├── utils
    ├── __init__.py
    ├── misc_utils.py
    ├── infer_utils.py
    └── train_utils.py
├── benchmarks
    ├── __init__.py
    └── OTB_Toolkit
    │   └── scripts
    │       └── bscripts
    │           ├── __init__.py
    │           ├── README.txt
    │           ├── run_SA_Siam_Semantic.py
    │           ├── run_SA_Siam_Appearance.py
    │           └── run_SA_Siam.py
├── embeddings
    ├── __init__.py
    └── sa_siam.py
├── inference
    ├── __init__.py
    ├── tracker.py
    └── inference_wrapper.py
├── metrics
    ├── __init__.py
    └── track_metrics.py
├── scripts
    ├── __init__.py
    ├── preprocess_VID_data.py
    └── build_VID2015_imdb.py
├── datasets
    ├── __init__.py
    ├── sampler.py
    ├── transforms.py
    ├── vid.py
    └── dataloader.py
├── README.md
├── LICENSE
├── experiments
    ├── train-semantic-network.py
    ├── train-appearance-network.py
    └── gen-sa-siam-cfg.py
├── .gitignore
├── SECURITY.md
├── configuration.py
├── train_siamese_model.py
└── siamese_model.py


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/inference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/benchmarks/OTB_Toolkit/scripts/bscripts/__init__.py:
--------------------------------------------------------------------------------
1 | from .run_SA_Siam_Semantic import *
2 | from .run_SA_Siam_Appearance import *
3 | from .run_SA_Siam import *


--------------------------------------------------------------------------------
/benchmarks/OTB_Toolkit/scripts/bscripts/README.txt:
--------------------------------------------------------------------------------
1 | Functions for running trackers.
2 | You can add your script files. 
3 |     - form : run_<tracker_name>(seq, resultpath, saveimage)
4 |     - return : dictonary type variable (has 'res', 'type', 'fps' fileds)
5 | You must import them in '__init__.py' and add exe(or matlab script) file into tracker_benchmark/trackers/<tracker_name>/
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Contributing
 3 | 
 4 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
 5 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
 6 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
 7 | 
 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
 9 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
10 | provided by the bot. You will only need to do this once across all repos using our CLA.
11 | 
12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
14 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
15 | 


--------------------------------------------------------------------------------
/datasets/sampler.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
 5 | #
 6 | # Distributed under terms of the MIT license.
 7 | 
 8 | """Dataset Sampler"""
 9 | 
10 | from __future__ import absolute_import
11 | from __future__ import division
12 | from __future__ import print_function
13 | 
14 | import numpy as np
15 | 
16 | 
17 | class Sampler(object):
18 |   def __init__(self, data_source, shuffle=True):
19 |     self.data_source = data_source
20 |     self.shuffle = shuffle
21 | 
22 |   def __iter__(self):
23 |     data_idxs = np.arange(len(self.data_source))
24 |     if self.shuffle:
25 |       np.random.shuffle(data_idxs)
26 | 
27 |     for idx in data_idxs:
28 |       yield idx
29 | 
30 | 
31 | if __name__ == '__main__':
32 |   x = [1, 2, 3]
33 |   sampler = Sampler(x, shuffle=True)
34 |   p = 0
35 |   for xx in sampler:
36 |     print(x[xx])
37 |     p += 1
38 |     if p == 10: break
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/experiments/train-semantic-network.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 bily          Huazhong University of Science and Technology
 5 | # Copyright © 2018 Anfeng He     Microsoft Research Asia. University of Science and Technology of China.
 6 | # Copyright (c) Microsoft. All rights reserved.
 7 | #
 8 | # Distributed under terms of the MIT license.
 9 | 
10 | """Train the color model in the SiamFC paper from scratch"""
11 | from __future__ import absolute_import
12 | from __future__ import division
13 | from __future__ import print_function
14 | 
15 | import os.path as osp
16 | import sys
17 | 
18 | CURRENT_DIR = osp.dirname(__file__)
19 | sys.path.append(osp.join(CURRENT_DIR, '..'))
20 | 
21 | from configuration import LOG_DIR
22 | from train_siamese_model import ex
23 | 
24 | if __name__ == '__main__':
25 |   RUN_NAME = 'SA-Siam-Semantic'
26 |   ex.run(config_updates={'train_config': {'train_dir': osp.join(LOG_DIR, 'track_model_checkpoints', RUN_NAME), },
27 |                          'track_config': {'log_dir': osp.join(LOG_DIR, 'track_model_inference', RUN_NAME), },
28 |                          'model_config': {'sa_siam_config': {'en_semantic': True, }, },
29 |                          },
30 |          options={'--name': RUN_NAME,
31 |                   '--force': True,
32 |                   '--enforce_clean': False,
33 |                   })
34 | 


--------------------------------------------------------------------------------
/experiments/train-appearance-network.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 bily          Huazhong University of Science and Technology
 5 | # Copyright © 2018 Anfeng He     Microsoft Research Asia. University of Science and Technology of China.
 6 | # Copyright (c) Microsoft. All rights reserved.
 7 | #
 8 | # Distributed under terms of the MIT license.
 9 | 
10 | """Train the color model in the SiamFC paper from scratch"""
11 | from __future__ import absolute_import
12 | from __future__ import division
13 | from __future__ import print_function
14 | 
15 | import os.path as osp
16 | import sys
17 | 
18 | CURRENT_DIR = osp.dirname(__file__)
19 | sys.path.append(osp.join(CURRENT_DIR, '..'))
20 | 
21 | from configuration import LOG_DIR
22 | from train_siamese_model import ex
23 | 
24 | if __name__ == '__main__':
25 |   RUN_NAME = 'SA-Siam-Appearance'
26 |   ex.run(config_updates={'train_config': {'train_dir': osp.join(LOG_DIR, 'track_model_checkpoints', RUN_NAME), },
27 |                          'track_config': {'log_dir': osp.join(LOG_DIR, 'track_model_inference', RUN_NAME), },
28 |                          'model_config': {'sa_siam_config': {'en_appearance': True, }, },
29 |                          },
30 |          options={'--name': RUN_NAME,
31 |                   '--force': True,
32 |                   '--enforce_clean': False,
33 |                   })
34 | 


--------------------------------------------------------------------------------
/experiments/gen-sa-siam-cfg.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 bily          Huazhong University of Science and Technology
 5 | # Copyright © 2018 Anfeng He     Microsoft Research Asia. University of Science and Technology of China.
 6 | # Copyright (c) Microsoft. All rights reserved.
 7 | #
 8 | # Distributed under terms of the MIT license.
 9 | 
10 | """Train the color model in the SiamFC paper from scratch"""
11 | from __future__ import absolute_import
12 | from __future__ import division
13 | from __future__ import print_function
14 | 
15 | import os.path as osp
16 | import sys
17 | 
18 | CURRENT_DIR = osp.dirname(__file__)
19 | sys.path.append(osp.join(CURRENT_DIR, '..'))
20 | 
21 | from configuration import LOG_DIR
22 | from train_siamese_model import ex
23 | 
24 | if __name__ == '__main__':
25 |   RUN_NAME = 'SA-Siam'
26 |   ex.run(config_updates={'train_config': {'train_dir': osp.join(LOG_DIR, 'track_model_checkpoints', RUN_NAME), 
27 |                                           'train_data_config':{'epoch': 0}},
28 |                          'track_config': {'log_dir': osp.join(LOG_DIR, 'track_model_inference', RUN_NAME), },
29 |                          'model_config': {'sa_siam_config': {'en_semantic': True, 
30 |                                                              'en_appearance': True}, },
31 |                         },
32 |          options={'--name': RUN_NAME,
33 |                   '--force': True,
34 |                   '--enforce_clean': False,
35 |                   })
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/datasets/transforms.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
 5 | #
 6 | # Distributed under terms of the MIT license.
 7 | 
 8 | 
 9 | """Various transforms for video and image augmentation"""
10 | 
11 | import numbers
12 | 
13 | import tensorflow as tf
14 | 
15 | 
16 | class Compose(object):
17 |   """Composes several transforms together."""
18 | 
19 |   def __init__(self, transforms):
20 |     self.transforms = transforms
21 | 
22 |   def __call__(self, example):
23 |     for t in self.transforms:
24 |       example = t(example)
25 |     return example
26 | 
27 | 
28 | class RandomGray(object):
29 |   def __init__(self, gray_ratio=0.25):
30 |     self.gray_ratio = gray_ratio
31 | 
32 |   def __call__(self, img_sequence):
33 |     def rgb_to_gray():
34 |       gray_images = tf.image.rgb_to_grayscale(img_sequence)
35 |       return tf.concat([gray_images] * 3, axis=3)
36 | 
37 |     def identity():
38 |       return tf.identity(img_sequence)
39 | 
40 |     return tf.cond(tf.less(tf.random_uniform([], 0, 1), self.gray_ratio), rgb_to_gray, identity)
41 | 
42 | 
43 | class RandomStretch(object):
44 |   def __init__(self, max_stretch=0.05, interpolation='bilinear'):
45 |     self.max_stretch = max_stretch
46 |     self.interpolation = interpolation
47 | 
48 |   def __call__(self, img):
49 |     scale = 1.0 + tf.random_uniform([], -self.max_stretch, self.max_stretch)
50 |     img_shape = tf.shape(img)
51 |     ts = tf.to_int32(tf.round(tf.to_float(img_shape[:2]) * scale))
52 |     resize_method_map = {'bilinear': tf.image.ResizeMethod.BILINEAR,
53 |                          'bicubic': tf.image.ResizeMethod.BICUBIC}
54 |     return tf.image.resize_images(img, ts, method=resize_method_map[self.interpolation])
55 | 
56 | 
57 | class CenterCrop(object):
58 |   def __init__(self, size):
59 |     if isinstance(size, numbers.Number):
60 |       self.size = (int(size), int(size))
61 |     else:
62 |       self.size = size
63 | 
64 |   def __call__(self, img):
65 |     th, tw = self.size
66 |     return tf.image.resize_image_with_crop_or_pad(img, th, tw)
67 | 
68 | 
69 | class RandomCrop(object):
70 |   def __init__(self, size):
71 |     if isinstance(size, numbers.Number):
72 |       self.size = (int(size), int(size))
73 |     else:
74 |       self.size = size
75 | 
76 |   def __call__(self, img):
77 |     img_shape = tf.shape(img)
78 |     th, tw = self.size
79 | 
80 |     y1 = tf.random_uniform([], 0, img_shape[0] - th, dtype=tf.int32)
81 |     x1 = tf.random_uniform([], 0, img_shape[1] - tw, dtype=tf.int32)
82 | 
83 |     return tf.image.crop_to_bounding_box(img, y1, x1, th, tw)
84 | 


--------------------------------------------------------------------------------
/datasets/vid.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
 5 | #
 6 | # Distributed under terms of the MIT license.
 7 | 
 8 | """VID Dataset"""
 9 | 
10 | from __future__ import absolute_import
11 | from __future__ import division
12 | from __future__ import print_function
13 | 
14 | import pickle
15 | 
16 | import numpy as np
17 | 
18 | 
19 | def downsample(n_in, n_out, max_frame_dist=1):
20 |   # Get a list of frame distance between consecutive frames
21 |   max_frame_dist = np.minimum(n_in, max_frame_dist)
22 |   possible_frame_dist = range(1, max_frame_dist + 1)
23 |   frame_dist = np.random.choice(possible_frame_dist, n_out - 1)
24 |   end_to_start_frame_dist = np.sum(frame_dist)
25 | 
26 |   # Check frame dist boundary
27 |   possible_max_start_idx = n_in - 1 - end_to_start_frame_dist
28 |   if possible_max_start_idx < 0:
29 |     n_extra = - possible_max_start_idx
30 |     while n_extra > 0:
31 |       for idx, dist in enumerate(frame_dist):
32 |         if dist > 1:
33 |           frame_dist[idx] = dist - 1
34 |           n_extra -= 1
35 |           if n_extra == 0: break
36 | 
37 |   # Get frame dist
38 |   end_to_start_frame_dist = np.sum(frame_dist)
39 |   possible_max_start_idx = n_in - 1 - end_to_start_frame_dist
40 |   start_idx = np.random.choice(possible_max_start_idx + 1, 1)
41 |   out_idxs = np.cumsum(np.concatenate((start_idx, frame_dist)))
42 |   return out_idxs
43 | 
44 | 
45 | def upsample(n_in, n_out):
46 |   n_more = n_out - n_in
47 |   in_idxs = range(n_in)
48 |   more_idxs = np.random.choice(in_idxs, n_more)
49 |   out_idxs = sorted(list(in_idxs) + list(more_idxs))
50 |   return out_idxs
51 | 
52 | 
53 | class VID:
54 |   def __init__(self, imdb_path, max_frame_dist, epoch_size=None):
55 |     with open(imdb_path, 'rb') as f:
56 |       imdb = pickle.load(f)
57 | 
58 |     self.videos = imdb['videos']
59 |     self.time_steps = 2
60 |     self.max_frame_dist = max_frame_dist
61 | 
62 |     if epoch_size is None:
63 |       self.epoch_size = len(self.videos)
64 |     else:
65 |       self.epoch_size = int(epoch_size)
66 | 
67 |   def __getitem__(self, index):
68 |     img_ids = self.videos[index % len(self.videos)]
69 |     n_frames = len(img_ids)
70 | 
71 |     if n_frames < self.time_steps:
72 |       out_idxs = upsample(n_frames, self.time_steps)
73 |     elif n_frames == self.time_steps:
74 |       out_idxs = range(n_frames)
75 |     else:
76 |       out_idxs = downsample(n_frames, self.time_steps, self.max_frame_dist)
77 | 
78 |     video = []
79 |     for j, frame_idx in enumerate(out_idxs):
80 |       img_path = img_ids[frame_idx]
81 |       video.append(img_path.encode('utf-8'))
82 |     return video
83 | 
84 |   def __len__(self):
85 |     return self.epoch_size
86 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/benchmarks/OTB_Toolkit/scripts/bscripts/run_SA_Siam_Semantic.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 bily          Huazhong University of Science and Technology
 5 | # Copyright © 2018 Anfeng He     Microsoft Research Asia. University of Science and Technology of China.
 6 | # Copyright (c) Microsoft. All rights reserved.
 7 | #
 8 | # Distributed under terms of the MIT license.
 9 | 
10 | r"""Support integration with OTB benchmark"""
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import division
14 | from __future__ import print_function
15 | 
16 | import logging
17 | import os
18 | import sys
19 | import time
20 | 
21 | import tensorflow as tf
22 | 
23 | sys.path.append(os.getcwd())
24 | 
25 | from configuration import LOG_DIR
26 | 
27 | # Code root absolute path
28 | CODE_ROOT = './'
29 | 
30 | # Checkpoint for evaluation
31 | CHECKPOINT = os.path.join(LOG_DIR, 'track_model_checkpoints', 'SA-Siam-Semantic', 'model.ckpt-{iter_ckpt}')
32 | 
33 | sys.path.insert(0, CODE_ROOT)
34 | 
35 | from utils.misc_utils import auto_select_gpu, load_cfgs
36 | from inference import inference_wrapper
37 | from inference.tracker import Tracker
38 | from utils.infer_utils import Rectangle
39 | 
40 | # Set GPU
41 | os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu()
42 | logging.getLogger().setLevel(logging.INFO)
43 | 
44 | 
45 | def run_SA_Siam_Semantic(seq, rp, bSaveImage, epoch=30):
46 |   iter_ckpt = epoch * 6650 - 1
47 |   checkpoint_path = CHECKPOINT.format(iter_ckpt=iter_ckpt)
48 |   logging.info('Evaluating {}...'.format(checkpoint_path))
49 | 
50 |   # Read configurations from json
51 |   model_config, _, track_config = load_cfgs(checkpoint_path)
52 | 
53 |   track_config['log_level'] = 0  # Skip verbose logging for speed
54 | 
55 |   # Build the inference graph.
56 |   g = tf.Graph()
57 |   with g.as_default():
58 |     model = inference_wrapper.InferenceWrapper()
59 |     restore_fn = model.build_graph_from_config(model_config, track_config, checkpoint_path)
60 |   g.finalize()
61 | 
62 |   gpu_options = tf.GPUOptions(allow_growth=True)
63 |   sess_config = tf.ConfigProto(gpu_options=gpu_options)
64 | 
65 |   with tf.Session(graph=g, config=sess_config) as sess:
66 |     # Load the model from checkpoint.
67 |     restore_fn(sess)
68 | 
69 |     tracker = Tracker(model, model_config, track_config)
70 | 
71 |     tic = time.clock()
72 |     frames = seq.s_frames
73 |     init_rect = seq.init_rect
74 |     x, y, width, height = init_rect  # OTB format
75 |     init_bb = Rectangle(x - 1, y - 1, width, height)
76 | 
77 |     trajectory_py = tracker.track(sess, init_bb, frames)
78 |     trajectory = [Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in
79 |                   trajectory_py]  # x, y add one to match OTB format
80 |     duration = time.clock() - tic
81 | 
82 |     result = dict()
83 |     result['res'] = trajectory
84 |     result['type'] = 'rect'
85 |     result['fps'] = round(seq.len / duration, 3)
86 |     return result
87 | 


--------------------------------------------------------------------------------
/benchmarks/OTB_Toolkit/scripts/bscripts/run_SA_Siam_Appearance.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 bily          Huazhong University of Science and Technology
 5 | # Copyright © 2018 Anfeng He     Microsoft Research Asia. University of Science and Technology of China.
 6 | # Copyright (c) Microsoft. All rights reserved.
 7 | # 
 8 | # Distributed under terms of the MIT license.
 9 | 
10 | r"""Support integration with OTB benchmark"""
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import division
14 | from __future__ import print_function
15 | 
16 | import logging
17 | import os
18 | import sys
19 | import time
20 | 
21 | import tensorflow as tf
22 | 
23 | sys.path.append(os.getcwd())
24 | 
25 | from configuration import LOG_DIR
26 | 
27 | # Code root absolute path
28 | CODE_ROOT = './'
29 | 
30 | # Checkpoint for evaluation
31 | CHECKPOINT = os.path.join(LOG_DIR, 'track_model_checkpoints', 'SA-Siam-Appearance', 'model.ckpt-{iter_ckpt}')
32 | 
33 | sys.path.insert(0, CODE_ROOT)
34 | 
35 | from utils.misc_utils import auto_select_gpu, load_cfgs
36 | from inference import inference_wrapper
37 | from inference.tracker import Tracker
38 | from utils.infer_utils import Rectangle
39 | 
40 | # Set GPU
41 | os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu()
42 | logging.getLogger().setLevel(logging.INFO)
43 | 
44 | 
45 | def run_SA_Siam_Appearance(seq, rp, bSaveImage, epoch=30):
46 |   iter_ckpt = epoch * 6650 - 1
47 |   checkpoint_path = CHECKPOINT.format(iter_ckpt=iter_ckpt)
48 |   logging.info('Evaluating {}...'.format(checkpoint_path))
49 | 
50 |   # Read configurations from json
51 |   model_config, _, track_config = load_cfgs(checkpoint_path)
52 | 
53 |   track_config['log_level'] = 0  # Skip verbose logging for speed
54 | 
55 |   # Build the inference graph.
56 |   g = tf.Graph()
57 |   with g.as_default():
58 |     model = inference_wrapper.InferenceWrapper()
59 |     restore_fn = model.build_graph_from_config(model_config, track_config, checkpoint_path)
60 |   g.finalize()
61 | 
62 |   gpu_options = tf.GPUOptions(allow_growth=True)
63 |   sess_config = tf.ConfigProto(gpu_options=gpu_options)
64 | 
65 |   with tf.Session(graph=g, config=sess_config) as sess:
66 |     # Load the model from checkpoint.
67 |     restore_fn(sess)
68 | 
69 |     tracker = Tracker(model, model_config, track_config)
70 | 
71 |     tic = time.clock()
72 |     frames = seq.s_frames
73 |     init_rect = seq.init_rect
74 |     x, y, width, height = init_rect  # OTB format
75 |     init_bb = Rectangle(x - 1, y - 1, width, height)
76 | 
77 |     trajectory_py = tracker.track(sess, init_bb, frames)
78 |     trajectory = [Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in
79 |                   trajectory_py]  # x, y add one to match OTB format
80 |     duration = time.clock() - tic
81 | 
82 |     result = dict()
83 |     result['res'] = trajectory
84 |     result['type'] = 'rect'
85 |     result['fps'] = round(seq.len / duration, 3)
86 |     return result
87 | 


--------------------------------------------------------------------------------
/benchmarks/OTB_Toolkit/scripts/bscripts/run_SA_Siam.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2017 bily          Huazhong University of Science and Technology
 5 | # Copyright © 2018 Anfeng He     Microsoft Research Asia. University of Science and Technology of China.
 6 | # Copyright (c) Microsoft. All rights reserved.
 7 | #
 8 | # Distributed under terms of the MIT license.
 9 | 
10 | r"""Support integration with OTB benchmark"""
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import division
14 | from __future__ import print_function
15 | 
16 | import logging
17 | import os
18 | import sys
19 | import time
20 | 
21 | import tensorflow as tf
22 | 
23 | sys.path.append(os.getcwd())
24 | 
25 | from configuration import LOG_DIR
26 | from utils.infer_utils import get_saver
27 | 
28 | # Code root absolute path
29 | CODE_ROOT = './'
30 | 
31 | # Checkpoint for evaluation
32 | CHECKPOINT_APPEARANCE = os.path.join(LOG_DIR, 'track_model_checkpoints', 'SA-Siam-Appearance', 'model.ckpt-{iter_ckpt}')
33 | CHECKPOINT_SEMANTIC = os.path.join(LOG_DIR, 'track_model_checkpoints', 'SA-Siam-Semantic', 'model.ckpt-{iter_ckpt}')
34 | CHECKPOINT_SA_SIAM = os.path.join(LOG_DIR, 'track_model_checkpoints', 'SA-Siam')
35 | 
36 | sys.path.insert(0, CODE_ROOT)
37 | 
38 | from utils.misc_utils import auto_select_gpu, load_cfgs
39 | from inference import inference_wrapper
40 | from inference.tracker import Tracker
41 | from utils.infer_utils import Rectangle
42 | 
43 | # Set GPU
44 | os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu()
45 | logging.getLogger().setLevel(logging.INFO)
46 | 
47 | 
48 | def run_SA_Siam(seq, rp, bSaveImage, epoch=30):
49 |   iter_ckpt = epoch * 6650 - 1
50 |   checkpoint_appearance_path = CHECKPOINT_APPEARANCE.format(iter_ckpt=iter_ckpt)
51 |   logging.info('Evaluating {}...'.format(checkpoint_appearance_path))
52 |   checkpoint_semantic_path = CHECKPOINT_SEMANTIC.format(iter_ckpt=iter_ckpt)
53 |   logging.info('Evaluating {}...'.format(checkpoint_semantic_path))
54 | 
55 |   # Read configurations from json
56 |   model_config, _, track_config = load_cfgs(CHECKPOINT_SA_SIAM)
57 | 
58 |   track_config['log_level'] = 0  # Skip verbose logging for speed
59 | 
60 |   # Build the inference graph.
61 |   g = tf.Graph()
62 |   with g.as_default():
63 |     model = inference_wrapper.InferenceWrapper()
64 |     model.build_model(model_config, track_config)
65 |     saver_loader_semantic   = get_saver('',  removes=[':0', '_semantic'], excepts=['appearance', 'State'])
66 |     saver_loader_appearance = get_saver('',  removes=[':0', '_appearance'], excepts=['semantic',  'State'])
67 |   g.finalize()
68 | 
69 |   gpu_options = tf.GPUOptions(allow_growth=True)
70 |   sess_config = tf.ConfigProto(gpu_options=gpu_options)
71 | 
72 |   with tf.Session(graph=g, config=sess_config) as sess:
73 |     # Load the model from checkpoint.
74 |     # restore_fn(sess)
75 |     saver_loader_semantic.restore(sess, checkpoint_semantic_path)
76 |     saver_loader_appearance.restore(sess, checkpoint_appearance_path)
77 | 
78 |     tracker = Tracker(model, model_config, track_config)
79 | 
80 |     tic = time.clock()
81 |     frames = seq.s_frames
82 |     init_rect = seq.init_rect
83 |     x, y, width, height = init_rect  # OTB format
84 |     init_bb = Rectangle(x - 1, y - 1, width, height)
85 | 
86 |     trajectory_py = tracker.track(sess, init_bb, frames)
87 |     trajectory = [Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in
88 |                   trajectory_py]  # x, y add one to match OTB format
89 |     duration = time.clock() - tic
90 | 
91 |     result = dict()
92 |     result['res'] = trajectory
93 |     result['type'] = 'rect'
94 |     result['fps'] = round(seq.len / duration, 3)
95 |     return result
96 | 


--------------------------------------------------------------------------------
/datasets/dataloader.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import logging
 13 | logging.getLogger().setLevel(logging.INFO)
 14 | 
 15 | import tensorflow as tf
 16 | 
 17 | from datasets.sampler import Sampler
 18 | from datasets.transforms import Compose, RandomGray, RandomCrop, CenterCrop, RandomStretch
 19 | from datasets.vid import VID
 20 | from utils.misc_utils import get
 21 | 
 22 | 
 23 | class DataLoader(object):
 24 |   def __init__(self, config, is_training):
 25 |     self.config = config
 26 |     self.is_training = is_training
 27 | 
 28 |     preprocess_name = get(config, 'preprocessing_name', None)
 29 |     logging.info('preproces -- {}'.format(preprocess_name))
 30 | 
 31 |     if preprocess_name == 'siamese_fc_color':
 32 |       self.v_transform = None
 33 |       # TODO: use a single operation (tf.image.crop_and_resize) to achieve all transformations ?
 34 |       self.z_transform = Compose([RandomStretch(),
 35 |                                   CenterCrop((255 - 8, 255 - 8)),
 36 |                                   RandomCrop(255 - 2 * 8),
 37 |                                   CenterCrop((255 - 2 * 8, 255 - 2 * 8))])
 38 |       self.x_transform = Compose([RandomStretch(),
 39 |                                   CenterCrop((255 - 8, 255 - 8)),
 40 |                                   RandomCrop(255 - 2 * 8), ])
 41 |     elif preprocess_name == 'siamese_fc_gray':
 42 |       self.v_transform = RandomGray()
 43 |       self.z_transform = Compose([RandomStretch(),
 44 |                                   CenterCrop((255 - 8, 255 - 8)),
 45 |                                   RandomCrop(255 - 2 * 8),
 46 |                                   CenterCrop((255 - 2 * 8, 255 - 2 * 8))])
 47 |       self.x_transform = Compose([RandomStretch(),
 48 |                                   CenterCrop((255 - 8, 255 - 8)),
 49 |                                   RandomCrop(255 - 2 * 8), ])
 50 |     elif preprocess_name == 'None':
 51 |       self.v_transform = None
 52 |       self.z_transform = CenterCrop((255, 255))
 53 |       self.x_transform = CenterCrop((255, 255))
 54 |     else:
 55 |       raise ValueError('Preprocessing name {} was not recognized.'.format(preprocess_name))
 56 | 
 57 |     self.dataset_py = VID(config['input_imdb'], config['max_frame_dist'])
 58 |     self.sampler = Sampler(self.dataset_py, shuffle=is_training)
 59 | 
 60 |   def build(self):
 61 |     self.build_dataset()
 62 |     self.build_iterator()
 63 | 
 64 |   def build_dataset(self):
 65 |     def sample_generator():
 66 |       for video_id in self.sampler:
 67 |         sample = self.dataset_py[video_id]
 68 |         yield sample
 69 | 
 70 |     def transform_fn(video):
 71 |       exemplar_file = tf.read_file(video[0])
 72 |       instance_file = tf.read_file(video[1])
 73 |       exemplar_image = tf.image.decode_jpeg(exemplar_file, channels=3, dct_method="INTEGER_ACCURATE")
 74 |       instance_image = tf.image.decode_jpeg(instance_file, channels=3, dct_method="INTEGER_ACCURATE")
 75 | 
 76 |       if self.v_transform is not None:
 77 |         video = tf.stack([exemplar_image, instance_image])
 78 |         video = self.v_transform(video)
 79 |         exemplar_image = video[0]
 80 |         instance_image = video[1]
 81 | 
 82 |       if self.z_transform is not None:
 83 |         exemplar_image = self.z_transform(exemplar_image)
 84 | 
 85 |       if self.x_transform is not None:
 86 |         instance_image = self.x_transform(instance_image)
 87 | 
 88 |       return exemplar_image, instance_image
 89 | 
 90 |     dataset = tf.data.Dataset.from_generator(sample_generator,
 91 |                                              output_types=(tf.string),
 92 |                                              output_shapes=(tf.TensorShape([2])))
 93 |     dataset = dataset.map(transform_fn, num_parallel_calls=self.config['prefetch_threads'])
 94 |     dataset = dataset.prefetch(self.config['prefetch_capacity'])
 95 |     dataset = dataset.repeat()
 96 |     dataset = dataset.batch(self.config['batch_size'])
 97 |     self.dataset_tf = dataset
 98 | 
 99 |   def build_iterator(self):
100 |     self.iterator = self.dataset_tf.make_one_shot_iterator()
101 | 
102 |   def get_one_batch(self):
103 |     return self.iterator.get_next()
104 | 


--------------------------------------------------------------------------------
/scripts/preprocess_VID_data.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import os
 13 | import os.path as osp
 14 | import sys
 15 | import xml.etree.ElementTree as ET
 16 | from glob import glob
 17 | from multiprocessing.pool import ThreadPool
 18 | 
 19 | import cv2
 20 | from cv2 import imread, imwrite
 21 | 
 22 | CURRENT_DIR = osp.dirname(__file__)
 23 | ROOT_DIR = osp.join(CURRENT_DIR, '..')
 24 | sys.path.append(ROOT_DIR)
 25 | 
 26 | from utils.infer_utils import get_crops, Rectangle, convert_bbox_format
 27 | from utils.misc_utils import mkdir_p
 28 | 
 29 | 
 30 | def get_track_save_directory(save_dir, split, subdir, video):
 31 |   subdir_map = {'ILSVRC2015_VID_train_0000': 'a',
 32 |                 'ILSVRC2015_VID_train_0001': 'b',
 33 |                 'ILSVRC2015_VID_train_0002': 'c',
 34 |                 'ILSVRC2015_VID_train_0003': 'd',
 35 |                 '': 'e'}
 36 |   return osp.join(save_dir, 'Data', 'VID', split, subdir_map[subdir], video)
 37 | 
 38 | 
 39 | def process_split(root_dir, save_dir, split, subdir='', ):
 40 |   data_dir = osp.join(root_dir, 'Data', 'VID', split)
 41 |   anno_dir = osp.join(root_dir, 'Annotations', 'VID', split, subdir)
 42 |   video_names = os.listdir(anno_dir)
 43 | 
 44 |   for idx, video in enumerate(video_names):
 45 |     print('{split}-{subdir} ({idx}/{total}): Processing {video}...'.format(split=split, subdir=subdir,
 46 |                                                                            idx=idx, total=len(video_names),
 47 |                                                                            video=video))
 48 |     video_path = osp.join(anno_dir, video)
 49 |     xml_files = glob(osp.join(video_path, '*.xml'))
 50 | 
 51 |     for xml in xml_files:
 52 |       tree = ET.parse(xml)
 53 |       root = tree.getroot()
 54 | 
 55 |       folder = root.find('folder').text
 56 |       filename = root.find('filename').text
 57 | 
 58 |       # Read image
 59 |       img_file = osp.join(data_dir, folder, filename + '.JPEG')
 60 |       img = None
 61 | 
 62 |       # Get all object bounding boxes
 63 |       bboxs = []
 64 |       for object in root.iter('object'):
 65 |         bbox = object.find('bndbox')
 66 |         xmax = float(bbox.find('xmax').text)
 67 |         xmin = float(bbox.find('xmin').text)
 68 |         ymax = float(bbox.find('ymax').text)
 69 |         ymin = float(bbox.find('ymin').text)
 70 |         width = xmax - xmin + 1
 71 |         height = ymax - ymin + 1
 72 |         bboxs.append([xmin, ymin, width, height])
 73 | 
 74 |       for idx, object in enumerate(root.iter('object')):
 75 |         id = object.find('trackid').text
 76 |         class_name = object.find('name').text
 77 | 
 78 |         track_save_dir = get_track_save_directory(save_dir, 'train', subdir, video)
 79 |         mkdir_p(track_save_dir)
 80 |         savename = osp.join(track_save_dir, '{}.{:02d}.crop.x.jpg'.format(filename, int(id)))
 81 |         if osp.isfile(savename): continue  # skip existing images
 82 | 
 83 |         if img is None:
 84 |           img = imread(img_file)
 85 | 
 86 |         # Get crop
 87 |         target_box = convert_bbox_format(Rectangle(*bboxs[idx]), 'center-based')
 88 |         crop, _ = get_crops(img, target_box,
 89 |                             size_z=127, size_x=255,
 90 |                             context_amount=0.5, )
 91 | 
 92 |         imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |   vid_dir = osp.join(ROOT_DIR, 'data/ILSVRC2015')
 97 | 
 98 |   # Or, you could save the actual curated data to a disk with sufficient space
 99 |   # then create a soft link in `data/ILSVRC2015-VID-Curation`
100 |   save_dir = 'data/ILSVRC2015-VID-Curation'
101 | 
102 |   pool = ThreadPool(processes=5)
103 | 
104 |   one_work = lambda a, b: process_split(vid_dir, save_dir, a, b)
105 | 
106 |   results = []
107 |   results.append(pool.apply_async(one_work, ['val', '']))
108 |   results.append(pool.apply_async(one_work, ['train', 'ILSVRC2015_VID_train_0000']))
109 |   results.append(pool.apply_async(one_work, ['train', 'ILSVRC2015_VID_train_0001']))
110 |   results.append(pool.apply_async(one_work, ['train', 'ILSVRC2015_VID_train_0002']))
111 |   results.append(pool.apply_async(one_work, ['train', 'ILSVRC2015_VID_train_0003']))
112 |   ans = [res.get() for res in results]
113 | 


--------------------------------------------------------------------------------
/scripts/build_VID2015_imdb.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | """Save the paths of crops from the ImageNet VID 2015 dataset in pickle format"""
  9 | from __future__ import absolute_import
 10 | from __future__ import division
 11 | from __future__ import print_function
 12 | 
 13 | import glob
 14 | import os
 15 | import os.path as osp
 16 | import pickle
 17 | import sys
 18 | 
 19 | import numpy as np
 20 | import tensorflow as tf
 21 | 
 22 | CURRENT_DIR = osp.dirname(__file__)
 23 | sys.path.append(osp.join(CURRENT_DIR, '..'))
 24 | 
 25 | from utils.misc_utils import sort_nicely
 26 | 
 27 | 
 28 | class Config:
 29 |   ### Dataset
 30 |   # directory where curated dataset is stored
 31 |   dataset_dir = 'data/ILSVRC2015-VID-Curation'
 32 |   save_dir = 'data/'
 33 | 
 34 |   # percentage of all videos for validation
 35 |   validation_ratio = 0.1
 36 | 
 37 | 
 38 | class DataIter:
 39 |   """Container for dataset of one iteration"""
 40 |   pass
 41 | 
 42 | 
 43 | class Dataset:
 44 |   def __init__(self, config):
 45 |     self.config = config
 46 | 
 47 |   def _get_unique_trackids(self, video_dir):
 48 |     """Get unique trackids within video_dir"""
 49 |     x_image_paths = glob.glob(video_dir + '/*.crop.x.jpg')
 50 |     trackids = [os.path.basename(path).split('.')[1] for path in x_image_paths]
 51 |     unique_trackids = set(trackids)
 52 |     return unique_trackids
 53 | 
 54 |   def dataset_iterator(self, video_dirs):
 55 |     video_num = len(video_dirs)
 56 |     iter_size = 150
 57 |     iter_num = int(np.ceil(video_num / float(iter_size)))
 58 |     for iter_ in range(iter_num):
 59 |       iter_start = iter_ * iter_size
 60 |       iter_videos = video_dirs[iter_start: iter_start + iter_size]
 61 | 
 62 |       data_iter = DataIter()
 63 |       num_videos = len(iter_videos)
 64 |       instance_videos = []
 65 |       for index in range(num_videos):
 66 |         print('Processing {}/{}...'.format(iter_start + index, video_num))
 67 |         video_dir = iter_videos[index]
 68 |         trackids = self._get_unique_trackids(video_dir)
 69 | 
 70 |         for trackid in trackids:
 71 |           instance_image_paths = glob.glob(video_dir + '/*' + trackid + '.crop.x.jpg')
 72 | 
 73 |           # sort image paths by frame number
 74 |           instance_image_paths = sort_nicely(instance_image_paths)
 75 | 
 76 |           # get image absolute path
 77 |           instance_image_paths = [os.path.abspath(p) for p in instance_image_paths]
 78 |           instance_videos.append(instance_image_paths)
 79 |       data_iter.num_videos = len(instance_videos)
 80 |       data_iter.instance_videos = instance_videos
 81 |       yield data_iter
 82 | 
 83 |   def get_all_video_dirs(self):
 84 |     ann_dir = os.path.join(self.config.dataset_dir, 'Data', 'VID')
 85 |     all_video_dirs = []
 86 | 
 87 |     # We have already combined all training and validation videos in ILSVRC2015 and put them in the `train` directory.
 88 |     # The file structure is like:
 89 |     # train
 90 |     #    |- a
 91 |     #    |- b
 92 |     #    |_ c
 93 |     #       |- ILSVRC2015_train_00024001
 94 |     #       |- ILSVRC2015_train_00024002
 95 |     #       |_ ILSVRC2015_train_00024003
 96 |     #               |- 000045.00.crop.x.jpg
 97 |     #               |- 000046.00.crop.x.jpg
 98 |     #               |- ...
 99 |     train_dirs = os.listdir(os.path.join(ann_dir, 'train'))
100 |     for dir_ in train_dirs:
101 |       train_sub_dir = os.path.join(ann_dir, 'train', dir_)
102 |       video_names = os.listdir(train_sub_dir)
103 |       train_video_dirs = [os.path.join(train_sub_dir, name) for name in video_names]
104 |       all_video_dirs = all_video_dirs + train_video_dirs
105 | 
106 |     return all_video_dirs
107 | 
108 | 
109 | def main():
110 |   # Get the data.
111 |   config = Config()
112 |   dataset = Dataset(config)
113 |   all_video_dirs = dataset.get_all_video_dirs()
114 |   num_validation = int(len(all_video_dirs) * config.validation_ratio)
115 | 
116 |   ### validation
117 |   validation_dirs = all_video_dirs[:num_validation]
118 |   validation_imdb = dict()
119 |   validation_imdb['videos'] = []
120 |   for i, data_iter in enumerate(dataset.dataset_iterator(validation_dirs)):
121 |     validation_imdb['videos'] += data_iter.instance_videos
122 |   validation_imdb['n_videos'] = len(validation_imdb['videos'])
123 |   validation_imdb['image_shape'] = (255, 255, 3)
124 | 
125 |   ### train
126 |   train_dirs = all_video_dirs[num_validation:]
127 |   train_imdb = dict()
128 |   train_imdb['videos'] = []
129 |   for i, data_iter in enumerate(dataset.dataset_iterator(train_dirs)):
130 |     train_imdb['videos'] += data_iter.instance_videos
131 |   train_imdb['n_videos'] = len(train_imdb['videos'])
132 |   train_imdb['image_shape'] = (255, 255, 3)
133 | 
134 |   if not tf.gfile.IsDirectory(config.save_dir):
135 |     tf.logging.info('Creating training directory: %s', config.save_dir)
136 |     tf.gfile.MakeDirs(config.save_dir)
137 | 
138 |   with open(os.path.join(config.save_dir, 'validation_imdb.pickle'), 'wb') as f:
139 |     pickle.dump(validation_imdb, f)
140 |   with open(os.path.join(config.save_dir, 'train_imdb.pickle'), 'wb') as f:
141 |     pickle.dump(train_imdb, f)
142 | 
143 | 
144 | if __name__ == '__main__':
145 |   main()
146 | 


--------------------------------------------------------------------------------
/configuration.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright @ 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | 
  7 | """Default configurations of model specification, training and tracking
  8 | 
  9 | For most of the time, DO NOT modify the configurations within this file.
 10 | Use the configurations here as the default configurations and only update
 11 | them following the examples in the `experiments` directory.
 12 | """
 13 | 
 14 | from __future__ import absolute_import
 15 | from __future__ import division
 16 | from __future__ import print_function
 17 | 
 18 | import os.path as osp
 19 | 
 20 | WORKSPACE_DIR = './'
 21 | LOG_DIR = osp.join(WORKSPACE_DIR, 'Logs/SA-Siam')  # where checkpoints, logs are saved
 22 | DATA_DIR = osp.join(WORKSPACE_DIR, 'data')
 23 | RUN_NAME = ''  # identifier of the experiment
 24 | OTB_DATA_DIR = '/data/anfeng/tracking/data/OTB/'
 25 | 
 26 | MODEL_CONFIG = {
 27 |   'z_image_size': 127,  # Exemplar image size
 28 | 
 29 |   'embed_config': {'embedding_name': 'sa_siam',
 30 |                    'embedding_checkpoint_file': None,  # mat file path of the pretrained embedding model.
 31 |                    'train_embedding': True,
 32 |                    'init_method': None,
 33 |                    'use_bn': True,
 34 |                    'bn_scale': True,
 35 |                    'bn_momentum': 0.05,
 36 |                    'bn_epsilon': 1e-6,
 37 |                    'weight_decay': 5e-4,
 38 |                    'stride': 8, },
 39 |   'sa_siam_config':{'en_appearance': False,
 40 |                     'en_semantic': False,
 41 |                     'n_out': 256,
 42 |                     'all_combine_layers_appearance': {'conv5':1.0},
 43 |                     'all_combine_layers_semantic': {'conv5':1.0, 'conv4':0.1},
 44 |                     'sz_conv5_z': 6,
 45 |                     'en_semantic_att': True,
 46 |   },
 47 | 
 48 |   'adjust_response_config': {'train_bias': True,
 49 |                              'scale': 1e-3, },
 50 | }
 51 | 
 52 | TRAIN_CONFIG = {
 53 |   'train_dir': osp.join(LOG_DIR, 'track_model_checkpoints', RUN_NAME),
 54 |   'caffenet_dir': osp.join(DATA_DIR, 'caffenet.npy'),
 55 | 
 56 |   'seed': 0,  # fix seed for reproducing experiments
 57 | 
 58 |   'train_data_config': {'input_imdb': osp.join(DATA_DIR, 'train_imdb.pickle'),
 59 |                         'preprocessing_name': 'siamese_fc_color',
 60 |                         'num_examples_per_epoch': 5.32e4,
 61 |                         'epoch': 30,
 62 |                         'batch_size': 8,
 63 |                         'max_frame_dist': 100,  # Maximum distance between any two random frames draw from videos.
 64 |                         'prefetch_threads': 4,
 65 |                         'prefetch_capacity': 15 * 8, },  # The maximum elements number in the data loading queue
 66 | 
 67 |   'validation_data_config': {'input_imdb': osp.join(DATA_DIR, 'validation_imdb.pickle'),
 68 |                              'preprocessing_name': 'None',
 69 |                              'batch_size': 8,
 70 |                              'max_frame_dist': 100,  # Maximum distance between any two random frames draw from videos.
 71 |                              'prefetch_threads': 1,
 72 |                              'prefetch_capacity': 15 * 8, },  # The maximum elements number in the data loading queue
 73 | 
 74 |   # Configurations for generating groundtruth maps
 75 |   'gt_config': {'rPos': 16,
 76 |                 'rNeg': 0, },
 77 | 
 78 |   # Optimizer for training the model.
 79 |   # 'optimizer_config': {'optimizer': 'MOMENTUM',  # SGD and MOMENTUM are supported
 80 |   #                      'momentum': 0.9,
 81 |   #                      'use_nesterov': False, },
 82 |   'optimizer_config': {'optimizer': 'SGD'},
 83 | 
 84 |   # Learning rate configs
 85 |   'lr_config': {'policy': 'exponential',
 86 |                 'initial_lr': 0.01,
 87 |                 'num_epochs_per_decay': 25,
 88 |                 'lr_decay_factor': 0.1,
 89 |                 'staircase': True, },
 90 | 
 91 |   # If not None, clip gradients to this value.
 92 |   'clip_gradients': None,
 93 | 
 94 |   # Frequency at which loss and global step are logged
 95 |   'log_every_n_steps': 10,
 96 | 
 97 |   # Frequency to save model
 98 |   'save_model_every_n_step': 5.32e4 // 8,  # save model every epoch
 99 | 
100 |   # How many model checkpoints to keep. No limit if None.
101 |   'max_checkpoints_to_keep': None,
102 | }
103 | 
104 | TRACK_CONFIG = {
105 |   # Directory for saving log files during tracking.
106 |   'log_dir': osp.join(LOG_DIR, 'track_model_inference', RUN_NAME),
107 | 
108 |   # Logging level of inference, use 1 for detailed inspection. 0 for speed.
109 |   'log_level': 0,
110 | 
111 |   'x_image_size': 255,  # Search image size during tracking
112 | 
113 |   # Configurations for upsampling score maps
114 |   'upsample_method': 'bicubic',
115 |   'upsample_factor': 16,
116 | 
117 |   # Configurations for searching scales
118 |   'num_scales': 3,  # Number of scales to search
119 |   'scale_step': 1.0375,  # Scale changes between different scale search
120 |   'scale_damp': 0.59,  # Damping factor for scale update
121 |   'scale_penalty': 0.9745,  # Score penalty for scale change
122 | 
123 |   # Configurations for penalizing large displacement from the center
124 |   'window_influence': 0.176,
125 | 
126 |   'include_first': False, # If track the first frame
127 | }
128 | 


--------------------------------------------------------------------------------
/utils/misc_utils.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | """Miscellaneous Utilities."""
  9 | from __future__ import absolute_import
 10 | from __future__ import division
 11 | from __future__ import print_function
 12 | 
 13 | import errno
 14 | import json
 15 | import logging
 16 | logging.getLogger().setLevel(logging.INFO)
 17 | import os
 18 | import re
 19 | import sys
 20 | from os import path as osp
 21 | 
 22 | try:
 23 |   import pynvml  # nvidia-ml provides utility for NVIDIA management
 24 | 
 25 |   HAS_NVML = True
 26 | except:
 27 |   HAS_NVML = False
 28 | 
 29 | 
 30 | def auto_select_gpu():
 31 |   """Select gpu which has largest free memory"""
 32 |   if HAS_NVML:
 33 |     pynvml.nvmlInit()
 34 |     deviceCount = pynvml.nvmlDeviceGetCount()
 35 |     largest_free_mem = 0
 36 |     largest_free_idx = 0
 37 |     for i in range(deviceCount):
 38 |       handle = pynvml.nvmlDeviceGetHandleByIndex(i)
 39 |       info = pynvml.nvmlDeviceGetMemoryInfo(handle)
 40 |       if info.free > largest_free_mem:
 41 |         largest_free_mem = info.free
 42 |         largest_free_idx = i
 43 |     pynvml.nvmlShutdown()
 44 |     largest_free_mem = largest_free_mem / 1024. / 1024.  # Convert to MB
 45 | 
 46 |     idx_to_gpu_id = {}
 47 |     for i in range(deviceCount):
 48 |       idx_to_gpu_id[i] = '{}'.format(i)
 49 | 
 50 |     gpu_id = idx_to_gpu_id[largest_free_idx]
 51 |     logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem))
 52 |     return gpu_id
 53 |   else:
 54 |     logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!')
 55 |     return '0'
 56 | 
 57 | 
 58 | def get_center(x):
 59 |   return (x - 1.) / 2.
 60 | 
 61 | 
 62 | def get(config, key, default):
 63 |   """Get value in config by key, use default if key is not set
 64 | 
 65 |   This little function is useful for dynamical experimental settings.
 66 |   For example, we can add a new configuration without worrying compatibility with older versions.
 67 |   You can also achieve this by just calling config.get(key, default), but add a warning is even better : )
 68 |   """
 69 |   val = config.get(key)
 70 |   if val is None:
 71 |     logging.warning('{} is not explicitly specified, using default value: {}'.format(key, default))
 72 |     val = default
 73 |   return val
 74 | 
 75 | 
 76 | def mkdir_p(path):
 77 |   """mimic the behavior of mkdir -p in bash"""
 78 |   try:
 79 |     os.makedirs(path)
 80 |   except OSError as exc:  # Python >2.5
 81 |     if exc.errno == errno.EEXIST and os.path.isdir(path):
 82 |       pass
 83 |     else:
 84 |       raise
 85 | 
 86 | 
 87 | def tryfloat(s):
 88 |   try:
 89 |     return float(s)
 90 |   except:
 91 |     return s
 92 | 
 93 | 
 94 | def alphanum_key(s):
 95 |   """ Turn a string into a list of string and number chunks.
 96 |       "z23a" -> ["z", 23, "a"]
 97 |   """
 98 |   return [tryfloat(c) for c in re.split('([0-9.]+)', s)]
 99 | 
100 | 
101 | def sort_nicely(l):
102 |   """Sort the given list in the way that humans expect."""
103 |   return sorted(l, key=alphanum_key)
104 | 
105 | 
106 | class Tee(object):
107 |   """Mimic the behavior of tee in bash
108 | 
109 |   From: http://web.archive.org/web/20141016185743/https://mail.python.org/pipermail/python-list/2007-May/460639.html
110 |   Usage:
111 |     tee=Tee('logfile', 'w')
112 |     print 'abcdefg'
113 |     print 'another line'
114 |     tee.close()
115 |     print 'screen only'
116 |     del tee # should do nothing
117 |   """
118 | 
119 |   def __init__(self, name, mode):
120 |     self.file = open(name, mode)
121 |     self.stdout = sys.stdout
122 |     sys.stdout = self
123 | 
124 |   def close(self):
125 |     if self.stdout is not None:
126 |       sys.stdout = self.stdout
127 |       self.stdout = None
128 |     if self.file is not None:
129 |       self.file.close()
130 |       self.file = None
131 | 
132 |   def write(self, data):
133 |     self.file.write(data)
134 |     self.stdout.write(data)
135 | 
136 |   def flush(self):
137 |     self.file.flush()
138 |     self.stdout.flush()
139 | 
140 |   def __del__(self):
141 |     self.close()
142 | 
143 | 
144 | def save_cfgs(train_dir, model_config, train_config, track_config):
145 |   """Save all configurations in JSON format for future reference"""
146 |   with open(osp.join(train_dir, 'model_config.json'), 'w') as f:
147 |     json.dump(model_config, f, indent=2)
148 |   with open(osp.join(train_dir, 'train_config.json'), 'w') as f:
149 |     json.dump(train_config, f, indent=2)
150 |   with open(osp.join(train_dir, 'track_config.json'), 'w') as f:
151 |     json.dump(track_config, f, indent=2)
152 | 
153 | 
154 | def load_cfgs(checkpoint):
155 |   if osp.isdir(checkpoint):
156 |     train_dir = checkpoint
157 |   else:
158 |     train_dir = osp.dirname(checkpoint)
159 | 
160 |   with open(osp.join(train_dir, 'model_config.json'), 'r') as f:
161 |     model_config = json.load(f)
162 |   with open(osp.join(train_dir, 'train_config.json'), 'r') as f:
163 |     train_config = json.load(f)
164 |   with open(osp.join(train_dir, 'track_config.json'), 'r') as f:
165 |     track_config = json.load(f)
166 |   return model_config, train_config, track_config
167 | 
168 | shape_of = lambda x: x.get_shape().as_list()
169 | same_hw = lambda a: (shape_of(a)[1] == shape_of(a)[2])


--------------------------------------------------------------------------------
/metrics/track_metrics.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | 
  9 | import tensorflow as tf
 10 | from tensorflow.python.framework import ops
 11 | from tensorflow.python.ops.metrics_impl import _confusion_matrix_at_thresholds
 12 | 
 13 | 
 14 | def _auc(labels, predictions, weights=None, num_thresholds=200,
 15 |          metrics_collections=None, updates_collections=None,
 16 |          curve='ROC', name=None, summation_method='trapezoidal'):
 17 |   """Computes the approximate AUC via a Riemann sum.
 18 | 
 19 |   Modified version of tf.metrics.auc. Add support for AUC computation
 20 |   of the recall curve.
 21 |   """
 22 |   with tf.variable_scope(
 23 |       name, 'auc', (labels, predictions, weights)):
 24 |     if curve != 'ROC' and curve != 'PR' and curve != 'R':
 25 |       raise ValueError('curve must be either ROC, PR or R, %s unknown' %
 26 |                        (curve))
 27 |     kepsilon = 1e-7  # to account for floating point imprecisions
 28 |     thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
 29 |                   for i in range(num_thresholds - 2)]
 30 |     thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 31 | 
 32 |     values, update_ops = _confusion_matrix_at_thresholds(
 33 |       labels, predictions, thresholds, weights)
 34 | 
 35 |     # Add epsilons to avoid dividing by 0.
 36 |     epsilon = 1.0e-6
 37 | 
 38 |     def compute_auc(tp, fn, tn, fp, name):
 39 |       """Computes the roc-auc or pr-auc based on confusion counts."""
 40 |       rec = tf.div(tp + epsilon, tp + fn + epsilon)
 41 |       if curve == 'ROC':
 42 |         fp_rate = tf.div(fp, fp + tn + epsilon)
 43 |         x = fp_rate
 44 |         y = rec
 45 |       elif curve == 'R':  # recall auc
 46 |         x = tf.linspace(1., 0., num_thresholds)
 47 |         y = rec
 48 |       else:  # curve == 'PR'.
 49 |         prec = tf.div(tp + epsilon, tp + fp + epsilon)
 50 |         x = rec
 51 |         y = prec
 52 |       if summation_method == 'trapezoidal':
 53 |         return tf.reduce_sum(
 54 |           tf.multiply(x[:num_thresholds - 1] - x[1:],
 55 |                       (y[:num_thresholds - 1] + y[1:]) / 2.),
 56 |           name=name)
 57 |       elif summation_method == 'minoring':
 58 |         return tf.reduce_sum(
 59 |           tf.multiply(x[:num_thresholds - 1] - x[1:],
 60 |                       tf.minimum(y[:num_thresholds - 1], y[1:])),
 61 |           name=name)
 62 |       elif summation_method == 'majoring':
 63 |         return tf.reduce_sum(
 64 |           tf.multiply(x[:num_thresholds - 1] - x[1:],
 65 |                       tf.maximum(y[:num_thresholds - 1], y[1:])),
 66 |           name=name)
 67 |       else:
 68 |         raise ValueError('Invalid summation_method: %s' % summation_method)
 69 | 
 70 |     # sum up the areas of all the trapeziums
 71 |     auc_value = compute_auc(
 72 |       values['tp'], values['fn'], values['tn'], values['fp'], 'value')
 73 |     update_op = compute_auc(
 74 |       update_ops['tp'], update_ops['fn'], update_ops['tn'], update_ops['fp'],
 75 |       'update_op')
 76 | 
 77 |     if metrics_collections:
 78 |       ops.add_to_collections(metrics_collections, auc_value)
 79 | 
 80 |     if updates_collections:
 81 |       ops.add_to_collections(updates_collections, update_op)
 82 | 
 83 |     return auc_value, update_op
 84 | 
 85 | 
 86 | def get_center_index(response):
 87 |   """Get the index of the center in the response map"""
 88 |   shape = tf.shape(response)
 89 |   c1 = tf.to_int32((shape[1] - 1) / 2)
 90 |   c2 = tf.to_int32((shape[2] - 1) / 2)
 91 |   return c1, c2
 92 | 
 93 | 
 94 | def center_score_error(response):
 95 |   """Center score error.
 96 | 
 97 |   The error is low when the center of the response map is classified as target.
 98 |   """
 99 |   with tf.name_scope('CS-err'):
100 |     r, c = get_center_index(response)
101 |     center_score = response[:, r, c]
102 |     mean, update_op = tf.metrics.mean(tf.to_float(center_score < 0))
103 |     with tf.control_dependencies([update_op]):
104 |       mean = tf.identity(mean)
105 |     return mean
106 | 
107 | 
108 | def get_maximum_index(response):
109 |   """Get the index of the maximum value in the response map"""
110 |   response_shape = response.get_shape().as_list()
111 |   response_spatial_size = response_shape[-2:]  # e.g. [29, 29]
112 |   length = response_spatial_size[0] * response_spatial_size[1]
113 | 
114 |   # Get maximum response index (note index starts from zero)
115 |   ind_max = tf.argmax(tf.reshape(response, [-1, length]), 1)
116 |   ind_row = tf.div(ind_max, response_spatial_size[1])
117 |   ind_col = tf.mod(ind_max, response_spatial_size[1])
118 |   return ind_row, ind_col
119 | 
120 | 
121 | def center_dist_error(response):
122 |   """Center distance error.
123 | 
124 |   The error is low when the maximum response is at the center of the response map.
125 |   """
126 |   with tf.name_scope('CD-err'):
127 |     radius_in_pixel = 50.
128 |     total_stride = 8.
129 |     num_thresholds = 100
130 |     radius_in_response = radius_in_pixel / total_stride
131 | 
132 |     gt_r, gt_c = get_center_index(response)
133 |     max_r, max_c = get_maximum_index(response)
134 |     gt_r = tf.to_float(gt_r)
135 |     gt_c = tf.to_float(gt_c)
136 |     max_r = tf.to_float(max_r)
137 |     max_c = tf.to_float(max_c)
138 |     distances = tf.sqrt((gt_r - max_r) ** 2 + (gt_c - max_c) ** 2)
139 | 
140 |     # We cast distances as prediction accuracies in the range [0, 1] where 0 means fail and
141 |     # 1 means success. In this way, we can readily use streaming_auc to compute area
142 |     # under curve.
143 |     dist_norm = distances / radius_in_response
144 |     dist_norm = tf.minimum(dist_norm, 1.)
145 |     predictions = 1. - dist_norm
146 |     labels = tf.ones_like(predictions)
147 | 
148 |     auc, update_op = _auc(labels, predictions, num_thresholds=num_thresholds, curve='R')
149 |     with tf.control_dependencies([update_op]):
150 |       err = 1. - auc
151 |     return err
152 | 


--------------------------------------------------------------------------------
/utils/infer_utils.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | """
  9 | Inference Utilities
 10 | """
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import division
 14 | from __future__ import print_function
 15 | 
 16 | import collections
 17 | 
 18 | import numpy as np
 19 | import tensorflow as tf
 20 | from cv2 import resize
 21 | import logging
 22 | 
 23 | from utils.misc_utils import get_center
 24 | 
 25 | Rectangle = collections.namedtuple('Rectangle', ['x', 'y', 'width', 'height'])
 26 | 
 27 | 
 28 | def im2rgb(im):
 29 |   if len(im.shape) != 3:
 30 |     im = np.stack([im, im, im], -1)
 31 |   return im
 32 | 
 33 | 
 34 | def convert_bbox_format(bbox, to):
 35 |   x, y, target_width, target_height = bbox.x, bbox.y, bbox.width, bbox.height
 36 |   if to == 'top-left-based':
 37 |     x -= get_center(target_width)
 38 |     y -= get_center(target_height)
 39 |   elif to == 'center-based':
 40 |     y += get_center(target_height)
 41 |     x += get_center(target_width)
 42 |   else:
 43 |     raise ValueError("Bbox format: {} was not recognized".format(to))
 44 |   return Rectangle(x, y, target_width, target_height)
 45 | 
 46 | 
 47 | def get_exemplar_images(images, exemplar_size, targets_pos=None):
 48 |   """Crop exemplar image from input images"""
 49 |   with tf.name_scope('get_exemplar_image'):
 50 |     batch_size, x_height, x_width = images.get_shape().as_list()[:3]
 51 |     z_height, z_width = exemplar_size
 52 | 
 53 |     if targets_pos is None:
 54 |       target_pos_single = [[get_center(x_height), get_center(x_width)]]
 55 |       targets_pos_ = tf.tile(target_pos_single, [batch_size, 1])
 56 |     else:
 57 |       targets_pos_ = targets_pos
 58 | 
 59 |     # convert to top-left corner based coordinates
 60 |     top = tf.to_int32(tf.round(targets_pos_[:, 0] - get_center(z_height)))
 61 |     bottom = tf.to_int32(top + z_height)
 62 |     left = tf.to_int32(tf.round(targets_pos_[:, 1] - get_center(z_width)))
 63 |     right = tf.to_int32(left + z_width)
 64 | 
 65 |     def _slice(x):
 66 |       f, t, l, b, r = x
 67 |       c = f[t:b, l:r]
 68 |       return c
 69 | 
 70 |     exemplar_img = tf.map_fn(_slice, (images, top, left, bottom, right), dtype=images.dtype)
 71 |     exemplar_img.set_shape([batch_size, z_height, z_width, 3])
 72 |     return exemplar_img
 73 | 
 74 | 
 75 | def get_crops(im, bbox, size_z, size_x, context_amount):
 76 |   """Obtain image sub-window, padding with avg channel if area goes outside of border
 77 | 
 78 |   Adapted from https://github.com/bertinetto/siamese-fc/blob/master/ILSVRC15-curation/save_crops.m#L46
 79 | 
 80 |   Args:
 81 |     im: Image ndarray
 82 |     bbox: Named tuple (x, y, width, height) x, y corresponds to the crops center
 83 |     size_z: Target + context size
 84 |     size_x: The resultant crop size
 85 |     context_amount: The amount of context
 86 | 
 87 |   Returns:
 88 |     image crop: Image ndarray
 89 |   """
 90 |   cy, cx, h, w = bbox.y, bbox.x, bbox.height, bbox.width
 91 |   wc_z = w + context_amount * (w + h)
 92 |   hc_z = h + context_amount * (w + h)
 93 |   s_z = np.sqrt(wc_z * hc_z)
 94 |   scale_z = size_z / s_z
 95 | 
 96 |   d_search = (size_x - size_z) / 2
 97 |   pad = d_search / scale_z
 98 |   s_x = s_z + 2 * pad
 99 |   scale_x = size_x / s_x
100 | 
101 |   image_crop_x, _, _, _, _ = get_subwindow_avg(im, [cy, cx],
102 |                                                [size_x, size_x],
103 |                                                [np.round(s_x), np.round(s_x)])
104 | 
105 |   return image_crop_x, scale_x
106 | 
107 | 
108 | def get_subwindow_avg(im, pos, model_sz, original_sz):
109 |   # avg_chans = np.mean(im, axis=(0, 1)) # This version is 3x slower
110 |   avg_chans = [np.mean(im[:, :, 0]), np.mean(im[:, :, 1]), np.mean(im[:, :, 2])]
111 |   if not original_sz:
112 |     original_sz = model_sz
113 |   sz = original_sz
114 |   im_sz = im.shape
115 |   # make sure the size is not too small
116 |   assert im_sz[0] > 2 and im_sz[1] > 2
117 |   c = [get_center(s) for s in sz]
118 | 
119 |   # check out-of-bounds coordinates, and set them to avg_chans
120 |   context_xmin = np.int(np.round(pos[1] - c[1]))
121 |   context_xmax = np.int(context_xmin + sz[1] - 1)
122 |   context_ymin = np.int(np.round(pos[0] - c[0]))
123 |   context_ymax = np.int(context_ymin + sz[0] - 1)
124 |   left_pad = np.int(np.maximum(0, -context_xmin))
125 |   top_pad = np.int(np.maximum(0, -context_ymin))
126 |   right_pad = np.int(np.maximum(0, context_xmax - im_sz[1] + 1))
127 |   bottom_pad = np.int(np.maximum(0, context_ymax - im_sz[0] + 1))
128 | 
129 |   context_xmin = context_xmin + left_pad
130 |   context_xmax = context_xmax + left_pad
131 |   context_ymin = context_ymin + top_pad
132 |   context_ymax = context_ymax + top_pad
133 |   if top_pad > 0 or bottom_pad > 0 or left_pad > 0 or right_pad > 0:
134 |     R = np.pad(im[:, :, 0], ((top_pad, bottom_pad), (left_pad, right_pad)),
135 |                'constant', constant_values=(avg_chans[0]))
136 |     G = np.pad(im[:, :, 1], ((top_pad, bottom_pad), (left_pad, right_pad)),
137 |                'constant', constant_values=(avg_chans[1]))
138 |     B = np.pad(im[:, :, 2], ((top_pad, bottom_pad), (left_pad, right_pad)),
139 |                'constant', constant_values=(avg_chans[2]))
140 | 
141 |     im = np.stack((R, G, B), axis=2)
142 | 
143 |   im_patch_original = im[context_ymin:context_ymax + 1,
144 |                       context_xmin:context_xmax + 1, :]
145 |   if not (model_sz[0] == original_sz[0] and model_sz[1] == original_sz[1]):
146 |     im_patch = resize(im_patch_original, tuple(model_sz))
147 |   else:
148 |     im_patch = im_patch_original
149 |   return im_patch, left_pad, top_pad, right_pad, bottom_pad
150 | 
151 | def get_saver(keyword, removes, excepts,repl=[]):
152 |   vars_need_load = {}
153 |   for v in (tf.global_variables()):
154 |     vname = v.name
155 |     if vname.find(keyword)!=-1:
156 |       for eeexxx in excepts:
157 |         if vname.find(eeexxx)!=-1:
158 |           logging.warning('No Load: '+vname)
159 |           break
160 |       else:
161 |         vname_ori = vname
162 |         for r in removes:
163 |           vname = vname.replace(r,'')
164 |         for r in repl:
165 |           vname = vname.replace(r[0],r[1])
166 |         vars_need_load[vname] = v
167 |         logging.warning('Load: ' + vname + ' as ' + vname_ori)
168 |     else:
169 |       logging.warning('No Load: '+vname)
170 |   return tf.train.Saver(vars_need_load)


--------------------------------------------------------------------------------
/train_siamese_model.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | """Train the model"""
  9 | 
 10 | from __future__ import absolute_import
 11 | from __future__ import division
 12 | from __future__ import print_function
 13 | 
 14 | import logging
 15 | logging.getLogger().setLevel(logging.INFO)
 16 | import os
 17 | import os.path as osp
 18 | import random
 19 | import time
 20 | from datetime import datetime
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | from sacred import Experiment
 25 | from sacred.observers import FileStorageObserver
 26 | 
 27 | import configuration
 28 | import siamese_model
 29 | from utils.misc_utils import auto_select_gpu, mkdir_p, save_cfgs
 30 | from utils.train_utils import load_caffenet
 31 | 
 32 | ex = Experiment(configuration.RUN_NAME)
 33 | ex.observers.append(FileStorageObserver.create(osp.join(configuration.LOG_DIR, 'sacred')))
 34 | 
 35 | 
 36 | @ex.config
 37 | def configurations():
 38 |   # Add configurations for current script, for more details please see the documentation of `sacred`.
 39 |   # REFER: http://sacred.readthedocs.io/en/latest/index.html
 40 |   model_config = configuration.MODEL_CONFIG
 41 |   train_config = configuration.TRAIN_CONFIG
 42 |   track_config = configuration.TRACK_CONFIG
 43 | 
 44 | 
 45 | def _configure_learning_rate(train_config, global_step):
 46 |   lr_config = train_config['lr_config']
 47 | 
 48 |   num_batches_per_epoch = \
 49 |     int(train_config['train_data_config']['num_examples_per_epoch'] / train_config['train_data_config']['batch_size'])
 50 | 
 51 |   lr_policy = lr_config['policy']
 52 |   if lr_policy == 'piecewise_constant':
 53 |     lr_boundaries = [int(e * num_batches_per_epoch) for e in lr_config['lr_boundaries']]
 54 |     return tf.train.piecewise_constant(global_step,
 55 |                                        lr_boundaries,
 56 |                                        lr_config['lr_values'])
 57 |   elif lr_policy == 'exponential':
 58 |     decay_steps = int(num_batches_per_epoch) * lr_config['num_epochs_per_decay']
 59 |     return tf.train.exponential_decay(lr_config['initial_lr'],
 60 |                                       global_step,
 61 |                                       decay_steps=decay_steps,
 62 |                                       decay_rate=lr_config['lr_decay_factor'],
 63 |                                       staircase=lr_config['staircase'])
 64 |   elif lr_policy == 'cosine':
 65 |     T_total = train_config['train_data_config']['epoch'] * num_batches_per_epoch
 66 |     return 0.5 * lr_config['initial_lr'] * (1 + tf.cos(np.pi * tf.to_float(global_step) / T_total))
 67 |   else:
 68 |     raise ValueError('Learning rate policy [%s] was not recognized', lr_policy)
 69 | 
 70 | 
 71 | def _configure_optimizer(train_config, learning_rate):
 72 |   optimizer_config = train_config['optimizer_config']
 73 |   optimizer_name = optimizer_config['optimizer'].upper()
 74 |   if optimizer_name == 'MOMENTUM':
 75 |     optimizer = tf.train.MomentumOptimizer(
 76 |       learning_rate,
 77 |       momentum=optimizer_config['momentum'],
 78 |       use_nesterov=optimizer_config['use_nesterov'],
 79 |       name='Momentum')
 80 |   elif optimizer_name == 'SGD':
 81 |     optimizer = tf.train.GradientDescentOptimizer(learning_rate)
 82 |   else:
 83 |     raise ValueError('Optimizer [%s] was not recognized', optimizer_config['optimizer'])
 84 |   return optimizer
 85 | 
 86 | 
 87 | @ex.automain
 88 | def main(model_config, train_config, track_config):
 89 |   logging.getLogger().setLevel(logging.INFO)
 90 |   os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu()
 91 | 
 92 |   # Create training directory which will be used to save: configurations, model files, TensorBoard logs
 93 |   train_dir = train_config['train_dir']
 94 |   if not osp.isdir(train_dir):
 95 |     logging.info('Creating training directory: %s', train_dir)
 96 |     mkdir_p(train_dir)
 97 | 
 98 |   g = tf.Graph()
 99 |   with g.as_default():
100 |     # Set fixed seed for reproducible experiments
101 |     random.seed(train_config['seed'])
102 |     np.random.seed(train_config['seed'])
103 |     tf.set_random_seed(train_config['seed'])
104 | 
105 |     # Build the training and validation model
106 |     model = siamese_model.SiameseModel(model_config, train_config, mode='train')
107 |     model.build()
108 |     model_va = siamese_model.SiameseModel(model_config, train_config, mode='validation')
109 |     model_va.build(reuse=True)
110 | 
111 |     # Save configurations for future reference
112 |     save_cfgs(train_dir, model_config, train_config, track_config)
113 | 
114 |     learning_rate = _configure_learning_rate(train_config, model.global_step)
115 |     optimizer = _configure_optimizer(train_config, learning_rate)
116 |     tf.summary.scalar('learning_rate', learning_rate)
117 |     logging.info('Trainable variables:')
118 |     for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
119 |       logging.info('-- {}'.format(v))
120 |     # Set up the training ops
121 |     opt_op = tf.contrib.layers.optimize_loss(
122 |       loss=model.total_loss,
123 |       global_step=model.global_step,
124 |       learning_rate=learning_rate,
125 |       optimizer=optimizer,
126 |       clip_gradients=train_config['clip_gradients'],
127 |       learning_rate_decay_fn=None,
128 |       summaries=['learning_rate'])
129 | 
130 |     with tf.control_dependencies([opt_op]):
131 |       train_op = tf.no_op(name='train')
132 | 
133 |     saver = tf.train.Saver(tf.global_variables(),
134 |                            max_to_keep=train_config['max_checkpoints_to_keep'])
135 | 
136 |     summary_writer = tf.summary.FileWriter(train_dir, g)
137 |     summary_op = tf.summary.merge_all()
138 | 
139 |     global_variables_init_op = tf.global_variables_initializer()
140 |     local_variables_init_op = tf.local_variables_initializer()
141 | 
142 |     # Dynamically allocate GPU memory
143 |     gpu_options = tf.GPUOptions(allow_growth=True)
144 |     sess_config = tf.ConfigProto(gpu_options=gpu_options)
145 | 
146 |     sess = tf.Session(config=sess_config)
147 |     model_path = tf.train.latest_checkpoint(train_config['train_dir'])
148 | 
149 |     if not model_path:
150 |       sess.run(global_variables_init_op)
151 |       sess.run(local_variables_init_op)
152 |       start_step = 0
153 |       if model_config['sa_siam_config']['en_semantic']:
154 |         load_caffenet(train_config['caffenet_dir'], sess)
155 |       if model_config['embed_config']['embedding_checkpoint_file']:
156 |         model.init_fn(sess)
157 | 
158 |     else:
159 |       logging.info('Restore from last checkpoint: {}'.format(model_path))
160 |       sess.run(local_variables_init_op)
161 |       saver.restore(sess, model_path)
162 |       start_step = tf.train.global_step(sess, model.global_step.name) + 1
163 | 
164 |     g.finalize()  # Finalize graph to avoid adding ops by mistake
165 |     # Training loop
166 |     data_config = train_config['train_data_config']
167 |     total_steps = int(data_config['epoch'] *
168 |                       data_config['num_examples_per_epoch'] /
169 |                       data_config['batch_size'])
170 |     logging.info('Train for {} steps'.format(total_steps))
171 |     for step in range(start_step, total_steps):
172 |       start_time = time.time()
173 |       _, loss, batch_loss = sess.run([train_op, model.total_loss, model.batch_loss])
174 |       duration = time.time() - start_time
175 | 
176 |       if step % 10 == 0:
177 |         examples_per_sec = data_config['batch_size'] / float(duration)
178 |         time_remain = data_config['batch_size'] * (total_steps - step) / examples_per_sec
179 |         m, s = divmod(time_remain, 60)
180 |         h, m = divmod(m, 60)
181 |         format_str = ('%s: step %d, total loss = %.2f, batch loss = %.2f (%.1f examples/sec; %.3f '
182 |                       'sec/batch; %dh:%02dm:%02ds remains)')
183 |         logging.info(format_str % (datetime.now(), step, loss, batch_loss,
184 |                                    examples_per_sec, duration, h, m, s))
185 | 
186 |       if step % 100 == 0:
187 |         summary_str = sess.run(summary_op)
188 |         summary_writer.add_summary(summary_str, step)
189 | 
190 |       if (step + 1) % train_config['save_model_every_n_step'] == 0 or (step + 1) == total_steps:
191 |         checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt')
192 |         saver.save(sess, checkpoint_path, global_step=step)
193 | 


--------------------------------------------------------------------------------
/inference/tracker.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | """Class for tracking using a track model."""
  8 | 
  9 | from __future__ import absolute_import
 10 | from __future__ import division
 11 | from __future__ import print_function
 12 | 
 13 | import logging
 14 | logging.getLogger().setLevel(logging.INFO)
 15 | import os.path as osp
 16 | 
 17 | import numpy as np
 18 | import cv2
 19 | from cv2 import imwrite
 20 | 
 21 | from utils.infer_utils import convert_bbox_format, Rectangle
 22 | from utils.misc_utils import get_center, get
 23 | 
 24 | 
 25 | class TargetState(object):
 26 |   """Represent the target state."""
 27 | 
 28 |   def __init__(self, bbox, search_pos, scale_idx):
 29 |     self.bbox = bbox  # (cx, cy, w, h) in the original image
 30 |     self.search_pos = search_pos  # target center position in the search image
 31 |     self.scale_idx = scale_idx  # scale index in the searched scales
 32 | 
 33 | 
 34 | class Tracker(object):
 35 |   """Tracker based on the siamese model."""
 36 | 
 37 |   def __init__(self, siamese_model, model_config, track_config):
 38 |     self.siamese_model = siamese_model
 39 |     self.model_config = model_config
 40 |     self.track_config = track_config
 41 | 
 42 |     self.num_scales = track_config['num_scales']
 43 |     logging.info('track num scales -- {}'.format(self.num_scales))
 44 |     scales = np.arange(self.num_scales) - get_center(self.num_scales)
 45 |     self.search_factors = [self.track_config['scale_step'] ** x for x in scales]
 46 | 
 47 |     self.x_image_size = track_config['x_image_size']  # Search image size
 48 |     self.window = None  # Cosine window
 49 |     self.log_level = track_config['log_level']
 50 | 
 51 |   def track(self, sess, first_bbox, frames, logdir='/tmp'):
 52 |     """Runs tracking on a single image sequence."""
 53 |     # Get initial target bounding box and convert to center based
 54 |     bbox = convert_bbox_format(first_bbox, 'center-based')
 55 | 
 56 |     # Feed in the first frame image to set initial state.
 57 |     bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
 58 |     input_feed = [frames[0], bbox_feed]
 59 |     frame2crop_scale = self.siamese_model.initialize(sess, input_feed)
 60 | 
 61 |     # Storing target state
 62 |     original_target_height = bbox.height
 63 |     original_target_width = bbox.width
 64 |     search_center = np.array([get_center(self.x_image_size),
 65 |                               get_center(self.x_image_size)])
 66 |     current_target_state = TargetState(bbox=bbox,
 67 |                                        search_pos=search_center,
 68 |                                        scale_idx=int(get_center(self.num_scales)))
 69 | 
 70 |     include_first = get(self.track_config, 'include_first', False)
 71 |     logging.info('Tracking include first -- {}'.format(include_first))
 72 | 
 73 |     # Run tracking loop
 74 |     reported_bboxs = []
 75 |     for i, filename in enumerate(frames):
 76 |       if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
 77 |         bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x,
 78 |                      current_target_state.bbox.height, current_target_state.bbox.width]
 79 |         input_feed = [filename, bbox_feed]
 80 | 
 81 |         outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
 82 |         search_scale_list = outputs['scale_xs']
 83 |         response = outputs['response']
 84 |         response_size = response.shape[1]
 85 | 
 86 |         # Choose the scale whole response map has the highest peak
 87 |         if self.num_scales > 1:
 88 |           response_max = np.max(response, axis=(1, 2))
 89 |           penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales))
 90 |           current_scale_idx = int(get_center(self.num_scales))
 91 |           penalties[current_scale_idx] = 1.0
 92 |           response_penalized = response_max * penalties
 93 |           best_scale = np.argmax(response_penalized)
 94 |           if np.max(response_max)<0:
 95 |             logging.warning('MAX_RESPONSE LESS THAN ZERO!')
 96 |             # best_scale = current_scale_idx
 97 |         else:
 98 |           best_scale = 0
 99 | 
100 |         response = response[best_scale]
101 | 
102 |         with np.errstate(all='raise'):  # Raise error if something goes wrong
103 |           response = response - np.min(response)
104 |           response = response / np.sum(response)
105 | 
106 |         if self.window is None:
107 |           window = np.dot(np.expand_dims(np.hanning(response_size), 1),
108 |                           np.expand_dims(np.hanning(response_size), 0))
109 |           self.window = window / np.sum(window)  # normalize window
110 |         window_influence = self.track_config['window_influence']
111 |         response = (1 - window_influence) * response + window_influence * self.window
112 | 
113 |         # Find maximum response
114 |         r_max, c_max = np.unravel_index(response.argmax(),
115 |                                         response.shape)
116 | 
117 |         # Convert from crop-relative coordinates to frame coordinates
118 |         p_coor = np.array([r_max, c_max])
119 |         # displacement from the center in instance final representation ...
120 |         disp_instance_final = p_coor - get_center(response_size)
121 |         # ... in instance feature space ...
122 |         upsample_factor = self.track_config['upsample_factor']
123 |         disp_instance_feat = disp_instance_final / upsample_factor
124 |         # ... Avoid empty position ...
125 |         r_radius = int(response_size / upsample_factor / 2)
126 |         disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius)
127 |         # ... in instance input ...
128 |         disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride']
129 |         # ... in instance original crop (in frame coordinates)
130 |         disp_instance_frame = disp_instance_input / search_scale_list[best_scale]
131 |         # Position within frame in frame coordinates
132 |         y = current_target_state.bbox.y
133 |         x = current_target_state.bbox.x
134 |         y += disp_instance_frame[0]
135 |         x += disp_instance_frame[1]
136 | 
137 |         # Target scale damping and saturation
138 |         target_scale = current_target_state.bbox.height / original_target_height
139 |         search_factor = self.search_factors[best_scale]
140 |         scale_damp = self.track_config['scale_damp']  # damping factor for scale update
141 |         target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
142 |         target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))
143 | 
144 |         # Some book keeping
145 |         height = original_target_height * target_scale
146 |         width = original_target_width * target_scale
147 |         current_target_state.bbox = Rectangle(x, y, width, height)
148 |         current_target_state.scale_idx = best_scale
149 |         current_target_state.search_pos = search_center + disp_instance_input
150 | 
151 |         assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
152 |           'target position in feature space should be no larger than input image size'
153 |         assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
154 |           'target position in feature space should be no larger than input image size'
155 | 
156 |         if self.log_level > 0:
157 |           np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])
158 | 
159 |           # Select the image with the highest score scale and convert it to uint8
160 |           image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8)
161 |           # Note that imwrite in cv2 assumes the image is in BGR format.
162 |           # However, the cropped image returned by TensorFlow is RGB.
163 |           # Therefore, we convert color format using cv2.cvtColor
164 |           imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
165 |                   cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))
166 | 
167 |           np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale])
168 |           np.save(osp.join(logdir, 'response{}.npy'.format(i)), response)
169 | 
170 |           y_search, x_search = current_target_state.search_pos
171 |           search_scale = search_scale_list[best_scale]
172 |           target_height_search = height * search_scale
173 |           target_width_search = width * search_scale
174 |           bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search)
175 |           bbox_search = convert_bbox_format(bbox_search, 'top-left-based')
176 |           np.save(osp.join(logdir, 'bbox{}.npy'.format(i)),
177 |                   [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height])
178 | 
179 |       reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based')
180 |       reported_bboxs.append(reported_bbox)
181 |     return reported_bboxs
182 | 


--------------------------------------------------------------------------------
/utils/train_utils.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | """Utilities for model construction"""
  9 | from __future__ import absolute_import
 10 | from __future__ import division
 11 | from __future__ import print_function
 12 | 
 13 | import re
 14 | 
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | from scipy import io as sio
 18 | import logging
 19 | logging.getLogger().setLevel(logging.INFO)
 20 | from utils.misc_utils import get_center
 21 | 
 22 | 
 23 | def construct_gt_score_maps(response_size, batch_size, stride, gt_config=None):
 24 |   """Construct a batch of groundtruth score maps
 25 | 
 26 |   Args:
 27 |     response_size: A list or tuple with two elements [ho, wo]
 28 |     batch_size: An integer e.g., 16
 29 |     stride: Embedding stride e.g., 8
 30 |     gt_config: Configurations for groundtruth generation
 31 | 
 32 |   Return:
 33 |     A float tensor of shape [batch_size] + response_size
 34 |   """
 35 |   with tf.name_scope('construct_gt'):
 36 |     ho = response_size[0]
 37 |     wo = response_size[1]
 38 |     y = tf.cast(tf.range(0, ho), dtype=tf.float32) - get_center(ho)
 39 |     x = tf.cast(tf.range(0, wo), dtype=tf.float32) - get_center(wo)
 40 |     [Y, X] = tf.meshgrid(y, x)
 41 | 
 42 |     def _logistic_label(X, Y, rPos, rNeg):
 43 |       # dist_to_center = tf.sqrt(tf.square(X) + tf.square(Y))  # L2 metric
 44 |       dist_to_center = tf.abs(X) + tf.abs(Y)  # Block metric
 45 |       Z = tf.where(dist_to_center <= rPos,
 46 |                    tf.ones_like(X),
 47 |                    tf.where(dist_to_center < rNeg,
 48 |                             0.5 * tf.ones_like(X),
 49 |                             tf.zeros_like(X)))
 50 |       return Z
 51 | 
 52 |     rPos = gt_config['rPos'] / stride
 53 |     rNeg = gt_config['rNeg'] / stride
 54 |     gt = _logistic_label(X, Y, rPos, rNeg)
 55 | 
 56 |     # Duplicate a batch of maps
 57 |     gt_expand = tf.reshape(gt, [1] + response_size)
 58 |     gt = tf.tile(gt_expand, [batch_size, 1, 1])
 59 |     return gt
 60 | 
 61 | 
 62 | def get_params_from_mat(matpath):
 63 |   """Get parameter from .mat file into parms(dict)"""
 64 | 
 65 |   def squeeze(vars_):
 66 |     # Matlab save some params with shape (*, 1)
 67 |     # However, we don't need the trailing dimension in TensorFlow.
 68 |     if isinstance(vars_, (list, tuple)):
 69 |       return [np.squeeze(v, 1) for v in vars_]
 70 |     else:
 71 |       return np.squeeze(vars_, 1)
 72 | 
 73 |   netparams = sio.loadmat(matpath)["net"]["params"][0][0]
 74 |   params = dict()
 75 | 
 76 |   for i in range(netparams.size):
 77 |     param = netparams[0][i]
 78 |     name = param["name"][0]
 79 |     value = param["value"]
 80 |     value_size = param["value"].shape[0]
 81 | 
 82 |     match = re.match(r"([a-z]+)([0-9]+)([a-z]+)", name, re.I)
 83 |     if match:
 84 |       items = match.groups()
 85 |     elif name == 'adjust_f':
 86 |       params['detection/weights'] = squeeze(value)
 87 |       continue
 88 |     elif name == 'adjust_b':
 89 |       params['detection/biases'] = squeeze(value)
 90 |       continue
 91 |     else:
 92 |       raise Exception('unrecognized layer params')
 93 | 
 94 |     op, layer, types = items
 95 |     layer = int(layer)
 96 |     if layer in [1, 3]:
 97 |       if op == 'conv':  # convolution
 98 |         if types == 'f':
 99 |           params['conv%d/weights' % layer] = value
100 |         elif types == 'b':
101 |           value = squeeze(value)
102 |           params['conv%d/biases' % layer] = value
103 |       elif op == 'bn':  # batch normalization
104 |         if types == 'x':
105 |           m, v = squeeze(np.split(value, 2, 1))
106 |           params['conv%d/BatchNorm/moving_mean' % layer] = m
107 |           params['conv%d/BatchNorm/moving_variance' % layer] = np.square(v)
108 |         elif types == 'm':
109 |           value = squeeze(value)
110 |           params['conv%d/BatchNorm/gamma' % layer] = value
111 |         elif types == 'b':
112 |           value = squeeze(value)
113 |           params['conv%d/BatchNorm/beta' % layer] = value
114 |       else:
115 |         raise Exception
116 |     elif layer in [2, 4]:
117 |       if op == 'conv' and types == 'f':
118 |         b1, b2 = np.split(value, 2, 3)
119 |       else:
120 |         b1, b2 = np.split(value, 2, 0)
121 |       if op == 'conv':
122 |         if types == 'f':
123 |           params['conv%d/b1/weights' % layer] = b1
124 |           params['conv%d/b2/weights' % layer] = b2
125 |         elif types == 'b':
126 |           b1, b2 = squeeze(np.split(value, 2, 0))
127 |           params['conv%d/b1/biases' % layer] = b1
128 |           params['conv%d/b2/biases' % layer] = b2
129 |       elif op == 'bn':
130 |         if types == 'x':
131 |           m1, v1 = squeeze(np.split(b1, 2, 1))
132 |           m2, v2 = squeeze(np.split(b2, 2, 1))
133 |           params['conv%d/b1/BatchNorm/moving_mean' % layer] = m1
134 |           params['conv%d/b2/BatchNorm/moving_mean' % layer] = m2
135 |           params['conv%d/b1/BatchNorm/moving_variance' % layer] = np.square(v1)
136 |           params['conv%d/b2/BatchNorm/moving_variance' % layer] = np.square(v2)
137 |         elif types == 'm':
138 |           params['conv%d/b1/BatchNorm/gamma' % layer] = squeeze(b1)
139 |           params['conv%d/b2/BatchNorm/gamma' % layer] = squeeze(b2)
140 |         elif types == 'b':
141 |           params['conv%d/b1/BatchNorm/beta' % layer] = squeeze(b1)
142 |           params['conv%d/b2/BatchNorm/beta' % layer] = squeeze(b2)
143 |       else:
144 |         raise Exception
145 | 
146 |     elif layer in [5]:
147 |       if op == 'conv' and types == 'f':
148 |         b1, b2 = np.split(value, 2, 3)
149 |       else:
150 |         b1, b2 = squeeze(np.split(value, 2, 0))
151 |       assert op == 'conv', 'layer5 contains only convolution'
152 |       if types == 'f':
153 |         params['conv%d/b1/weights' % layer] = b1
154 |         params['conv%d/b2/weights' % layer] = b2
155 |       elif types == 'b':
156 |         params['conv%d/b1/biases' % layer] = b1
157 |         params['conv%d/b2/biases' % layer] = b2
158 | 
159 |   return params
160 | 
161 | 
162 | def load_mat_model(matpath, embed_scope, detection_scope=None):
163 |   """Restore SiameseFC models from .mat model files"""
164 |   params = get_params_from_mat(matpath)
165 | 
166 |   assign_ops = []
167 | 
168 |   def _assign(ref_name, params, scope=embed_scope):
169 |     var_in_model = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
170 |                                      scope + ref_name)[0]
171 |     var_in_mat = params[ref_name]
172 |     op = tf.assign(var_in_model, var_in_mat)
173 |     assign_ops.append(op)
174 | 
175 |   for l in range(1, 6):
176 |     logging.info('Loading layer {} from mat.'.format(l))
177 |     if l in [1, 3]:
178 |       _assign('conv%d/weights' % l, params)
179 |       # _assign('conv%d/biases' % l, params)
180 |       _assign('conv%d/BatchNorm/beta' % l, params)
181 |       _assign('conv%d/BatchNorm/gamma' % l, params)
182 |       _assign('conv%d/BatchNorm/moving_mean' % l, params)
183 |       _assign('conv%d/BatchNorm/moving_variance' % l, params)
184 |     elif l in [2, 4]:
185 |       # Branch 1
186 |       _assign('conv%d/b1/weights' % l, params)
187 |       # _assign('conv%d/b1/biases' % l, params)
188 |       _assign('conv%d/b1/BatchNorm/beta' % l, params)
189 |       _assign('conv%d/b1/BatchNorm/gamma' % l, params)
190 |       _assign('conv%d/b1/BatchNorm/moving_mean' % l, params)
191 |       _assign('conv%d/b1/BatchNorm/moving_variance' % l, params)
192 |       # Branch 2
193 |       _assign('conv%d/b2/weights' % l, params)
194 |       # _assign('conv%d/b2/biases' % l, params)
195 |       _assign('conv%d/b2/BatchNorm/beta' % l, params)
196 |       _assign('conv%d/b2/BatchNorm/gamma' % l, params)
197 |       _assign('conv%d/b2/BatchNorm/moving_mean' % l, params)
198 |       _assign('conv%d/b2/BatchNorm/moving_variance' % l, params)
199 |     elif l in [5]:
200 |       # Branch 1
201 |       _assign('conv%d/b1/weights' % l, params)
202 |       _assign('conv%d/b1/biases' % l, params)
203 |       # Branch 2
204 |       _assign('conv%d/b2/weights' % l, params)
205 |       _assign('conv%d/b2/biases' % l, params)
206 |     else:
207 |       raise Exception('layer number must below 5')
208 | 
209 |   if detection_scope:
210 |     _assign(detection_scope + 'biases', params, scope='')
211 | 
212 |   initialize = tf.group(*assign_ops)
213 |   return initialize
214 | def load_caffenet(path_caffenet, _sess):
215 |   logging.info('Load object model from ' + path_caffenet)
216 |   data_dict = np.load(path_caffenet, encoding='latin1').item()
217 |   for op_name in data_dict:
218 |     if op_name.find('fc')!=-1:
219 |       continue
220 |     full_op_name = 'sa_siam/semantic_net/'+op_name
221 |     with tf.variable_scope(full_op_name, reuse=True):
222 |       if op_name in ['conv2','conv4','conv5']:
223 |         for param_name, data in data_dict[op_name].items():
224 |           d1, d2 = tf.split(data, 2, -1+len(data.shape)) # Last dim is selected to split
225 |           for [d_, b_] in [[d1,'b1'],[d2,'b2']]:
226 |             with tf.variable_scope(b_, reuse=True):
227 |               logging.info('Loading: ' + full_op_name + ' ' + b_ + ' ' + param_name)
228 |               var = tf.get_variable(param_name)
229 |               _sess.run(var.assign(d_))
230 |       else:
231 |         for param_name, data in data_dict[op_name].items():
232 |           logging.info('Loading: ' + full_op_name  + ' ' + param_name)
233 |           var = tf.get_variable(param_name)
234 |           _sess.run(var.assign(data))


--------------------------------------------------------------------------------
/siamese_model.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily     Huazhong University of Science and Technology
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | """Construct the computational graph of siamese model for training. """
  9 | 
 10 | from __future__ import absolute_import
 11 | from __future__ import division
 12 | from __future__ import print_function
 13 | 
 14 | import functools
 15 | 
 16 | import tensorflow as tf
 17 | 
 18 | from datasets.dataloader import DataLoader
 19 | from embeddings.sa_siam import sa_siam_arg_scope, sa_siam
 20 | from metrics.track_metrics import center_dist_error, center_score_error
 21 | from utils.train_utils import construct_gt_score_maps, load_mat_model
 22 | 
 23 | slim = tf.contrib.slim
 24 | 
 25 | 
 26 | class SiameseModel:
 27 |   def __init__(self, model_config, train_config, mode='train'):
 28 |     self.model_config = model_config
 29 |     self.train_config = train_config
 30 |     self.mode = mode
 31 |     assert mode in ['train', 'validation', 'inference']
 32 | 
 33 |     if self.mode == 'train':
 34 |       self.data_config = self.train_config['train_data_config']
 35 |     elif self.mode == 'validation':
 36 |       self.data_config = self.train_config['validation_data_config']
 37 | 
 38 |     self.dataloader = None
 39 |     self.exemplars = None
 40 |     self.instances = None
 41 |     self.response = None
 42 |     self.batch_loss = None
 43 |     self.total_loss = None
 44 |     self.init_fn = None
 45 |     self.global_step = None
 46 | 
 47 |   def is_training(self):
 48 |     """Returns true if the model is built for training mode"""
 49 |     return self.mode == 'train'
 50 | 
 51 |   def build_inputs(self):
 52 |     """Input fetching and batching
 53 | 
 54 |     Outputs:
 55 |       self.exemplars: image batch of shape [batch, hz, wz, 3]
 56 |       self.instances: image batch of shape [batch, hx, wx, 3]
 57 |     """
 58 |     if self.mode in ['train', 'validation']:
 59 |       with tf.device("/cpu:0"):  # Put data loading and preprocessing in CPU is substantially faster
 60 |         self.dataloader = DataLoader(self.data_config, self.is_training())
 61 |         self.dataloader.build()
 62 |         exemplars, instances = self.dataloader.get_one_batch()
 63 | 
 64 |         exemplars = tf.to_float(exemplars)
 65 |         instances = tf.to_float(instances)
 66 |     else:
 67 |       self.examplar_feed = tf.placeholder(shape=[None, None, None, 3],
 68 |                                           dtype=tf.uint8,
 69 |                                           name='examplar_input')
 70 |       self.instance_feed = tf.placeholder(shape=[None, None, None, 3],
 71 |                                           dtype=tf.uint8,
 72 |                                           name='instance_input')
 73 |       exemplars = tf.to_float(self.examplar_feed)
 74 |       instances = tf.to_float(self.instance_feed)
 75 | 
 76 |     self.exemplars = exemplars
 77 |     self.instances = instances
 78 | 
 79 |   def build_image_embeddings(self, reuse=False):
 80 |     """Builds the image model subgraph and generates image embeddings
 81 | 
 82 |     Inputs:
 83 |       self.exemplars: A tensor of shape [batch, hz, wz, 3]
 84 |       self.instances: A tensor of shape [batch, hx, wx, 3]
 85 | 
 86 |     Outputs:
 87 |       self.exemplar_embeds: A Tensor of shape [batch, hz_embed, wz_embed, embed_dim]
 88 |       self.instance_embeds: A Tensor of shape [batch, hx_embed, wx_embed, embed_dim]
 89 |     """
 90 |     config = self.model_config['embed_config']
 91 |     arg_scope = sa_siam_arg_scope(config,
 92 |                                   trainable=config['train_embedding'],
 93 |                                   is_training=self.is_training())
 94 |     with slim.arg_scope(arg_scope):
 95 |       self.exemplar_embeds, _ = sa_siam(inputs=self.exemplars, is_example=True, reuse=reuse, sa_siam_config=self.model_config['sa_siam_config'])
 96 |       self.instance_embeds, _ = sa_siam(inputs=self.instances, is_example=False, reuse=True, sa_siam_config=self.model_config['sa_siam_config'])
 97 | 
 98 |   def build_template(self):
 99 |     # The template is simply the feature of the exemplar image in SiamFC.
100 |     self.templates = self.exemplar_embeds
101 | 
102 |   def build_detection(self, reuse=False):
103 |     with tf.variable_scope('detection', reuse=reuse):
104 |       def _translation_match(x, z):  # translation match for one example within a batch
105 |         x = tf.expand_dims(x, 0)  # [1, in_height, in_width, in_channels]
106 |         z = tf.expand_dims(z, -1)  # [filter_height, filter_width, in_channels, 1]
107 |         return tf.nn.conv2d(x, z, strides=[1, 1, 1, 1], padding='VALID', name='translation_match')
108 | 
109 |       output = tf.map_fn(lambda x: _translation_match(x[0], x[1]),
110 |                          (self.instance_embeds, self.templates),
111 |                          dtype=self.instance_embeds.dtype)
112 |       output = tf.squeeze(output, [1, 4])  # of shape e.g., [8, 15, 15]
113 | 
114 |       # Adjust score, this is required to make training possible.
115 |       config = self.model_config['adjust_response_config']
116 |       bias = tf.get_variable('biases', [1],
117 |                              dtype=tf.float32,
118 |                              initializer=tf.constant_initializer(0.0, dtype=tf.float32),
119 |                              trainable=config['train_bias'])
120 |       response = config['scale'] * output + bias
121 |       self.response = response
122 | 
123 |   def build_loss(self):
124 |     response = self.response
125 |     response_size = response.get_shape().as_list()[1:3]  # [height, width]
126 | 
127 |     gt = construct_gt_score_maps(response_size,
128 |                                  self.data_config['batch_size'],
129 |                                  self.model_config['embed_config']['stride'],
130 |                                  self.train_config['gt_config'])
131 | 
132 |     with tf.name_scope('Loss'):
133 |       loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=response,
134 |                                                      labels=gt)
135 | 
136 |       with tf.name_scope('Balance_weights'):
137 |         n_pos = tf.reduce_sum(tf.to_float(tf.equal(gt[0], 1)))
138 |         n_neg = tf.reduce_sum(tf.to_float(tf.equal(gt[0], 0)))
139 |         w_pos = 0.5 / n_pos
140 |         w_neg = 0.5 / n_neg
141 |         class_weights = tf.where(tf.equal(gt, 1),
142 |                                  w_pos * tf.ones_like(gt),
143 |                                  tf.ones_like(gt))
144 |         class_weights = tf.where(tf.equal(gt, 0),
145 |                                  w_neg * tf.ones_like(gt),
146 |                                  class_weights)
147 |         loss = loss * class_weights
148 | 
149 |       # Note that we use reduce_sum instead of reduce_mean since the loss has
150 |       # already been normalized by class_weights in spatial dimension.
151 |       loss = tf.reduce_sum(loss, [1, 2])
152 | 
153 |       batch_loss = tf.reduce_mean(loss, name='batch_loss')
154 |       tf.losses.add_loss(batch_loss)
155 | 
156 |       total_loss = tf.losses.get_total_loss()
157 |       self.batch_loss = batch_loss
158 |       self.total_loss = total_loss
159 | 
160 |       tf.summary.image('exemplar', self.exemplars, family=self.mode)
161 |       tf.summary.image('instance', self.instances, family=self.mode)
162 | 
163 |       mean_batch_loss, update_op1 = tf.metrics.mean(batch_loss)
164 |       mean_total_loss, update_op2 = tf.metrics.mean(total_loss)
165 |       with tf.control_dependencies([update_op1, update_op2]):
166 |         tf.summary.scalar('batch_loss', mean_batch_loss, family=self.mode)
167 |         tf.summary.scalar('total_loss', mean_total_loss, family=self.mode)
168 | 
169 |       if self.mode == 'train':
170 |         tf.summary.image('GT', tf.reshape(gt[0], [1] + response_size + [1]), family='GT')
171 |       tf.summary.image('Response', tf.expand_dims(tf.sigmoid(response), -1), family=self.mode)
172 |       tf.summary.histogram('Response', self.response, family=self.mode)
173 | 
174 |       # Two more metrics to monitor the performance of training
175 |       tf.summary.scalar('center_score_error', center_score_error(response), family=self.mode)
176 |       tf.summary.scalar('center_dist_error', center_dist_error(response), family=self.mode)
177 | 
178 |   def setup_global_step(self):
179 |     global_step = tf.Variable(
180 |       initial_value=0,
181 |       name='global_step',
182 |       trainable=False,
183 |       collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
184 | 
185 |     self.global_step = global_step
186 | 
187 |   def setup_embedding_initializer(self):
188 |     """Sets up the function to restore embedding variables from checkpoint."""
189 |     embed_config = self.model_config['embed_config']
190 |     if embed_config['embedding_checkpoint_file']:
191 |       # Restore Siamese FC models from .mat model files
192 |       initialize = load_mat_model(embed_config['embedding_checkpoint_file'],
193 |                                   'sa_siam/appearance_net/', 'detection/')
194 | 
195 |       def restore_fn(sess):
196 |         tf.logging.info("Restoring embedding variables from checkpoint file %s",
197 |                         embed_config['embedding_checkpoint_file'])
198 |         sess.run([initialize])
199 | 
200 |       self.init_fn = restore_fn
201 | 
202 |   def build(self, reuse=False):
203 |     """Creates all ops for training and evaluation"""
204 |     with tf.name_scope(self.mode):
205 |       self.build_inputs()
206 |       self.build_image_embeddings(reuse=reuse)
207 |       self.build_template()
208 |       self.build_detection(reuse=reuse)
209 |       self.setup_embedding_initializer()
210 | 
211 |       if self.mode in ['train', 'validation']:
212 |         self.build_loss()
213 | 
214 |       if self.is_training():
215 |         self.setup_global_step()
216 | 


--------------------------------------------------------------------------------
/inference/inference_wrapper.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily          Huazhong University of Science and Technology
  5 | # Copyright © 2018 Anfeng He     Microsoft Research Asia. University of Science and Technology of China.
  6 | # Copyright (c) Microsoft. All rights reserved.
  7 | #
  8 | # Distributed under terms of the MIT license.
  9 | 
 10 | """Model Wrapper class for performing inference with a SiameseModel"""
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import division
 14 | from __future__ import print_function
 15 | 
 16 | import functools
 17 | import logging
 18 | logging.getLogger().setLevel(logging.INFO)
 19 | import os
 20 | import os.path as osp
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | from embeddings.sa_siam import sa_siam_arg_scope, sa_siam
 26 | from utils.infer_utils import get_exemplar_images
 27 | from utils.misc_utils import get_center, get, shape_of
 28 | 
 29 | slim = tf.contrib.slim
 30 | 
 31 | 
 32 | class InferenceWrapper():
 33 |   """Model wrapper class for performing inference with a siamese model."""
 34 | 
 35 |   def __init__(self):
 36 |     self.image = None
 37 |     self.target_bbox_feed = None
 38 |     self.search_images = None
 39 |     self.embeds = None
 40 |     self.templates = None
 41 |     self.init = None
 42 |     self.model_config = None
 43 |     self.track_config = None
 44 |     self.response_up = None
 45 | 
 46 |   def build_graph_from_config(self, model_config, track_config, checkpoint_path):
 47 |     """Build the inference graph and return a restore function."""
 48 |     self.build_model(model_config, track_config)
 49 |     ema = tf.train.ExponentialMovingAverage(0)
 50 |     variables_to_restore = ema.variables_to_restore(moving_avg_variables=[])
 51 | 
 52 |     # Filter out State variables
 53 |     variables_to_restore_filterd = {}
 54 |     for key, value in variables_to_restore.items():
 55 |       if key.split('/')[1] != 'State':
 56 |         variables_to_restore_filterd[key] = value
 57 | 
 58 |     saver = tf.train.Saver(variables_to_restore_filterd)
 59 | 
 60 |     if osp.isdir(checkpoint_path):
 61 |       checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
 62 |       if not checkpoint_path:
 63 |         raise ValueError("No checkpoint file found in: {}".format(checkpoint_path))
 64 | 
 65 |     def _restore_fn(sess):
 66 |       logging.info("Loading model from checkpoint: %s", checkpoint_path)
 67 |       saver.restore(sess, checkpoint_path)
 68 |       logging.info("Successfully loaded checkpoint: %s", os.path.basename(checkpoint_path))
 69 | 
 70 |     return _restore_fn
 71 | 
 72 |   def build_model(self, model_config, track_config):
 73 |     self.model_config = model_config
 74 |     self.track_config = track_config
 75 | 
 76 |     self.build_inputs()
 77 |     self.build_search_images()
 78 |     self.build_template()
 79 |     self.build_detection()
 80 |     self.build_upsample()
 81 |     self.dumb_op = tf.no_op('dumb_operation')
 82 | 
 83 |   def build_inputs(self):
 84 |     filename = tf.placeholder(tf.string, [], name='filename')
 85 |     image_file = tf.read_file(filename)
 86 |     image = tf.image.decode_jpeg(image_file, channels=3, dct_method="INTEGER_ACCURATE")
 87 |     image = tf.to_float(image)
 88 |     self.image = image
 89 |     self.target_bbox_feed = tf.placeholder(dtype=tf.float32,
 90 |                                            shape=[4],
 91 |                                            name='target_bbox_feed')  # center's y, x, height, width
 92 | 
 93 |   def build_search_images(self):
 94 |     """Crop search images from the input image based on the last target position
 95 | 
 96 |     1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2
 97 |     2. Crop an image patch as large as x_image_size centered at the target center.
 98 |     3. If the cropped image region is beyond the boundary of the input image, mean values are padded.
 99 |     """
100 |     model_config = self.model_config
101 |     track_config = self.track_config
102 | 
103 |     size_z = model_config['z_image_size']
104 |     size_x = track_config['x_image_size']
105 |     context_amount = 0.5
106 | 
107 |     num_scales = track_config['num_scales']
108 |     scales = np.arange(num_scales) - get_center(num_scales)
109 |     assert np.sum(scales) == 0, 'scales should be symmetric'
110 |     search_factors = [track_config['scale_step'] ** x for x in scales]
111 | 
112 |     frame_sz = tf.shape(self.image)
113 |     target_yx = self.target_bbox_feed[0:2]
114 |     target_size = self.target_bbox_feed[2:4]
115 |     avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan')
116 | 
117 |     # Compute base values
118 |     base_z_size = target_size
119 |     base_z_context_size = base_z_size + context_amount * tf.reduce_sum(base_z_size)
120 |     base_s_z = tf.sqrt(tf.reduce_prod(base_z_context_size))  # Canonical size
121 |     base_scale_z = tf.div(tf.to_float(size_z), base_s_z)
122 |     d_search = (size_x - size_z) / 2.0
123 |     base_pad = tf.div(d_search, base_scale_z)
124 |     base_s_x = base_s_z + 2 * base_pad
125 |     base_scale_x = tf.div(tf.to_float(size_x), base_s_x)
126 | 
127 |     boxes = []
128 |     for factor in search_factors:
129 |       s_x = factor * base_s_x
130 |       frame_sz_1 = tf.to_float(frame_sz[0:2] - 1)
131 |       topleft = tf.div(target_yx - get_center(s_x), frame_sz_1)
132 |       bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1)
133 |       box = tf.concat([topleft, bottomright], axis=0)
134 |       boxes.append(box)
135 |     boxes = tf.stack(boxes)
136 | 
137 |     scale_xs = []
138 |     for factor in search_factors:
139 |       scale_x = base_scale_x / factor
140 |       scale_xs.append(scale_x)
141 |     self.scale_xs = tf.stack(scale_xs)
142 | 
143 |     # Note we use different padding values for each image
144 |     # while the original implementation uses only the average value
145 |     # of the first image for all images.
146 |     image_minus_avg = tf.expand_dims(self.image - avg_chan, 0)
147 |     image_cropped = tf.image.crop_and_resize(image_minus_avg, boxes,
148 |                                              box_ind=tf.zeros((track_config['num_scales']), tf.int32),
149 |                                              crop_size=[size_x, size_x])
150 |     self.search_images = image_cropped + avg_chan
151 | 
152 |   def get_image_embedding(self, images, is_example, sa_siam_config, reuse=None):
153 |     config = self.model_config['embed_config']
154 |     arg_scope = sa_siam_arg_scope(config,
155 |                                   trainable=config['train_embedding'],
156 |                                   is_training=False)
157 | 
158 |     @functools.wraps(sa_siam)
159 |     def embedding_fn(images, is_example, sa_siam_config, reuse=False):
160 |       with slim.arg_scope(arg_scope):
161 |         return sa_siam(images, is_example, sa_siam_config, reuse=reuse)
162 | 
163 |     embed, _ = embedding_fn(images=images, is_example=is_example, sa_siam_config=sa_siam_config, reuse=reuse)
164 |     return embed
165 | 
166 |   def build_template(self):
167 |     model_config = self.model_config
168 |     track_config = self.track_config
169 | 
170 |     # Exemplar image lies at the center of the search image in the first frame
171 |     exemplar_images = get_exemplar_images(self.search_images, [track_config['x_image_size'],
172 |                                                                track_config['x_image_size']])
173 |     templates = self.get_image_embedding(exemplar_images, is_example=True, sa_siam_config=self.model_config['sa_siam_config'])
174 |     center_scale = int(get_center(track_config['num_scales']))
175 |     center_template = tf.identity(templates[center_scale])
176 |     templates = tf.stack([center_template for _ in range(track_config['num_scales'])])
177 | 
178 |     with tf.variable_scope('target_template'):
179 |       # Store template in Variable such that we don't have to feed this template every time.
180 |       with tf.variable_scope('State'):
181 |         state = tf.get_variable('exemplar',
182 |                                 initializer=tf.zeros(templates.get_shape().as_list(), dtype=templates.dtype),
183 |                                 trainable=False)
184 |         with tf.control_dependencies([templates]):
185 |           self.init = tf.assign(state, templates, validate_shape=True)
186 |         self.templates = state
187 | 
188 |   def build_detection(self):
189 |     self.embeds = self.get_image_embedding(self.search_images, reuse=True, is_example=False, sa_siam_config=self.model_config['sa_siam_config'])
190 |     with tf.variable_scope('detection'):
191 |       def _get_mask_any(shape_mask, _u, _d, _l, _r):
192 |         _mask = np.zeros(shape_mask, dtype='float32')
193 |         _mask[_u:_d, _l:_r] = 1.0
194 |         return _mask
195 |       def _get_center_mask(shape_mask, _sz): # mask center a _sz x _sz patch
196 |         _u = int((shape_mask[0] - _sz) / 2)
197 |         _d = _u + _sz
198 |         _l = int((shape_mask[1] - _sz) / 2)
199 |         _r = _l + _sz
200 |         return _get_mask_any(shape_mask, _u, _d, _l, _r)
201 |       def _translation_match(x, z, mask_center=np.array([[1.0]], dtype='float32')):
202 |         x = tf.expand_dims(x, 0)  # [batch, in_height, in_width, in_channels]
203 |         z = tf.expand_dims(z, -1)  # [filter_height, filter_width, in_channels, out_channels]
204 |         mask_center = tf.expand_dims(mask_center, -1)
205 |         mask_center = tf.expand_dims(mask_center, -1)
206 |         return tf.nn.conv2d(x, z * mask_center, strides=[1, 1, 1, 1], padding='VALID', name='translation_match')
207 |       logging.info('Shape of templates: {}'.format(self.templates.shape))
208 |       logging.info('Shape of embeds: {}'.format(self.embeds.shape))
209 |       en_appearance = get(self.model_config['sa_siam_config'], 'en_appearance', False)
210 |       en_semantic = get(self.model_config['sa_siam_config'], 'en_semantic', False)
211 |       if en_appearance and en_semantic:
212 |         c_appearance = get(self.model_config['sa_siam_config'], 'c_appearance', 0.3)
213 |         out_scale = self.model_config['adjust_response_config']['scale'] 
214 |         temp_appearance, temp_semantic = tf.split(self.templates, 2, 3)
215 |         inst_appearance, inst_semantic = tf.split(self.embeds, 2, 3)
216 |         bias_semantic = tf.get_variable('biases_semantic', [1],
217 |                               dtype=tf.float32,
218 |                               initializer=tf.constant_initializer(0.0, dtype=tf.float32),
219 |                               trainable=False)
220 |         bias_appearance = tf.get_variable('biases_appearance', [1],
221 |                               dtype=tf.float32,
222 |                               initializer=tf.constant_initializer(0.0, dtype=tf.float32),
223 |                               trainable=False)
224 |         sz_feat = shape_of(temp_appearance)[1:3] # [h,w]
225 |         self.mask_all = {
226 |           'keep_all': 1 - _get_center_mask(sz_feat, 0)
227 |         }
228 |         self.response_all = {}
229 |         for k in sorted(self.mask_all.keys()):
230 |           logging.info('Make match: {}'.format(k))
231 |           match_k = lambda x: _translation_match(x[0], x[1], mask_center=self.mask_all[k])
232 |           out_appearance_mask_k = tf.map_fn(match_k, (inst_appearance, temp_appearance), dtype=inst_appearance.dtype)
233 |           out_semantic_mask_k  = tf.map_fn(match_k, (inst_semantic,  temp_semantic),  dtype=inst_semantic.dtype)
234 | 
235 |           out_appearance_mask_k = tf.squeeze(out_appearance_mask_k, [1,4])
236 |           out_semantic_mask_k = tf.squeeze(out_semantic_mask_k, [1,4])
237 | 
238 |           response_appearance_mask_k = out_scale * out_appearance_mask_k
239 |           response_semantic_mask_k  = out_scale * out_semantic_mask_k
240 | 
241 |           self.response_all[k] = (response_appearance_mask_k + bias_appearance) * c_appearance + (response_semantic_mask_k + bias_semantic) * (1-c_appearance) 
242 |         response = self.response_all['keep_all']
243 |       else:
244 |         output = tf.map_fn(
245 |           lambda x: _translation_match(x[0], x[1]),
246 |           (self.embeds, self.templates), dtype=self.embeds.dtype)  # of shape [16, 1, 17, 17, 1]
247 |         output = tf.squeeze(output, [1, 4])  # of shape e.g. [16, 17, 17]
248 |         bias = tf.get_variable('biases', [1],
249 |                               dtype=tf.float32,
250 |                               initializer=tf.constant_initializer(0.0, dtype=tf.float32),
251 |                               trainable=False)
252 |         response = (self.model_config['adjust_response_config']['scale'] * output + bias)
253 |       self.response = response
254 | 
255 |   def build_upsample(self):
256 |     """Upsample response to obtain finer target position"""
257 |     with tf.variable_scope('upsample'):
258 |       response = tf.expand_dims(self.response, 3)
259 |       up_method = self.track_config['upsample_method']
260 |       methods = {'bilinear': tf.image.ResizeMethod.BILINEAR,
261 |                  'bicubic': tf.image.ResizeMethod.BICUBIC}
262 |       up_method = methods[up_method]
263 |       response_spatial_size = self.response.get_shape().as_list()[1:3]
264 |       up_size = [s * self.track_config['upsample_factor'] for s in response_spatial_size]
265 |       response_up = tf.image.resize_images(response,
266 |                                            up_size,
267 |                                            method=up_method,
268 |                                            align_corners=True)
269 |       response_up = tf.squeeze(response_up, [3])
270 |       self.response_up = response_up
271 | 
272 |   def initialize(self, sess, input_feed):
273 |     image_path, target_bbox = input_feed
274 |     scale_xs, _ = sess.run([self.scale_xs, self.init],
275 |                            feed_dict={'filename:0': image_path,
276 |                                       "target_bbox_feed:0": target_bbox, })
277 |     return scale_xs
278 | 
279 |   def inference_step(self, sess, input_feed):
280 |     image_path, target_bbox = input_feed
281 |     log_level = self.track_config['log_level']
282 |     image_cropped_op = self.search_images if log_level > 0 else self.dumb_op
283 |     image_cropped, scale_xs, response_output = sess.run(
284 |       fetches=[image_cropped_op, self.scale_xs, self.response_up],
285 |       feed_dict={
286 |         "filename:0": image_path,
287 |         "target_bbox_feed:0": target_bbox, })
288 | 
289 |     output = {
290 |       'image_cropped': image_cropped,
291 |       'scale_xs': scale_xs,
292 |       'response': response_output}
293 |     return output, None
294 | 


--------------------------------------------------------------------------------
/embeddings/sa_siam.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2017 bily          Huazhong University of Science and Technology
  5 | # Copyright © 2018 Anfeng He     Microsoft Research Asia. University of Science and Technology of China.
  6 | # Copyright (c) Microsoft. All rights reserved.
  7 | #
  8 | # Distributed under terms of the MIT license.
  9 | 
 10 | """Contains definitions of the network in [1][2].
 11 | 
 12 |   [1] Bertinetto, L., et al. (2016).
 13 |       "Fully-Convolutional Siamese Networks for Object Tracking."
 14 |       arXiv preprint arXiv:1606.09549.
 15 |   [2] Anfeng He, et al. (2018).
 16 |       "A Twofold Siamese Network for Real-Time Object Tracking."
 17 |       arXiv preprint arXiv:1802.08817.
 18 | 
 19 | Typical use:
 20 | 
 21 |    import sa_siam
 22 |    with slim.arg_scope(sa_siam.sa_siam_arg_scope()):
 23 |       net, end_points = sa_siam.sa_siam(inputs, is_training=False)
 24 | """
 25 | from __future__ import absolute_import
 26 | from __future__ import division
 27 | from __future__ import print_function
 28 | 
 29 | import logging 
 30 | logging.getLogger().setLevel(logging.INFO)
 31 | 
 32 | import tensorflow as tf
 33 | 
 34 | from utils.misc_utils import get, shape_of, same_hw
 35 | from functools import reduce 
 36 | 
 37 | slim = tf.contrib.slim
 38 | 
 39 | 
 40 | def sa_siam_arg_scope(embed_config,
 41 |                       trainable=True,
 42 |                       is_training=False):
 43 |   """Defines the default arg scope.
 44 | 
 45 |   Args:
 46 |     embed_config: A dictionary which contains configurations for the embedding function.
 47 |     trainable: If the weights in the embedding function is trainable.
 48 |     is_training: If the embedding function is built for training.
 49 | 
 50 |   Returns:
 51 |     An `arg_scope` to use for the SA-Siam models.
 52 |   """
 53 |   # Only consider the model to be in training mode if it's trainable.
 54 |   # This is vital for batch_norm since moving_mean and moving_variance
 55 |   # will get updated even if not trainable.
 56 |   is_model_training = trainable and is_training
 57 | 
 58 |   if get(embed_config, 'use_bn', True):
 59 |     batch_norm_scale = get(embed_config, 'bn_scale', True)
 60 |     batch_norm_decay = 1 - get(embed_config, 'bn_momentum', 3e-4)
 61 |     batch_norm_epsilon = get(embed_config, 'bn_epsilon', 1e-6)
 62 |     batch_norm_params = {
 63 |       "scale": batch_norm_scale,
 64 |       # Decay for the moving averages.
 65 |       "decay": batch_norm_decay,
 66 |       # Epsilon to prevent 0s in variance.
 67 |       "epsilon": batch_norm_epsilon,
 68 |       "trainable": trainable,
 69 |       "is_training": is_model_training,
 70 |       # Collection containing the moving mean and moving variance.
 71 |       "variables_collections": {
 72 |         "beta": None,
 73 |         "gamma": None,
 74 |         "moving_mean": ["moving_vars"],
 75 |         "moving_variance": ["moving_vars"],
 76 |       },
 77 |       'updates_collections': None,  # Ensure that updates are done within a frame
 78 |     }
 79 |     normalizer_fn = slim.batch_norm
 80 |   else:
 81 |     batch_norm_params = {}
 82 |     normalizer_fn = None
 83 | 
 84 |   weight_decay = get(embed_config, 'weight_decay', 5e-4)
 85 |   if trainable:
 86 |     weights_regularizer = slim.l2_regularizer(weight_decay)
 87 |   else:
 88 |     weights_regularizer = None
 89 | 
 90 |   init_method = get(embed_config, 'init_method', None)
 91 |   if is_model_training:
 92 |     logging.info('embedding init method -- {}'.format(init_method))
 93 |   if init_method == 'kaiming_normal':
 94 |     # The same setting as siamese-fc
 95 |     initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_OUT', uniform=False)
 96 |   else:
 97 |     initializer = slim.xavier_initializer()
 98 | 
 99 |   with slim.arg_scope(
100 |       [slim.conv2d],
101 |       weights_regularizer=weights_regularizer,
102 |       weights_initializer=initializer,
103 |       padding='VALID',
104 |       trainable=trainable,
105 |       activation_fn=tf.nn.relu,
106 |       normalizer_fn=normalizer_fn,
107 |       normalizer_params=batch_norm_params):
108 |     with slim.arg_scope([slim.batch_norm], **batch_norm_params):
109 |       with slim.arg_scope([slim.batch_norm], is_training=is_model_training) as arg_sc:
110 |           return arg_sc
111 | def appearance_net(layer_in):
112 |   logging.info('Building Appearence branch of SA-Siam')
113 |   layers_all = []
114 |   layer_cur = slim.conv2d(layer_in, 96, [11, 11], 2, scope='conv1')
115 |   layer_cur = slim.max_pool2d(layer_cur, [3, 3], 2, scope='pool1')
116 |   layers_all.append(layer_cur)
117 |   with tf.variable_scope('conv2'):
118 |     b1, b2 = tf.split(layer_cur, 2, 3)
119 |     b1 = slim.conv2d(b1, 128, [5, 5], scope='b1')
120 |     # The original implementation has bias terms for all convolution, but
121 |     # it actually isn't necessary if the convolution layer is followed by a batch
122 |     # normalization layer since batch norm will subtract the mean.
123 |     b2 = slim.conv2d(b2, 128, [5, 5], scope='b2')
124 |     layer_cur = tf.concat([b1, b2], 3)
125 |   layer_cur = slim.max_pool2d(layer_cur, [3, 3], 2, scope='pool2')
126 |   layers_all.append(layer_cur)
127 |   layer_cur = slim.conv2d(layer_cur, 384, [3, 3], 1, scope='conv3')
128 |   layers_all.append(layer_cur)
129 |   with tf.variable_scope('conv4'):
130 |     b1, b2 = tf.split(layer_cur, 2, 3)
131 |     b1 = slim.conv2d(b1, 192, [3, 3], 1, scope='b1')
132 |     b2 = slim.conv2d(b2, 192, [3, 3], 1, scope='b2')
133 |     layer_cur = tf.concat([b1, b2], 3)
134 |     layers_all.append(layer_cur)
135 |   # Conv 5 with only convolution
136 |   with tf.variable_scope('conv5'):
137 |     with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None):
138 |       b1, b2 = tf.split(layer_cur, 2, 3)
139 |       b1 = slim.conv2d(b1, 128, [3, 3], 1, scope='b1')
140 |       b2 = slim.conv2d(b2, 128, [3, 3], 1, scope='b2')
141 |     layer_cur = tf.concat([b1, b2], 3)
142 |     layers_all.append(layer_cur)
143 |   return layer_cur, layers_all
144 | 
145 | def semantic_net(layer_in):
146 |   logging.info('Building Semantic branch of SA-Siam..')
147 |   layers_all = []
148 |   with slim.arg_scope([slim.conv2d], normalizer_fn=None, trainable=False, normalizer_params=False):
149 |     layer_cur = layer_in - [123.0,117.0,104.0] # RGB sub mean
150 |     layer_cur = tf.reverse(layer_cur,[3]) # convert img to BGR
151 |     layer_cur = slim.conv2d(layer_cur, 96, [11, 11], 2, scope='conv1')
152 |     layer_cur = slim.max_pool2d(layer_cur, [3, 3], 2, scope='pool1')
153 |     layer_cur = tf.nn.local_response_normalization(layer_cur,depth_radius=2,alpha=2e-5,beta=0.75,bias=1.0,name='norm1')
154 |     layers_all.append(layer_cur)
155 |     with tf.variable_scope('conv2'):
156 |       b1, b2 = tf.split(layer_cur, 2, 3)
157 |       b1 = slim.conv2d(b1, 128, [5, 5], scope='b1')
158 |       b2 = slim.conv2d(b2, 128, [5, 5], scope='b2')
159 |       layer_cur = tf.concat([b1, b2], 3)
160 |     layer_cur = slim.max_pool2d(layer_cur, [3, 3], 2, scope='pool2')
161 |     layer_cur = tf.nn.local_response_normalization(layer_cur,depth_radius=2,alpha=2e-5,beta=0.75,bias=1.0,name='norm2')
162 |     layers_all.append(layer_cur)
163 |     layer_cur = slim.conv2d(layer_cur, 384, [3, 3], 1, scope='conv3')
164 |     layers_all.append(layer_cur)
165 |     with tf.variable_scope('conv4'):
166 |       b1, b2 = tf.split(layer_cur, 2, 3)
167 |       b1 = slim.conv2d(b1, 192, [3, 3], 1, scope='b1')
168 |       b2 = slim.conv2d(b2, 192, [3, 3], 1, scope='b2')
169 |       layer_cur = tf.concat([b1, b2], 3)
170 |       layers_all.append(layer_cur)
171 |     # Conv 5 with only convolution
172 |     with tf.variable_scope('conv5'):
173 |       with slim.arg_scope([slim.conv2d],activation_fn=tf.nn.relu):
174 |         b1, b2 = tf.split(layer_cur, 2, 3)
175 |         b1 = slim.conv2d(b1, 128, [3, 3], 1, scope='b1')
176 |         b2 = slim.conv2d(b2, 128, [3, 3], 1, scope='b2')
177 |     layer_cur = tf.concat([b1, b2], 3)
178 |     layers_all.append(layer_cur)
179 |   return layer_cur, layers_all
180 | def combine_sa_net(a_net, s_net):
181 |   all_feat = a_net + s_net
182 |   assert(all(list(map(same_hw, all_feat))))
183 |   max_feat_size = max(list(map(lambda a: shape_of(a)[1], all_feat)))
184 |   logging.info('Max_feat_size={}'.format(max_feat_size))
185 |   def pad_feat(feat):
186 |     if max_feat_size is None and shape_of(feat)[1] is None:
187 |       return feat
188 |     pad_size = max_feat_size - shape_of(feat)[1]
189 |     pad_l = pad_size // 2
190 |     pad_r = pad_size - pad_l
191 |     return tf.pad(feat,[[0,0],[pad_l,pad_r],[pad_l,pad_r],[0,0]])
192 |   all_feat = list(map(pad_feat, all_feat))
193 |   return tf.concat(all_feat, axis=3)
194 | 
195 | def sa_siam(inputs,
196 |             is_example,
197 |             sa_siam_config={},
198 |             reuse=None,
199 |             scope='sa_siam'):
200 |   en_appearance = get(sa_siam_config, 'en_appearance', False)
201 |   en_semantic = get(sa_siam_config, 'en_semantic', False)
202 |   n_out = get(sa_siam_config, 'n_out', 256)
203 |   all_combine_layers_appearance = get(sa_siam_config, 'all_combine_layers_appearance', {'conv5':1.0})
204 |   all_combine_layers_semantic = get(sa_siam_config, 'all_combine_layers_semantic', {'conv5':1.0, 'conv4':0.1})
205 |   sz_conv5_z = get(sa_siam_config, 'sz_conv5_z', 6)
206 |   en_semantic_att = get(sa_siam_config, 'en_semantic_att', True)
207 | 
208 |   with tf.variable_scope(scope, 'sa_siam', [inputs], reuse=reuse) as sc:
209 |     end_points_collection = sc.name + '_end_points'
210 |     with slim.arg_scope([slim.conv2d, slim.max_pool2d],
211 |                         outputs_collections=end_points_collection):
212 |       def proc_raw_all_feat(feat, is_appearance, n_out_cur, all_combine_layers):
213 |         res = []
214 |         max_feat_size = 0
215 |         for l in range(1,6):
216 |           for k in all_combine_layers.keys():
217 |             if k.find(str(l)) != -1:
218 |               if shape_of(feat[l-1])[3] is None:
219 |                 res.append(feat[l-1])
220 |                 break
221 |               if l == 5 and is_appearance and abs(n_out_cur - shape_of(feat[l-1])[3]) < 0.1:
222 |                 res.append(feat[l-1])
223 |               else:
224 |                 if not is_appearance:
225 |                   feat[l-1] *= all_combine_layers[k] # Multiple scale for convergence during training
226 |                 with slim.arg_scope([slim.conv2d],activation_fn=None, normalizer_fn=None):
227 |                   c1x1 = slim.conv2d(feat[l-1], n_out_cur, [1,1], 1, scope='c1x1_' + k)
228 |                 res.append(c1x1)
229 |               logging.info('Keep {} .. is_appearance={} shape={}'.format(k,is_appearance,shape_of(res[-1])))
230 |         return res
231 |       def re_weight_crop(feat, all_combine_layers, only_crop=False):
232 |         feat_shape = list(map(shape_of, feat))
233 |         res = []
234 |         for l in range(1,6): # proc layers from 1 to 5 in order
235 |           for k in all_combine_layers.keys(): # find the corresponding layer in all layers
236 |             if k.find(str(l)) != -1:
237 |               logging.info('For layer {} ...'.format(k))
238 |               cur_ly_idx = l - 1
239 |               if feat_shape[cur_ly_idx][2] is None and feat_shape[4][2] is None:
240 |                 res.append(feat[cur_ly_idx])
241 |                 break
242 |               pad_val = feat_shape[cur_ly_idx][2] - feat_shape[4][2]
243 |               sz_conv5_z_cur = pad_val + sz_conv5_z
244 |               sz_conv5_x_cur = feat_shape[cur_ly_idx][2]
245 |               n_left = int((sz_conv5_x_cur - sz_conv5_z_cur) / 2 + 0.5)
246 |               div_left_st = [0, n_left, n_left + sz_conv5_z_cur, sz_conv5_x_cur]
247 |               logging.info('.. Crop as {}'.format(div_left_st)) # crop 9 patchs and max pool each patch
248 |               if not only_crop:
249 |                 all_max = []
250 |                 for j in [0,1,2]:
251 |                   for i in [0,1,2]:
252 |                     l_crop = div_left_st[i]
253 |                     r_crop = div_left_st[i + 1]
254 |                     u_crop = div_left_st[j]
255 |                     d_crop = div_left_st[j+1]
256 |                     max_patch = tf.reduce_max(feat[cur_ly_idx][:, u_crop:d_crop, l_crop:r_crop, :], axis=[1, 2]) #shape = [n, c]
257 |                     all_max.append(max_patch)
258 |                 max_map = tf.stack(all_max, axis=2) #shape = [n, c, 9]
259 |                 logging.info('.. Max_map.shape = {}'.format(max_map.shape))
260 |                 max_map = slim.fully_connected(max_map, 9, scope='att_fc1_' + k) # fully_connected layer will only applied to the last dim
261 |                 logging.info('.. Max_map_fc1.shape = {}'.format(max_map.shape))
262 |                 max_map = slim.fully_connected(max_map, 1, scope='att_fc2_' + k, activation_fn=None, normalizer_fn=None,)
263 |                 logging.info('.. Max_map_fc2.shape = {}'.format(max_map.shape)) # shape = [n, c, 1]
264 |                 att_map = tf.reshape(max_map, [-1, 1, 1, feat_shape[cur_ly_idx][3]])
265 |                 logging.info('.. att_map.shape = {}'.format(att_map.shape))
266 |                 att_map = tf.sigmoid(att_map) + 0.5 # important bias for avoiding loss too much
267 |                 feat[cur_ly_idx] = att_map * feat[cur_ly_idx]
268 |               feat[cur_ly_idx] = feat[cur_ly_idx][:, div_left_st[1]:div_left_st[2], div_left_st[1]:div_left_st[2], :] # crop center feat
269 |               res.append(feat[cur_ly_idx])
270 |               break
271 |           else:
272 |             res.append(None)
273 |         return res
274 |       layer_cur = inputs
275 |       if en_appearance:
276 |         n_out_appearance = n_out / len(all_combine_layers_appearance.keys())
277 |         with tf.variable_scope('appearance_net'):
278 |           _, feat_appearance_all = appearance_net(layer_cur)
279 |           if is_example:
280 |             feat_appearance_all = re_weight_crop(feat_appearance_all, all_combine_layers_appearance, only_crop=True)
281 |           net_appearance = proc_raw_all_feat(feat_appearance_all, is_appearance=True, n_out_cur=n_out_appearance, all_combine_layers=all_combine_layers_appearance)
282 |       if en_semantic:
283 |         n_out_semantic = n_out / len(all_combine_layers_semantic.keys())
284 |         with tf.variable_scope('semantic_net'):
285 |           _, feat_semantic_all = semantic_net(layer_cur)
286 |           if is_example:
287 |             feat_semantic_all = re_weight_crop(feat_semantic_all, all_combine_layers_semantic, only_crop=not en_semantic_att)
288 |           net_semantic = proc_raw_all_feat(feat_semantic_all, is_appearance=False, n_out_cur=n_out_semantic, all_combine_layers=all_combine_layers_semantic)
289 |       if en_appearance and en_semantic:
290 |         layer_cur = combine_sa_net(net_appearance, net_semantic)
291 |       elif en_appearance:layer_cur = combine_sa_net(net_appearance, [])
292 |       elif en_semantic:layer_cur = combine_sa_net(net_semantic, [])
293 |       else: raise ValueError('Semantic or Appearance must enable one branch!')
294 |       # Convert end_points_collection into a dictionary of end_points.
295 |       end_points = slim.utils.convert_collection_to_dict(end_points_collection)
296 |       return layer_cur, end_points
297 | 
298 | sa_siam.stride = 8
299 | 


--------------------------------------------------------------------------------