├── utils ├── __init__.py ├── misc_utils.py ├── infer_utils.py └── train_utils.py ├── benchmarks ├── __init__.py └── OTB_Toolkit │ └── scripts │ └── bscripts │ ├── __init__.py │ ├── README.txt │ ├── run_SA_Siam_Semantic.py │ ├── run_SA_Siam_Appearance.py │ └── run_SA_Siam.py ├── embeddings ├── __init__.py └── sa_siam.py ├── inference ├── __init__.py ├── tracker.py └── inference_wrapper.py ├── metrics ├── __init__.py └── track_metrics.py ├── scripts ├── __init__.py ├── preprocess_VID_data.py └── build_VID2015_imdb.py ├── datasets ├── __init__.py ├── sampler.py ├── transforms.py ├── vid.py └── dataloader.py ├── README.md ├── LICENSE ├── experiments ├── train-semantic-network.py ├── train-appearance-network.py └── gen-sa-siam-cfg.py ├── .gitignore ├── SECURITY.md ├── configuration.py ├── train_siamese_model.py └── siamese_model.py /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /benchmarks/OTB_Toolkit/scripts/bscripts/__init__.py: -------------------------------------------------------------------------------- 1 | from .run_SA_Siam_Semantic import * 2 | from .run_SA_Siam_Appearance import * 3 | from .run_SA_Siam import * -------------------------------------------------------------------------------- /benchmarks/OTB_Toolkit/scripts/bscripts/README.txt: -------------------------------------------------------------------------------- 1 | Functions for running trackers. 2 | You can add your script files. 3 | - form : run_(seq, resultpath, saveimage) 4 | - return : dictonary type variable (has 'res', 'type', 'fps' fileds) 5 | You must import them in '__init__.py' and add exe(or matlab script) file into tracker_benchmark/trackers// 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributing 3 | 4 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 5 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 6 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 7 | 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 9 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 10 | provided by the bot. You will only need to do this once across all repos using our CLA. 11 | 12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 14 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 15 | -------------------------------------------------------------------------------- /datasets/sampler.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | """Dataset Sampler""" 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import print_function 13 | 14 | import numpy as np 15 | 16 | 17 | class Sampler(object): 18 | def __init__(self, data_source, shuffle=True): 19 | self.data_source = data_source 20 | self.shuffle = shuffle 21 | 22 | def __iter__(self): 23 | data_idxs = np.arange(len(self.data_source)) 24 | if self.shuffle: 25 | np.random.shuffle(data_idxs) 26 | 27 | for idx in data_idxs: 28 | yield idx 29 | 30 | 31 | if __name__ == '__main__': 32 | x = [1, 2, 3] 33 | sampler = Sampler(x, shuffle=True) 34 | p = 0 35 | for xx in sampler: 36 | print(x[xx]) 37 | p += 1 38 | if p == 10: break 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /experiments/train-semantic-network.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # Copyright © 2018 Anfeng He Microsoft Research Asia. University of Science and Technology of China. 6 | # Copyright (c) Microsoft. All rights reserved. 7 | # 8 | # Distributed under terms of the MIT license. 9 | 10 | """Train the color model in the SiamFC paper from scratch""" 11 | from __future__ import absolute_import 12 | from __future__ import division 13 | from __future__ import print_function 14 | 15 | import os.path as osp 16 | import sys 17 | 18 | CURRENT_DIR = osp.dirname(__file__) 19 | sys.path.append(osp.join(CURRENT_DIR, '..')) 20 | 21 | from configuration import LOG_DIR 22 | from train_siamese_model import ex 23 | 24 | if __name__ == '__main__': 25 | RUN_NAME = 'SA-Siam-Semantic' 26 | ex.run(config_updates={'train_config': {'train_dir': osp.join(LOG_DIR, 'track_model_checkpoints', RUN_NAME), }, 27 | 'track_config': {'log_dir': osp.join(LOG_DIR, 'track_model_inference', RUN_NAME), }, 28 | 'model_config': {'sa_siam_config': {'en_semantic': True, }, }, 29 | }, 30 | options={'--name': RUN_NAME, 31 | '--force': True, 32 | '--enforce_clean': False, 33 | }) 34 | -------------------------------------------------------------------------------- /experiments/train-appearance-network.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # Copyright © 2018 Anfeng He Microsoft Research Asia. University of Science and Technology of China. 6 | # Copyright (c) Microsoft. All rights reserved. 7 | # 8 | # Distributed under terms of the MIT license. 9 | 10 | """Train the color model in the SiamFC paper from scratch""" 11 | from __future__ import absolute_import 12 | from __future__ import division 13 | from __future__ import print_function 14 | 15 | import os.path as osp 16 | import sys 17 | 18 | CURRENT_DIR = osp.dirname(__file__) 19 | sys.path.append(osp.join(CURRENT_DIR, '..')) 20 | 21 | from configuration import LOG_DIR 22 | from train_siamese_model import ex 23 | 24 | if __name__ == '__main__': 25 | RUN_NAME = 'SA-Siam-Appearance' 26 | ex.run(config_updates={'train_config': {'train_dir': osp.join(LOG_DIR, 'track_model_checkpoints', RUN_NAME), }, 27 | 'track_config': {'log_dir': osp.join(LOG_DIR, 'track_model_inference', RUN_NAME), }, 28 | 'model_config': {'sa_siam_config': {'en_appearance': True, }, }, 29 | }, 30 | options={'--name': RUN_NAME, 31 | '--force': True, 32 | '--enforce_clean': False, 33 | }) 34 | -------------------------------------------------------------------------------- /experiments/gen-sa-siam-cfg.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # Copyright © 2018 Anfeng He Microsoft Research Asia. University of Science and Technology of China. 6 | # Copyright (c) Microsoft. All rights reserved. 7 | # 8 | # Distributed under terms of the MIT license. 9 | 10 | """Train the color model in the SiamFC paper from scratch""" 11 | from __future__ import absolute_import 12 | from __future__ import division 13 | from __future__ import print_function 14 | 15 | import os.path as osp 16 | import sys 17 | 18 | CURRENT_DIR = osp.dirname(__file__) 19 | sys.path.append(osp.join(CURRENT_DIR, '..')) 20 | 21 | from configuration import LOG_DIR 22 | from train_siamese_model import ex 23 | 24 | if __name__ == '__main__': 25 | RUN_NAME = 'SA-Siam' 26 | ex.run(config_updates={'train_config': {'train_dir': osp.join(LOG_DIR, 'track_model_checkpoints', RUN_NAME), 27 | 'train_data_config':{'epoch': 0}}, 28 | 'track_config': {'log_dir': osp.join(LOG_DIR, 'track_model_inference', RUN_NAME), }, 29 | 'model_config': {'sa_siam_config': {'en_semantic': True, 30 | 'en_appearance': True}, }, 31 | }, 32 | options={'--name': RUN_NAME, 33 | '--force': True, 34 | '--enforce_clean': False, 35 | }) 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /datasets/transforms.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | 9 | """Various transforms for video and image augmentation""" 10 | 11 | import numbers 12 | 13 | import tensorflow as tf 14 | 15 | 16 | class Compose(object): 17 | """Composes several transforms together.""" 18 | 19 | def __init__(self, transforms): 20 | self.transforms = transforms 21 | 22 | def __call__(self, example): 23 | for t in self.transforms: 24 | example = t(example) 25 | return example 26 | 27 | 28 | class RandomGray(object): 29 | def __init__(self, gray_ratio=0.25): 30 | self.gray_ratio = gray_ratio 31 | 32 | def __call__(self, img_sequence): 33 | def rgb_to_gray(): 34 | gray_images = tf.image.rgb_to_grayscale(img_sequence) 35 | return tf.concat([gray_images] * 3, axis=3) 36 | 37 | def identity(): 38 | return tf.identity(img_sequence) 39 | 40 | return tf.cond(tf.less(tf.random_uniform([], 0, 1), self.gray_ratio), rgb_to_gray, identity) 41 | 42 | 43 | class RandomStretch(object): 44 | def __init__(self, max_stretch=0.05, interpolation='bilinear'): 45 | self.max_stretch = max_stretch 46 | self.interpolation = interpolation 47 | 48 | def __call__(self, img): 49 | scale = 1.0 + tf.random_uniform([], -self.max_stretch, self.max_stretch) 50 | img_shape = tf.shape(img) 51 | ts = tf.to_int32(tf.round(tf.to_float(img_shape[:2]) * scale)) 52 | resize_method_map = {'bilinear': tf.image.ResizeMethod.BILINEAR, 53 | 'bicubic': tf.image.ResizeMethod.BICUBIC} 54 | return tf.image.resize_images(img, ts, method=resize_method_map[self.interpolation]) 55 | 56 | 57 | class CenterCrop(object): 58 | def __init__(self, size): 59 | if isinstance(size, numbers.Number): 60 | self.size = (int(size), int(size)) 61 | else: 62 | self.size = size 63 | 64 | def __call__(self, img): 65 | th, tw = self.size 66 | return tf.image.resize_image_with_crop_or_pad(img, th, tw) 67 | 68 | 69 | class RandomCrop(object): 70 | def __init__(self, size): 71 | if isinstance(size, numbers.Number): 72 | self.size = (int(size), int(size)) 73 | else: 74 | self.size = size 75 | 76 | def __call__(self, img): 77 | img_shape = tf.shape(img) 78 | th, tw = self.size 79 | 80 | y1 = tf.random_uniform([], 0, img_shape[0] - th, dtype=tf.int32) 81 | x1 = tf.random_uniform([], 0, img_shape[1] - tw, dtype=tf.int32) 82 | 83 | return tf.image.crop_to_bounding_box(img, y1, x1, th, tw) 84 | -------------------------------------------------------------------------------- /datasets/vid.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | """VID Dataset""" 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import print_function 13 | 14 | import pickle 15 | 16 | import numpy as np 17 | 18 | 19 | def downsample(n_in, n_out, max_frame_dist=1): 20 | # Get a list of frame distance between consecutive frames 21 | max_frame_dist = np.minimum(n_in, max_frame_dist) 22 | possible_frame_dist = range(1, max_frame_dist + 1) 23 | frame_dist = np.random.choice(possible_frame_dist, n_out - 1) 24 | end_to_start_frame_dist = np.sum(frame_dist) 25 | 26 | # Check frame dist boundary 27 | possible_max_start_idx = n_in - 1 - end_to_start_frame_dist 28 | if possible_max_start_idx < 0: 29 | n_extra = - possible_max_start_idx 30 | while n_extra > 0: 31 | for idx, dist in enumerate(frame_dist): 32 | if dist > 1: 33 | frame_dist[idx] = dist - 1 34 | n_extra -= 1 35 | if n_extra == 0: break 36 | 37 | # Get frame dist 38 | end_to_start_frame_dist = np.sum(frame_dist) 39 | possible_max_start_idx = n_in - 1 - end_to_start_frame_dist 40 | start_idx = np.random.choice(possible_max_start_idx + 1, 1) 41 | out_idxs = np.cumsum(np.concatenate((start_idx, frame_dist))) 42 | return out_idxs 43 | 44 | 45 | def upsample(n_in, n_out): 46 | n_more = n_out - n_in 47 | in_idxs = range(n_in) 48 | more_idxs = np.random.choice(in_idxs, n_more) 49 | out_idxs = sorted(list(in_idxs) + list(more_idxs)) 50 | return out_idxs 51 | 52 | 53 | class VID: 54 | def __init__(self, imdb_path, max_frame_dist, epoch_size=None): 55 | with open(imdb_path, 'rb') as f: 56 | imdb = pickle.load(f) 57 | 58 | self.videos = imdb['videos'] 59 | self.time_steps = 2 60 | self.max_frame_dist = max_frame_dist 61 | 62 | if epoch_size is None: 63 | self.epoch_size = len(self.videos) 64 | else: 65 | self.epoch_size = int(epoch_size) 66 | 67 | def __getitem__(self, index): 68 | img_ids = self.videos[index % len(self.videos)] 69 | n_frames = len(img_ids) 70 | 71 | if n_frames < self.time_steps: 72 | out_idxs = upsample(n_frames, self.time_steps) 73 | elif n_frames == self.time_steps: 74 | out_idxs = range(n_frames) 75 | else: 76 | out_idxs = downsample(n_frames, self.time_steps, self.max_frame_dist) 77 | 78 | video = [] 79 | for j, frame_idx in enumerate(out_idxs): 80 | img_path = img_ids[frame_idx] 81 | video.append(img_path.encode('utf-8')) 82 | return video 83 | 84 | def __len__(self): 85 | return self.epoch_size 86 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /benchmarks/OTB_Toolkit/scripts/bscripts/run_SA_Siam_Semantic.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # Copyright © 2018 Anfeng He Microsoft Research Asia. University of Science and Technology of China. 6 | # Copyright (c) Microsoft. All rights reserved. 7 | # 8 | # Distributed under terms of the MIT license. 9 | 10 | r"""Support integration with OTB benchmark""" 11 | 12 | from __future__ import absolute_import 13 | from __future__ import division 14 | from __future__ import print_function 15 | 16 | import logging 17 | import os 18 | import sys 19 | import time 20 | 21 | import tensorflow as tf 22 | 23 | sys.path.append(os.getcwd()) 24 | 25 | from configuration import LOG_DIR 26 | 27 | # Code root absolute path 28 | CODE_ROOT = './' 29 | 30 | # Checkpoint for evaluation 31 | CHECKPOINT = os.path.join(LOG_DIR, 'track_model_checkpoints', 'SA-Siam-Semantic', 'model.ckpt-{iter_ckpt}') 32 | 33 | sys.path.insert(0, CODE_ROOT) 34 | 35 | from utils.misc_utils import auto_select_gpu, load_cfgs 36 | from inference import inference_wrapper 37 | from inference.tracker import Tracker 38 | from utils.infer_utils import Rectangle 39 | 40 | # Set GPU 41 | os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() 42 | logging.getLogger().setLevel(logging.INFO) 43 | 44 | 45 | def run_SA_Siam_Semantic(seq, rp, bSaveImage, epoch=30): 46 | iter_ckpt = epoch * 6650 - 1 47 | checkpoint_path = CHECKPOINT.format(iter_ckpt=iter_ckpt) 48 | logging.info('Evaluating {}...'.format(checkpoint_path)) 49 | 50 | # Read configurations from json 51 | model_config, _, track_config = load_cfgs(checkpoint_path) 52 | 53 | track_config['log_level'] = 0 # Skip verbose logging for speed 54 | 55 | # Build the inference graph. 56 | g = tf.Graph() 57 | with g.as_default(): 58 | model = inference_wrapper.InferenceWrapper() 59 | restore_fn = model.build_graph_from_config(model_config, track_config, checkpoint_path) 60 | g.finalize() 61 | 62 | gpu_options = tf.GPUOptions(allow_growth=True) 63 | sess_config = tf.ConfigProto(gpu_options=gpu_options) 64 | 65 | with tf.Session(graph=g, config=sess_config) as sess: 66 | # Load the model from checkpoint. 67 | restore_fn(sess) 68 | 69 | tracker = Tracker(model, model_config, track_config) 70 | 71 | tic = time.clock() 72 | frames = seq.s_frames 73 | init_rect = seq.init_rect 74 | x, y, width, height = init_rect # OTB format 75 | init_bb = Rectangle(x - 1, y - 1, width, height) 76 | 77 | trajectory_py = tracker.track(sess, init_bb, frames) 78 | trajectory = [Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in 79 | trajectory_py] # x, y add one to match OTB format 80 | duration = time.clock() - tic 81 | 82 | result = dict() 83 | result['res'] = trajectory 84 | result['type'] = 'rect' 85 | result['fps'] = round(seq.len / duration, 3) 86 | return result 87 | -------------------------------------------------------------------------------- /benchmarks/OTB_Toolkit/scripts/bscripts/run_SA_Siam_Appearance.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # Copyright © 2018 Anfeng He Microsoft Research Asia. University of Science and Technology of China. 6 | # Copyright (c) Microsoft. All rights reserved. 7 | # 8 | # Distributed under terms of the MIT license. 9 | 10 | r"""Support integration with OTB benchmark""" 11 | 12 | from __future__ import absolute_import 13 | from __future__ import division 14 | from __future__ import print_function 15 | 16 | import logging 17 | import os 18 | import sys 19 | import time 20 | 21 | import tensorflow as tf 22 | 23 | sys.path.append(os.getcwd()) 24 | 25 | from configuration import LOG_DIR 26 | 27 | # Code root absolute path 28 | CODE_ROOT = './' 29 | 30 | # Checkpoint for evaluation 31 | CHECKPOINT = os.path.join(LOG_DIR, 'track_model_checkpoints', 'SA-Siam-Appearance', 'model.ckpt-{iter_ckpt}') 32 | 33 | sys.path.insert(0, CODE_ROOT) 34 | 35 | from utils.misc_utils import auto_select_gpu, load_cfgs 36 | from inference import inference_wrapper 37 | from inference.tracker import Tracker 38 | from utils.infer_utils import Rectangle 39 | 40 | # Set GPU 41 | os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() 42 | logging.getLogger().setLevel(logging.INFO) 43 | 44 | 45 | def run_SA_Siam_Appearance(seq, rp, bSaveImage, epoch=30): 46 | iter_ckpt = epoch * 6650 - 1 47 | checkpoint_path = CHECKPOINT.format(iter_ckpt=iter_ckpt) 48 | logging.info('Evaluating {}...'.format(checkpoint_path)) 49 | 50 | # Read configurations from json 51 | model_config, _, track_config = load_cfgs(checkpoint_path) 52 | 53 | track_config['log_level'] = 0 # Skip verbose logging for speed 54 | 55 | # Build the inference graph. 56 | g = tf.Graph() 57 | with g.as_default(): 58 | model = inference_wrapper.InferenceWrapper() 59 | restore_fn = model.build_graph_from_config(model_config, track_config, checkpoint_path) 60 | g.finalize() 61 | 62 | gpu_options = tf.GPUOptions(allow_growth=True) 63 | sess_config = tf.ConfigProto(gpu_options=gpu_options) 64 | 65 | with tf.Session(graph=g, config=sess_config) as sess: 66 | # Load the model from checkpoint. 67 | restore_fn(sess) 68 | 69 | tracker = Tracker(model, model_config, track_config) 70 | 71 | tic = time.clock() 72 | frames = seq.s_frames 73 | init_rect = seq.init_rect 74 | x, y, width, height = init_rect # OTB format 75 | init_bb = Rectangle(x - 1, y - 1, width, height) 76 | 77 | trajectory_py = tracker.track(sess, init_bb, frames) 78 | trajectory = [Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in 79 | trajectory_py] # x, y add one to match OTB format 80 | duration = time.clock() - tic 81 | 82 | result = dict() 83 | result['res'] = trajectory 84 | result['type'] = 'rect' 85 | result['fps'] = round(seq.len / duration, 3) 86 | return result 87 | -------------------------------------------------------------------------------- /benchmarks/OTB_Toolkit/scripts/bscripts/run_SA_Siam.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # Copyright © 2018 Anfeng He Microsoft Research Asia. University of Science and Technology of China. 6 | # Copyright (c) Microsoft. All rights reserved. 7 | # 8 | # Distributed under terms of the MIT license. 9 | 10 | r"""Support integration with OTB benchmark""" 11 | 12 | from __future__ import absolute_import 13 | from __future__ import division 14 | from __future__ import print_function 15 | 16 | import logging 17 | import os 18 | import sys 19 | import time 20 | 21 | import tensorflow as tf 22 | 23 | sys.path.append(os.getcwd()) 24 | 25 | from configuration import LOG_DIR 26 | from utils.infer_utils import get_saver 27 | 28 | # Code root absolute path 29 | CODE_ROOT = './' 30 | 31 | # Checkpoint for evaluation 32 | CHECKPOINT_APPEARANCE = os.path.join(LOG_DIR, 'track_model_checkpoints', 'SA-Siam-Appearance', 'model.ckpt-{iter_ckpt}') 33 | CHECKPOINT_SEMANTIC = os.path.join(LOG_DIR, 'track_model_checkpoints', 'SA-Siam-Semantic', 'model.ckpt-{iter_ckpt}') 34 | CHECKPOINT_SA_SIAM = os.path.join(LOG_DIR, 'track_model_checkpoints', 'SA-Siam') 35 | 36 | sys.path.insert(0, CODE_ROOT) 37 | 38 | from utils.misc_utils import auto_select_gpu, load_cfgs 39 | from inference import inference_wrapper 40 | from inference.tracker import Tracker 41 | from utils.infer_utils import Rectangle 42 | 43 | # Set GPU 44 | os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() 45 | logging.getLogger().setLevel(logging.INFO) 46 | 47 | 48 | def run_SA_Siam(seq, rp, bSaveImage, epoch=30): 49 | iter_ckpt = epoch * 6650 - 1 50 | checkpoint_appearance_path = CHECKPOINT_APPEARANCE.format(iter_ckpt=iter_ckpt) 51 | logging.info('Evaluating {}...'.format(checkpoint_appearance_path)) 52 | checkpoint_semantic_path = CHECKPOINT_SEMANTIC.format(iter_ckpt=iter_ckpt) 53 | logging.info('Evaluating {}...'.format(checkpoint_semantic_path)) 54 | 55 | # Read configurations from json 56 | model_config, _, track_config = load_cfgs(CHECKPOINT_SA_SIAM) 57 | 58 | track_config['log_level'] = 0 # Skip verbose logging for speed 59 | 60 | # Build the inference graph. 61 | g = tf.Graph() 62 | with g.as_default(): 63 | model = inference_wrapper.InferenceWrapper() 64 | model.build_model(model_config, track_config) 65 | saver_loader_semantic = get_saver('', removes=[':0', '_semantic'], excepts=['appearance', 'State']) 66 | saver_loader_appearance = get_saver('', removes=[':0', '_appearance'], excepts=['semantic', 'State']) 67 | g.finalize() 68 | 69 | gpu_options = tf.GPUOptions(allow_growth=True) 70 | sess_config = tf.ConfigProto(gpu_options=gpu_options) 71 | 72 | with tf.Session(graph=g, config=sess_config) as sess: 73 | # Load the model from checkpoint. 74 | # restore_fn(sess) 75 | saver_loader_semantic.restore(sess, checkpoint_semantic_path) 76 | saver_loader_appearance.restore(sess, checkpoint_appearance_path) 77 | 78 | tracker = Tracker(model, model_config, track_config) 79 | 80 | tic = time.clock() 81 | frames = seq.s_frames 82 | init_rect = seq.init_rect 83 | x, y, width, height = init_rect # OTB format 84 | init_bb = Rectangle(x - 1, y - 1, width, height) 85 | 86 | trajectory_py = tracker.track(sess, init_bb, frames) 87 | trajectory = [Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in 88 | trajectory_py] # x, y add one to match OTB format 89 | duration = time.clock() - tic 90 | 91 | result = dict() 92 | result['res'] = trajectory 93 | result['type'] = 'rect' 94 | result['fps'] = round(seq.len / duration, 3) 95 | return result 96 | -------------------------------------------------------------------------------- /datasets/dataloader.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import logging 13 | logging.getLogger().setLevel(logging.INFO) 14 | 15 | import tensorflow as tf 16 | 17 | from datasets.sampler import Sampler 18 | from datasets.transforms import Compose, RandomGray, RandomCrop, CenterCrop, RandomStretch 19 | from datasets.vid import VID 20 | from utils.misc_utils import get 21 | 22 | 23 | class DataLoader(object): 24 | def __init__(self, config, is_training): 25 | self.config = config 26 | self.is_training = is_training 27 | 28 | preprocess_name = get(config, 'preprocessing_name', None) 29 | logging.info('preproces -- {}'.format(preprocess_name)) 30 | 31 | if preprocess_name == 'siamese_fc_color': 32 | self.v_transform = None 33 | # TODO: use a single operation (tf.image.crop_and_resize) to achieve all transformations ? 34 | self.z_transform = Compose([RandomStretch(), 35 | CenterCrop((255 - 8, 255 - 8)), 36 | RandomCrop(255 - 2 * 8), 37 | CenterCrop((255 - 2 * 8, 255 - 2 * 8))]) 38 | self.x_transform = Compose([RandomStretch(), 39 | CenterCrop((255 - 8, 255 - 8)), 40 | RandomCrop(255 - 2 * 8), ]) 41 | elif preprocess_name == 'siamese_fc_gray': 42 | self.v_transform = RandomGray() 43 | self.z_transform = Compose([RandomStretch(), 44 | CenterCrop((255 - 8, 255 - 8)), 45 | RandomCrop(255 - 2 * 8), 46 | CenterCrop((255 - 2 * 8, 255 - 2 * 8))]) 47 | self.x_transform = Compose([RandomStretch(), 48 | CenterCrop((255 - 8, 255 - 8)), 49 | RandomCrop(255 - 2 * 8), ]) 50 | elif preprocess_name == 'None': 51 | self.v_transform = None 52 | self.z_transform = CenterCrop((255, 255)) 53 | self.x_transform = CenterCrop((255, 255)) 54 | else: 55 | raise ValueError('Preprocessing name {} was not recognized.'.format(preprocess_name)) 56 | 57 | self.dataset_py = VID(config['input_imdb'], config['max_frame_dist']) 58 | self.sampler = Sampler(self.dataset_py, shuffle=is_training) 59 | 60 | def build(self): 61 | self.build_dataset() 62 | self.build_iterator() 63 | 64 | def build_dataset(self): 65 | def sample_generator(): 66 | for video_id in self.sampler: 67 | sample = self.dataset_py[video_id] 68 | yield sample 69 | 70 | def transform_fn(video): 71 | exemplar_file = tf.read_file(video[0]) 72 | instance_file = tf.read_file(video[1]) 73 | exemplar_image = tf.image.decode_jpeg(exemplar_file, channels=3, dct_method="INTEGER_ACCURATE") 74 | instance_image = tf.image.decode_jpeg(instance_file, channels=3, dct_method="INTEGER_ACCURATE") 75 | 76 | if self.v_transform is not None: 77 | video = tf.stack([exemplar_image, instance_image]) 78 | video = self.v_transform(video) 79 | exemplar_image = video[0] 80 | instance_image = video[1] 81 | 82 | if self.z_transform is not None: 83 | exemplar_image = self.z_transform(exemplar_image) 84 | 85 | if self.x_transform is not None: 86 | instance_image = self.x_transform(instance_image) 87 | 88 | return exemplar_image, instance_image 89 | 90 | dataset = tf.data.Dataset.from_generator(sample_generator, 91 | output_types=(tf.string), 92 | output_shapes=(tf.TensorShape([2]))) 93 | dataset = dataset.map(transform_fn, num_parallel_calls=self.config['prefetch_threads']) 94 | dataset = dataset.prefetch(self.config['prefetch_capacity']) 95 | dataset = dataset.repeat() 96 | dataset = dataset.batch(self.config['batch_size']) 97 | self.dataset_tf = dataset 98 | 99 | def build_iterator(self): 100 | self.iterator = self.dataset_tf.make_one_shot_iterator() 101 | 102 | def get_one_batch(self): 103 | return self.iterator.get_next() 104 | -------------------------------------------------------------------------------- /scripts/preprocess_VID_data.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import os 13 | import os.path as osp 14 | import sys 15 | import xml.etree.ElementTree as ET 16 | from glob import glob 17 | from multiprocessing.pool import ThreadPool 18 | 19 | import cv2 20 | from cv2 import imread, imwrite 21 | 22 | CURRENT_DIR = osp.dirname(__file__) 23 | ROOT_DIR = osp.join(CURRENT_DIR, '..') 24 | sys.path.append(ROOT_DIR) 25 | 26 | from utils.infer_utils import get_crops, Rectangle, convert_bbox_format 27 | from utils.misc_utils import mkdir_p 28 | 29 | 30 | def get_track_save_directory(save_dir, split, subdir, video): 31 | subdir_map = {'ILSVRC2015_VID_train_0000': 'a', 32 | 'ILSVRC2015_VID_train_0001': 'b', 33 | 'ILSVRC2015_VID_train_0002': 'c', 34 | 'ILSVRC2015_VID_train_0003': 'd', 35 | '': 'e'} 36 | return osp.join(save_dir, 'Data', 'VID', split, subdir_map[subdir], video) 37 | 38 | 39 | def process_split(root_dir, save_dir, split, subdir='', ): 40 | data_dir = osp.join(root_dir, 'Data', 'VID', split) 41 | anno_dir = osp.join(root_dir, 'Annotations', 'VID', split, subdir) 42 | video_names = os.listdir(anno_dir) 43 | 44 | for idx, video in enumerate(video_names): 45 | print('{split}-{subdir} ({idx}/{total}): Processing {video}...'.format(split=split, subdir=subdir, 46 | idx=idx, total=len(video_names), 47 | video=video)) 48 | video_path = osp.join(anno_dir, video) 49 | xml_files = glob(osp.join(video_path, '*.xml')) 50 | 51 | for xml in xml_files: 52 | tree = ET.parse(xml) 53 | root = tree.getroot() 54 | 55 | folder = root.find('folder').text 56 | filename = root.find('filename').text 57 | 58 | # Read image 59 | img_file = osp.join(data_dir, folder, filename + '.JPEG') 60 | img = None 61 | 62 | # Get all object bounding boxes 63 | bboxs = [] 64 | for object in root.iter('object'): 65 | bbox = object.find('bndbox') 66 | xmax = float(bbox.find('xmax').text) 67 | xmin = float(bbox.find('xmin').text) 68 | ymax = float(bbox.find('ymax').text) 69 | ymin = float(bbox.find('ymin').text) 70 | width = xmax - xmin + 1 71 | height = ymax - ymin + 1 72 | bboxs.append([xmin, ymin, width, height]) 73 | 74 | for idx, object in enumerate(root.iter('object')): 75 | id = object.find('trackid').text 76 | class_name = object.find('name').text 77 | 78 | track_save_dir = get_track_save_directory(save_dir, 'train', subdir, video) 79 | mkdir_p(track_save_dir) 80 | savename = osp.join(track_save_dir, '{}.{:02d}.crop.x.jpg'.format(filename, int(id))) 81 | if osp.isfile(savename): continue # skip existing images 82 | 83 | if img is None: 84 | img = imread(img_file) 85 | 86 | # Get crop 87 | target_box = convert_bbox_format(Rectangle(*bboxs[idx]), 'center-based') 88 | crop, _ = get_crops(img, target_box, 89 | size_z=127, size_x=255, 90 | context_amount=0.5, ) 91 | 92 | imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90]) 93 | 94 | 95 | if __name__ == '__main__': 96 | vid_dir = osp.join(ROOT_DIR, 'data/ILSVRC2015') 97 | 98 | # Or, you could save the actual curated data to a disk with sufficient space 99 | # then create a soft link in `data/ILSVRC2015-VID-Curation` 100 | save_dir = 'data/ILSVRC2015-VID-Curation' 101 | 102 | pool = ThreadPool(processes=5) 103 | 104 | one_work = lambda a, b: process_split(vid_dir, save_dir, a, b) 105 | 106 | results = [] 107 | results.append(pool.apply_async(one_work, ['val', ''])) 108 | results.append(pool.apply_async(one_work, ['train', 'ILSVRC2015_VID_train_0000'])) 109 | results.append(pool.apply_async(one_work, ['train', 'ILSVRC2015_VID_train_0001'])) 110 | results.append(pool.apply_async(one_work, ['train', 'ILSVRC2015_VID_train_0002'])) 111 | results.append(pool.apply_async(one_work, ['train', 'ILSVRC2015_VID_train_0003'])) 112 | ans = [res.get() for res in results] 113 | -------------------------------------------------------------------------------- /scripts/build_VID2015_imdb.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | """Save the paths of crops from the ImageNet VID 2015 dataset in pickle format""" 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import glob 14 | import os 15 | import os.path as osp 16 | import pickle 17 | import sys 18 | 19 | import numpy as np 20 | import tensorflow as tf 21 | 22 | CURRENT_DIR = osp.dirname(__file__) 23 | sys.path.append(osp.join(CURRENT_DIR, '..')) 24 | 25 | from utils.misc_utils import sort_nicely 26 | 27 | 28 | class Config: 29 | ### Dataset 30 | # directory where curated dataset is stored 31 | dataset_dir = 'data/ILSVRC2015-VID-Curation' 32 | save_dir = 'data/' 33 | 34 | # percentage of all videos for validation 35 | validation_ratio = 0.1 36 | 37 | 38 | class DataIter: 39 | """Container for dataset of one iteration""" 40 | pass 41 | 42 | 43 | class Dataset: 44 | def __init__(self, config): 45 | self.config = config 46 | 47 | def _get_unique_trackids(self, video_dir): 48 | """Get unique trackids within video_dir""" 49 | x_image_paths = glob.glob(video_dir + '/*.crop.x.jpg') 50 | trackids = [os.path.basename(path).split('.')[1] for path in x_image_paths] 51 | unique_trackids = set(trackids) 52 | return unique_trackids 53 | 54 | def dataset_iterator(self, video_dirs): 55 | video_num = len(video_dirs) 56 | iter_size = 150 57 | iter_num = int(np.ceil(video_num / float(iter_size))) 58 | for iter_ in range(iter_num): 59 | iter_start = iter_ * iter_size 60 | iter_videos = video_dirs[iter_start: iter_start + iter_size] 61 | 62 | data_iter = DataIter() 63 | num_videos = len(iter_videos) 64 | instance_videos = [] 65 | for index in range(num_videos): 66 | print('Processing {}/{}...'.format(iter_start + index, video_num)) 67 | video_dir = iter_videos[index] 68 | trackids = self._get_unique_trackids(video_dir) 69 | 70 | for trackid in trackids: 71 | instance_image_paths = glob.glob(video_dir + '/*' + trackid + '.crop.x.jpg') 72 | 73 | # sort image paths by frame number 74 | instance_image_paths = sort_nicely(instance_image_paths) 75 | 76 | # get image absolute path 77 | instance_image_paths = [os.path.abspath(p) for p in instance_image_paths] 78 | instance_videos.append(instance_image_paths) 79 | data_iter.num_videos = len(instance_videos) 80 | data_iter.instance_videos = instance_videos 81 | yield data_iter 82 | 83 | def get_all_video_dirs(self): 84 | ann_dir = os.path.join(self.config.dataset_dir, 'Data', 'VID') 85 | all_video_dirs = [] 86 | 87 | # We have already combined all training and validation videos in ILSVRC2015 and put them in the `train` directory. 88 | # The file structure is like: 89 | # train 90 | # |- a 91 | # |- b 92 | # |_ c 93 | # |- ILSVRC2015_train_00024001 94 | # |- ILSVRC2015_train_00024002 95 | # |_ ILSVRC2015_train_00024003 96 | # |- 000045.00.crop.x.jpg 97 | # |- 000046.00.crop.x.jpg 98 | # |- ... 99 | train_dirs = os.listdir(os.path.join(ann_dir, 'train')) 100 | for dir_ in train_dirs: 101 | train_sub_dir = os.path.join(ann_dir, 'train', dir_) 102 | video_names = os.listdir(train_sub_dir) 103 | train_video_dirs = [os.path.join(train_sub_dir, name) for name in video_names] 104 | all_video_dirs = all_video_dirs + train_video_dirs 105 | 106 | return all_video_dirs 107 | 108 | 109 | def main(): 110 | # Get the data. 111 | config = Config() 112 | dataset = Dataset(config) 113 | all_video_dirs = dataset.get_all_video_dirs() 114 | num_validation = int(len(all_video_dirs) * config.validation_ratio) 115 | 116 | ### validation 117 | validation_dirs = all_video_dirs[:num_validation] 118 | validation_imdb = dict() 119 | validation_imdb['videos'] = [] 120 | for i, data_iter in enumerate(dataset.dataset_iterator(validation_dirs)): 121 | validation_imdb['videos'] += data_iter.instance_videos 122 | validation_imdb['n_videos'] = len(validation_imdb['videos']) 123 | validation_imdb['image_shape'] = (255, 255, 3) 124 | 125 | ### train 126 | train_dirs = all_video_dirs[num_validation:] 127 | train_imdb = dict() 128 | train_imdb['videos'] = [] 129 | for i, data_iter in enumerate(dataset.dataset_iterator(train_dirs)): 130 | train_imdb['videos'] += data_iter.instance_videos 131 | train_imdb['n_videos'] = len(train_imdb['videos']) 132 | train_imdb['image_shape'] = (255, 255, 3) 133 | 134 | if not tf.gfile.IsDirectory(config.save_dir): 135 | tf.logging.info('Creating training directory: %s', config.save_dir) 136 | tf.gfile.MakeDirs(config.save_dir) 137 | 138 | with open(os.path.join(config.save_dir, 'validation_imdb.pickle'), 'wb') as f: 139 | pickle.dump(validation_imdb, f) 140 | with open(os.path.join(config.save_dir, 'train_imdb.pickle'), 'wb') as f: 141 | pickle.dump(train_imdb, f) 142 | 143 | 144 | if __name__ == '__main__': 145 | main() 146 | -------------------------------------------------------------------------------- /configuration.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright @ 2017 bily Huazhong University of Science and Technology 5 | # 6 | 7 | """Default configurations of model specification, training and tracking 8 | 9 | For most of the time, DO NOT modify the configurations within this file. 10 | Use the configurations here as the default configurations and only update 11 | them following the examples in the `experiments` directory. 12 | """ 13 | 14 | from __future__ import absolute_import 15 | from __future__ import division 16 | from __future__ import print_function 17 | 18 | import os.path as osp 19 | 20 | WORKSPACE_DIR = './' 21 | LOG_DIR = osp.join(WORKSPACE_DIR, 'Logs/SA-Siam') # where checkpoints, logs are saved 22 | DATA_DIR = osp.join(WORKSPACE_DIR, 'data') 23 | RUN_NAME = '' # identifier of the experiment 24 | OTB_DATA_DIR = '/data/anfeng/tracking/data/OTB/' 25 | 26 | MODEL_CONFIG = { 27 | 'z_image_size': 127, # Exemplar image size 28 | 29 | 'embed_config': {'embedding_name': 'sa_siam', 30 | 'embedding_checkpoint_file': None, # mat file path of the pretrained embedding model. 31 | 'train_embedding': True, 32 | 'init_method': None, 33 | 'use_bn': True, 34 | 'bn_scale': True, 35 | 'bn_momentum': 0.05, 36 | 'bn_epsilon': 1e-6, 37 | 'weight_decay': 5e-4, 38 | 'stride': 8, }, 39 | 'sa_siam_config':{'en_appearance': False, 40 | 'en_semantic': False, 41 | 'n_out': 256, 42 | 'all_combine_layers_appearance': {'conv5':1.0}, 43 | 'all_combine_layers_semantic': {'conv5':1.0, 'conv4':0.1}, 44 | 'sz_conv5_z': 6, 45 | 'en_semantic_att': True, 46 | }, 47 | 48 | 'adjust_response_config': {'train_bias': True, 49 | 'scale': 1e-3, }, 50 | } 51 | 52 | TRAIN_CONFIG = { 53 | 'train_dir': osp.join(LOG_DIR, 'track_model_checkpoints', RUN_NAME), 54 | 'caffenet_dir': osp.join(DATA_DIR, 'caffenet.npy'), 55 | 56 | 'seed': 0, # fix seed for reproducing experiments 57 | 58 | 'train_data_config': {'input_imdb': osp.join(DATA_DIR, 'train_imdb.pickle'), 59 | 'preprocessing_name': 'siamese_fc_color', 60 | 'num_examples_per_epoch': 5.32e4, 61 | 'epoch': 30, 62 | 'batch_size': 8, 63 | 'max_frame_dist': 100, # Maximum distance between any two random frames draw from videos. 64 | 'prefetch_threads': 4, 65 | 'prefetch_capacity': 15 * 8, }, # The maximum elements number in the data loading queue 66 | 67 | 'validation_data_config': {'input_imdb': osp.join(DATA_DIR, 'validation_imdb.pickle'), 68 | 'preprocessing_name': 'None', 69 | 'batch_size': 8, 70 | 'max_frame_dist': 100, # Maximum distance between any two random frames draw from videos. 71 | 'prefetch_threads': 1, 72 | 'prefetch_capacity': 15 * 8, }, # The maximum elements number in the data loading queue 73 | 74 | # Configurations for generating groundtruth maps 75 | 'gt_config': {'rPos': 16, 76 | 'rNeg': 0, }, 77 | 78 | # Optimizer for training the model. 79 | # 'optimizer_config': {'optimizer': 'MOMENTUM', # SGD and MOMENTUM are supported 80 | # 'momentum': 0.9, 81 | # 'use_nesterov': False, }, 82 | 'optimizer_config': {'optimizer': 'SGD'}, 83 | 84 | # Learning rate configs 85 | 'lr_config': {'policy': 'exponential', 86 | 'initial_lr': 0.01, 87 | 'num_epochs_per_decay': 25, 88 | 'lr_decay_factor': 0.1, 89 | 'staircase': True, }, 90 | 91 | # If not None, clip gradients to this value. 92 | 'clip_gradients': None, 93 | 94 | # Frequency at which loss and global step are logged 95 | 'log_every_n_steps': 10, 96 | 97 | # Frequency to save model 98 | 'save_model_every_n_step': 5.32e4 // 8, # save model every epoch 99 | 100 | # How many model checkpoints to keep. No limit if None. 101 | 'max_checkpoints_to_keep': None, 102 | } 103 | 104 | TRACK_CONFIG = { 105 | # Directory for saving log files during tracking. 106 | 'log_dir': osp.join(LOG_DIR, 'track_model_inference', RUN_NAME), 107 | 108 | # Logging level of inference, use 1 for detailed inspection. 0 for speed. 109 | 'log_level': 0, 110 | 111 | 'x_image_size': 255, # Search image size during tracking 112 | 113 | # Configurations for upsampling score maps 114 | 'upsample_method': 'bicubic', 115 | 'upsample_factor': 16, 116 | 117 | # Configurations for searching scales 118 | 'num_scales': 3, # Number of scales to search 119 | 'scale_step': 1.0375, # Scale changes between different scale search 120 | 'scale_damp': 0.59, # Damping factor for scale update 121 | 'scale_penalty': 0.9745, # Score penalty for scale change 122 | 123 | # Configurations for penalizing large displacement from the center 124 | 'window_influence': 0.176, 125 | 126 | 'include_first': False, # If track the first frame 127 | } 128 | -------------------------------------------------------------------------------- /utils/misc_utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | """Miscellaneous Utilities.""" 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import errno 14 | import json 15 | import logging 16 | logging.getLogger().setLevel(logging.INFO) 17 | import os 18 | import re 19 | import sys 20 | from os import path as osp 21 | 22 | try: 23 | import pynvml # nvidia-ml provides utility for NVIDIA management 24 | 25 | HAS_NVML = True 26 | except: 27 | HAS_NVML = False 28 | 29 | 30 | def auto_select_gpu(): 31 | """Select gpu which has largest free memory""" 32 | if HAS_NVML: 33 | pynvml.nvmlInit() 34 | deviceCount = pynvml.nvmlDeviceGetCount() 35 | largest_free_mem = 0 36 | largest_free_idx = 0 37 | for i in range(deviceCount): 38 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 39 | info = pynvml.nvmlDeviceGetMemoryInfo(handle) 40 | if info.free > largest_free_mem: 41 | largest_free_mem = info.free 42 | largest_free_idx = i 43 | pynvml.nvmlShutdown() 44 | largest_free_mem = largest_free_mem / 1024. / 1024. # Convert to MB 45 | 46 | idx_to_gpu_id = {} 47 | for i in range(deviceCount): 48 | idx_to_gpu_id[i] = '{}'.format(i) 49 | 50 | gpu_id = idx_to_gpu_id[largest_free_idx] 51 | logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem)) 52 | return gpu_id 53 | else: 54 | logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!') 55 | return '0' 56 | 57 | 58 | def get_center(x): 59 | return (x - 1.) / 2. 60 | 61 | 62 | def get(config, key, default): 63 | """Get value in config by key, use default if key is not set 64 | 65 | This little function is useful for dynamical experimental settings. 66 | For example, we can add a new configuration without worrying compatibility with older versions. 67 | You can also achieve this by just calling config.get(key, default), but add a warning is even better : ) 68 | """ 69 | val = config.get(key) 70 | if val is None: 71 | logging.warning('{} is not explicitly specified, using default value: {}'.format(key, default)) 72 | val = default 73 | return val 74 | 75 | 76 | def mkdir_p(path): 77 | """mimic the behavior of mkdir -p in bash""" 78 | try: 79 | os.makedirs(path) 80 | except OSError as exc: # Python >2.5 81 | if exc.errno == errno.EEXIST and os.path.isdir(path): 82 | pass 83 | else: 84 | raise 85 | 86 | 87 | def tryfloat(s): 88 | try: 89 | return float(s) 90 | except: 91 | return s 92 | 93 | 94 | def alphanum_key(s): 95 | """ Turn a string into a list of string and number chunks. 96 | "z23a" -> ["z", 23, "a"] 97 | """ 98 | return [tryfloat(c) for c in re.split('([0-9.]+)', s)] 99 | 100 | 101 | def sort_nicely(l): 102 | """Sort the given list in the way that humans expect.""" 103 | return sorted(l, key=alphanum_key) 104 | 105 | 106 | class Tee(object): 107 | """Mimic the behavior of tee in bash 108 | 109 | From: http://web.archive.org/web/20141016185743/https://mail.python.org/pipermail/python-list/2007-May/460639.html 110 | Usage: 111 | tee=Tee('logfile', 'w') 112 | print 'abcdefg' 113 | print 'another line' 114 | tee.close() 115 | print 'screen only' 116 | del tee # should do nothing 117 | """ 118 | 119 | def __init__(self, name, mode): 120 | self.file = open(name, mode) 121 | self.stdout = sys.stdout 122 | sys.stdout = self 123 | 124 | def close(self): 125 | if self.stdout is not None: 126 | sys.stdout = self.stdout 127 | self.stdout = None 128 | if self.file is not None: 129 | self.file.close() 130 | self.file = None 131 | 132 | def write(self, data): 133 | self.file.write(data) 134 | self.stdout.write(data) 135 | 136 | def flush(self): 137 | self.file.flush() 138 | self.stdout.flush() 139 | 140 | def __del__(self): 141 | self.close() 142 | 143 | 144 | def save_cfgs(train_dir, model_config, train_config, track_config): 145 | """Save all configurations in JSON format for future reference""" 146 | with open(osp.join(train_dir, 'model_config.json'), 'w') as f: 147 | json.dump(model_config, f, indent=2) 148 | with open(osp.join(train_dir, 'train_config.json'), 'w') as f: 149 | json.dump(train_config, f, indent=2) 150 | with open(osp.join(train_dir, 'track_config.json'), 'w') as f: 151 | json.dump(track_config, f, indent=2) 152 | 153 | 154 | def load_cfgs(checkpoint): 155 | if osp.isdir(checkpoint): 156 | train_dir = checkpoint 157 | else: 158 | train_dir = osp.dirname(checkpoint) 159 | 160 | with open(osp.join(train_dir, 'model_config.json'), 'r') as f: 161 | model_config = json.load(f) 162 | with open(osp.join(train_dir, 'train_config.json'), 'r') as f: 163 | train_config = json.load(f) 164 | with open(osp.join(train_dir, 'track_config.json'), 'r') as f: 165 | track_config = json.load(f) 166 | return model_config, train_config, track_config 167 | 168 | shape_of = lambda x: x.get_shape().as_list() 169 | same_hw = lambda a: (shape_of(a)[1] == shape_of(a)[2]) -------------------------------------------------------------------------------- /metrics/track_metrics.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | 9 | import tensorflow as tf 10 | from tensorflow.python.framework import ops 11 | from tensorflow.python.ops.metrics_impl import _confusion_matrix_at_thresholds 12 | 13 | 14 | def _auc(labels, predictions, weights=None, num_thresholds=200, 15 | metrics_collections=None, updates_collections=None, 16 | curve='ROC', name=None, summation_method='trapezoidal'): 17 | """Computes the approximate AUC via a Riemann sum. 18 | 19 | Modified version of tf.metrics.auc. Add support for AUC computation 20 | of the recall curve. 21 | """ 22 | with tf.variable_scope( 23 | name, 'auc', (labels, predictions, weights)): 24 | if curve != 'ROC' and curve != 'PR' and curve != 'R': 25 | raise ValueError('curve must be either ROC, PR or R, %s unknown' % 26 | (curve)) 27 | kepsilon = 1e-7 # to account for floating point imprecisions 28 | thresholds = [(i + 1) * 1.0 / (num_thresholds - 1) 29 | for i in range(num_thresholds - 2)] 30 | thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] 31 | 32 | values, update_ops = _confusion_matrix_at_thresholds( 33 | labels, predictions, thresholds, weights) 34 | 35 | # Add epsilons to avoid dividing by 0. 36 | epsilon = 1.0e-6 37 | 38 | def compute_auc(tp, fn, tn, fp, name): 39 | """Computes the roc-auc or pr-auc based on confusion counts.""" 40 | rec = tf.div(tp + epsilon, tp + fn + epsilon) 41 | if curve == 'ROC': 42 | fp_rate = tf.div(fp, fp + tn + epsilon) 43 | x = fp_rate 44 | y = rec 45 | elif curve == 'R': # recall auc 46 | x = tf.linspace(1., 0., num_thresholds) 47 | y = rec 48 | else: # curve == 'PR'. 49 | prec = tf.div(tp + epsilon, tp + fp + epsilon) 50 | x = rec 51 | y = prec 52 | if summation_method == 'trapezoidal': 53 | return tf.reduce_sum( 54 | tf.multiply(x[:num_thresholds - 1] - x[1:], 55 | (y[:num_thresholds - 1] + y[1:]) / 2.), 56 | name=name) 57 | elif summation_method == 'minoring': 58 | return tf.reduce_sum( 59 | tf.multiply(x[:num_thresholds - 1] - x[1:], 60 | tf.minimum(y[:num_thresholds - 1], y[1:])), 61 | name=name) 62 | elif summation_method == 'majoring': 63 | return tf.reduce_sum( 64 | tf.multiply(x[:num_thresholds - 1] - x[1:], 65 | tf.maximum(y[:num_thresholds - 1], y[1:])), 66 | name=name) 67 | else: 68 | raise ValueError('Invalid summation_method: %s' % summation_method) 69 | 70 | # sum up the areas of all the trapeziums 71 | auc_value = compute_auc( 72 | values['tp'], values['fn'], values['tn'], values['fp'], 'value') 73 | update_op = compute_auc( 74 | update_ops['tp'], update_ops['fn'], update_ops['tn'], update_ops['fp'], 75 | 'update_op') 76 | 77 | if metrics_collections: 78 | ops.add_to_collections(metrics_collections, auc_value) 79 | 80 | if updates_collections: 81 | ops.add_to_collections(updates_collections, update_op) 82 | 83 | return auc_value, update_op 84 | 85 | 86 | def get_center_index(response): 87 | """Get the index of the center in the response map""" 88 | shape = tf.shape(response) 89 | c1 = tf.to_int32((shape[1] - 1) / 2) 90 | c2 = tf.to_int32((shape[2] - 1) / 2) 91 | return c1, c2 92 | 93 | 94 | def center_score_error(response): 95 | """Center score error. 96 | 97 | The error is low when the center of the response map is classified as target. 98 | """ 99 | with tf.name_scope('CS-err'): 100 | r, c = get_center_index(response) 101 | center_score = response[:, r, c] 102 | mean, update_op = tf.metrics.mean(tf.to_float(center_score < 0)) 103 | with tf.control_dependencies([update_op]): 104 | mean = tf.identity(mean) 105 | return mean 106 | 107 | 108 | def get_maximum_index(response): 109 | """Get the index of the maximum value in the response map""" 110 | response_shape = response.get_shape().as_list() 111 | response_spatial_size = response_shape[-2:] # e.g. [29, 29] 112 | length = response_spatial_size[0] * response_spatial_size[1] 113 | 114 | # Get maximum response index (note index starts from zero) 115 | ind_max = tf.argmax(tf.reshape(response, [-1, length]), 1) 116 | ind_row = tf.div(ind_max, response_spatial_size[1]) 117 | ind_col = tf.mod(ind_max, response_spatial_size[1]) 118 | return ind_row, ind_col 119 | 120 | 121 | def center_dist_error(response): 122 | """Center distance error. 123 | 124 | The error is low when the maximum response is at the center of the response map. 125 | """ 126 | with tf.name_scope('CD-err'): 127 | radius_in_pixel = 50. 128 | total_stride = 8. 129 | num_thresholds = 100 130 | radius_in_response = radius_in_pixel / total_stride 131 | 132 | gt_r, gt_c = get_center_index(response) 133 | max_r, max_c = get_maximum_index(response) 134 | gt_r = tf.to_float(gt_r) 135 | gt_c = tf.to_float(gt_c) 136 | max_r = tf.to_float(max_r) 137 | max_c = tf.to_float(max_c) 138 | distances = tf.sqrt((gt_r - max_r) ** 2 + (gt_c - max_c) ** 2) 139 | 140 | # We cast distances as prediction accuracies in the range [0, 1] where 0 means fail and 141 | # 1 means success. In this way, we can readily use streaming_auc to compute area 142 | # under curve. 143 | dist_norm = distances / radius_in_response 144 | dist_norm = tf.minimum(dist_norm, 1.) 145 | predictions = 1. - dist_norm 146 | labels = tf.ones_like(predictions) 147 | 148 | auc, update_op = _auc(labels, predictions, num_thresholds=num_thresholds, curve='R') 149 | with tf.control_dependencies([update_op]): 150 | err = 1. - auc 151 | return err 152 | -------------------------------------------------------------------------------- /utils/infer_utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | """ 9 | Inference Utilities 10 | """ 11 | 12 | from __future__ import absolute_import 13 | from __future__ import division 14 | from __future__ import print_function 15 | 16 | import collections 17 | 18 | import numpy as np 19 | import tensorflow as tf 20 | from cv2 import resize 21 | import logging 22 | 23 | from utils.misc_utils import get_center 24 | 25 | Rectangle = collections.namedtuple('Rectangle', ['x', 'y', 'width', 'height']) 26 | 27 | 28 | def im2rgb(im): 29 | if len(im.shape) != 3: 30 | im = np.stack([im, im, im], -1) 31 | return im 32 | 33 | 34 | def convert_bbox_format(bbox, to): 35 | x, y, target_width, target_height = bbox.x, bbox.y, bbox.width, bbox.height 36 | if to == 'top-left-based': 37 | x -= get_center(target_width) 38 | y -= get_center(target_height) 39 | elif to == 'center-based': 40 | y += get_center(target_height) 41 | x += get_center(target_width) 42 | else: 43 | raise ValueError("Bbox format: {} was not recognized".format(to)) 44 | return Rectangle(x, y, target_width, target_height) 45 | 46 | 47 | def get_exemplar_images(images, exemplar_size, targets_pos=None): 48 | """Crop exemplar image from input images""" 49 | with tf.name_scope('get_exemplar_image'): 50 | batch_size, x_height, x_width = images.get_shape().as_list()[:3] 51 | z_height, z_width = exemplar_size 52 | 53 | if targets_pos is None: 54 | target_pos_single = [[get_center(x_height), get_center(x_width)]] 55 | targets_pos_ = tf.tile(target_pos_single, [batch_size, 1]) 56 | else: 57 | targets_pos_ = targets_pos 58 | 59 | # convert to top-left corner based coordinates 60 | top = tf.to_int32(tf.round(targets_pos_[:, 0] - get_center(z_height))) 61 | bottom = tf.to_int32(top + z_height) 62 | left = tf.to_int32(tf.round(targets_pos_[:, 1] - get_center(z_width))) 63 | right = tf.to_int32(left + z_width) 64 | 65 | def _slice(x): 66 | f, t, l, b, r = x 67 | c = f[t:b, l:r] 68 | return c 69 | 70 | exemplar_img = tf.map_fn(_slice, (images, top, left, bottom, right), dtype=images.dtype) 71 | exemplar_img.set_shape([batch_size, z_height, z_width, 3]) 72 | return exemplar_img 73 | 74 | 75 | def get_crops(im, bbox, size_z, size_x, context_amount): 76 | """Obtain image sub-window, padding with avg channel if area goes outside of border 77 | 78 | Adapted from https://github.com/bertinetto/siamese-fc/blob/master/ILSVRC15-curation/save_crops.m#L46 79 | 80 | Args: 81 | im: Image ndarray 82 | bbox: Named tuple (x, y, width, height) x, y corresponds to the crops center 83 | size_z: Target + context size 84 | size_x: The resultant crop size 85 | context_amount: The amount of context 86 | 87 | Returns: 88 | image crop: Image ndarray 89 | """ 90 | cy, cx, h, w = bbox.y, bbox.x, bbox.height, bbox.width 91 | wc_z = w + context_amount * (w + h) 92 | hc_z = h + context_amount * (w + h) 93 | s_z = np.sqrt(wc_z * hc_z) 94 | scale_z = size_z / s_z 95 | 96 | d_search = (size_x - size_z) / 2 97 | pad = d_search / scale_z 98 | s_x = s_z + 2 * pad 99 | scale_x = size_x / s_x 100 | 101 | image_crop_x, _, _, _, _ = get_subwindow_avg(im, [cy, cx], 102 | [size_x, size_x], 103 | [np.round(s_x), np.round(s_x)]) 104 | 105 | return image_crop_x, scale_x 106 | 107 | 108 | def get_subwindow_avg(im, pos, model_sz, original_sz): 109 | # avg_chans = np.mean(im, axis=(0, 1)) # This version is 3x slower 110 | avg_chans = [np.mean(im[:, :, 0]), np.mean(im[:, :, 1]), np.mean(im[:, :, 2])] 111 | if not original_sz: 112 | original_sz = model_sz 113 | sz = original_sz 114 | im_sz = im.shape 115 | # make sure the size is not too small 116 | assert im_sz[0] > 2 and im_sz[1] > 2 117 | c = [get_center(s) for s in sz] 118 | 119 | # check out-of-bounds coordinates, and set them to avg_chans 120 | context_xmin = np.int(np.round(pos[1] - c[1])) 121 | context_xmax = np.int(context_xmin + sz[1] - 1) 122 | context_ymin = np.int(np.round(pos[0] - c[0])) 123 | context_ymax = np.int(context_ymin + sz[0] - 1) 124 | left_pad = np.int(np.maximum(0, -context_xmin)) 125 | top_pad = np.int(np.maximum(0, -context_ymin)) 126 | right_pad = np.int(np.maximum(0, context_xmax - im_sz[1] + 1)) 127 | bottom_pad = np.int(np.maximum(0, context_ymax - im_sz[0] + 1)) 128 | 129 | context_xmin = context_xmin + left_pad 130 | context_xmax = context_xmax + left_pad 131 | context_ymin = context_ymin + top_pad 132 | context_ymax = context_ymax + top_pad 133 | if top_pad > 0 or bottom_pad > 0 or left_pad > 0 or right_pad > 0: 134 | R = np.pad(im[:, :, 0], ((top_pad, bottom_pad), (left_pad, right_pad)), 135 | 'constant', constant_values=(avg_chans[0])) 136 | G = np.pad(im[:, :, 1], ((top_pad, bottom_pad), (left_pad, right_pad)), 137 | 'constant', constant_values=(avg_chans[1])) 138 | B = np.pad(im[:, :, 2], ((top_pad, bottom_pad), (left_pad, right_pad)), 139 | 'constant', constant_values=(avg_chans[2])) 140 | 141 | im = np.stack((R, G, B), axis=2) 142 | 143 | im_patch_original = im[context_ymin:context_ymax + 1, 144 | context_xmin:context_xmax + 1, :] 145 | if not (model_sz[0] == original_sz[0] and model_sz[1] == original_sz[1]): 146 | im_patch = resize(im_patch_original, tuple(model_sz)) 147 | else: 148 | im_patch = im_patch_original 149 | return im_patch, left_pad, top_pad, right_pad, bottom_pad 150 | 151 | def get_saver(keyword, removes, excepts,repl=[]): 152 | vars_need_load = {} 153 | for v in (tf.global_variables()): 154 | vname = v.name 155 | if vname.find(keyword)!=-1: 156 | for eeexxx in excepts: 157 | if vname.find(eeexxx)!=-1: 158 | logging.warning('No Load: '+vname) 159 | break 160 | else: 161 | vname_ori = vname 162 | for r in removes: 163 | vname = vname.replace(r,'') 164 | for r in repl: 165 | vname = vname.replace(r[0],r[1]) 166 | vars_need_load[vname] = v 167 | logging.warning('Load: ' + vname + ' as ' + vname_ori) 168 | else: 169 | logging.warning('No Load: '+vname) 170 | return tf.train.Saver(vars_need_load) -------------------------------------------------------------------------------- /train_siamese_model.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | """Train the model""" 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import print_function 13 | 14 | import logging 15 | logging.getLogger().setLevel(logging.INFO) 16 | import os 17 | import os.path as osp 18 | import random 19 | import time 20 | from datetime import datetime 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | from sacred import Experiment 25 | from sacred.observers import FileStorageObserver 26 | 27 | import configuration 28 | import siamese_model 29 | from utils.misc_utils import auto_select_gpu, mkdir_p, save_cfgs 30 | from utils.train_utils import load_caffenet 31 | 32 | ex = Experiment(configuration.RUN_NAME) 33 | ex.observers.append(FileStorageObserver.create(osp.join(configuration.LOG_DIR, 'sacred'))) 34 | 35 | 36 | @ex.config 37 | def configurations(): 38 | # Add configurations for current script, for more details please see the documentation of `sacred`. 39 | # REFER: http://sacred.readthedocs.io/en/latest/index.html 40 | model_config = configuration.MODEL_CONFIG 41 | train_config = configuration.TRAIN_CONFIG 42 | track_config = configuration.TRACK_CONFIG 43 | 44 | 45 | def _configure_learning_rate(train_config, global_step): 46 | lr_config = train_config['lr_config'] 47 | 48 | num_batches_per_epoch = \ 49 | int(train_config['train_data_config']['num_examples_per_epoch'] / train_config['train_data_config']['batch_size']) 50 | 51 | lr_policy = lr_config['policy'] 52 | if lr_policy == 'piecewise_constant': 53 | lr_boundaries = [int(e * num_batches_per_epoch) for e in lr_config['lr_boundaries']] 54 | return tf.train.piecewise_constant(global_step, 55 | lr_boundaries, 56 | lr_config['lr_values']) 57 | elif lr_policy == 'exponential': 58 | decay_steps = int(num_batches_per_epoch) * lr_config['num_epochs_per_decay'] 59 | return tf.train.exponential_decay(lr_config['initial_lr'], 60 | global_step, 61 | decay_steps=decay_steps, 62 | decay_rate=lr_config['lr_decay_factor'], 63 | staircase=lr_config['staircase']) 64 | elif lr_policy == 'cosine': 65 | T_total = train_config['train_data_config']['epoch'] * num_batches_per_epoch 66 | return 0.5 * lr_config['initial_lr'] * (1 + tf.cos(np.pi * tf.to_float(global_step) / T_total)) 67 | else: 68 | raise ValueError('Learning rate policy [%s] was not recognized', lr_policy) 69 | 70 | 71 | def _configure_optimizer(train_config, learning_rate): 72 | optimizer_config = train_config['optimizer_config'] 73 | optimizer_name = optimizer_config['optimizer'].upper() 74 | if optimizer_name == 'MOMENTUM': 75 | optimizer = tf.train.MomentumOptimizer( 76 | learning_rate, 77 | momentum=optimizer_config['momentum'], 78 | use_nesterov=optimizer_config['use_nesterov'], 79 | name='Momentum') 80 | elif optimizer_name == 'SGD': 81 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) 82 | else: 83 | raise ValueError('Optimizer [%s] was not recognized', optimizer_config['optimizer']) 84 | return optimizer 85 | 86 | 87 | @ex.automain 88 | def main(model_config, train_config, track_config): 89 | logging.getLogger().setLevel(logging.INFO) 90 | os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() 91 | 92 | # Create training directory which will be used to save: configurations, model files, TensorBoard logs 93 | train_dir = train_config['train_dir'] 94 | if not osp.isdir(train_dir): 95 | logging.info('Creating training directory: %s', train_dir) 96 | mkdir_p(train_dir) 97 | 98 | g = tf.Graph() 99 | with g.as_default(): 100 | # Set fixed seed for reproducible experiments 101 | random.seed(train_config['seed']) 102 | np.random.seed(train_config['seed']) 103 | tf.set_random_seed(train_config['seed']) 104 | 105 | # Build the training and validation model 106 | model = siamese_model.SiameseModel(model_config, train_config, mode='train') 107 | model.build() 108 | model_va = siamese_model.SiameseModel(model_config, train_config, mode='validation') 109 | model_va.build(reuse=True) 110 | 111 | # Save configurations for future reference 112 | save_cfgs(train_dir, model_config, train_config, track_config) 113 | 114 | learning_rate = _configure_learning_rate(train_config, model.global_step) 115 | optimizer = _configure_optimizer(train_config, learning_rate) 116 | tf.summary.scalar('learning_rate', learning_rate) 117 | logging.info('Trainable variables:') 118 | for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): 119 | logging.info('-- {}'.format(v)) 120 | # Set up the training ops 121 | opt_op = tf.contrib.layers.optimize_loss( 122 | loss=model.total_loss, 123 | global_step=model.global_step, 124 | learning_rate=learning_rate, 125 | optimizer=optimizer, 126 | clip_gradients=train_config['clip_gradients'], 127 | learning_rate_decay_fn=None, 128 | summaries=['learning_rate']) 129 | 130 | with tf.control_dependencies([opt_op]): 131 | train_op = tf.no_op(name='train') 132 | 133 | saver = tf.train.Saver(tf.global_variables(), 134 | max_to_keep=train_config['max_checkpoints_to_keep']) 135 | 136 | summary_writer = tf.summary.FileWriter(train_dir, g) 137 | summary_op = tf.summary.merge_all() 138 | 139 | global_variables_init_op = tf.global_variables_initializer() 140 | local_variables_init_op = tf.local_variables_initializer() 141 | 142 | # Dynamically allocate GPU memory 143 | gpu_options = tf.GPUOptions(allow_growth=True) 144 | sess_config = tf.ConfigProto(gpu_options=gpu_options) 145 | 146 | sess = tf.Session(config=sess_config) 147 | model_path = tf.train.latest_checkpoint(train_config['train_dir']) 148 | 149 | if not model_path: 150 | sess.run(global_variables_init_op) 151 | sess.run(local_variables_init_op) 152 | start_step = 0 153 | if model_config['sa_siam_config']['en_semantic']: 154 | load_caffenet(train_config['caffenet_dir'], sess) 155 | if model_config['embed_config']['embedding_checkpoint_file']: 156 | model.init_fn(sess) 157 | 158 | else: 159 | logging.info('Restore from last checkpoint: {}'.format(model_path)) 160 | sess.run(local_variables_init_op) 161 | saver.restore(sess, model_path) 162 | start_step = tf.train.global_step(sess, model.global_step.name) + 1 163 | 164 | g.finalize() # Finalize graph to avoid adding ops by mistake 165 | # Training loop 166 | data_config = train_config['train_data_config'] 167 | total_steps = int(data_config['epoch'] * 168 | data_config['num_examples_per_epoch'] / 169 | data_config['batch_size']) 170 | logging.info('Train for {} steps'.format(total_steps)) 171 | for step in range(start_step, total_steps): 172 | start_time = time.time() 173 | _, loss, batch_loss = sess.run([train_op, model.total_loss, model.batch_loss]) 174 | duration = time.time() - start_time 175 | 176 | if step % 10 == 0: 177 | examples_per_sec = data_config['batch_size'] / float(duration) 178 | time_remain = data_config['batch_size'] * (total_steps - step) / examples_per_sec 179 | m, s = divmod(time_remain, 60) 180 | h, m = divmod(m, 60) 181 | format_str = ('%s: step %d, total loss = %.2f, batch loss = %.2f (%.1f examples/sec; %.3f ' 182 | 'sec/batch; %dh:%02dm:%02ds remains)') 183 | logging.info(format_str % (datetime.now(), step, loss, batch_loss, 184 | examples_per_sec, duration, h, m, s)) 185 | 186 | if step % 100 == 0: 187 | summary_str = sess.run(summary_op) 188 | summary_writer.add_summary(summary_str, step) 189 | 190 | if (step + 1) % train_config['save_model_every_n_step'] == 0 or (step + 1) == total_steps: 191 | checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt') 192 | saver.save(sess, checkpoint_path, global_step=step) 193 | -------------------------------------------------------------------------------- /inference/tracker.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | """Class for tracking using a track model.""" 8 | 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import logging 14 | logging.getLogger().setLevel(logging.INFO) 15 | import os.path as osp 16 | 17 | import numpy as np 18 | import cv2 19 | from cv2 import imwrite 20 | 21 | from utils.infer_utils import convert_bbox_format, Rectangle 22 | from utils.misc_utils import get_center, get 23 | 24 | 25 | class TargetState(object): 26 | """Represent the target state.""" 27 | 28 | def __init__(self, bbox, search_pos, scale_idx): 29 | self.bbox = bbox # (cx, cy, w, h) in the original image 30 | self.search_pos = search_pos # target center position in the search image 31 | self.scale_idx = scale_idx # scale index in the searched scales 32 | 33 | 34 | class Tracker(object): 35 | """Tracker based on the siamese model.""" 36 | 37 | def __init__(self, siamese_model, model_config, track_config): 38 | self.siamese_model = siamese_model 39 | self.model_config = model_config 40 | self.track_config = track_config 41 | 42 | self.num_scales = track_config['num_scales'] 43 | logging.info('track num scales -- {}'.format(self.num_scales)) 44 | scales = np.arange(self.num_scales) - get_center(self.num_scales) 45 | self.search_factors = [self.track_config['scale_step'] ** x for x in scales] 46 | 47 | self.x_image_size = track_config['x_image_size'] # Search image size 48 | self.window = None # Cosine window 49 | self.log_level = track_config['log_level'] 50 | 51 | def track(self, sess, first_bbox, frames, logdir='/tmp'): 52 | """Runs tracking on a single image sequence.""" 53 | # Get initial target bounding box and convert to center based 54 | bbox = convert_bbox_format(first_bbox, 'center-based') 55 | 56 | # Feed in the first frame image to set initial state. 57 | bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] 58 | input_feed = [frames[0], bbox_feed] 59 | frame2crop_scale = self.siamese_model.initialize(sess, input_feed) 60 | 61 | # Storing target state 62 | original_target_height = bbox.height 63 | original_target_width = bbox.width 64 | search_center = np.array([get_center(self.x_image_size), 65 | get_center(self.x_image_size)]) 66 | current_target_state = TargetState(bbox=bbox, 67 | search_pos=search_center, 68 | scale_idx=int(get_center(self.num_scales))) 69 | 70 | include_first = get(self.track_config, 'include_first', False) 71 | logging.info('Tracking include first -- {}'.format(include_first)) 72 | 73 | # Run tracking loop 74 | reported_bboxs = [] 75 | for i, filename in enumerate(frames): 76 | if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. 77 | bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x, 78 | current_target_state.bbox.height, current_target_state.bbox.width] 79 | input_feed = [filename, bbox_feed] 80 | 81 | outputs, metadata = self.siamese_model.inference_step(sess, input_feed) 82 | search_scale_list = outputs['scale_xs'] 83 | response = outputs['response'] 84 | response_size = response.shape[1] 85 | 86 | # Choose the scale whole response map has the highest peak 87 | if self.num_scales > 1: 88 | response_max = np.max(response, axis=(1, 2)) 89 | penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales)) 90 | current_scale_idx = int(get_center(self.num_scales)) 91 | penalties[current_scale_idx] = 1.0 92 | response_penalized = response_max * penalties 93 | best_scale = np.argmax(response_penalized) 94 | if np.max(response_max)<0: 95 | logging.warning('MAX_RESPONSE LESS THAN ZERO!') 96 | # best_scale = current_scale_idx 97 | else: 98 | best_scale = 0 99 | 100 | response = response[best_scale] 101 | 102 | with np.errstate(all='raise'): # Raise error if something goes wrong 103 | response = response - np.min(response) 104 | response = response / np.sum(response) 105 | 106 | if self.window is None: 107 | window = np.dot(np.expand_dims(np.hanning(response_size), 1), 108 | np.expand_dims(np.hanning(response_size), 0)) 109 | self.window = window / np.sum(window) # normalize window 110 | window_influence = self.track_config['window_influence'] 111 | response = (1 - window_influence) * response + window_influence * self.window 112 | 113 | # Find maximum response 114 | r_max, c_max = np.unravel_index(response.argmax(), 115 | response.shape) 116 | 117 | # Convert from crop-relative coordinates to frame coordinates 118 | p_coor = np.array([r_max, c_max]) 119 | # displacement from the center in instance final representation ... 120 | disp_instance_final = p_coor - get_center(response_size) 121 | # ... in instance feature space ... 122 | upsample_factor = self.track_config['upsample_factor'] 123 | disp_instance_feat = disp_instance_final / upsample_factor 124 | # ... Avoid empty position ... 125 | r_radius = int(response_size / upsample_factor / 2) 126 | disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius) 127 | # ... in instance input ... 128 | disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride'] 129 | # ... in instance original crop (in frame coordinates) 130 | disp_instance_frame = disp_instance_input / search_scale_list[best_scale] 131 | # Position within frame in frame coordinates 132 | y = current_target_state.bbox.y 133 | x = current_target_state.bbox.x 134 | y += disp_instance_frame[0] 135 | x += disp_instance_frame[1] 136 | 137 | # Target scale damping and saturation 138 | target_scale = current_target_state.bbox.height / original_target_height 139 | search_factor = self.search_factors[best_scale] 140 | scale_damp = self.track_config['scale_damp'] # damping factor for scale update 141 | target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) 142 | target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) 143 | 144 | # Some book keeping 145 | height = original_target_height * target_scale 146 | width = original_target_width * target_scale 147 | current_target_state.bbox = Rectangle(x, y, width, height) 148 | current_target_state.scale_idx = best_scale 149 | current_target_state.search_pos = search_center + disp_instance_input 150 | 151 | assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 152 | 'target position in feature space should be no larger than input image size' 153 | assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 154 | 'target position in feature space should be no larger than input image size' 155 | 156 | if self.log_level > 0: 157 | np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) 158 | 159 | # Select the image with the highest score scale and convert it to uint8 160 | image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8) 161 | # Note that imwrite in cv2 assumes the image is in BGR format. 162 | # However, the cropped image returned by TensorFlow is RGB. 163 | # Therefore, we convert color format using cv2.cvtColor 164 | imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), 165 | cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) 166 | 167 | np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) 168 | np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) 169 | 170 | y_search, x_search = current_target_state.search_pos 171 | search_scale = search_scale_list[best_scale] 172 | target_height_search = height * search_scale 173 | target_width_search = width * search_scale 174 | bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) 175 | bbox_search = convert_bbox_format(bbox_search, 'top-left-based') 176 | np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), 177 | [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height]) 178 | 179 | reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') 180 | reported_bboxs.append(reported_bbox) 181 | return reported_bboxs 182 | -------------------------------------------------------------------------------- /utils/train_utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | """Utilities for model construction""" 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import re 14 | 15 | import numpy as np 16 | import tensorflow as tf 17 | from scipy import io as sio 18 | import logging 19 | logging.getLogger().setLevel(logging.INFO) 20 | from utils.misc_utils import get_center 21 | 22 | 23 | def construct_gt_score_maps(response_size, batch_size, stride, gt_config=None): 24 | """Construct a batch of groundtruth score maps 25 | 26 | Args: 27 | response_size: A list or tuple with two elements [ho, wo] 28 | batch_size: An integer e.g., 16 29 | stride: Embedding stride e.g., 8 30 | gt_config: Configurations for groundtruth generation 31 | 32 | Return: 33 | A float tensor of shape [batch_size] + response_size 34 | """ 35 | with tf.name_scope('construct_gt'): 36 | ho = response_size[0] 37 | wo = response_size[1] 38 | y = tf.cast(tf.range(0, ho), dtype=tf.float32) - get_center(ho) 39 | x = tf.cast(tf.range(0, wo), dtype=tf.float32) - get_center(wo) 40 | [Y, X] = tf.meshgrid(y, x) 41 | 42 | def _logistic_label(X, Y, rPos, rNeg): 43 | # dist_to_center = tf.sqrt(tf.square(X) + tf.square(Y)) # L2 metric 44 | dist_to_center = tf.abs(X) + tf.abs(Y) # Block metric 45 | Z = tf.where(dist_to_center <= rPos, 46 | tf.ones_like(X), 47 | tf.where(dist_to_center < rNeg, 48 | 0.5 * tf.ones_like(X), 49 | tf.zeros_like(X))) 50 | return Z 51 | 52 | rPos = gt_config['rPos'] / stride 53 | rNeg = gt_config['rNeg'] / stride 54 | gt = _logistic_label(X, Y, rPos, rNeg) 55 | 56 | # Duplicate a batch of maps 57 | gt_expand = tf.reshape(gt, [1] + response_size) 58 | gt = tf.tile(gt_expand, [batch_size, 1, 1]) 59 | return gt 60 | 61 | 62 | def get_params_from_mat(matpath): 63 | """Get parameter from .mat file into parms(dict)""" 64 | 65 | def squeeze(vars_): 66 | # Matlab save some params with shape (*, 1) 67 | # However, we don't need the trailing dimension in TensorFlow. 68 | if isinstance(vars_, (list, tuple)): 69 | return [np.squeeze(v, 1) for v in vars_] 70 | else: 71 | return np.squeeze(vars_, 1) 72 | 73 | netparams = sio.loadmat(matpath)["net"]["params"][0][0] 74 | params = dict() 75 | 76 | for i in range(netparams.size): 77 | param = netparams[0][i] 78 | name = param["name"][0] 79 | value = param["value"] 80 | value_size = param["value"].shape[0] 81 | 82 | match = re.match(r"([a-z]+)([0-9]+)([a-z]+)", name, re.I) 83 | if match: 84 | items = match.groups() 85 | elif name == 'adjust_f': 86 | params['detection/weights'] = squeeze(value) 87 | continue 88 | elif name == 'adjust_b': 89 | params['detection/biases'] = squeeze(value) 90 | continue 91 | else: 92 | raise Exception('unrecognized layer params') 93 | 94 | op, layer, types = items 95 | layer = int(layer) 96 | if layer in [1, 3]: 97 | if op == 'conv': # convolution 98 | if types == 'f': 99 | params['conv%d/weights' % layer] = value 100 | elif types == 'b': 101 | value = squeeze(value) 102 | params['conv%d/biases' % layer] = value 103 | elif op == 'bn': # batch normalization 104 | if types == 'x': 105 | m, v = squeeze(np.split(value, 2, 1)) 106 | params['conv%d/BatchNorm/moving_mean' % layer] = m 107 | params['conv%d/BatchNorm/moving_variance' % layer] = np.square(v) 108 | elif types == 'm': 109 | value = squeeze(value) 110 | params['conv%d/BatchNorm/gamma' % layer] = value 111 | elif types == 'b': 112 | value = squeeze(value) 113 | params['conv%d/BatchNorm/beta' % layer] = value 114 | else: 115 | raise Exception 116 | elif layer in [2, 4]: 117 | if op == 'conv' and types == 'f': 118 | b1, b2 = np.split(value, 2, 3) 119 | else: 120 | b1, b2 = np.split(value, 2, 0) 121 | if op == 'conv': 122 | if types == 'f': 123 | params['conv%d/b1/weights' % layer] = b1 124 | params['conv%d/b2/weights' % layer] = b2 125 | elif types == 'b': 126 | b1, b2 = squeeze(np.split(value, 2, 0)) 127 | params['conv%d/b1/biases' % layer] = b1 128 | params['conv%d/b2/biases' % layer] = b2 129 | elif op == 'bn': 130 | if types == 'x': 131 | m1, v1 = squeeze(np.split(b1, 2, 1)) 132 | m2, v2 = squeeze(np.split(b2, 2, 1)) 133 | params['conv%d/b1/BatchNorm/moving_mean' % layer] = m1 134 | params['conv%d/b2/BatchNorm/moving_mean' % layer] = m2 135 | params['conv%d/b1/BatchNorm/moving_variance' % layer] = np.square(v1) 136 | params['conv%d/b2/BatchNorm/moving_variance' % layer] = np.square(v2) 137 | elif types == 'm': 138 | params['conv%d/b1/BatchNorm/gamma' % layer] = squeeze(b1) 139 | params['conv%d/b2/BatchNorm/gamma' % layer] = squeeze(b2) 140 | elif types == 'b': 141 | params['conv%d/b1/BatchNorm/beta' % layer] = squeeze(b1) 142 | params['conv%d/b2/BatchNorm/beta' % layer] = squeeze(b2) 143 | else: 144 | raise Exception 145 | 146 | elif layer in [5]: 147 | if op == 'conv' and types == 'f': 148 | b1, b2 = np.split(value, 2, 3) 149 | else: 150 | b1, b2 = squeeze(np.split(value, 2, 0)) 151 | assert op == 'conv', 'layer5 contains only convolution' 152 | if types == 'f': 153 | params['conv%d/b1/weights' % layer] = b1 154 | params['conv%d/b2/weights' % layer] = b2 155 | elif types == 'b': 156 | params['conv%d/b1/biases' % layer] = b1 157 | params['conv%d/b2/biases' % layer] = b2 158 | 159 | return params 160 | 161 | 162 | def load_mat_model(matpath, embed_scope, detection_scope=None): 163 | """Restore SiameseFC models from .mat model files""" 164 | params = get_params_from_mat(matpath) 165 | 166 | assign_ops = [] 167 | 168 | def _assign(ref_name, params, scope=embed_scope): 169 | var_in_model = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 170 | scope + ref_name)[0] 171 | var_in_mat = params[ref_name] 172 | op = tf.assign(var_in_model, var_in_mat) 173 | assign_ops.append(op) 174 | 175 | for l in range(1, 6): 176 | logging.info('Loading layer {} from mat.'.format(l)) 177 | if l in [1, 3]: 178 | _assign('conv%d/weights' % l, params) 179 | # _assign('conv%d/biases' % l, params) 180 | _assign('conv%d/BatchNorm/beta' % l, params) 181 | _assign('conv%d/BatchNorm/gamma' % l, params) 182 | _assign('conv%d/BatchNorm/moving_mean' % l, params) 183 | _assign('conv%d/BatchNorm/moving_variance' % l, params) 184 | elif l in [2, 4]: 185 | # Branch 1 186 | _assign('conv%d/b1/weights' % l, params) 187 | # _assign('conv%d/b1/biases' % l, params) 188 | _assign('conv%d/b1/BatchNorm/beta' % l, params) 189 | _assign('conv%d/b1/BatchNorm/gamma' % l, params) 190 | _assign('conv%d/b1/BatchNorm/moving_mean' % l, params) 191 | _assign('conv%d/b1/BatchNorm/moving_variance' % l, params) 192 | # Branch 2 193 | _assign('conv%d/b2/weights' % l, params) 194 | # _assign('conv%d/b2/biases' % l, params) 195 | _assign('conv%d/b2/BatchNorm/beta' % l, params) 196 | _assign('conv%d/b2/BatchNorm/gamma' % l, params) 197 | _assign('conv%d/b2/BatchNorm/moving_mean' % l, params) 198 | _assign('conv%d/b2/BatchNorm/moving_variance' % l, params) 199 | elif l in [5]: 200 | # Branch 1 201 | _assign('conv%d/b1/weights' % l, params) 202 | _assign('conv%d/b1/biases' % l, params) 203 | # Branch 2 204 | _assign('conv%d/b2/weights' % l, params) 205 | _assign('conv%d/b2/biases' % l, params) 206 | else: 207 | raise Exception('layer number must below 5') 208 | 209 | if detection_scope: 210 | _assign(detection_scope + 'biases', params, scope='') 211 | 212 | initialize = tf.group(*assign_ops) 213 | return initialize 214 | def load_caffenet(path_caffenet, _sess): 215 | logging.info('Load object model from ' + path_caffenet) 216 | data_dict = np.load(path_caffenet, encoding='latin1').item() 217 | for op_name in data_dict: 218 | if op_name.find('fc')!=-1: 219 | continue 220 | full_op_name = 'sa_siam/semantic_net/'+op_name 221 | with tf.variable_scope(full_op_name, reuse=True): 222 | if op_name in ['conv2','conv4','conv5']: 223 | for param_name, data in data_dict[op_name].items(): 224 | d1, d2 = tf.split(data, 2, -1+len(data.shape)) # Last dim is selected to split 225 | for [d_, b_] in [[d1,'b1'],[d2,'b2']]: 226 | with tf.variable_scope(b_, reuse=True): 227 | logging.info('Loading: ' + full_op_name + ' ' + b_ + ' ' + param_name) 228 | var = tf.get_variable(param_name) 229 | _sess.run(var.assign(d_)) 230 | else: 231 | for param_name, data in data_dict[op_name].items(): 232 | logging.info('Loading: ' + full_op_name + ' ' + param_name) 233 | var = tf.get_variable(param_name) 234 | _sess.run(var.assign(data)) -------------------------------------------------------------------------------- /siamese_model.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # 6 | # Distributed under terms of the MIT license. 7 | 8 | """Construct the computational graph of siamese model for training. """ 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import print_function 13 | 14 | import functools 15 | 16 | import tensorflow as tf 17 | 18 | from datasets.dataloader import DataLoader 19 | from embeddings.sa_siam import sa_siam_arg_scope, sa_siam 20 | from metrics.track_metrics import center_dist_error, center_score_error 21 | from utils.train_utils import construct_gt_score_maps, load_mat_model 22 | 23 | slim = tf.contrib.slim 24 | 25 | 26 | class SiameseModel: 27 | def __init__(self, model_config, train_config, mode='train'): 28 | self.model_config = model_config 29 | self.train_config = train_config 30 | self.mode = mode 31 | assert mode in ['train', 'validation', 'inference'] 32 | 33 | if self.mode == 'train': 34 | self.data_config = self.train_config['train_data_config'] 35 | elif self.mode == 'validation': 36 | self.data_config = self.train_config['validation_data_config'] 37 | 38 | self.dataloader = None 39 | self.exemplars = None 40 | self.instances = None 41 | self.response = None 42 | self.batch_loss = None 43 | self.total_loss = None 44 | self.init_fn = None 45 | self.global_step = None 46 | 47 | def is_training(self): 48 | """Returns true if the model is built for training mode""" 49 | return self.mode == 'train' 50 | 51 | def build_inputs(self): 52 | """Input fetching and batching 53 | 54 | Outputs: 55 | self.exemplars: image batch of shape [batch, hz, wz, 3] 56 | self.instances: image batch of shape [batch, hx, wx, 3] 57 | """ 58 | if self.mode in ['train', 'validation']: 59 | with tf.device("/cpu:0"): # Put data loading and preprocessing in CPU is substantially faster 60 | self.dataloader = DataLoader(self.data_config, self.is_training()) 61 | self.dataloader.build() 62 | exemplars, instances = self.dataloader.get_one_batch() 63 | 64 | exemplars = tf.to_float(exemplars) 65 | instances = tf.to_float(instances) 66 | else: 67 | self.examplar_feed = tf.placeholder(shape=[None, None, None, 3], 68 | dtype=tf.uint8, 69 | name='examplar_input') 70 | self.instance_feed = tf.placeholder(shape=[None, None, None, 3], 71 | dtype=tf.uint8, 72 | name='instance_input') 73 | exemplars = tf.to_float(self.examplar_feed) 74 | instances = tf.to_float(self.instance_feed) 75 | 76 | self.exemplars = exemplars 77 | self.instances = instances 78 | 79 | def build_image_embeddings(self, reuse=False): 80 | """Builds the image model subgraph and generates image embeddings 81 | 82 | Inputs: 83 | self.exemplars: A tensor of shape [batch, hz, wz, 3] 84 | self.instances: A tensor of shape [batch, hx, wx, 3] 85 | 86 | Outputs: 87 | self.exemplar_embeds: A Tensor of shape [batch, hz_embed, wz_embed, embed_dim] 88 | self.instance_embeds: A Tensor of shape [batch, hx_embed, wx_embed, embed_dim] 89 | """ 90 | config = self.model_config['embed_config'] 91 | arg_scope = sa_siam_arg_scope(config, 92 | trainable=config['train_embedding'], 93 | is_training=self.is_training()) 94 | with slim.arg_scope(arg_scope): 95 | self.exemplar_embeds, _ = sa_siam(inputs=self.exemplars, is_example=True, reuse=reuse, sa_siam_config=self.model_config['sa_siam_config']) 96 | self.instance_embeds, _ = sa_siam(inputs=self.instances, is_example=False, reuse=True, sa_siam_config=self.model_config['sa_siam_config']) 97 | 98 | def build_template(self): 99 | # The template is simply the feature of the exemplar image in SiamFC. 100 | self.templates = self.exemplar_embeds 101 | 102 | def build_detection(self, reuse=False): 103 | with tf.variable_scope('detection', reuse=reuse): 104 | def _translation_match(x, z): # translation match for one example within a batch 105 | x = tf.expand_dims(x, 0) # [1, in_height, in_width, in_channels] 106 | z = tf.expand_dims(z, -1) # [filter_height, filter_width, in_channels, 1] 107 | return tf.nn.conv2d(x, z, strides=[1, 1, 1, 1], padding='VALID', name='translation_match') 108 | 109 | output = tf.map_fn(lambda x: _translation_match(x[0], x[1]), 110 | (self.instance_embeds, self.templates), 111 | dtype=self.instance_embeds.dtype) 112 | output = tf.squeeze(output, [1, 4]) # of shape e.g., [8, 15, 15] 113 | 114 | # Adjust score, this is required to make training possible. 115 | config = self.model_config['adjust_response_config'] 116 | bias = tf.get_variable('biases', [1], 117 | dtype=tf.float32, 118 | initializer=tf.constant_initializer(0.0, dtype=tf.float32), 119 | trainable=config['train_bias']) 120 | response = config['scale'] * output + bias 121 | self.response = response 122 | 123 | def build_loss(self): 124 | response = self.response 125 | response_size = response.get_shape().as_list()[1:3] # [height, width] 126 | 127 | gt = construct_gt_score_maps(response_size, 128 | self.data_config['batch_size'], 129 | self.model_config['embed_config']['stride'], 130 | self.train_config['gt_config']) 131 | 132 | with tf.name_scope('Loss'): 133 | loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=response, 134 | labels=gt) 135 | 136 | with tf.name_scope('Balance_weights'): 137 | n_pos = tf.reduce_sum(tf.to_float(tf.equal(gt[0], 1))) 138 | n_neg = tf.reduce_sum(tf.to_float(tf.equal(gt[0], 0))) 139 | w_pos = 0.5 / n_pos 140 | w_neg = 0.5 / n_neg 141 | class_weights = tf.where(tf.equal(gt, 1), 142 | w_pos * tf.ones_like(gt), 143 | tf.ones_like(gt)) 144 | class_weights = tf.where(tf.equal(gt, 0), 145 | w_neg * tf.ones_like(gt), 146 | class_weights) 147 | loss = loss * class_weights 148 | 149 | # Note that we use reduce_sum instead of reduce_mean since the loss has 150 | # already been normalized by class_weights in spatial dimension. 151 | loss = tf.reduce_sum(loss, [1, 2]) 152 | 153 | batch_loss = tf.reduce_mean(loss, name='batch_loss') 154 | tf.losses.add_loss(batch_loss) 155 | 156 | total_loss = tf.losses.get_total_loss() 157 | self.batch_loss = batch_loss 158 | self.total_loss = total_loss 159 | 160 | tf.summary.image('exemplar', self.exemplars, family=self.mode) 161 | tf.summary.image('instance', self.instances, family=self.mode) 162 | 163 | mean_batch_loss, update_op1 = tf.metrics.mean(batch_loss) 164 | mean_total_loss, update_op2 = tf.metrics.mean(total_loss) 165 | with tf.control_dependencies([update_op1, update_op2]): 166 | tf.summary.scalar('batch_loss', mean_batch_loss, family=self.mode) 167 | tf.summary.scalar('total_loss', mean_total_loss, family=self.mode) 168 | 169 | if self.mode == 'train': 170 | tf.summary.image('GT', tf.reshape(gt[0], [1] + response_size + [1]), family='GT') 171 | tf.summary.image('Response', tf.expand_dims(tf.sigmoid(response), -1), family=self.mode) 172 | tf.summary.histogram('Response', self.response, family=self.mode) 173 | 174 | # Two more metrics to monitor the performance of training 175 | tf.summary.scalar('center_score_error', center_score_error(response), family=self.mode) 176 | tf.summary.scalar('center_dist_error', center_dist_error(response), family=self.mode) 177 | 178 | def setup_global_step(self): 179 | global_step = tf.Variable( 180 | initial_value=0, 181 | name='global_step', 182 | trainable=False, 183 | collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) 184 | 185 | self.global_step = global_step 186 | 187 | def setup_embedding_initializer(self): 188 | """Sets up the function to restore embedding variables from checkpoint.""" 189 | embed_config = self.model_config['embed_config'] 190 | if embed_config['embedding_checkpoint_file']: 191 | # Restore Siamese FC models from .mat model files 192 | initialize = load_mat_model(embed_config['embedding_checkpoint_file'], 193 | 'sa_siam/appearance_net/', 'detection/') 194 | 195 | def restore_fn(sess): 196 | tf.logging.info("Restoring embedding variables from checkpoint file %s", 197 | embed_config['embedding_checkpoint_file']) 198 | sess.run([initialize]) 199 | 200 | self.init_fn = restore_fn 201 | 202 | def build(self, reuse=False): 203 | """Creates all ops for training and evaluation""" 204 | with tf.name_scope(self.mode): 205 | self.build_inputs() 206 | self.build_image_embeddings(reuse=reuse) 207 | self.build_template() 208 | self.build_detection(reuse=reuse) 209 | self.setup_embedding_initializer() 210 | 211 | if self.mode in ['train', 'validation']: 212 | self.build_loss() 213 | 214 | if self.is_training(): 215 | self.setup_global_step() 216 | -------------------------------------------------------------------------------- /inference/inference_wrapper.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # Copyright © 2018 Anfeng He Microsoft Research Asia. University of Science and Technology of China. 6 | # Copyright (c) Microsoft. All rights reserved. 7 | # 8 | # Distributed under terms of the MIT license. 9 | 10 | """Model Wrapper class for performing inference with a SiameseModel""" 11 | 12 | from __future__ import absolute_import 13 | from __future__ import division 14 | from __future__ import print_function 15 | 16 | import functools 17 | import logging 18 | logging.getLogger().setLevel(logging.INFO) 19 | import os 20 | import os.path as osp 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from embeddings.sa_siam import sa_siam_arg_scope, sa_siam 26 | from utils.infer_utils import get_exemplar_images 27 | from utils.misc_utils import get_center, get, shape_of 28 | 29 | slim = tf.contrib.slim 30 | 31 | 32 | class InferenceWrapper(): 33 | """Model wrapper class for performing inference with a siamese model.""" 34 | 35 | def __init__(self): 36 | self.image = None 37 | self.target_bbox_feed = None 38 | self.search_images = None 39 | self.embeds = None 40 | self.templates = None 41 | self.init = None 42 | self.model_config = None 43 | self.track_config = None 44 | self.response_up = None 45 | 46 | def build_graph_from_config(self, model_config, track_config, checkpoint_path): 47 | """Build the inference graph and return a restore function.""" 48 | self.build_model(model_config, track_config) 49 | ema = tf.train.ExponentialMovingAverage(0) 50 | variables_to_restore = ema.variables_to_restore(moving_avg_variables=[]) 51 | 52 | # Filter out State variables 53 | variables_to_restore_filterd = {} 54 | for key, value in variables_to_restore.items(): 55 | if key.split('/')[1] != 'State': 56 | variables_to_restore_filterd[key] = value 57 | 58 | saver = tf.train.Saver(variables_to_restore_filterd) 59 | 60 | if osp.isdir(checkpoint_path): 61 | checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) 62 | if not checkpoint_path: 63 | raise ValueError("No checkpoint file found in: {}".format(checkpoint_path)) 64 | 65 | def _restore_fn(sess): 66 | logging.info("Loading model from checkpoint: %s", checkpoint_path) 67 | saver.restore(sess, checkpoint_path) 68 | logging.info("Successfully loaded checkpoint: %s", os.path.basename(checkpoint_path)) 69 | 70 | return _restore_fn 71 | 72 | def build_model(self, model_config, track_config): 73 | self.model_config = model_config 74 | self.track_config = track_config 75 | 76 | self.build_inputs() 77 | self.build_search_images() 78 | self.build_template() 79 | self.build_detection() 80 | self.build_upsample() 81 | self.dumb_op = tf.no_op('dumb_operation') 82 | 83 | def build_inputs(self): 84 | filename = tf.placeholder(tf.string, [], name='filename') 85 | image_file = tf.read_file(filename) 86 | image = tf.image.decode_jpeg(image_file, channels=3, dct_method="INTEGER_ACCURATE") 87 | image = tf.to_float(image) 88 | self.image = image 89 | self.target_bbox_feed = tf.placeholder(dtype=tf.float32, 90 | shape=[4], 91 | name='target_bbox_feed') # center's y, x, height, width 92 | 93 | def build_search_images(self): 94 | """Crop search images from the input image based on the last target position 95 | 96 | 1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2 97 | 2. Crop an image patch as large as x_image_size centered at the target center. 98 | 3. If the cropped image region is beyond the boundary of the input image, mean values are padded. 99 | """ 100 | model_config = self.model_config 101 | track_config = self.track_config 102 | 103 | size_z = model_config['z_image_size'] 104 | size_x = track_config['x_image_size'] 105 | context_amount = 0.5 106 | 107 | num_scales = track_config['num_scales'] 108 | scales = np.arange(num_scales) - get_center(num_scales) 109 | assert np.sum(scales) == 0, 'scales should be symmetric' 110 | search_factors = [track_config['scale_step'] ** x for x in scales] 111 | 112 | frame_sz = tf.shape(self.image) 113 | target_yx = self.target_bbox_feed[0:2] 114 | target_size = self.target_bbox_feed[2:4] 115 | avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan') 116 | 117 | # Compute base values 118 | base_z_size = target_size 119 | base_z_context_size = base_z_size + context_amount * tf.reduce_sum(base_z_size) 120 | base_s_z = tf.sqrt(tf.reduce_prod(base_z_context_size)) # Canonical size 121 | base_scale_z = tf.div(tf.to_float(size_z), base_s_z) 122 | d_search = (size_x - size_z) / 2.0 123 | base_pad = tf.div(d_search, base_scale_z) 124 | base_s_x = base_s_z + 2 * base_pad 125 | base_scale_x = tf.div(tf.to_float(size_x), base_s_x) 126 | 127 | boxes = [] 128 | for factor in search_factors: 129 | s_x = factor * base_s_x 130 | frame_sz_1 = tf.to_float(frame_sz[0:2] - 1) 131 | topleft = tf.div(target_yx - get_center(s_x), frame_sz_1) 132 | bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1) 133 | box = tf.concat([topleft, bottomright], axis=0) 134 | boxes.append(box) 135 | boxes = tf.stack(boxes) 136 | 137 | scale_xs = [] 138 | for factor in search_factors: 139 | scale_x = base_scale_x / factor 140 | scale_xs.append(scale_x) 141 | self.scale_xs = tf.stack(scale_xs) 142 | 143 | # Note we use different padding values for each image 144 | # while the original implementation uses only the average value 145 | # of the first image for all images. 146 | image_minus_avg = tf.expand_dims(self.image - avg_chan, 0) 147 | image_cropped = tf.image.crop_and_resize(image_minus_avg, boxes, 148 | box_ind=tf.zeros((track_config['num_scales']), tf.int32), 149 | crop_size=[size_x, size_x]) 150 | self.search_images = image_cropped + avg_chan 151 | 152 | def get_image_embedding(self, images, is_example, sa_siam_config, reuse=None): 153 | config = self.model_config['embed_config'] 154 | arg_scope = sa_siam_arg_scope(config, 155 | trainable=config['train_embedding'], 156 | is_training=False) 157 | 158 | @functools.wraps(sa_siam) 159 | def embedding_fn(images, is_example, sa_siam_config, reuse=False): 160 | with slim.arg_scope(arg_scope): 161 | return sa_siam(images, is_example, sa_siam_config, reuse=reuse) 162 | 163 | embed, _ = embedding_fn(images=images, is_example=is_example, sa_siam_config=sa_siam_config, reuse=reuse) 164 | return embed 165 | 166 | def build_template(self): 167 | model_config = self.model_config 168 | track_config = self.track_config 169 | 170 | # Exemplar image lies at the center of the search image in the first frame 171 | exemplar_images = get_exemplar_images(self.search_images, [track_config['x_image_size'], 172 | track_config['x_image_size']]) 173 | templates = self.get_image_embedding(exemplar_images, is_example=True, sa_siam_config=self.model_config['sa_siam_config']) 174 | center_scale = int(get_center(track_config['num_scales'])) 175 | center_template = tf.identity(templates[center_scale]) 176 | templates = tf.stack([center_template for _ in range(track_config['num_scales'])]) 177 | 178 | with tf.variable_scope('target_template'): 179 | # Store template in Variable such that we don't have to feed this template every time. 180 | with tf.variable_scope('State'): 181 | state = tf.get_variable('exemplar', 182 | initializer=tf.zeros(templates.get_shape().as_list(), dtype=templates.dtype), 183 | trainable=False) 184 | with tf.control_dependencies([templates]): 185 | self.init = tf.assign(state, templates, validate_shape=True) 186 | self.templates = state 187 | 188 | def build_detection(self): 189 | self.embeds = self.get_image_embedding(self.search_images, reuse=True, is_example=False, sa_siam_config=self.model_config['sa_siam_config']) 190 | with tf.variable_scope('detection'): 191 | def _get_mask_any(shape_mask, _u, _d, _l, _r): 192 | _mask = np.zeros(shape_mask, dtype='float32') 193 | _mask[_u:_d, _l:_r] = 1.0 194 | return _mask 195 | def _get_center_mask(shape_mask, _sz): # mask center a _sz x _sz patch 196 | _u = int((shape_mask[0] - _sz) / 2) 197 | _d = _u + _sz 198 | _l = int((shape_mask[1] - _sz) / 2) 199 | _r = _l + _sz 200 | return _get_mask_any(shape_mask, _u, _d, _l, _r) 201 | def _translation_match(x, z, mask_center=np.array([[1.0]], dtype='float32')): 202 | x = tf.expand_dims(x, 0) # [batch, in_height, in_width, in_channels] 203 | z = tf.expand_dims(z, -1) # [filter_height, filter_width, in_channels, out_channels] 204 | mask_center = tf.expand_dims(mask_center, -1) 205 | mask_center = tf.expand_dims(mask_center, -1) 206 | return tf.nn.conv2d(x, z * mask_center, strides=[1, 1, 1, 1], padding='VALID', name='translation_match') 207 | logging.info('Shape of templates: {}'.format(self.templates.shape)) 208 | logging.info('Shape of embeds: {}'.format(self.embeds.shape)) 209 | en_appearance = get(self.model_config['sa_siam_config'], 'en_appearance', False) 210 | en_semantic = get(self.model_config['sa_siam_config'], 'en_semantic', False) 211 | if en_appearance and en_semantic: 212 | c_appearance = get(self.model_config['sa_siam_config'], 'c_appearance', 0.3) 213 | out_scale = self.model_config['adjust_response_config']['scale'] 214 | temp_appearance, temp_semantic = tf.split(self.templates, 2, 3) 215 | inst_appearance, inst_semantic = tf.split(self.embeds, 2, 3) 216 | bias_semantic = tf.get_variable('biases_semantic', [1], 217 | dtype=tf.float32, 218 | initializer=tf.constant_initializer(0.0, dtype=tf.float32), 219 | trainable=False) 220 | bias_appearance = tf.get_variable('biases_appearance', [1], 221 | dtype=tf.float32, 222 | initializer=tf.constant_initializer(0.0, dtype=tf.float32), 223 | trainable=False) 224 | sz_feat = shape_of(temp_appearance)[1:3] # [h,w] 225 | self.mask_all = { 226 | 'keep_all': 1 - _get_center_mask(sz_feat, 0) 227 | } 228 | self.response_all = {} 229 | for k in sorted(self.mask_all.keys()): 230 | logging.info('Make match: {}'.format(k)) 231 | match_k = lambda x: _translation_match(x[0], x[1], mask_center=self.mask_all[k]) 232 | out_appearance_mask_k = tf.map_fn(match_k, (inst_appearance, temp_appearance), dtype=inst_appearance.dtype) 233 | out_semantic_mask_k = tf.map_fn(match_k, (inst_semantic, temp_semantic), dtype=inst_semantic.dtype) 234 | 235 | out_appearance_mask_k = tf.squeeze(out_appearance_mask_k, [1,4]) 236 | out_semantic_mask_k = tf.squeeze(out_semantic_mask_k, [1,4]) 237 | 238 | response_appearance_mask_k = out_scale * out_appearance_mask_k 239 | response_semantic_mask_k = out_scale * out_semantic_mask_k 240 | 241 | self.response_all[k] = (response_appearance_mask_k + bias_appearance) * c_appearance + (response_semantic_mask_k + bias_semantic) * (1-c_appearance) 242 | response = self.response_all['keep_all'] 243 | else: 244 | output = tf.map_fn( 245 | lambda x: _translation_match(x[0], x[1]), 246 | (self.embeds, self.templates), dtype=self.embeds.dtype) # of shape [16, 1, 17, 17, 1] 247 | output = tf.squeeze(output, [1, 4]) # of shape e.g. [16, 17, 17] 248 | bias = tf.get_variable('biases', [1], 249 | dtype=tf.float32, 250 | initializer=tf.constant_initializer(0.0, dtype=tf.float32), 251 | trainable=False) 252 | response = (self.model_config['adjust_response_config']['scale'] * output + bias) 253 | self.response = response 254 | 255 | def build_upsample(self): 256 | """Upsample response to obtain finer target position""" 257 | with tf.variable_scope('upsample'): 258 | response = tf.expand_dims(self.response, 3) 259 | up_method = self.track_config['upsample_method'] 260 | methods = {'bilinear': tf.image.ResizeMethod.BILINEAR, 261 | 'bicubic': tf.image.ResizeMethod.BICUBIC} 262 | up_method = methods[up_method] 263 | response_spatial_size = self.response.get_shape().as_list()[1:3] 264 | up_size = [s * self.track_config['upsample_factor'] for s in response_spatial_size] 265 | response_up = tf.image.resize_images(response, 266 | up_size, 267 | method=up_method, 268 | align_corners=True) 269 | response_up = tf.squeeze(response_up, [3]) 270 | self.response_up = response_up 271 | 272 | def initialize(self, sess, input_feed): 273 | image_path, target_bbox = input_feed 274 | scale_xs, _ = sess.run([self.scale_xs, self.init], 275 | feed_dict={'filename:0': image_path, 276 | "target_bbox_feed:0": target_bbox, }) 277 | return scale_xs 278 | 279 | def inference_step(self, sess, input_feed): 280 | image_path, target_bbox = input_feed 281 | log_level = self.track_config['log_level'] 282 | image_cropped_op = self.search_images if log_level > 0 else self.dumb_op 283 | image_cropped, scale_xs, response_output = sess.run( 284 | fetches=[image_cropped_op, self.scale_xs, self.response_up], 285 | feed_dict={ 286 | "filename:0": image_path, 287 | "target_bbox_feed:0": target_bbox, }) 288 | 289 | output = { 290 | 'image_cropped': image_cropped, 291 | 'scale_xs': scale_xs, 292 | 'response': response_output} 293 | return output, None 294 | -------------------------------------------------------------------------------- /embeddings/sa_siam.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright © 2017 bily Huazhong University of Science and Technology 5 | # Copyright © 2018 Anfeng He Microsoft Research Asia. University of Science and Technology of China. 6 | # Copyright (c) Microsoft. All rights reserved. 7 | # 8 | # Distributed under terms of the MIT license. 9 | 10 | """Contains definitions of the network in [1][2]. 11 | 12 | [1] Bertinetto, L., et al. (2016). 13 | "Fully-Convolutional Siamese Networks for Object Tracking." 14 | arXiv preprint arXiv:1606.09549. 15 | [2] Anfeng He, et al. (2018). 16 | "A Twofold Siamese Network for Real-Time Object Tracking." 17 | arXiv preprint arXiv:1802.08817. 18 | 19 | Typical use: 20 | 21 | import sa_siam 22 | with slim.arg_scope(sa_siam.sa_siam_arg_scope()): 23 | net, end_points = sa_siam.sa_siam(inputs, is_training=False) 24 | """ 25 | from __future__ import absolute_import 26 | from __future__ import division 27 | from __future__ import print_function 28 | 29 | import logging 30 | logging.getLogger().setLevel(logging.INFO) 31 | 32 | import tensorflow as tf 33 | 34 | from utils.misc_utils import get, shape_of, same_hw 35 | from functools import reduce 36 | 37 | slim = tf.contrib.slim 38 | 39 | 40 | def sa_siam_arg_scope(embed_config, 41 | trainable=True, 42 | is_training=False): 43 | """Defines the default arg scope. 44 | 45 | Args: 46 | embed_config: A dictionary which contains configurations for the embedding function. 47 | trainable: If the weights in the embedding function is trainable. 48 | is_training: If the embedding function is built for training. 49 | 50 | Returns: 51 | An `arg_scope` to use for the SA-Siam models. 52 | """ 53 | # Only consider the model to be in training mode if it's trainable. 54 | # This is vital for batch_norm since moving_mean and moving_variance 55 | # will get updated even if not trainable. 56 | is_model_training = trainable and is_training 57 | 58 | if get(embed_config, 'use_bn', True): 59 | batch_norm_scale = get(embed_config, 'bn_scale', True) 60 | batch_norm_decay = 1 - get(embed_config, 'bn_momentum', 3e-4) 61 | batch_norm_epsilon = get(embed_config, 'bn_epsilon', 1e-6) 62 | batch_norm_params = { 63 | "scale": batch_norm_scale, 64 | # Decay for the moving averages. 65 | "decay": batch_norm_decay, 66 | # Epsilon to prevent 0s in variance. 67 | "epsilon": batch_norm_epsilon, 68 | "trainable": trainable, 69 | "is_training": is_model_training, 70 | # Collection containing the moving mean and moving variance. 71 | "variables_collections": { 72 | "beta": None, 73 | "gamma": None, 74 | "moving_mean": ["moving_vars"], 75 | "moving_variance": ["moving_vars"], 76 | }, 77 | 'updates_collections': None, # Ensure that updates are done within a frame 78 | } 79 | normalizer_fn = slim.batch_norm 80 | else: 81 | batch_norm_params = {} 82 | normalizer_fn = None 83 | 84 | weight_decay = get(embed_config, 'weight_decay', 5e-4) 85 | if trainable: 86 | weights_regularizer = slim.l2_regularizer(weight_decay) 87 | else: 88 | weights_regularizer = None 89 | 90 | init_method = get(embed_config, 'init_method', None) 91 | if is_model_training: 92 | logging.info('embedding init method -- {}'.format(init_method)) 93 | if init_method == 'kaiming_normal': 94 | # The same setting as siamese-fc 95 | initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_OUT', uniform=False) 96 | else: 97 | initializer = slim.xavier_initializer() 98 | 99 | with slim.arg_scope( 100 | [slim.conv2d], 101 | weights_regularizer=weights_regularizer, 102 | weights_initializer=initializer, 103 | padding='VALID', 104 | trainable=trainable, 105 | activation_fn=tf.nn.relu, 106 | normalizer_fn=normalizer_fn, 107 | normalizer_params=batch_norm_params): 108 | with slim.arg_scope([slim.batch_norm], **batch_norm_params): 109 | with slim.arg_scope([slim.batch_norm], is_training=is_model_training) as arg_sc: 110 | return arg_sc 111 | def appearance_net(layer_in): 112 | logging.info('Building Appearence branch of SA-Siam') 113 | layers_all = [] 114 | layer_cur = slim.conv2d(layer_in, 96, [11, 11], 2, scope='conv1') 115 | layer_cur = slim.max_pool2d(layer_cur, [3, 3], 2, scope='pool1') 116 | layers_all.append(layer_cur) 117 | with tf.variable_scope('conv2'): 118 | b1, b2 = tf.split(layer_cur, 2, 3) 119 | b1 = slim.conv2d(b1, 128, [5, 5], scope='b1') 120 | # The original implementation has bias terms for all convolution, but 121 | # it actually isn't necessary if the convolution layer is followed by a batch 122 | # normalization layer since batch norm will subtract the mean. 123 | b2 = slim.conv2d(b2, 128, [5, 5], scope='b2') 124 | layer_cur = tf.concat([b1, b2], 3) 125 | layer_cur = slim.max_pool2d(layer_cur, [3, 3], 2, scope='pool2') 126 | layers_all.append(layer_cur) 127 | layer_cur = slim.conv2d(layer_cur, 384, [3, 3], 1, scope='conv3') 128 | layers_all.append(layer_cur) 129 | with tf.variable_scope('conv4'): 130 | b1, b2 = tf.split(layer_cur, 2, 3) 131 | b1 = slim.conv2d(b1, 192, [3, 3], 1, scope='b1') 132 | b2 = slim.conv2d(b2, 192, [3, 3], 1, scope='b2') 133 | layer_cur = tf.concat([b1, b2], 3) 134 | layers_all.append(layer_cur) 135 | # Conv 5 with only convolution 136 | with tf.variable_scope('conv5'): 137 | with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None): 138 | b1, b2 = tf.split(layer_cur, 2, 3) 139 | b1 = slim.conv2d(b1, 128, [3, 3], 1, scope='b1') 140 | b2 = slim.conv2d(b2, 128, [3, 3], 1, scope='b2') 141 | layer_cur = tf.concat([b1, b2], 3) 142 | layers_all.append(layer_cur) 143 | return layer_cur, layers_all 144 | 145 | def semantic_net(layer_in): 146 | logging.info('Building Semantic branch of SA-Siam..') 147 | layers_all = [] 148 | with slim.arg_scope([slim.conv2d], normalizer_fn=None, trainable=False, normalizer_params=False): 149 | layer_cur = layer_in - [123.0,117.0,104.0] # RGB sub mean 150 | layer_cur = tf.reverse(layer_cur,[3]) # convert img to BGR 151 | layer_cur = slim.conv2d(layer_cur, 96, [11, 11], 2, scope='conv1') 152 | layer_cur = slim.max_pool2d(layer_cur, [3, 3], 2, scope='pool1') 153 | layer_cur = tf.nn.local_response_normalization(layer_cur,depth_radius=2,alpha=2e-5,beta=0.75,bias=1.0,name='norm1') 154 | layers_all.append(layer_cur) 155 | with tf.variable_scope('conv2'): 156 | b1, b2 = tf.split(layer_cur, 2, 3) 157 | b1 = slim.conv2d(b1, 128, [5, 5], scope='b1') 158 | b2 = slim.conv2d(b2, 128, [5, 5], scope='b2') 159 | layer_cur = tf.concat([b1, b2], 3) 160 | layer_cur = slim.max_pool2d(layer_cur, [3, 3], 2, scope='pool2') 161 | layer_cur = tf.nn.local_response_normalization(layer_cur,depth_radius=2,alpha=2e-5,beta=0.75,bias=1.0,name='norm2') 162 | layers_all.append(layer_cur) 163 | layer_cur = slim.conv2d(layer_cur, 384, [3, 3], 1, scope='conv3') 164 | layers_all.append(layer_cur) 165 | with tf.variable_scope('conv4'): 166 | b1, b2 = tf.split(layer_cur, 2, 3) 167 | b1 = slim.conv2d(b1, 192, [3, 3], 1, scope='b1') 168 | b2 = slim.conv2d(b2, 192, [3, 3], 1, scope='b2') 169 | layer_cur = tf.concat([b1, b2], 3) 170 | layers_all.append(layer_cur) 171 | # Conv 5 with only convolution 172 | with tf.variable_scope('conv5'): 173 | with slim.arg_scope([slim.conv2d],activation_fn=tf.nn.relu): 174 | b1, b2 = tf.split(layer_cur, 2, 3) 175 | b1 = slim.conv2d(b1, 128, [3, 3], 1, scope='b1') 176 | b2 = slim.conv2d(b2, 128, [3, 3], 1, scope='b2') 177 | layer_cur = tf.concat([b1, b2], 3) 178 | layers_all.append(layer_cur) 179 | return layer_cur, layers_all 180 | def combine_sa_net(a_net, s_net): 181 | all_feat = a_net + s_net 182 | assert(all(list(map(same_hw, all_feat)))) 183 | max_feat_size = max(list(map(lambda a: shape_of(a)[1], all_feat))) 184 | logging.info('Max_feat_size={}'.format(max_feat_size)) 185 | def pad_feat(feat): 186 | if max_feat_size is None and shape_of(feat)[1] is None: 187 | return feat 188 | pad_size = max_feat_size - shape_of(feat)[1] 189 | pad_l = pad_size // 2 190 | pad_r = pad_size - pad_l 191 | return tf.pad(feat,[[0,0],[pad_l,pad_r],[pad_l,pad_r],[0,0]]) 192 | all_feat = list(map(pad_feat, all_feat)) 193 | return tf.concat(all_feat, axis=3) 194 | 195 | def sa_siam(inputs, 196 | is_example, 197 | sa_siam_config={}, 198 | reuse=None, 199 | scope='sa_siam'): 200 | en_appearance = get(sa_siam_config, 'en_appearance', False) 201 | en_semantic = get(sa_siam_config, 'en_semantic', False) 202 | n_out = get(sa_siam_config, 'n_out', 256) 203 | all_combine_layers_appearance = get(sa_siam_config, 'all_combine_layers_appearance', {'conv5':1.0}) 204 | all_combine_layers_semantic = get(sa_siam_config, 'all_combine_layers_semantic', {'conv5':1.0, 'conv4':0.1}) 205 | sz_conv5_z = get(sa_siam_config, 'sz_conv5_z', 6) 206 | en_semantic_att = get(sa_siam_config, 'en_semantic_att', True) 207 | 208 | with tf.variable_scope(scope, 'sa_siam', [inputs], reuse=reuse) as sc: 209 | end_points_collection = sc.name + '_end_points' 210 | with slim.arg_scope([slim.conv2d, slim.max_pool2d], 211 | outputs_collections=end_points_collection): 212 | def proc_raw_all_feat(feat, is_appearance, n_out_cur, all_combine_layers): 213 | res = [] 214 | max_feat_size = 0 215 | for l in range(1,6): 216 | for k in all_combine_layers.keys(): 217 | if k.find(str(l)) != -1: 218 | if shape_of(feat[l-1])[3] is None: 219 | res.append(feat[l-1]) 220 | break 221 | if l == 5 and is_appearance and abs(n_out_cur - shape_of(feat[l-1])[3]) < 0.1: 222 | res.append(feat[l-1]) 223 | else: 224 | if not is_appearance: 225 | feat[l-1] *= all_combine_layers[k] # Multiple scale for convergence during training 226 | with slim.arg_scope([slim.conv2d],activation_fn=None, normalizer_fn=None): 227 | c1x1 = slim.conv2d(feat[l-1], n_out_cur, [1,1], 1, scope='c1x1_' + k) 228 | res.append(c1x1) 229 | logging.info('Keep {} .. is_appearance={} shape={}'.format(k,is_appearance,shape_of(res[-1]))) 230 | return res 231 | def re_weight_crop(feat, all_combine_layers, only_crop=False): 232 | feat_shape = list(map(shape_of, feat)) 233 | res = [] 234 | for l in range(1,6): # proc layers from 1 to 5 in order 235 | for k in all_combine_layers.keys(): # find the corresponding layer in all layers 236 | if k.find(str(l)) != -1: 237 | logging.info('For layer {} ...'.format(k)) 238 | cur_ly_idx = l - 1 239 | if feat_shape[cur_ly_idx][2] is None and feat_shape[4][2] is None: 240 | res.append(feat[cur_ly_idx]) 241 | break 242 | pad_val = feat_shape[cur_ly_idx][2] - feat_shape[4][2] 243 | sz_conv5_z_cur = pad_val + sz_conv5_z 244 | sz_conv5_x_cur = feat_shape[cur_ly_idx][2] 245 | n_left = int((sz_conv5_x_cur - sz_conv5_z_cur) / 2 + 0.5) 246 | div_left_st = [0, n_left, n_left + sz_conv5_z_cur, sz_conv5_x_cur] 247 | logging.info('.. Crop as {}'.format(div_left_st)) # crop 9 patchs and max pool each patch 248 | if not only_crop: 249 | all_max = [] 250 | for j in [0,1,2]: 251 | for i in [0,1,2]: 252 | l_crop = div_left_st[i] 253 | r_crop = div_left_st[i + 1] 254 | u_crop = div_left_st[j] 255 | d_crop = div_left_st[j+1] 256 | max_patch = tf.reduce_max(feat[cur_ly_idx][:, u_crop:d_crop, l_crop:r_crop, :], axis=[1, 2]) #shape = [n, c] 257 | all_max.append(max_patch) 258 | max_map = tf.stack(all_max, axis=2) #shape = [n, c, 9] 259 | logging.info('.. Max_map.shape = {}'.format(max_map.shape)) 260 | max_map = slim.fully_connected(max_map, 9, scope='att_fc1_' + k) # fully_connected layer will only applied to the last dim 261 | logging.info('.. Max_map_fc1.shape = {}'.format(max_map.shape)) 262 | max_map = slim.fully_connected(max_map, 1, scope='att_fc2_' + k, activation_fn=None, normalizer_fn=None,) 263 | logging.info('.. Max_map_fc2.shape = {}'.format(max_map.shape)) # shape = [n, c, 1] 264 | att_map = tf.reshape(max_map, [-1, 1, 1, feat_shape[cur_ly_idx][3]]) 265 | logging.info('.. att_map.shape = {}'.format(att_map.shape)) 266 | att_map = tf.sigmoid(att_map) + 0.5 # important bias for avoiding loss too much 267 | feat[cur_ly_idx] = att_map * feat[cur_ly_idx] 268 | feat[cur_ly_idx] = feat[cur_ly_idx][:, div_left_st[1]:div_left_st[2], div_left_st[1]:div_left_st[2], :] # crop center feat 269 | res.append(feat[cur_ly_idx]) 270 | break 271 | else: 272 | res.append(None) 273 | return res 274 | layer_cur = inputs 275 | if en_appearance: 276 | n_out_appearance = n_out / len(all_combine_layers_appearance.keys()) 277 | with tf.variable_scope('appearance_net'): 278 | _, feat_appearance_all = appearance_net(layer_cur) 279 | if is_example: 280 | feat_appearance_all = re_weight_crop(feat_appearance_all, all_combine_layers_appearance, only_crop=True) 281 | net_appearance = proc_raw_all_feat(feat_appearance_all, is_appearance=True, n_out_cur=n_out_appearance, all_combine_layers=all_combine_layers_appearance) 282 | if en_semantic: 283 | n_out_semantic = n_out / len(all_combine_layers_semantic.keys()) 284 | with tf.variable_scope('semantic_net'): 285 | _, feat_semantic_all = semantic_net(layer_cur) 286 | if is_example: 287 | feat_semantic_all = re_weight_crop(feat_semantic_all, all_combine_layers_semantic, only_crop=not en_semantic_att) 288 | net_semantic = proc_raw_all_feat(feat_semantic_all, is_appearance=False, n_out_cur=n_out_semantic, all_combine_layers=all_combine_layers_semantic) 289 | if en_appearance and en_semantic: 290 | layer_cur = combine_sa_net(net_appearance, net_semantic) 291 | elif en_appearance:layer_cur = combine_sa_net(net_appearance, []) 292 | elif en_semantic:layer_cur = combine_sa_net(net_semantic, []) 293 | else: raise ValueError('Semantic or Appearance must enable one branch!') 294 | # Convert end_points_collection into a dictionary of end_points. 295 | end_points = slim.utils.convert_collection_to_dict(end_points_collection) 296 | return layer_cur, end_points 297 | 298 | sa_siam.stride = 8 299 | --------------------------------------------------------------------------------