├── ILSVRC15-curation
    ├── gen_image_crops_VID.py
    └── gen_imdb_VID.py
├── LICENSE
├── README.md
├── Tracking
    ├── Config.py
    ├── SiamNet.py
    ├── Tracking_Utils.py
    └── run_SiamFC.py
├── Train
    ├── Config.py
    ├── DataAugmentation.py
    ├── SiamNet.py
    ├── Utils.py
    ├── VIDDataset.py
    ├── model
    │   └── SiamFC_50_model.pth
    └── run_Train_SiamFC.py
└── imgs
    └── result.PNG


/ILSVRC15-curation/gen_image_crops_VID.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Written by Heng Fan
  3 | '''
  4 | import numpy as np
  5 | import os
  6 | import glob
  7 | import xml.etree.ElementTree as ET
  8 | import cv2
  9 | import datetime
 10 | 
 11 | 
 12 | '''
 13 | # default setting for cropping
 14 | '''
 15 | examplar_size = 127.0
 16 | instance_size = 255.0
 17 | context_amount = 0.5
 18 | 
 19 | 
 20 | def get_subwindow_avg(im, pos, model_sz, original_sz):
 21 |     '''
 22 |     # obtain image patch, padding with avg channel if area goes outside of border
 23 |     '''
 24 |     avg_chans = [np.mean(im[:, :, 0]), np.mean(im[:, :, 1]), np.mean(im[:, :, 2])]
 25 | 
 26 |     if original_sz is None:
 27 |         original_sz = model_sz
 28 | 
 29 |     sz = original_sz
 30 |     im_sz = im.shape
 31 |     # make sure the size is not too small
 32 |     assert (im_sz[0] > 2) & (im_sz[1] > 2), "The size of image is too small!"
 33 |     c = (sz + 1) / 2
 34 | 
 35 |     # check out-of-bounds coordinates, and set them to black
 36 |     context_xmin = round(pos[1] - c)  # floor(pos(2) - sz(2) / 2);
 37 |     context_xmax = context_xmin + sz - 1
 38 |     context_ymin = round(pos[0] - c)  # floor(pos(1) - sz(1) / 2);
 39 |     context_ymax = context_ymin + sz - 1
 40 |     left_pad = max(0, 1 - context_xmin)  # in python, index starts from 0
 41 |     top_pad = max(0, 1 - context_ymin)
 42 |     right_pad = max(0, context_xmax - im_sz[1])
 43 |     bottom_pad = max(0, context_ymax - im_sz[0])
 44 | 
 45 |     context_xmin = context_xmin + left_pad
 46 |     context_xmax = context_xmax + left_pad
 47 |     context_ymin = context_ymin + top_pad
 48 |     context_ymax = context_ymax + top_pad
 49 | 
 50 |     im_R = im[:, :, 0]
 51 |     im_G = im[:, :, 1]
 52 |     im_B = im[:, :, 2]
 53 | 
 54 |     # padding
 55 |     if (top_pad != 0) | (bottom_pad != 0) | (left_pad != 0) | (right_pad != 0):
 56 |         im_R = np.pad(im_R, ((int(top_pad), int(bottom_pad)), (int(left_pad), int(right_pad))), 'constant',
 57 |                       constant_values=avg_chans[0])
 58 |         im_G = np.pad(im_G, ((int(top_pad), int(bottom_pad)), (int(left_pad), int(right_pad))), 'constant',
 59 |                       constant_values=avg_chans[1])
 60 |         im_B = np.pad(im_B, ((int(top_pad), int(bottom_pad)), (int(left_pad), int(right_pad))), 'constant',
 61 |                       constant_values=avg_chans[2])
 62 | 
 63 |         im = np.stack((im_R, im_G, im_B), axis=2)
 64 | 
 65 |     im_patch_original = im[int(context_ymin) - 1:int(context_ymax), int(context_xmin) - 1:int(context_xmax), :]
 66 | 
 67 |     if model_sz != original_sz:
 68 |         im_patch = cv2.resize(im_patch_original, (int(model_sz), int(model_sz)), interpolation=cv2.INTER_CUBIC)
 69 |     else:
 70 |         im_patch = im_patch_original
 71 | 
 72 |     return im_patch
 73 | 
 74 | 
 75 | def get_crops(img, bbox, size_z, size_x, context_amount):
 76 |     '''
 77 |     # get examplar and search region crops
 78 |     '''
 79 |     cx = bbox[0] + bbox[2]/2
 80 |     cy = bbox[1] + bbox[3]/2
 81 |     w = bbox[2]
 82 |     h = bbox[3]
 83 | 
 84 |     # for examplar
 85 |     wc_z = w + context_amount * (w + h)
 86 |     hc_z = h + context_amount * (w + h)
 87 |     s_z = np.sqrt(wc_z * hc_z)
 88 |     scale_z = size_z / s_z
 89 |     im_crop_z = get_subwindow_avg(img, np.array([cy, cx]), size_z, round(s_z))
 90 | 
 91 |     # for search region
 92 |     d_search = (size_x - size_z) / 2
 93 |     pad = d_search / scale_z
 94 |     s_x = s_z + 2 * pad
 95 |     scale_x = size_x / s_x
 96 |     im_crop_x = get_subwindow_avg(img, np.array([cy, cx]), size_x, round(s_x))
 97 | 
 98 |     return im_crop_z, im_crop_x
 99 | 
100 | 
101 | def generate_image_crops(vid_root_path, vid_curated_path):
102 |     '''
103 |     # save image crops to the vid_curated_path
104 |     '''
105 |     anno_str = "Annotations/VID/train/"
106 |     data_str = "Data/VID/train/"
107 |     vid_anno_path = os.path.join(vid_root_path, anno_str)
108 |     vid_data_path = os.path.join(vid_root_path, data_str)
109 | 
110 |     cur_procesed_fraem = 0
111 |     start_time = datetime.datetime.now()
112 |     total_time = 0
113 | 
114 |     # dirs of level1: e.g., a/, b/, ...
115 |     all_dirs_level1 = os.listdir(vid_anno_path)
116 |     for i in range(len(all_dirs_level1)):
117 |         all_dirs_level2 = os.listdir(os.path.join(vid_anno_path, all_dirs_level1[i]))
118 | 
119 |         # dirs of level2: e.g., a/ILSVRC2015_train_00000000/, a/ILSVRC2015_train_00001000/, ...
120 |         for j in range(len(all_dirs_level2)):
121 |             frame_list = glob.glob(os.path.join(vid_anno_path, all_dirs_level1[i], all_dirs_level2[j], "*.xml"))
122 |             frame_list.sort()
123 | 
124 |             # level3: frame level
125 |             for k in range(len(frame_list)):
126 |                 frame_xml_name = os.path.join(vid_anno_path, all_dirs_level1[i], all_dirs_level2[j], frame_list[k])
127 |                 frame_xml_tree = ET.parse(frame_xml_name)
128 |                 frame_xml_root = frame_xml_tree.getroot()
129 | 
130 |                 # image file path
131 |                 frame_img_name = (frame_list[k].replace(".xml", ".JPEG")).replace(vid_anno_path, vid_data_path)
132 |                 img = cv2.imread(frame_img_name)
133 |                 if img is None:
134 |                     print("Cannot find %s!"%frame_img_name)
135 |                     exit(0)
136 | 
137 |                 # image file name
138 |                 frame_filename = frame_xml_root.find('filename').text
139 | 
140 |                 # process (all objects in) each frame
141 |                 for object in frame_xml_root.iter("object"):
142 |                     # get trackid
143 |                     id = object.find("trackid").text
144 | 
145 |                     # get bounding box
146 |                     bbox_node = object.find("bndbox")
147 |                     xmax = float(bbox_node.find('xmax').text)
148 |                     xmin = float(bbox_node.find('xmin').text)
149 |                     ymax = float(bbox_node.find('ymax').text)
150 |                     ymin = float(bbox_node.find('ymin').text)
151 |                     width = xmax - xmin + 1
152 |                     height = ymax - ymin + 1
153 |                     bbox = np.array([xmin, ymin, width, height])
154 | 
155 |                     # print("processing %s, %s, %s, %s ..." % (all_dirs_level1[i], all_dirs_level2[j], frame_filename+".JPEG", id))
156 | 
157 |                     # get crops
158 |                     im_crop_z, im_crop_x = get_crops(img, bbox, examplar_size, instance_size, context_amount)
159 | 
160 |                     # save crops
161 |                     save_path = os.path.join(vid_curated_path, data_str, all_dirs_level1[i], all_dirs_level2[j])
162 |                     if not os.path.exists(save_path):
163 |                         os.makedirs(save_path)
164 | 
165 |                     savename_crop_z = os.path.join(save_path, '{}.{:02d}.crop.z.jpg'.format(frame_filename, int(id)))
166 |                     savename_crop_x = os.path.join(save_path, '{}.{:02d}.crop.x.jpg'.format(frame_filename, int(id)))
167 | 
168 |                     cv2.imwrite(savename_crop_z, im_crop_z, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
169 |                     cv2.imwrite(savename_crop_x, im_crop_x, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
170 | 
171 |                     cur_procesed_fraem = cur_procesed_fraem + 1
172 | 
173 |                     if cur_procesed_fraem % 1000 == 0:
174 |                         end_time = datetime.datetime.now()
175 |                         total_time = total_time + int((end_time-start_time).seconds)
176 |                         print("finished processing %d frames in %d seconds (FPS: %d ) ..." % (cur_procesed_fraem, total_time, int(1000/(end_time-start_time).seconds)))
177 |                         start_time = datetime.datetime.now()
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     # path to your VID dataset
182 |     vid_root_path = "/home/hfan/Dataset/ILSVRC2015"
183 |     vid_curated_path = "/home/hfan/Dataset/ILSVRC2015_crops"
184 |     if not os.path.exists(vid_curated_path):
185 |         os.mkdir(vid_curated_path)
186 |     generate_image_crops(vid_root_path, vid_curated_path)
187 | 


--------------------------------------------------------------------------------
/ILSVRC15-curation/gen_imdb_VID.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Written by Heng Fan
  3 | '''
  4 | import numpy as np
  5 | import os
  6 | import glob
  7 | import xml.etree.ElementTree as ET
  8 | import json
  9 | 
 10 | 
 11 | validation_ratio = 0.1
 12 | 
 13 | 
 14 | def generate_image_imdb(vid_root_path, vid_curated_path):
 15 |     '''
 16 |     # save image crops to the vid_curated_path
 17 |     '''
 18 |     anno_str = "Annotations/VID/train/"
 19 |     data_str = "Data/VID/train/"
 20 |     vid_anno_path = os.path.join(vid_root_path, anno_str)
 21 |     vid_data_path = os.path.join(vid_root_path, data_str)
 22 | 
 23 |     num_videos = 0
 24 | 
 25 |     # dirs of level1: e.g., a/, b/, ...
 26 |     all_dirs_level1 = os.listdir(vid_anno_path)
 27 | 
 28 |     for i in range(len(all_dirs_level1)):
 29 |         all_dirs_level2 = os.listdir(os.path.join(vid_anno_path, all_dirs_level1[i]))
 30 |         num_videos = num_videos + len(all_dirs_level2)
 31 | 
 32 |     train_video_num = round(num_videos * (1-validation_ratio))
 33 |     val_video_num = num_videos - train_video_num
 34 | 
 35 |     imdb_video_train = dict()
 36 |     imdb_video_train['num_videos'] = train_video_num
 37 |     imdb_video_train['data_str'] = data_str
 38 | 
 39 |     imdb_video_val = dict()
 40 |     imdb_video_val['num_videos'] = val_video_num
 41 |     imdb_video_val['data_str'] = data_str
 42 | 
 43 |     videos_train = dict()
 44 |     videos_val = dict()
 45 | 
 46 |     vid_idx = 0
 47 | 
 48 |     for i in range(len(all_dirs_level1)):
 49 |         all_dirs_level2 = os.listdir(os.path.join(vid_anno_path, all_dirs_level1[i]))
 50 | 
 51 |         # dirs of level2: e.g., a/ILSVRC2015_train_00000000/, a/ILSVRC2015_train_00001000/, ...
 52 |         for j in range(len(all_dirs_level2)):
 53 | 
 54 |             if vid_idx < train_video_num:
 55 |                 if not videos_train.has_key(all_dirs_level2[j]):
 56 |                     videos_train[all_dirs_level2[j]] = []
 57 |             else:
 58 |                 if not videos_val.has_key(all_dirs_level2[j]):
 59 |                     videos_val[all_dirs_level2[j]] = []
 60 | 
 61 | 
 62 | 
 63 |             frame_list = glob.glob(os.path.join(vid_anno_path, all_dirs_level1[i], all_dirs_level2[j], "*.xml"))
 64 |             frame_list.sort()
 65 | 
 66 |             video_ids = dict()     # store frame level information
 67 | 
 68 |             # level3: frame level
 69 |             for k in range(len(frame_list)):
 70 |                 # read xml file
 71 |                 frame_id = k
 72 |                 frame_xml_name = os.path.join(vid_anno_path, all_dirs_level1[i], all_dirs_level2[j], frame_list[k])
 73 |                 frame_xml_tree = ET.parse(frame_xml_name)
 74 |                 frame_xml_root = frame_xml_tree.getroot()
 75 | 
 76 |                 # crop_path = os.path.join(vid_curated_path, data_str, all_dirs_level1[i], all_dirs_level2[j])
 77 |                 crop_path = os.path.join(all_dirs_level1[i], all_dirs_level2[j])
 78 |                 frame_filename = frame_xml_root.find('filename').text
 79 | 
 80 |                 print ("processing: %s, %s, %s ..." % (all_dirs_level1[i], all_dirs_level2[j], frame_filename))
 81 | 
 82 |                 for object in frame_xml_root.iter("object"):
 83 |                     # get trackid
 84 |                     id = object.find("trackid").text
 85 |                     if not video_ids.has_key(id):
 86 |                         video_ids[id] = []
 87 |                     # get bounding box
 88 |                     bbox_node = object.find("bndbox")
 89 |                     xmax = float(bbox_node.find('xmax').text)
 90 |                     xmin = float(bbox_node.find('xmin').text)
 91 |                     ymax = float(bbox_node.find('ymax').text)
 92 |                     ymin = float(bbox_node.find('ymin').text)
 93 |                     width = xmax - xmin + 1
 94 |                     height = ymax - ymin + 1
 95 |                     bbox = np.array([xmin, ymin, width, height])
 96 | 
 97 |                     tmp_instance = dict()
 98 |                     tmp_instance['instance_path'] = os.path.join(all_dirs_level1[i], all_dirs_level2[j], '{}.{:02d}.crop.x.jpg'.format(frame_filename, int(id)))
 99 |                     tmp_instance['bbox'] =bbox.tolist()
100 | 
101 |                     video_ids[id].append(tmp_instance)
102 | 
103 |             # delete the object_id with less than 1 frame
104 |             tmp_keys = video_ids.keys()
105 |             for ki in range(len(tmp_keys)):
106 |                 if len(video_ids[tmp_keys[ki]]) < 2:
107 |                     del video_ids[tmp_keys[ki]]
108 | 
109 |             tmp_keys = video_ids.keys()
110 | 
111 |             if len(tmp_keys) > 0:
112 | 
113 |                 if vid_idx < train_video_num:
114 |                     videos_train[all_dirs_level2[j]].append(video_ids)
115 |                 else:
116 |                     videos_val[all_dirs_level2[j]].append(video_ids)
117 | 
118 |                 vid_idx = vid_idx + 1
119 | 
120 |     imdb_video_train['videos'] = videos_train
121 |     imdb_video_val['videos'] = videos_val
122 | 
123 |     # save imdb information
124 |     json.dump(imdb_video_train, open('imdb_video_train.json', 'w'), indent=2)
125 |     json.dump(imdb_video_val, open('imdb_video_val.json', 'w'), indent=2)
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     vid_root_path = "/home/hfan/Dataset/ILSVRC2015"
130 |     vid_curated_path = "/home/hfan/Dataset/ILSVRC2015_crops"
131 |     generate_image_imdb(vid_root_path, vid_curated_path)
132 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## SiamFC-PyTorch
 2 | This is the PyTorch (0.40) implementation of SiamFC tracker [1], which was originally <a href="https://github.com/bertinetto/siamese-fc">implemented</a> in Matlab using MatConvNet [2]. In our implementation, we obtain better perforamnce than the original one.
 3 | 
 4 | ## Goal
 5 | 
 6 | * A more compact implementation of SiamFC [1]
 7 | * Reproduce the results of SiamFC [1], including data generation, training and tracking
 8 | 
 9 | ## Requirements
10 | 
11 | * Python 2.7 (I use Anaconda 2.* here. If you use Python3, you may get the very different results!)
12 | * Python-opencv
13 | * PyTorch 0.40
14 | * other common packages such as `numpy`, etc
15 | 
16 | ## Data curation 
17 | 
18 | * Download <a href="http://bvisionweb1.cs.unc.edu/ilsvrc2015/ILSVRC2015_VID.tar.gz">ILSVRC15</a>, and unzip it (let's assume that `$ILSVRC2015_Root` is the path to your ILSVRC2015)
19 | * Move `$ILSVRC2015_Root/Data/VID/val` into `$ILSVRC2015_Root/Data/VID/train/`, so we have five sub-folders in `$ILSVRC2015_Root/Data/VID/train/`
20 | * It is a good idea to change the names of five sub-folders in `$ILSVRC2015_Root/Data/VID/train/` to `a`, `b`, `c`, `d`, and `e`
21 | * Move `$ILSVRC2015_Root/Annotations/VID/val` into `$ILSVRC2015_Root/Annotations/VID/train/`, so we have five sub-folders in `$ILSVRC2015_Root/Annotations/VID/train/`
22 | * Change the names of five sub-folders in `$ILSVRC2015_Root/Annotations/VID/train/` to `a`, `b`, `c`, `d` and `e`, respectively
23 | 
24 | * Generate image crops
25 |   * cd `$SiamFC-PyTorch/ILSVRC15-curation/` (Assume you've downloaded the rep and its path is `$SiamFC-PyTorch`)
26 |   * change `vid_curated_path` in `gen_image_crops_VID.py` to save your crops
27 |   * run `$python gen_image_crops_VID.py` (I run it in PyCharm), then you can check the cropped images in your saving path (i.e., `vid_curated_path`)
28 |   
29 | * Generate imdb for training and validation
30 |   * cd `$SiamFC-PyTorch/ILSVRC15-curation/`
31 |   * change `vid_root_path` and `vid_curated_path` to your custom path in `gen_imdb_VID.py`
32 |   * run `$python gen_imdb_VID.py`, then you will get two json files `imdb_video_train.json` (~ 430MB) and `imdb_video_val.json` (~ 28MB) in current folder, which are used for training and validation
33 | 
34 | ## Train
35 | 
36 | * cd `$SiamFC-PyTorch/Train/`
37 | * Change `data_dir`, `train_imdb` and `val_imdb` to your custom <b>cropping path</b>, training and validation json files
38 | * run `$python run_Train_SiamFC.py`
39 | * <b>some notes in training</b>
40 |   * the parameters for training are in `Config.py`
41 |   * by default, I use GPU in training, and you can check the details in the function `train(data_dir, train_imdb, val_imdb, model_save_path="./model/", use_gpu=True)`
42 |   * by default, the trained models will be saved to `$SiamFC-PyTorch/Train/model/`
43 |   * each epoch (50 in total) may take 7-8 minuts (Nvidia 1080 GPU), and you can use parallelling utilities in PyTorch for speeding up
44 |   * I tried to use fixed random seeds to get the same results, but it doesn't work ):, so results for each training may be slightly different (still better than the original)
45 |   * <b>only</b> color images are used for training, and better performance is expected if using color+gray as in original paper
46 | 
47 | ## Test (Tracking)
48 | 
49 | * cd `$SiamFC-PyTorch/Tracking/`
50 | * <b>Firstly</b>, you should take a look at `Config.py`, which contains all parameters for tracking
51 | * Change `self.net_base_path` to the path saving your trained models
52 | * Change `self.seq_base_path` to the path storing your test sequences (OTB format, otherwise you need to revise the function `load_sequence()` in `Tracking_Utils.py`
53 | * Change `self.net` to indicate whcih model you want for evaluation (by default, use the last one), and I've uploaded a trained model `SiamFC_50_model.pth` in this rep (located in $SiamFC-PyTorch/Train/model/)
54 | * Change other parameters as your willing :)
55 | * Now, let's run `$python run_Train_SiamFC.py`
56 | * <b>some notes in tracking</b>
57 |   * two evaluation types are provided: single video demo and evaluation on the whole (OTB-100) benchmark
58 |   * you can also change whihc net for evaluation in `run_Train_SiamFC.py`
59 | 
60 | ## Results
61 | I tested the trained model on OTB-100 using a Nvidia 1080 GPU. The results and comparisons to the original implementation are shown in the below image. The running speed of our implementation is <b>82 fps</b>. Note that, both models are trained from stratch.
62 | 
63 | ![image](/imgs/result.PNG)
64 | 
65 | ## References
66 | 
67 | [1] L. Bertinetto, J. Valmadre, J. F. Henriques, A. Vedaldi, and P. H. Torr. Fully-convolutional siamese networks for object tracking. In ECCV Workshop, 2016.
68 | 
69 | [2] A. Vedaldi and K. Lenc. Matconvnet – convolutional neural networks for matlab. In ACM MM, 2015.
70 | 
71 | ## Contact 
72 | 
73 | Any question are welcomed to hengfan@temple.edu.
74 | 


--------------------------------------------------------------------------------
/Tracking/Config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration for training SiamFC and tracking evaluation
 3 | Written by Heng Fan
 4 | """
 5 | 
 6 | 
 7 | class Config:
 8 |     def __init__(self):
 9 |         # parameters for training
10 |         self.pos_pair_range = 100
11 |         self.num_pairs = 5.32e4
12 |         self.val_ratio = 0.1
13 |         self.num_epoch = 50
14 |         self.batch_size = 8
15 |         self.examplar_size = 127
16 |         self.instance_size = 255
17 |         self.sub_mean = 0
18 |         self.train_num_workers = 12  # number of threads to load data when training
19 |         self.val_num_workers = 8
20 |         self.stride = 8
21 |         self.rPos = 16
22 |         self.rNeg = 0
23 |         self.label_weight_method = "balanced"
24 | 
25 |         self.lr = 1e-2               # learning rate of SGD
26 |         self.momentum = 0.9          # momentum of SGD
27 |         self.weight_decay = 5e-4     # weight decay of optimizator
28 |         self.step_size = 1           # step size of LR_Schedular
29 |         self.gamma = 0.8685          # decay rate of LR_Schedular
30 | 
31 |         # parameters for tracking (SiamFC-3s by default)
32 |         self.num_scale = 3
33 |         self.scale_step = 1.0375
34 |         self.scale_penalty = 0.9745
35 |         self.scale_LR = 0.59
36 |         self.response_UP = 16
37 |         self.windowing = "cosine"
38 |         self.w_influence = 0.176
39 | 
40 |         self.video = "Lemming"
41 |         self.visualization = 0
42 |         self.bbox_output = True
43 |         self.bbox_output_path = "./tracking_result/"
44 | 
45 |         self.context_amount = 0.5
46 |         self.scale_min = 0.2
47 |         self.scale_max = 5
48 |         self.score_size = 17
49 | 
50 |         # path to your trained model
51 |         self.net_base_path = "/home/hfan/Desktop/PyTorch-SiamFC/Train/model/"
52 |         # path to your sequences (sequence should be in OTB format)
53 |         self.seq_base_path = "/home/hfan/Desktop/demo-sequences/"
54 |         # which model to use
55 |         self.net = "SiamFC_50_model.pth"


--------------------------------------------------------------------------------
/Tracking/SiamNet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The architecture of SiamFC
  3 | Written by Heng Fan
  4 | """
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | from Config import *
 10 | 
 11 | 
 12 | class SiamNet(nn.Module):
 13 | 
 14 |     def __init__(self):
 15 |         super(SiamNet, self).__init__()
 16 | 
 17 |         # architecture (AlexNet like)
 18 |         self.feat_extraction = nn.Sequential(
 19 |             nn.Conv2d(3, 96, 11, 2),             # conv1
 20 |             nn.BatchNorm2d(96),
 21 |             nn.ReLU(inplace=True),
 22 |             nn.MaxPool2d(3, 2),
 23 |             nn.Conv2d(96, 256, 5, 1, groups=2),  # conv2, group convolution
 24 |             nn.BatchNorm2d(256),
 25 |             nn.ReLU(inplace=True),
 26 |             nn.MaxPool2d(3, 2),
 27 |             nn.Conv2d(256, 384, 3, 1),           # conv3
 28 |             nn.BatchNorm2d(384),
 29 |             nn.ReLU(inplace=True),
 30 |             nn.Conv2d(384, 384, 3, 1, groups=2), # conv4
 31 |             nn.BatchNorm2d(384),
 32 |             nn.ReLU(inplace=True),
 33 |             nn.Conv2d(384, 256, 3, 1, groups=2)  # conv5
 34 |         )
 35 | 
 36 |         # adjust layer as in the original SiamFC in matconvnet
 37 |         self.adjust = nn.Conv2d(1, 1, 1, 1)
 38 | 
 39 |         # initialize weights
 40 |         self._initialize_weight()
 41 | 
 42 |         self.config = Config()
 43 | 
 44 |     def forward(self, z, x):
 45 |         """
 46 |         forward pass
 47 |         z: examplare, BxCxHxW
 48 |         x: search region, BxCxH1xW1
 49 |         """
 50 |         # get features for z and x
 51 |         z_feat = self.feat_extraction(z)
 52 |         x_feat = self.feat_extraction(x)
 53 | 
 54 |         # correlation of z and x
 55 |         xcorr_out = self.xcorr(z_feat, x_feat)
 56 | 
 57 |         score = self.adjust(xcorr_out)
 58 | 
 59 |         return score
 60 | 
 61 |     def xcorr(self, z, x):
 62 |         """
 63 |         correlation layer as in the original SiamFC (convolution process in fact)
 64 |         """
 65 |         batch_size_x, channel_x, w_x, h_x = x.shape
 66 |         x = torch.reshape(x, (1, batch_size_x * channel_x, w_x, h_x))
 67 | 
 68 |         # group convolution
 69 |         out = F.conv2d(x, z, groups = batch_size_x)
 70 | 
 71 |         batch_size_out, channel_out, w_out, h_out = out.shape
 72 |         xcorr_out = torch.reshape(out, (channel_out, batch_size_out, w_out, h_out))
 73 | 
 74 |         return xcorr_out
 75 | 
 76 |     def _initialize_weight(self):
 77 |         """
 78 |         initialize network parameters
 79 |         """
 80 |         tmp_layer_idx = 0
 81 |         for m in self.modules():
 82 |             if isinstance(m, nn.Conv2d):
 83 |                 tmp_layer_idx = tmp_layer_idx + 1
 84 |                 if tmp_layer_idx < 6:
 85 |                     # kaiming initialization
 86 |                     nn.init.kaiming_normal_(m.weight.data, mode='fan_out', nonlinearity='relu')
 87 |                 else:
 88 |                     # initialization for adjust layer as in the original paper
 89 |                     m.weight.data.fill_(1e-3)
 90 |                     m.bias.data.zero_()
 91 |             elif isinstance(m, nn.BatchNorm2d):
 92 |                 m.weight.data.fill_(1)
 93 |                 m.bias.data.zero_()
 94 | 
 95 |     def weight_loss(self, prediction, label, weight):
 96 |         """
 97 |         weighted cross entropy loss
 98 |         """
 99 |         return F.binary_cross_entropy_with_logits(prediction,
100 |                                                   label,
101 |                                                   weight,
102 |                                                   size_average=False) / self.config.batch_size
103 | 


--------------------------------------------------------------------------------
/Tracking/Tracking_Utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tool functiond for tracking evaluation
  3 | Written by Heng Fan
  4 | """
  5 | 
  6 | import os
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | import matplotlib.patches as patches
 10 | import cv2
 11 | import glob
 12 | 
 13 | 
 14 | def cat_img(image_cat1, image_cat2, image_cat3):
 15 |     """
 16 |     concatenate three 1-channel images to one 3-channel image
 17 |     """
 18 |     image = np.zeros(shape = (image_cat1.shape[0], image_cat1.shape[1], 3), dtype=np.double)
 19 |     image[:, :, 0] = image_cat1
 20 |     image[:, :, 1] = image_cat2
 21 |     image[:, :, 2] = image_cat3
 22 | 
 23 |     return image
 24 | 
 25 | 
 26 | def load_sequence(seq_root_path, seq_name):
 27 |     """
 28 |     load sequences;
 29 |     sequences should be in OTB format, or you can custom this function by yourself
 30 |     """
 31 |     img_dir = os.path.join(seq_root_path, seq_name, 'img/')
 32 |     gt_path = os.path.join(seq_root_path, seq_name, 'groundtruth_rect.txt')
 33 | 
 34 |     img_list = glob.glob(img_dir + "*.jpg")
 35 |     img_list.sort()
 36 |     img_list = [os.path.join(img_dir, x) for x in img_list]
 37 | 
 38 |     gt = np.loadtxt(gt_path, delimiter=',')
 39 | 
 40 |     init_bbox = gt[0]
 41 |     if seq_name == "Tiger1":
 42 |         init_bbox = gt[5]
 43 | 
 44 |     init_x = init_bbox[0]
 45 |     init_y = init_bbox[1]
 46 |     init_w = init_bbox[2]
 47 |     init_h = init_bbox[3]
 48 | 
 49 |     target_position = np.array([init_y + init_h/2, init_x + init_w/2], dtype = np.double)
 50 |     target_sz = np.array([init_h, init_w], dtype = np.double)
 51 | 
 52 |     if seq_name == "David":
 53 |         img_list = img_list[299:]
 54 |     if seq_name == "Tiger1":
 55 |         img_list = img_list[5:]
 56 |     if seq_name == "Football1":
 57 |         img_list = img_list[0:74]
 58 | 
 59 |     return img_list, target_position, target_sz
 60 | 
 61 | 
 62 | def visualize_tracking_result(img, bbox, fig_n):
 63 |     """
 64 |     visualize tracking result
 65 |     """
 66 |     fig = plt.figure(fig_n)
 67 |     ax = plt.Axes(fig, [0., 0., 1., 1.])
 68 |     ax.set_axis_off()
 69 |     fig.add_axes(ax)
 70 |     r = patches.Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], linewidth = 3, edgecolor = "#00ff00", zorder = 1, fill = False)
 71 |     ax.imshow(img)
 72 |     ax.add_patch(r)
 73 |     plt.ion()
 74 |     plt.show()
 75 |     plt.pause(0.00001)
 76 |     plt.clf()
 77 | 
 78 | 
 79 | def get_subwindow_tracking(im, pos, model_sz, original_sz, avg_chans):
 80 |     """
 81 |     extract image crop
 82 |     """
 83 |     if original_sz is None:
 84 |         original_sz = model_sz
 85 | 
 86 |     sz = original_sz
 87 |     im_sz = im.shape
 88 |     # make sure the size is not too small
 89 |     assert (im_sz[0] > 2) & (im_sz[1] > 2), "The size of image is too small!"
 90 |     c = (sz+1) / 2
 91 | 
 92 |     # check out-of-bounds coordinates, and set them to black
 93 |     context_xmin = round(pos[1] - c)       # floor(pos(2) - sz(2) / 2);
 94 |     context_xmax = context_xmin + sz - 1
 95 |     context_ymin = round(pos[0] - c)       # floor(pos(1) - sz(1) / 2);
 96 |     context_ymax = context_ymin + sz - 1
 97 |     left_pad = max(0, 1-context_xmin)       # in python, index starts from 0
 98 |     top_pad = max(0, 1-context_ymin)
 99 |     right_pad = max(0, context_xmax - im_sz[1])
100 |     bottom_pad = max(0, context_ymax - im_sz[0])
101 | 
102 |     context_xmin = context_xmin + left_pad
103 |     context_xmax = context_xmax + left_pad
104 |     context_ymin = context_ymin + top_pad
105 |     context_ymax = context_ymax + top_pad
106 | 
107 |     im_R = im[:, :, 0]
108 |     im_G = im[:, :, 1]
109 |     im_B = im[:, :, 2]
110 | 
111 |     # padding
112 |     if (top_pad !=0) | (bottom_pad !=0) | (left_pad !=0) | (right_pad !=0):
113 |         im_R = np.pad(im_R, ((int(top_pad), int(bottom_pad)), (int(left_pad), int(right_pad))), 'constant', constant_values = avg_chans[0])
114 |         im_G = np.pad(im_G, ((int(top_pad), int(bottom_pad)), (int(left_pad), int(right_pad))), 'constant', constant_values = avg_chans[1])
115 |         im_B = np.pad(im_B, ((int(top_pad), int(bottom_pad)), (int(left_pad), int(right_pad))), 'constant', constant_values = avg_chans[2])
116 | 
117 |         im = cat_img(im_R, im_G, im_B)
118 | 
119 |     im_patch_original = im[int(context_ymin)-1:int(context_ymax), int(context_xmin)-1:int(context_xmax), :]
120 | 
121 |     if model_sz != original_sz:
122 |         im_patch = cv2.resize(im_patch_original, (int(model_sz), int(model_sz)), interpolation = cv2.INTER_CUBIC)
123 |     else:
124 |         im_patch = im_patch_original
125 | 
126 |     return im_patch
127 | 
128 | 
129 | def make_scale_pyramid(im, target_position, in_side_scaled, out_side, avg_chans, p):
130 |     """
131 |     extract multi-scale image crops
132 |     """
133 |     in_side_scaled = np.round(in_side_scaled)
134 |     pyramid = np.zeros((out_side, out_side, 3, p.num_scale), dtype = np.double)
135 |     max_target_side = in_side_scaled[in_side_scaled.size-1]
136 |     min_target_side = in_side_scaled[0]
137 |     beta = out_side / min_target_side
138 |     # size_in_search_area = beta * size_in_image
139 |     # e.g. out_side = beta * min_target_side
140 |     search_side = round(beta * max_target_side)
141 | 
142 |     search_region = get_subwindow_tracking(im, target_position, search_side, max_target_side, avg_chans)
143 | 
144 |     assert (round(beta * min_target_side) == out_side), "Error!"
145 | 
146 |     for s in range(p.num_scale):
147 |         target_side = round(beta * in_side_scaled[s])
148 |         search_target_position = np.array([1 + search_side/2, 1 + search_side/2], dtype = np.double)
149 |         pyramid[:, :, :, s] = get_subwindow_tracking(search_region, search_target_position, out_side,
150 |                                                    target_side, avg_chans)
151 | 
152 |     return pyramid
153 | 
154 | 
155 | def tracker_eval(net, s_x, z_features, x_features, target_position, window, p):
156 |     """
157 |     do evaluation (i.e., a forward pass for search region)
158 |     (This part is implemented as in the original Matlab version)
159 |     """
160 |     # compute scores search regions of different scales
161 |     scores = net.xcorr(z_features, x_features)
162 |     scores = scores.to("cpu")
163 | 
164 |     response_maps = scores.squeeze().permute(1, 2, 0).data.numpy()
165 |     # for this one, the opencv resize function works fine
166 |     response_maps_up = cv2.resize(response_maps, (response_maps.shape[0]*p.response_UP, response_maps.shape[0]*p.response_UP), interpolation=cv2.INTER_CUBIC)
167 | 
168 |     # choose the scale whose response map has the highest peak
169 |     if p.num_scale > 1:
170 |         current_scale_id =np.ceil(p.num_scale/2)
171 |         best_scale = current_scale_id
172 |         best_peak = float("-inf")
173 |         for s in range(p.num_scale):
174 |             this_response = response_maps_up[:, :, s]
175 |             # penalize change of scale
176 |             if s != current_scale_id:
177 |                 this_response = this_response * p.scale_penalty
178 |             this_peak = np.max(this_response)
179 |             if this_peak > best_peak:
180 |                 best_peak = this_peak
181 |                 best_scale = s
182 |         response_map = response_maps_up[:, :, int(best_scale)]
183 |     else:
184 |         response_map = response_maps_up
185 |         best_scale = 1
186 |     # make the response map sum to 1
187 |     response_map = response_map - np.min(response_map)
188 |     response_map = response_map / sum(sum(response_map))
189 | 
190 |     # apply windowing
191 |     response_map = (1 - p.w_influence) * response_map + p.w_influence * window
192 |     p_corr = np.asarray(np.unravel_index(np.argmax(response_map), np.shape(response_map)))
193 | 
194 |     # avoid empty
195 |     if p_corr[0] is None:
196 |         p_corr[0] = np.ceil(p.score_size/2)
197 |     if p_corr[1] is None:
198 |         p_corr[1] = np.ceil(p.score_size/2)
199 | 
200 |     # Convert to crop-relative coordinates to frame coordinates
201 |     # displacement from the center in instance final representation ...
202 |     disp_instance_final = p_corr - np.ceil(p.score_size * p.response_UP / 2)
203 |     # ... in instance input ...
204 |     disp_instance_input = disp_instance_final * p.stride / p.response_UP
205 |     # ... in instance original crop (in frame coordinates)
206 |     disp_instance_frame = disp_instance_input * s_x / p.instance_size
207 |     # position within frame in frame coordinates
208 |     new_target_position = target_position + disp_instance_frame
209 | 
210 |     return new_target_position, best_scale


--------------------------------------------------------------------------------
/Tracking/run_SiamFC.py:
--------------------------------------------------------------------------------
  1 | from Config import *
  2 | from Tracking_Utils import *
  3 | from SiamNet import *
  4 | import os
  5 | import numpy as np
  6 | import torchvision.transforms.functional as F
  7 | import cv2
  8 | import datetime
  9 | import torch
 10 | from torch.autograd import Variable
 11 | 
 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 13 | 
 14 | 
 15 | # entry to evaluation of SiamFC
 16 | def run_tracker(p):
 17 |     """
 18 |     run tracker, return bounding result and speed
 19 |     """
 20 |     # load model
 21 |     net = torch.load(os.path.join(p.net_base_path, p.net))
 22 |     net = net.to(device)
 23 | 
 24 |     # evaluation mode
 25 |     net.eval()
 26 | 
 27 |     # load sequence
 28 |     img_list, target_position, target_size = load_sequence(p.seq_base_path, p.video)
 29 | 
 30 |     # first frame
 31 |     img_uint8 = cv2.imread(img_list[0])
 32 |     img_uint8 = cv2.cvtColor(img_uint8, cv2.COLOR_BGR2RGB)
 33 |     img_double = np.double(img_uint8)    # uint8 to float
 34 | 
 35 |     # compute avg for padding
 36 |     avg_chans = np.mean(img_double, axis=(0, 1))
 37 | 
 38 |     wc_z = target_size[1] + p.context_amount * sum(target_size)
 39 |     hc_z = target_size[0] + p.context_amount * sum(target_size)
 40 |     s_z = np.sqrt(wc_z * hc_z)
 41 |     scale_z = p.examplar_size / s_z
 42 | 
 43 |     # crop examplar z in the first frame
 44 |     z_crop = get_subwindow_tracking(img_double, target_position, p.examplar_size, round(s_z), avg_chans)
 45 | 
 46 |     z_crop = np.uint8(z_crop)  # you need to convert it to uint8
 47 |     # convert image to tensor
 48 |     z_crop_tensor = 255.0 * F.to_tensor(z_crop).unsqueeze(0)
 49 | 
 50 |     d_search = (p.instance_size - p.examplar_size) / 2
 51 |     pad = d_search / scale_z
 52 |     s_x = s_z + 2 * pad
 53 |     # arbitrary scale saturation
 54 |     min_s_x = p.scale_min * s_x
 55 |     max_s_x = p.scale_max * s_x
 56 | 
 57 |     # generate cosine window
 58 |     if p.windowing == 'cosine':
 59 |         window = np.outer(np.hanning(p.score_size * p.response_UP), np.hanning(p.score_size * p.response_UP))
 60 |     elif p.windowing == 'uniform':
 61 |         window = np.ones((p.score_size * p.response_UP, p.score_size * p.response_UP))
 62 |     window = window / sum(sum(window))
 63 | 
 64 |     # pyramid scale search
 65 |     scales = p.scale_step**np.linspace(-np.ceil(p.num_scale/2), np.ceil(p.num_scale/2), p.num_scale)
 66 | 
 67 |     # extract feature for examplar z
 68 |     z_features = net.feat_extraction(Variable(z_crop_tensor).to(device))
 69 |     z_features = z_features.repeat(p.num_scale, 1, 1, 1)
 70 | 
 71 |     # do tracking
 72 |     bboxes = np.zeros((len(img_list), 4), dtype=np.double)  # save tracking result
 73 |     start_time = datetime.datetime.now()
 74 |     for i in range(0, len(img_list)):
 75 |         if i > 0:
 76 |             # do detection
 77 |             # currently, we only consider RGB images for tracking
 78 |             img_uint8 = cv2.imread(img_list[i])
 79 |             img_uint8 = cv2.cvtColor(img_uint8, cv2.COLOR_BGR2RGB)
 80 |             img_double = np.double(img_uint8)  # uint8 to float
 81 | 
 82 |             scaled_instance = s_x * scales
 83 |             scaled_target = np.zeros((2, scales.size), dtype = np.double)
 84 |             scaled_target[0, :] = target_size[0] * scales
 85 |             scaled_target[1, :] = target_size[1] * scales
 86 | 
 87 |             # extract scaled crops for search region x at previous target position
 88 |             x_crops = make_scale_pyramid(img_double, target_position, scaled_instance, p.instance_size, avg_chans, p)
 89 | 
 90 |             # get features of search regions
 91 |             x_crops_tensor = torch.FloatTensor(x_crops.shape[3], x_crops.shape[2], x_crops.shape[1], x_crops.shape[0])
 92 |             # response_map = SiameseNet.get_response_map(z_features, x_crops)
 93 |             for k in range(x_crops.shape[3]):
 94 |                 tmp_x_crop = x_crops[:, :, :, k]
 95 |                 tmp_x_crop = np.uint8(tmp_x_crop)
 96 |                 # numpy array to tensor
 97 |                 x_crops_tensor[k, :, :, :] = 255.0 * F.to_tensor(tmp_x_crop).unsqueeze(0)
 98 | 
 99 |             # get features of search regions
100 |             x_features = net.feat_extraction(Variable(x_crops_tensor).to(device))
101 | 
102 |             # evaluate the offline-trained network for exemplar x features
103 |             target_position, new_scale = tracker_eval(net, round(s_x), z_features, x_features, target_position, window, p)
104 | 
105 |             # scale damping and saturation
106 |             s_x = max(min_s_x, min(max_s_x, (1 - p.scale_LR) * s_x + p.scale_LR * scaled_instance[int(new_scale)]))
107 |             target_size = (1 - p.scale_LR) * target_size + p.scale_LR * np.array([scaled_target[0, int(new_scale)], scaled_target[1, int(new_scale)]])
108 | 
109 |         rect_position = np.array([target_position[1]-target_size[1]/2, target_position[0]-target_size[0]/2, target_size[1], target_size[0]])
110 | 
111 |         if p.visualization:
112 |             visualize_tracking_result(img_uint8, rect_position, 1)
113 | 
114 |         # output bbox in the original frame coordinates
115 |         o_target_position = target_position
116 |         o_target_size = target_size
117 |         bboxes[i,:] = np.array([o_target_position[1]-o_target_size[1]/2, o_target_position[0]-o_target_size[0]/2, o_target_size[1], o_target_size[0]])
118 | 
119 |     end_time = datetime.datetime.now()
120 |     fps = len(img_list)/max(1.0, (end_time-start_time).seconds)
121 | 
122 |     return bboxes, fps
123 | 
124 | 
125 | if __name__ == "__main__":
126 | 
127 |     # get the default parameters
128 |     p = Config()
129 | 
130 |     # choose which model to run
131 |     p.net = "SiamFC_50_model.pth"
132 | 
133 |     # choose demo type, single or all
134 |     demp_type = "all"
135 | 
136 |     if demp_type == "single":   # single video demo
137 |         video = "Lemming"
138 |         p.video = video
139 |         print("Processing %s ... " % p.video)
140 |         bbox_result, fps = run_tracker(p)
141 |         print("FPS: %d " % fps)
142 |     else:                       # evaluation the whole OTB benchmark
143 |         # load all videos
144 |         all_videos = os.listdir(p.seq_base_path)
145 | 
146 |         if p.bbox_output:
147 |             if not os.path.exists(p.bbox_output_path):
148 |                 os.makedirs(p.bbox_output_path)
149 | 
150 |         fps_all = .0
151 | 
152 |         for video in all_videos:
153 |             p.video = video
154 |             print("Processing %s ... " % p.video)
155 |             bbox_result, fps = run_tracker(p)
156 |             # fps for this video
157 |             print("FPS: %d " % fps)
158 |             # saving tracking results
159 |             if p.bbox_output:
160 |                 np.savetxt(p.bbox_output_path + p.video.lower() + '_SiamFC.txt', bbox_result, fmt='%.3f')
161 |             fps_all = fps_all + fps
162 | 
163 |         avg_fps = fps_all / len(all_videos)
164 |         print("Average FPS: %f" % avg_fps)
165 | 


--------------------------------------------------------------------------------
/Train/Config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration for training SiamFC and tracking evaluation
 3 | Written by Heng Fan
 4 | """
 5 | 
 6 | 
 7 | class Config:
 8 |     def __init__(self):
 9 |         # parameters for training
10 |         self.pos_pair_range = 100
11 |         self.num_pairs = 5.32e4
12 |         self.val_ratio = 0.1
13 |         self.num_epoch = 50
14 |         self.batch_size = 8
15 |         self.examplar_size = 127
16 |         self.instance_size = 255
17 |         self.sub_mean = 0
18 |         self.train_num_workers = 12  # number of threads to load data when training
19 |         self.val_num_workers = 8
20 |         self.stride = 8
21 |         self.rPos = 16
22 |         self.rNeg = 0
23 |         self.label_weight_method = "balanced"
24 | 
25 |         self.lr = 1e-2               # learning rate of SGD
26 |         self.momentum = 0.9          # momentum of SGD
27 |         self.weight_decay = 5e-4     # weight decay of optimizator
28 |         self.step_size = 1           # step size of LR_Schedular
29 |         self.gamma = 0.8685          # decay rate of LR_Schedular
30 | 
31 |         # parameters for tracking (SiamFC-3s by default)
32 |         self.num_scale = 3
33 |         self.scale_step = 1.0375
34 |         self.scale_penalty = 0.9745
35 |         self.scale_LR = 0.59
36 |         self.response_UP = 16
37 |         self.windowing = "cosine"
38 |         self.w_influence = 0.176
39 | 
40 |         self.video = "Lemming"
41 |         self.visualization = 1
42 |         self.bbox_output = True
43 |         self.bbox_output_path = "./tracking_result/"
44 | 
45 |         self.context_amount = 0.5
46 |         self.scale_min = 0.2
47 |         self.scale_max = 5
48 |         self.score_size = 17
49 | 
50 |         # path to your trained model
51 |         self.net_base_path = "/home/hfan/Desktop/PyTorch-SiamFC/Train/model/"
52 |         # path to your sequences (sequence should be in OTB format)
53 |         self.seq_base_path = "/home/hfan/Desktop/demo-sequences/"
54 |         # which model to use
55 |         self.net = "SiamFC_50_model.pth"


--------------------------------------------------------------------------------
/Train/DataAugmentation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import cv2
 4 | 
 5 | 
 6 | class RandomStretch(object):
 7 |     def __init__(self, max_stretch=0.05):
 8 |         self.max_stretch = max_stretch
 9 | 
10 |     def __call__(self, sample):
11 |         scale_h = 1.0 + np.random.uniform(-self.max_stretch, self.max_stretch)
12 |         scale_w = 1.0 + np.random.uniform(-self.max_stretch, self.max_stretch)
13 |         h, w = sample.shape[:2]
14 |         shape = (int(h * scale_h), int(w * scale_w))
15 |         return cv2.resize(sample, shape, cv2.INTER_CUBIC)
16 | 
17 | 
18 | class CenterCrop(object):
19 |     '''
20 |     center crop for examplar z
21 |     '''
22 |     def __init__(self, size):
23 |         self.size = size
24 | 
25 |     def __call__(self, sample):
26 |         shape = sample.shape[:2]
27 |         cy, cx = shape[0] // 2, shape[1] // 2
28 |         ymin, xmin = cy - self.size[0]//2, cx - self.size[1] // 2
29 |         ymax, xmax = cy + self.size[0]//2 + 1, cx + self.size[1] // 2 + 1
30 |         left = right = top = bottom = 0
31 |         im_h, im_w = shape
32 |         if xmin < 0:
33 |             left = int(abs(xmin))
34 |         if xmax > im_w:
35 |             right = int(xmax - im_w)
36 |         if ymin < 0:
37 |             top = int(abs(ymin))
38 |         if ymax > im_h:
39 |             bottom = int(ymax - im_h)
40 | 
41 |         xmin = int(max(0, xmin))
42 |         xmax = int(min(im_w, xmax))
43 |         ymin = int(max(0, ymin))
44 |         ymax = int(min(im_h, ymax))
45 |         im_patch = sample[ymin:ymax, xmin:xmax]
46 |         if left != 0 or right !=0 or top!=0 or bottom!=0:
47 |             im_patch = cv2.copyMakeBorder(im_patch, top, bottom, left, right,
48 |                     cv2.BORDER_CONSTANT, value=0)
49 |         return im_patch
50 | 
51 | 
52 | class RandomCrop(object):
53 |     def __init__(self, size):
54 |         self.size = size
55 | 
56 |     def __call__(self, sample):
57 |         shape = sample.shape[:2]
58 |         y1 = np.random.randint(0, shape[0] - self.size[0])
59 |         x1 = np.random.randint(0, shape[1] - self.size[1])
60 |         y2 = y1 + self.size[0]
61 |         x2 = x1 + self.size[1]
62 |         img_patch = sample[y1:y2, x1:x2]
63 |         return img_patch
64 | 
65 | 
66 | class Normalize(object):
67 |     def __init__(self):
68 |         self.mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
69 |         self.std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
70 | 
71 |     def __call__(self, sample):
72 |         return (sample / 255. - self.mean) / self.std
73 | 
74 | 
75 | class ToTensor(object):
76 |     def __call__(self, sample):
77 |         sample = sample.transpose(2, 0, 1)
78 |         return torch.from_numpy(sample.astype(np.float32))
79 | 


--------------------------------------------------------------------------------
/Train/SiamNet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The architecture of SiamFC
  3 | Written by Heng Fan
  4 | """
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | from Config import *
 10 | 
 11 | 
 12 | class SiamNet(nn.Module):
 13 | 
 14 |     def __init__(self):
 15 |         super(SiamNet, self).__init__()
 16 | 
 17 |         # architecture (AlexNet like)
 18 |         self.feat_extraction = nn.Sequential(
 19 |             nn.Conv2d(3, 96, 11, 2),             # conv1
 20 |             nn.BatchNorm2d(96),
 21 |             nn.ReLU(inplace=True),
 22 |             nn.MaxPool2d(3, 2),
 23 |             nn.Conv2d(96, 256, 5, 1, groups=2),  # conv2, group convolution
 24 |             nn.BatchNorm2d(256),
 25 |             nn.ReLU(inplace=True),
 26 |             nn.MaxPool2d(3, 2),
 27 |             nn.Conv2d(256, 384, 3, 1),           # conv3
 28 |             nn.BatchNorm2d(384),
 29 |             nn.ReLU(inplace=True),
 30 |             nn.Conv2d(384, 384, 3, 1, groups=2), # conv4
 31 |             nn.BatchNorm2d(384),
 32 |             nn.ReLU(inplace=True),
 33 |             nn.Conv2d(384, 256, 3, 1, groups=2)  # conv5
 34 |         )
 35 | 
 36 |         # adjust layer as in the original SiamFC in matconvnet
 37 |         self.adjust = nn.Conv2d(1, 1, 1, 1)
 38 | 
 39 |         # initialize weights
 40 |         self._initialize_weight()
 41 | 
 42 |         self.config = Config()
 43 | 
 44 |     def forward(self, z, x):
 45 |         """
 46 |         forward pass
 47 |         z: examplare, BxCxHxW
 48 |         x: search region, BxCxH1xW1
 49 |         """
 50 |         # get features for z and x
 51 |         z_feat = self.feat_extraction(z)
 52 |         x_feat = self.feat_extraction(x)
 53 | 
 54 |         # correlation of z and z
 55 |         xcorr_out = self.xcorr(z_feat, x_feat)
 56 | 
 57 |         score = self.adjust(xcorr_out)
 58 | 
 59 |         return score
 60 | 
 61 |     def xcorr(self, z, x):
 62 |         """
 63 |         correlation layer as in the original SiamFC (convolution process in fact)
 64 |         """
 65 |         batch_size_x, channel_x, w_x, h_x = x.shape
 66 |         x = torch.reshape(x, (1, batch_size_x * channel_x, w_x, h_x))
 67 | 
 68 |         # group convolution
 69 |         out = F.conv2d(x, z, groups = batch_size_x)
 70 | 
 71 |         batch_size_out, channel_out, w_out, h_out = out.shape
 72 |         xcorr_out = torch.reshape(out, (channel_out, batch_size_out, w_out, h_out))
 73 | 
 74 |         return xcorr_out
 75 | 
 76 |     def _initialize_weight(self):
 77 |         """
 78 |         initialize network parameters
 79 |         """
 80 |         tmp_layer_idx = 0
 81 |         for m in self.modules():
 82 |             if isinstance(m, nn.Conv2d):
 83 |                 tmp_layer_idx = tmp_layer_idx + 1
 84 |                 if tmp_layer_idx < 6:
 85 |                     # kaiming initialization
 86 |                     nn.init.kaiming_normal_(m.weight.data, mode='fan_out', nonlinearity='relu')
 87 |                 else:
 88 |                     # initialization for adjust layer as in the original paper
 89 |                     m.weight.data.fill_(1e-3)
 90 |                     m.bias.data.zero_()
 91 |             elif isinstance(m, nn.BatchNorm2d):
 92 |                 m.weight.data.fill_(1)
 93 |                 m.bias.data.zero_()
 94 | 
 95 |     def weight_loss(self, prediction, label, weight):
 96 |         """
 97 |         weighted cross entropy loss
 98 |         """
 99 |         return F.binary_cross_entropy_with_logits(prediction,
100 |                                                   label,
101 |                                                   weight,
102 |                                                   size_average=False) / self.config.batch_size
103 | 


--------------------------------------------------------------------------------
/Train/Utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Define some tool functions
 3 | Written by Heng Fan
 4 | """
 5 | 
 6 | import torch
 7 | import numpy as np
 8 | import cv2
 9 | 
10 | 
11 | def create_logisticloss_label(label_size, rPos, rNeg):
12 |     """
13 |     construct label for logistic loss (same for all pairs)
14 |     """
15 |     label_side = int(label_size[0])
16 |     logloss_label = torch.zeros(label_side, label_side)
17 |     label_origin = np.array([np.ceil(label_side / 2), np.ceil(label_side / 2)])
18 |     for i in range(label_side):
19 |         for j in range(label_side):
20 |             dist_from_origin = np.sqrt((i - label_origin[0]) ** 2 + (j - label_origin[1]) ** 2)
21 |             if dist_from_origin <= rPos:
22 |                 logloss_label[i, j] = +1
23 |             else:
24 |                 if dist_from_origin <= rNeg:
25 |                     logloss_label[i, j] = 0
26 |                 
27 |     return logloss_label
28 | 
29 | 
30 | def create_label(fixed_label_size, config, use_gpu):
31 |     """
32 |     create label with weight
33 |     """
34 |     rPos = config.rPos / config.stride
35 |     rNeg = config.rNeg / config.stride
36 | 
37 |     half = int(np.floor(fixed_label_size[0] / 2) + 1)
38 | 
39 |     if config.label_weight_method == "balanced":
40 |         fixed_label = create_logisticloss_label(fixed_label_size, rPos, rNeg)
41 |         # plt.imshow(fixed_label)
42 |         # plt.colorbar()
43 |         # plt.show()
44 |         instance_weight = torch.ones(fixed_label.shape[0], fixed_label.shape[1])
45 |         tmp_idx_P = np.where(fixed_label == 1)
46 |         sumP = tmp_idx_P[0].size
47 |         tmp_idx_N = np.where(fixed_label == 0)
48 |         sumN = tmp_idx_N[0].size
49 |         instance_weight[tmp_idx_P] = 0.5 * instance_weight[tmp_idx_P] / sumP
50 |         instance_weight[tmp_idx_N] = 0.5 * instance_weight[tmp_idx_N] / sumN
51 |         # plt.imshow(instance_weight)
52 |         # plt.colorbar()
53 |         # plt.show()
54 | 
55 |         # reshape label
56 |         fixed_label = torch.reshape(fixed_label, (1, 1, fixed_label.shape[0], fixed_label.shape[1]))
57 |         # copy label to match batchsize
58 |         fixed_label = fixed_label.repeat(config.batch_size, 1, 1, 1)
59 | 
60 |         # reshape weight
61 |         instance_weight = torch.reshape(instance_weight, (1, instance_weight.shape[0], instance_weight.shape[1]))
62 | 
63 |     if use_gpu:
64 |         return fixed_label.cuda(), instance_weight.cuda()
65 |     else:
66 |         return fixed_label, instance_weight
67 | 
68 | 
69 | def cv2_brg2rgb(bgr_img):
70 |     """
71 |     convert brg image to rgb
72 |     """
73 |     b, g, r = cv2.split(bgr_img)
74 |     rgb_img = cv2.merge([r, g, b])
75 |     
76 |     return rgb_img
77 | 
78 | 
79 | def float32_to_uint8(img):
80 |     """
81 |     convert float32 array to uint8
82 |     """
83 |     beyong_255 = np.where(img > 255)
84 |     img[beyong_255] = 255
85 |     less_0 = np.where(img < 0)
86 |     img[less_0] = 0
87 |     img = np.round(img)
88 | 
89 |     return img.astype(np.uint8)
90 | 


--------------------------------------------------------------------------------
/Train/VIDDataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Dataset for VID
 3 | Written by Heng Fan
 4 | """
 5 | 
 6 | from torch.utils.data.dataset import Dataset
 7 | import json
 8 | from Utils import *
 9 | import os
10 | 
11 | 
12 | class VIDDataset(Dataset):
13 | 
14 |     def __init__(self, imdb, data_dir, config, z_transforms, x_transforms, mode = "Train"):
15 |         imdb_video      = json.load(open(imdb, 'r'))
16 |         self.videos     = imdb_video['videos']
17 |         self.data_dir   = data_dir
18 |         self.config     = config
19 |         self.num_videos = int(imdb_video['num_videos'])
20 | 
21 |         self.z_transforms = z_transforms
22 |         self.x_transforms = x_transforms
23 | 
24 |         if mode == "Train":
25 |             self.num = self.config.num_pairs
26 |         else:
27 |             self.num = self.num_videos
28 | 
29 |     def __getitem__(self, rand_vid):
30 |         '''
31 |         read a pair of images z and x
32 |         '''
33 |         # randomly decide the id of video to get z and x
34 |         rand_vid = rand_vid % self.num_videos
35 | 
36 |         video_keys = self.videos.keys()
37 |         video = self.videos[video_keys[rand_vid]]
38 | 
39 |         # get ids of this video
40 |         video_ids = video[0]
41 |         # how many ids in this video
42 |         video_id_keys = video_ids.keys()
43 | 
44 |         # randomly pick an id for z
45 |         rand_trackid_z = np.random.choice(list(range(len(video_id_keys))))
46 |         # get the video for this id
47 |         video_id_z = video_ids[video_id_keys[rand_trackid_z]]
48 | 
49 |         # pick a valid examplar z in the video
50 |         rand_z = np.random.choice(range(len(video_id_z)))
51 | 
52 |         # pick a valid instance within frame_range frames from the examplar, excluding the examplar itself
53 |         possible_x_pos = range(len(video_id_z))
54 |         rand_x = np.random.choice(possible_x_pos[max(rand_z - self.config.pos_pair_range, 0):rand_z] + possible_x_pos[(rand_z + 1):min(rand_z + self.config.pos_pair_range, len(video_id_z))])
55 | 
56 |         z = video_id_z[rand_z].copy()    # use copy() here to avoid changing dictionary
57 |         x = video_id_z[rand_x].copy()
58 | 
59 |         # read z and x
60 |         img_z = cv2.imread(os.path.join(self.data_dir, z['instance_path']))
61 |         img_z = cv2.cvtColor(img_z, cv2.COLOR_BGR2RGB)
62 | 
63 |         img_x = cv2.imread(os.path.join(self.data_dir, x['instance_path']))
64 |         img_x = cv2.cvtColor(img_x, cv2.COLOR_BGR2RGB)
65 | 
66 |         # do data augmentation;
67 |         # note that we have done center crop for z in the data augmentation
68 |         img_z = self.z_transforms(img_z)
69 |         img_x = self.x_transforms(img_x)
70 | 
71 |         return img_z, img_x
72 | 
73 |     def __len__(self):
74 |         return self.num
75 | 


--------------------------------------------------------------------------------
/Train/model/SiamFC_50_model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HengLan/SiamFC-PyTorch/6cb921f5ac58b612ebface176d9a84d5e033150a/Train/model/SiamFC_50_model.pth


--------------------------------------------------------------------------------
/Train/run_Train_SiamFC.py:
--------------------------------------------------------------------------------
  1 | """
  2 | PyTorch implementation of SiamFC (Luca Bertinetto, et al., ECCVW, 2016)
  3 | Written by Heng Fan
  4 | """
  5 | 
  6 | from SiamNet import *
  7 | from VIDDataset import *
  8 | from torch.autograd import Variable
  9 | from torch.optim.lr_scheduler import StepLR
 10 | from Config import *
 11 | from Utils import *
 12 | import torchvision.transforms as transforms
 13 | from DataAugmentation import *
 14 | import os
 15 | from tqdm import tqdm
 16 | from torch.utils.data import DataLoader
 17 | 
 18 | # fix random seed
 19 | np.random.seed(1357)
 20 | torch.manual_seed(1234)
 21 | 
 22 | 
 23 | def train(data_dir, train_imdb, val_imdb, model_save_path="./model/", use_gpu=True):
 24 | 
 25 |     # initialize training configuration
 26 |     config = Config()
 27 | 
 28 |     # do data augmentation in PyTorch;
 29 |     # you can also do complex data augmentation as in the original paper
 30 |     center_crop_size = config.instance_size - config.stride
 31 |     random_crop_size = config.instance_size - 2 * config.stride
 32 | 
 33 |     train_z_transforms = transforms.Compose([
 34 |         RandomStretch(),
 35 |         CenterCrop((config.examplar_size, config.examplar_size)),
 36 |         ToTensor()
 37 |     ])
 38 |     train_x_transforms = transforms.Compose([
 39 |         RandomStretch(),
 40 |         CenterCrop((center_crop_size, center_crop_size)),
 41 |         RandomCrop((random_crop_size, random_crop_size)),
 42 |         ToTensor()
 43 |     ])
 44 |     valid_z_transforms = transforms.Compose([
 45 |         CenterCrop((config.examplar_size, config.examplar_size)),
 46 |         ToTensor(),
 47 |     ])
 48 |     valid_x_transforms = transforms.Compose([
 49 |         ToTensor()
 50 |     ])
 51 | 
 52 |     # load data (see details in VIDDataset.py)
 53 |     train_dataset = VIDDataset(train_imdb, data_dir, config, train_z_transforms, train_x_transforms)
 54 |     val_dataset = VIDDataset(val_imdb, data_dir, config, valid_z_transforms, valid_x_transforms, "Validation")
 55 | 
 56 |     # create dataloader
 57 |     train_loader = DataLoader(train_dataset, batch_size=config.batch_size,
 58 |                              shuffle=True, num_workers=config.train_num_workers, drop_last=True)
 59 |     val_loader = DataLoader(val_dataset, batch_size=config.batch_size,
 60 |                            shuffle=True, num_workers=config.val_num_workers, drop_last=True)
 61 | 
 62 |     # create SiamFC network architecture (see details in SiamNet.py)
 63 |     net = SiamNet()
 64 |     # move network to GPU if using GPU
 65 |     if use_gpu:
 66 |         net.cuda()
 67 | 
 68 |     # define training strategy;
 69 |     # the learning rate of adjust layer (i.e., a conv layer)
 70 |     # is set to 0 as in the original paper
 71 |     optimizer = torch.optim.SGD([
 72 |         {'params': net.feat_extraction.parameters()},
 73 |         {'params': net.adjust.bias},
 74 |         {'params': net.adjust.weight, 'lr': 0},
 75 |     ], config.lr, config.momentum, config.weight_decay)
 76 | 
 77 |     # adjusting learning in each epoch
 78 |     scheduler = StepLR(optimizer, config.step_size, config.gamma)
 79 | 
 80 |     # used to control generating label for training;
 81 |     # once generated, they are fixed since the labels for each
 82 |     # pair of images (examplar z and search region x) are the same
 83 |     train_response_flag = False
 84 |     valid_response_flag = False
 85 | 
 86 |     # ------------------------ training & validation process ------------------------
 87 |     for i in range(config.num_epoch):
 88 | 
 89 |         # adjusting learning rate
 90 |         scheduler.step()
 91 | 
 92 |         # ------------------------------ training ------------------------------
 93 |         # indicating training (very important for batch normalization)
 94 |         net.train()
 95 | 
 96 |         # used to collect loss
 97 |         train_loss = []
 98 | 
 99 |         for j, data in enumerate(tqdm(train_loader)):
100 | 
101 |             # fetch data, i.e., B x C x W x H (batchsize x channel x wdith x heigh)
102 |             exemplar_imgs, instance_imgs = data
103 | 
104 |             # forward pass
105 |             if use_gpu:
106 |                 exemplar_imgs = exemplar_imgs.cuda()
107 |                 instance_imgs = instance_imgs.cuda()
108 |             output = net.forward(Variable(exemplar_imgs), Variable(instance_imgs))
109 | 
110 |             # create label for training (only do it one time)
111 |             if not train_response_flag:
112 |                 # change control flag
113 |                 train_response_flag = True
114 |                 # get shape of output (i.e., response map)
115 |                 response_size = output.shape[2:4]
116 |                 # generate label and weight
117 |                 train_eltwise_label, train_instance_weight = create_label(response_size, config, use_gpu)
118 | 
119 |             # clear the gradient
120 |             optimizer.zero_grad()
121 | 
122 |             # loss
123 |             loss = net.weight_loss(output, train_eltwise_label, train_instance_weight)
124 | 
125 |             # backward
126 |             loss.backward()
127 | 
128 |             # update parameter
129 |             optimizer.step()
130 | 
131 |             # collect training loss
132 |             train_loss.append(loss.data)
133 | 
134 |         # ------------------------------ saving model ------------------------------
135 |         if not os.path.exists(model_save_path):
136 |             os.makedirs(model_save_path)
137 |         torch.save(net, model_save_path + "SiamFC_" + str(i + 1) + "_model.pth")
138 | 
139 |         # ------------------------------ validation ------------------------------
140 |         # indicate validation
141 |         net.eval()
142 | 
143 |         # used to collect validation loss
144 |         val_loss = []
145 | 
146 |         for j, data in enumerate(tqdm(val_loader)):
147 | 
148 |             exemplar_imgs, instance_imgs = data
149 | 
150 |             # forward pass
151 |             if use_gpu:
152 |                 exemplar_imgs = exemplar_imgs.cuda()
153 |                 instance_imgs = instance_imgs.cuda()
154 |             output = net.forward(Variable(exemplar_imgs), Variable(instance_imgs))
155 | 
156 |             # create label for validation (only do it one time)
157 |             if not valid_response_flag:
158 |                 valid_response_flag = True
159 |                 response_size = output.shape[2:4]
160 |                 valid_eltwise_label, valid_instance_weight = create_label(response_size, config, use_gpu)
161 | 
162 |             # loss
163 |             loss = net.weight_loss(output, valid_eltwise_label, valid_instance_weight)
164 | 
165 |             # collect validation loss
166 |             val_loss.append(loss.data)
167 | 
168 |         print ("Epoch %d   training loss: %f, validation loss: %f" % (i+1, np.mean(train_loss), np.mean(val_loss)))
169 | 
170 | 
171 | if __name__ == "__main__":
172 | 
173 |     data_dir = "/home/hfan/Dataset/ILSVRC2015_crops/Data/VID/train"
174 |     train_imdb = "/home/hfan/Desktop/PyTorch-SiamFC/ILSVRC15-curation/imdb_video_train.json"
175 |     val_imdb = "/home/hfan/Desktop/PyTorch-SiamFC/ILSVRC15-curation/imdb_video_val.json"
176 | 
177 |     # training SiamFC network, using GPU by default
178 |     train(data_dir, train_imdb, val_imdb)
179 | 


--------------------------------------------------------------------------------
/imgs/result.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HengLan/SiamFC-PyTorch/6cb921f5ac58b612ebface176d9a84d5e033150a/imgs/result.PNG


--------------------------------------------------------------------------------