├── 0一些基本工具.py ├── 1数据集管理.py ├── 2从数据集中导入视频图片.py ├── 3随机采样,形成pk批次.py ├── 4随机图像裁剪.py ├── 5损失函数.py ├── 6评估模型的方法.py ├── README.md ├── info ├── query_IDX.mat ├── test_name.txt ├── tracks_test_info.mat ├── tracks_train_info.mat └── train_name.txt ├── main_video_person_reid.py ├── models ├── ResNet.py ├── __init__.py └── resnet3d.py └── test.py /0一些基本工具.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import os 3 | import sys 4 | import errno 5 | import shutil 6 | import json 7 | import os.path as osp 8 | 9 | import torch 10 | 11 | 12 | def mkdir_if_missing(directory): # 如果文件不存在,就新建一个文件 13 | if not osp.exists(directory): # 如果不存在该文件 14 | try: 15 | os.makedirs(directory) # 建立这个文件 16 | except OSError as e: 17 | if e.errno != errno.EEXIST: 18 | raise 19 | 20 | 21 | class AverageMeter(object): 22 | """Computes and stores the average and current value. 计算并存储平均值和当前值. 23 | 24 | Code imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262 25 | """ 26 | def __init__(self, val=0, avg=0, sum=0, count=0): 27 | self.val = val 28 | self.avg = avg 29 | self.sum = sum 30 | self.count = count 31 | # self.reset() 32 | 33 | def reset(self): # 复位 34 | self.val = 0 35 | self.avg = 0 36 | self.sum = 0 37 | self.count = 0 38 | 39 | def update(self, val, n=1): # 更新各个值的状态 40 | self.val = val 41 | self.sum += val * n 42 | self.count += n 43 | self.avg = self.sum / self.count 44 | 45 | 46 | def save_checkpoint(state, is_best, fpath='checkpoint.pth.tar'): # 保存checkpoint文件 47 | mkdir_if_missing(osp.dirname(fpath)) # 如果当前路径不存在该名字的文件,就新建它 48 | torch.save(state, fpath) # 将模型的状态,保存在fpath文件中 49 | if is_best: # 如果这个模型的结果是最好的. 50 | shutil.copy(fpath, osp.join(osp.dirname(fpath), 'best_model.pth.tar')) 51 | # shutil.copyfile(src, dst):复制文件内容(不包含元数据)从src到dst。 dst必须是完整的目标文件名. 52 | 53 | 54 | class Logger(object): 55 | """ 56 | Write console output to external text file. 将控制台输出写入外部文本文件。日志输出 57 | Code imported from https://github.com/Cysu/open-reid/blob/master/reid/utils/logging.py. 58 | """ 59 | def __init__(self, fpath=None): 60 | self.console = sys.stdout # 控制台输出 61 | self.file = None 62 | if fpath is not None: # 如果fpath存在,就建立这个文件 63 | mkdir_if_missing(os.path.dirname(fpath)) 64 | self.file = open(fpath, 'w') # 打开这个文件,打开文件的模式是写入. 65 | 66 | def __del__(self): # 关闭文件 67 | self.close() 68 | 69 | def __enter__(self): 70 | pass 71 | 72 | def __exit__(self, *args): # 退出文件 73 | self.close() 74 | 75 | def write(self, msg): # 将信息写入文件 76 | self.console.write(msg) # = sys.stdout.write(msg) 打印信息 77 | if self.file is not None: 78 | self.file.write(msg) # = open(fpath,'w').write(msg) 向checkpoint文件中写入信息. 79 | 80 | def flush(self): # flush()函数:刷新stdout,每隔一秒输出,在屏幕上可以实时看到输出信息. 81 | self.console.flush() 82 | if self.file is not None: 83 | self.file.flush() # 刷新缓冲区,即将缓冲区中的数据立刻写入文件,同时清空缓冲区,不需要是被动的等待输出缓冲区写入。 84 | os.fsync(self.file.fileno()) # fileno()返回一个整型的文件描述符(file descriptor FD 整型),可用于底层操作系统的I/O 操作 85 | # fsync()强制将文件描述符为fd的文件写入硬盘 86 | 87 | def close(self): 88 | self.console.close() # 关闭控制台输出 89 | if self.file is not None: 90 | self.file.close() # 关闭文件 91 | 92 | 93 | def read_json(fpath): 94 | with open(fpath, 'r') as f: # 打开fpath文件,以只读模式,作为f 95 | obj = json.load(f) # 读取json信息,取得指定文件中内容,参数为要读取的文件对象 96 | return obj 97 | 98 | 99 | def write_json(obj, fpath): 100 | mkdir_if_missing(osp.dirname(fpath)) # 先建立文件 101 | with open(fpath, 'w') as f: # 打开fpath文件,以写入模式,作为f 102 | json.dump(obj, f, indent=4, separators=(',', ': ')) # 存入到指定文件,第一个参数为要存入的内容,第二个为文件的对象 103 | # indent=4,换行且按照indent的数值显示前面的空白分行显示 104 | # separators:分隔符,这表示dictionary内keys之间用“,”隔开,而KEY和value之间用“:”隔开。 105 | -------------------------------------------------------------------------------- /1数据集管理.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, absolute_import 2 | import os 3 | import glob 4 | import re 5 | import sys 6 | import urllib 7 | import tarfile 8 | import zipfile 9 | import os.path as osp 10 | from scipy.io import loadmat 11 | import numpy as np 12 | 13 | from utils import mkdir_if_missing, write_json, read_json 14 | 15 | """Dataset classes 数据集的种类""" 16 | 17 | 18 | class Mars(object): 19 | """ 20 | MARS 21 | 22 | Reference: 23 | Zheng et al. MARS: A Video Benchmark for Large-Scale Person Re-identification. ECCV 2016. 24 | 25 | Dataset statistics: 26 | # identities: 1261 27 | # tracklets: 8298 (train) + 1980 (query) + 9330 (gallery) 28 | # cameras: 6 29 | 30 | Args: 31 | min_seq_len (int): tracklet with length shorter than this value will be discarded (default: 0). 32 | 长度小于此值的tracklet将被丢弃 33 | """ 34 | root = '/media/ying/0BDD17830BDD1783/ReIdDataset/Mars' # 数据集的相对路径,下面是数据集的拆分信息 35 | train_name_path = osp.join(root, 'info/train_name.txt') # 包含训练集图片名称的路径 36 | test_name_path = osp.join(root, 'info/test_name.txt') # 包含测试集图片名称的路径 37 | track_train_info_path = osp.join(root, 'info/tracks_train_info.mat') # .mat是matlab格式的数据文件 38 | track_test_info_path = osp.join(root, 'info/tracks_test_info.mat') # 跟踪测试信息的路径 39 | query_IDX_path = osp.join(root, 'info/query_IDX.mat') # 查询身份的路径 40 | 41 | def __init__(self, min_seq_len=0): 42 | self._check_before_run() # 在运行前检查 43 | 44 | # prepare meta data 准备元数据 45 | train_names = self._get_names(self.train_name_path) # 获得训练集图片的所有名称 46 | test_names = self._get_names(self.test_name_path) # 获得测试集所有图片的名称 47 | track_train = loadmat(self.track_train_info_path) # 从scipy.io.loadmat读取.mat文件. 48 | track_train = track_train['track_train_info'] # 所有的元素都封装在ndarray 中。 49 | # 数据结构为 numpy.ndarray (8298, 4) 50 | track_test = loadmat(self.track_test_info_path)['track_test_info'] 51 | # numpy.ndarray (12180, 4) 52 | query_IDX = loadmat(self.query_IDX_path)['query_IDX'].squeeze() # squeeze()去除size为1的维度 53 | # numpy.ndarray (1980,) 54 | query_IDX -= 1 55 | # index from 0 56 | track_query = track_test[query_IDX, :] # 测试的视频段,从0开始 57 | gallery_IDX = [i for i in range(track_test.shape[0]) if i not in query_IDX] # 如果测试的索引不在查询中,那么它在图库中 58 | track_gallery = track_test[gallery_IDX, :] # 图库的视频段 59 | 60 | train, num_train_tracklets, num_train_pids, num_train_imgs = \ 61 | self._process_data(train_names, track_train, home_dir='bbox_train', relabel=True, min_seq_len=min_seq_len) 62 | 63 | query, num_query_tracklets, num_query_pids, num_query_imgs = \ 64 | self._process_data(test_names, track_query, home_dir='bbox_test', relabel=False, min_seq_len=min_seq_len) 65 | 66 | gallery, num_gallery_tracklets, num_gallery_pids, num_gallery_imgs = \ 67 | self._process_data(test_names, track_gallery, home_dir='bbox_test', relabel=False, min_seq_len=min_seq_len) 68 | 69 | num_imgs_per_tracklet = num_train_imgs + num_query_imgs + num_gallery_imgs # 每个视频段中包含的图片数 70 | min_num = np.min(num_imgs_per_tracklet) # 最小图片数 71 | max_num = np.max(num_imgs_per_tracklet) # 最大图片数 72 | avg_num = np.mean(num_imgs_per_tracklet) # 平均图片数 73 | 74 | num_total_pids = num_train_pids + num_query_pids # 行人身份的总数=训练+测试 75 | num_total_tracklets = num_train_tracklets + num_query_tracklets + num_gallery_tracklets # 视频段的总数 76 | 77 | print("=> MARS loaded") 78 | print("Dataset statistics:") 79 | print(" ------------------------------") 80 | print(" subset | # ids | # tracklets") 81 | print(" ------------------------------") 82 | print(" train | {:5d} | {:8d}".format(num_train_pids, num_train_tracklets)) 83 | print(" query | {:5d} | {:8d}".format(num_query_pids, num_query_tracklets)) 84 | print(" gallery | {:5d} | {:8d}".format(num_gallery_pids, num_gallery_tracklets)) 85 | print(" ------------------------------") 86 | print(" total | {:5d} | {:8d}".format(num_total_pids, num_total_tracklets)) 87 | print(" number of images per tracklet: {} ~ {}, average {:.1f}".format(min_num, max_num, avg_num)) 88 | print(" ------------------------------") 89 | 90 | self.train = train 91 | self.query = query 92 | self.gallery = gallery 93 | 94 | self.num_train_pids = num_train_pids 95 | self.num_query_pids = num_query_pids 96 | self.num_gallery_pids = num_gallery_pids 97 | 98 | def _check_before_run(self): 99 | """Check if all files are available before going deeper检查所有的文件是否在文件夹(路径)中""" 100 | if not osp.exists(self.root): 101 | raise RuntimeError("'{}' is not available".format(self.root)) 102 | if not osp.exists(self.train_name_path): 103 | raise RuntimeError("'{}' is not available".format(self.train_name_path)) 104 | if not osp.exists(self.test_name_path): 105 | raise RuntimeError("'{}' is not available".format(self.test_name_path)) 106 | if not osp.exists(self.track_train_info_path): 107 | raise RuntimeError("'{}' is not available".format(self.track_train_info_path)) 108 | if not osp.exists(self.track_test_info_path): 109 | raise RuntimeError("'{}' is not available".format(self.track_test_info_path)) 110 | if not osp.exists(self.query_IDX_path): 111 | raise RuntimeError("'{}' is not available".format(self.query_IDX_path)) 112 | 113 | def _get_names(self, fpath): 114 | names = [] 115 | with open(fpath, 'r') as f: # 以只读的方式打开路径中的文件,作为f 116 | for line in f: # 遍历文件中的每一行 117 | new_line = line.rstrip() # rstrip()删除 string 字符串末尾的指定字符(默认为空格). 118 | names.append(new_line) # 将字符串添加到names列表的末尾 119 | return names 120 | 121 | def _process_data(self, names, meta_data, home_dir=None, relabel=False, min_seq_len=0): # 处理数据 122 | assert home_dir in ['bbox_train', 'bbox_test'] # home_dir为'bbox_train', 'bbox_test'两者之一 123 | num_tracklets = meta_data.shape[0] # 小视频段的数目 = 元数据的行数 124 | pid_list = list(set(meta_data[:, 2].tolist())) # meta_data[:, 2]读取第二列(列数从0开始)的数组元素 125 | # tolist()将数组转换成列表 set() 函数创建一个无序不重复元素集 126 | num_pids = len(pid_list) 127 | 128 | if relabel: # 重新标记 129 | pid2label = {pid: label for label, pid in enumerate(pid_list)} # pid: label 在pid_list列表中 130 | tracklets = [] # 小段视频 131 | num_imgs_per_tracklet = [] # 每个小段视频里的图片数 132 | 133 | for tracklet_idx in range(num_tracklets): # 遍历小段视频 134 | data = meta_data[tracklet_idx, ...] # 从元数据中获得数据 135 | start_index, end_index, pid, camid = data # 这些数据是图片起始索引,结束索引,身份,摄像头的id 136 | if pid == -1: # 如果不存在身份id 137 | continue # junk images are just ignored 忽略这些无用图片 138 | assert 1 <= camid <= 6 # 摄像头id 1~6之间 139 | if relabel: # 如果重新标记 140 | pid = pid2label[pid] # 获得原来pid对应的relabel后的label 141 | camid -= 1 # index starts from 0 摄像头的索引从0开始 142 | img_names = names[start_index-1:end_index] # 根据索引,获得对应图片的名字 143 | 144 | # make sure image names correspond to the same person 确保图片名字对应于同一个人 145 | pnames = [img_name[:4] for img_name in img_names] 146 | assert len(set(pnames)) == 1, "Error: a single tracklet contains different person images" 147 | 148 | # make sure all images are captured under the same camera # 确保所有的图片由同一个摄像头捕获 149 | camnames = [img_name[5] for img_name in img_names] 150 | assert len(set(camnames)) == 1, "Error: images are captured under different cameras!" 151 | 152 | # append image names with directory information 附加包含目录信息的图像名称 153 | img_paths = [osp.join(self.root, home_dir, img_name[:4], img_name) for img_name in img_names] 154 | if len(img_paths) >= min_seq_len: 155 | img_paths = tuple(img_paths) # 图像路径,以元组形成 156 | tracklets.append((img_paths, pid, camid)) # 将(图像路径,行人id,摄像头id)以列表形式,添加到tracklets列表后面 157 | num_imgs_per_tracklet.append(len(img_paths)) # 小段视频中包含的图像数 158 | 159 | num_tracklets = len(tracklets) 160 | 161 | return tracklets, num_tracklets, num_pids, num_imgs_per_tracklet 162 | # 返回小段视频,小段视频的数目,行人id的个数,每段视频中的图像数 163 | 164 | 165 | class iLIDSVID(object): 166 | """ 167 | iLIDS-VID 168 | 169 | Reference: 170 | Wang et al. Person Re-Identification by Video Ranking. ECCV 2014. 171 | 172 | Dataset statistics: 173 | # identities: 300 174 | # tracklets: 600 175 | # cameras: 2 176 | 177 | Args: 178 | split_id (int): indicates which split to use. There are totally 10 splits. 179 | """ 180 | root = './data/ilids-vid' 181 | dataset_url = 'http://www.eecs.qmul.ac.uk/~xiatian/iLIDS-VID/iLIDS-VID.tar' 182 | data_dir = osp.join(root, 'i-LIDS-VID') 183 | split_dir = osp.join(root, 'train-test people splits') 184 | split_mat_path = osp.join(split_dir, 'train_test_splits_ilidsvid.mat') 185 | split_path = osp.join(root, 'splits.json') 186 | cam_1_path = osp.join(root, 'i-LIDS-VID/sequences/cam1') 187 | cam_2_path = osp.join(root, 'i-LIDS-VID/sequences/cam2') 188 | 189 | def __init__(self, split_id=0): 190 | self._download_data() 191 | self._check_before_run() 192 | 193 | self._prepare_split() 194 | splits = read_json(self.split_path) 195 | if split_id >= len(splits): 196 | raise ValueError("split_id exceeds range, received {}, but expected between 0 and {}".format(split_id, 197 | len(splits)-1)) 198 | split = splits[split_id] 199 | train_dirs, test_dirs = split['train'], split['test'] 200 | print("# train identites: {}, # test identites {}".format(len(train_dirs), len(test_dirs))) 201 | 202 | train, num_train_tracklets, num_train_pids, num_imgs_train = \ 203 | self._process_data(train_dirs, cam1=True, cam2=True) 204 | query, num_query_tracklets, num_query_pids, num_imgs_query = \ 205 | self._process_data(test_dirs, cam1=True, cam2=False) 206 | gallery, num_gallery_tracklets, num_gallery_pids, num_imgs_gallery = \ 207 | self._process_data(test_dirs, cam1=False, cam2=True) 208 | 209 | num_imgs_per_tracklet = num_imgs_train + num_imgs_query + num_imgs_gallery 210 | min_num = np.min(num_imgs_per_tracklet) 211 | max_num = np.max(num_imgs_per_tracklet) 212 | avg_num = np.mean(num_imgs_per_tracklet) 213 | 214 | num_total_pids = num_train_pids + num_query_pids 215 | num_total_tracklets = num_train_tracklets + num_query_tracklets + num_gallery_tracklets 216 | 217 | print("=> iLIDS-VID loaded") 218 | print("Dataset statistics:") 219 | print(" ------------------------------") 220 | print(" subset | # ids | # tracklets") 221 | print(" ------------------------------") 222 | print(" train | {:5d} | {:8d}".format(num_train_pids, num_train_tracklets)) 223 | print(" query | {:5d} | {:8d}".format(num_query_pids, num_query_tracklets)) 224 | print(" gallery | {:5d} | {:8d}".format(num_gallery_pids, num_gallery_tracklets)) 225 | print(" ------------------------------") 226 | print(" total | {:5d} | {:8d}".format(num_total_pids, num_total_tracklets)) 227 | print(" number of images per tracklet: {} ~ {}, average {:.1f}".format(min_num, max_num, avg_num)) 228 | print(" ------------------------------") 229 | 230 | self.train = train 231 | self.query = query 232 | self.gallery = gallery 233 | 234 | self.num_train_pids = num_train_pids 235 | self.num_query_pids = num_query_pids 236 | self.num_gallery_pids = num_gallery_pids 237 | 238 | def _download_data(self): 239 | if osp.exists(self.root): 240 | print("This dataset has been downloaded.") 241 | return 242 | 243 | mkdir_if_missing(self.root) 244 | fpath = osp.join(self.root, osp.basename(self.dataset_url)) 245 | 246 | print("Downloading iLIDS-VID dataset") 247 | url_opener = urllib.URLopener() 248 | url_opener.retrieve(self.dataset_url, fpath) 249 | 250 | print("Extracting files") 251 | tar = tarfile.open(fpath) 252 | tar.extractall(path=self.root) 253 | tar.close() 254 | 255 | def _check_before_run(self): 256 | """Check if all files are available before going deeper""" 257 | if not osp.exists(self.root): 258 | raise RuntimeError("'{}' is not available".format(self.root)) 259 | if not osp.exists(self.data_dir): 260 | raise RuntimeError("'{}' is not available".format(self.data_dir)) 261 | if not osp.exists(self.split_dir): 262 | raise RuntimeError("'{}' is not available".format(self.split_dir)) 263 | 264 | def _prepare_split(self): 265 | if not osp.exists(self.split_path): 266 | print("Creating splits") 267 | mat_split_data = loadmat(self.split_mat_path)['ls_set'] 268 | 269 | num_splits = mat_split_data.shape[0] 270 | num_total_ids = mat_split_data.shape[1] 271 | assert num_splits == 10 272 | assert num_total_ids == 300 273 | num_ids_each = num_total_ids/2 274 | 275 | # pids in mat_split_data are indices, so we need to transform them 276 | # to real pids 277 | person_cam1_dirs = os.listdir(self.cam_1_path) 278 | person_cam2_dirs = os.listdir(self.cam_2_path) 279 | 280 | # make sure persons in one camera view can be found in the other camera view 281 | assert set(person_cam1_dirs) == set(person_cam2_dirs) 282 | 283 | splits = [] 284 | for i_split in range(num_splits): 285 | # first 50% for testing and the remaining for training, following Wang et al. ECCV'14. 286 | train_idxs = sorted(list(mat_split_data[i_split, num_ids_each:])) 287 | test_idxs = sorted(list(mat_split_data[i_split, :num_ids_each])) 288 | 289 | train_idxs = [int(i)-1 for i in train_idxs] 290 | test_idxs = [int(i)-1 for i in test_idxs] 291 | 292 | # transform pids to person dir names 293 | train_dirs = [person_cam1_dirs[i] for i in train_idxs] 294 | test_dirs = [person_cam1_dirs[i] for i in test_idxs] 295 | 296 | split = {'train': train_dirs, 'test': test_dirs} 297 | splits.append(split) 298 | 299 | print("Totally {} splits are created, following Wang et al. ECCV'14".format(len(splits))) 300 | print("Split file is saved to {}".format(self.split_path)) 301 | write_json(splits, self.split_path) 302 | 303 | print("Splits created") 304 | 305 | def _process_data(self, dirnames, cam1=True, cam2=True): 306 | tracklets = [] 307 | num_imgs_per_tracklet = [] 308 | dirname2pid = {dirname: i for i, dirname in enumerate(dirnames)} 309 | 310 | for dirname in dirnames: 311 | if cam1: 312 | person_dir = osp.join(self.cam_1_path, dirname) 313 | img_names = glob.glob(osp.join(person_dir, '*.png')) 314 | assert len(img_names) > 0 315 | img_names = tuple(img_names) 316 | pid = dirname2pid[dirname] 317 | tracklets.append((img_names, pid, 0)) 318 | num_imgs_per_tracklet.append(len(img_names)) 319 | 320 | if cam2: 321 | person_dir = osp.join(self.cam_2_path, dirname) 322 | img_names = glob.glob(osp.join(person_dir, '*.png')) 323 | assert len(img_names) > 0 324 | img_names = tuple(img_names) 325 | pid = dirname2pid[dirname] 326 | tracklets.append((img_names, pid, 1)) 327 | num_imgs_per_tracklet.append(len(img_names)) 328 | 329 | num_tracklets = len(tracklets) 330 | num_pids = len(dirnames) 331 | 332 | return tracklets, num_tracklets, num_pids, num_imgs_per_tracklet 333 | 334 | 335 | class PRID(object): 336 | """ 337 | PRID 338 | 339 | Reference: 340 | Hirzer et al. Person Re-Identification by Descriptive and Discriminative Classification. SCIA 2011. 341 | 342 | Dataset statistics: 343 | # identities: 200 344 | # tracklets: 400 345 | # cameras: 2 346 | 347 | Args: 348 | split_id (int): indicates which split to use. There are totally 10 splits. 349 | min_seq_len (int): tracklet with length shorter than this value will be discarded (default: 0). 350 | """ 351 | root = './data/prid2011' 352 | dataset_url = 'https://files.icg.tugraz.at/f/6ab7e8ce8f/?raw=1' 353 | split_path = osp.join(root, 'splits_prid2011.json') 354 | cam_a_path = osp.join(root, 'prid_2011', 'multi_shot', 'cam_a') 355 | cam_b_path = osp.join(root, 'prid_2011', 'multi_shot', 'cam_b') 356 | 357 | def __init__(self, split_id=0, min_seq_len=0): 358 | self._check_before_run() 359 | splits = read_json(self.split_path) 360 | if split_id >= len(splits): 361 | raise ValueError("split_id exceeds range, received {}, but expected between 0 and {}".format(split_id, 362 | len(splits)-1)) 363 | split = splits[split_id] 364 | train_dirs, test_dirs = split['train'], split['test'] 365 | print("# train identites: {}, # test identites {}".format(len(train_dirs), len(test_dirs))) 366 | 367 | train, num_train_tracklets, num_train_pids, num_imgs_train = \ 368 | self._process_data(train_dirs, cam1=True, cam2=True) 369 | query, num_query_tracklets, num_query_pids, num_imgs_query = \ 370 | self._process_data(test_dirs, cam1=True, cam2=False) 371 | gallery, num_gallery_tracklets, num_gallery_pids, num_imgs_gallery = \ 372 | self._process_data(test_dirs, cam1=False, cam2=True) 373 | 374 | num_imgs_per_tracklet = num_imgs_train + num_imgs_query + num_imgs_gallery 375 | min_num = np.min(num_imgs_per_tracklet) 376 | max_num = np.max(num_imgs_per_tracklet) 377 | avg_num = np.mean(num_imgs_per_tracklet) 378 | 379 | num_total_pids = num_train_pids + num_query_pids 380 | num_total_tracklets = num_train_tracklets + num_query_tracklets + num_gallery_tracklets 381 | 382 | print("=> PRID-2011 loaded") 383 | print("Dataset statistics:") 384 | print(" ------------------------------") 385 | print(" subset | # ids | # tracklets") 386 | print(" ------------------------------") 387 | print(" train | {:5d} | {:8d}".format(num_train_pids, num_train_tracklets)) 388 | print(" query | {:5d} | {:8d}".format(num_query_pids, num_query_tracklets)) 389 | print(" gallery | {:5d} | {:8d}".format(num_gallery_pids, num_gallery_tracklets)) 390 | print(" ------------------------------") 391 | print(" total | {:5d} | {:8d}".format(num_total_pids, num_total_tracklets)) 392 | print(" number of images per tracklet: {} ~ {}, average {:.1f}".format(min_num, max_num, avg_num)) 393 | print(" ------------------------------") 394 | 395 | self.train = train 396 | self.query = query 397 | self.gallery = gallery 398 | 399 | self.num_train_pids = num_train_pids 400 | self.num_query_pids = num_query_pids 401 | self.num_gallery_pids = num_gallery_pids 402 | 403 | def _check_before_run(self): 404 | """Check if all files are available before going deeper""" 405 | if not osp.exists(self.root): 406 | raise RuntimeError("'{}' is not available".format(self.root)) 407 | 408 | def _process_data(self, dirnames, cam1=True, cam2=True): 409 | tracklets = [] 410 | num_imgs_per_tracklet = [] 411 | dirname2pid = {dirname: i for i, dirname in enumerate(dirnames)} 412 | 413 | for dirname in dirnames: 414 | if cam1: 415 | person_dir = osp.join(self.cam_a_path, dirname) 416 | img_names = glob.glob(osp.join(person_dir, '*.png')) 417 | assert len(img_names) > 0 418 | img_names = tuple(img_names) 419 | pid = dirname2pid[dirname] 420 | tracklets.append((img_names, pid, 0)) 421 | num_imgs_per_tracklet.append(len(img_names)) 422 | 423 | if cam2: 424 | person_dir = osp.join(self.cam_b_path, dirname) 425 | img_names = glob.glob(osp.join(person_dir, '*.png')) 426 | assert len(img_names) > 0 427 | img_names = tuple(img_names) 428 | pid = dirname2pid[dirname] 429 | tracklets.append((img_names, pid, 1)) 430 | num_imgs_per_tracklet.append(len(img_names)) 431 | 432 | num_tracklets = len(tracklets) 433 | num_pids = len(dirnames) 434 | 435 | return tracklets, num_tracklets, num_pids, num_imgs_per_tracklet 436 | 437 | 438 | """Create dataset""" 439 | 440 | __factory = { 441 | 'mars': Mars, 442 | 'ilidsvid': iLIDSVID, 443 | 'prid': PRID, 444 | } 445 | 446 | 447 | def get_names(): 448 | return __factory.keys() 449 | 450 | 451 | def init_dataset(name, *args, **kwargs): 452 | if name not in __factory.keys(): 453 | raise KeyError("Unknown dataset: {}".format(name)) 454 | return __factory[name](*args, **kwargs) 455 | 456 | 457 | if __name__ == '__main__': 458 | # test 459 | # dataset = Market1501() 460 | dataset = Mars() 461 | # dataset = iLIDSVID() 462 | # dataset = PRID() 463 | -------------------------------------------------------------------------------- /2从数据集中导入视频图片.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, absolute_import 2 | import os 3 | from PIL import Image 4 | import numpy as np 5 | 6 | import torch 7 | from torch.utils.data import Dataset 8 | import random 9 | 10 | 11 | def read_image(img_path): 12 | """Keep reading image until succeed.一直进行读入图片操作,直到成功。 13 | This can avoid IOError incurred by heavy IO process.""" 14 | got_img = False # 读取图像的标志,初始为False 15 | if not os.path.exists(img_path): 16 | raise IOError('{} does not exist'.format(img_path)) 17 | while not got_img: 18 | try: 19 | # 从图像路径读取图片,并将图片转换为RGB格式 20 | img = Image.open(img_path).convert('RGB') 21 | got_img = True # 成功读取图片,则读取图像的标志置为TRUE 22 | except IOError: # 读取图片出错,报错 23 | print("IOError incurred when reading '{}'. Will redo. Don't worry. Just chill.".format(img_path)) 24 | pass 25 | return img 26 | 27 | 28 | class VideoDataset(Dataset): 29 | """Video Person ReID Dataset.基于视频reid的数据集。 30 | Note batch data has shape (batch, seq_len, channel, height, width). 31 | 注意,一个批次的数据的形状为(batch,序列长度,通道数,图片的高度,图片的宽度) 32 | """ 33 | # 三种采样方法,平等的,随机,所有 34 | sample_methods = ['evenly', 'random', 'all'] 35 | 36 | # 初始化数据集 37 | def __init__(self, dataset, seq_len=15, sample='evenly', transform=None): 38 | self.dataset = dataset 39 | self.seq_len = seq_len 40 | self.sample = sample 41 | self.transform = transform 42 | 43 | def __len__(self): 44 | return len(self.dataset) # 数据集长度 45 | 46 | def __getitem__(self, index): 47 | img_paths, pid, camid = self.dataset[index] # 从数据集中图片的索引(名称)获得图片路径,行人的身份,摄像头的id 48 | num = len(img_paths) 49 | 50 | # 采样方式:1.随机采用 51 | if self.sample == 'random': 52 | """ 53 | Randomly sample seq_len consecutive frames from num frames, 54 | if num is smaller than seq_len, then replicate items. 55 | This sampling strategy is used in training phase. 56 | 随机从num个视频帧中采样seq_len长度的连续帧,如果num小于seq_len,则复制它们,使之等于seq_len。在训练阶段采用这种采样策略。 57 | """ 58 | frame_indices = list(range(num)) # 视频帧的索引0-num 59 | rand_end = max(0, len(frame_indices) - self.seq_len - 1) # 随机范围的终止 60 | begin_index = random.randint(0, rand_end) # 开始的索引值 61 | end_index = min(begin_index + self.seq_len, len(frame_indices)) # 结束的索引值 62 | 63 | indices = frame_indices[begin_index:end_index] # 最终的索引范围 64 | 65 | for index in indices: # 遍历索引 66 | if len(indices) >= self.seq_len: # 如果索引的长度大于等于序列长度 67 | break 68 | indices.append(index) # 向列表末尾增加索引 69 | indices = np.array(indices) # 将列表转换成数组 70 | 71 | imgs = [] # 图像列表 72 | for index in indices: # 遍历新的索引 73 | index = int(index) # 首先保证索引是整数 74 | img_path = img_paths[index] # 从数据集中获得图像的路径,并结合索引,确定图像的具体路径 75 | img = read_image(img_path) # 根据路径读取图片 76 | if self.transform is not None: # 执行随机裁剪 77 | img = self.transform(img) 78 | img = img.unsqueeze(0) # unsqueeze(0)函数,增加维度,在第0维增加1个维度 79 | imgs.append(img) # 向图像列表末尾增加图像索引.存储这些索引,保证时间顺序 80 | imgs = torch.cat(imgs, dim=0) # cat([a,b],dim),若dim=0,则将a,b按行放在一起. 若dim=1,则a,b按列放在一起 81 | # imgs = imgs.permute(1,0,2,3) 82 | return imgs, pid, camid 83 | 84 | elif self.sample == 'dense': 85 | """ 86 | Sample all frames in a video into a list of clips, each clip contains seq_len frames, batch_size needs to 87 | be set to 1. 88 | 将视频中的所有帧采样到一系列的clips,每个clip包含seq_len个帧,批次大小需要设置为1 89 | This sampling strategy is used in test phase. 在测试阶段采用密集采样策略. 90 | """ 91 | cur_index = 0 # 初始索引为0 92 | frame_indices = list(range(num)) # 视频帧的索引范围 93 | indices_list = [] 94 | while num - cur_index > self.seq_len: # 如果视频帧的数目大于seq_len 95 | indices_list.append(frame_indices[cur_index:cur_index + self.seq_len]) 96 | # 将帧的索引[0,seq_len],添加到原来帧的索引列表的末尾 97 | cur_index += self.seq_len # 当前索引值=0+seq_len 98 | 99 | last_seq = frame_indices[cur_index:] # 最后留下的序列为当前索引值后面的列表值 100 | for index in last_seq: # 遍历剩下的索引值 101 | if len(last_seq) >= self.seq_len: # 如果剩下的索引值长度大于每个身份要采样的序列长度 102 | break 103 | last_seq.append(index) # 将索引加到列表后面 104 | indices_list.append(last_seq) # 存储这些索引,保证时间顺序 105 | 106 | imgs_list = [] 107 | for indices in indices_list: # 遍历所有索引中的视频段的索引,即有几个视频段 108 | imgs = [] 109 | for index in indices: # 在每个视频段中,遍历每一帧的索引 110 | index = int(index) 111 | img_path = img_paths[index] # 确定图像的路径 112 | img = read_image(img_path) # 读图 113 | if self.transform is not None: 114 | img = self.transform(img) 115 | img = img.unsqueeze(0) 116 | imgs.append(img) 117 | imgs = torch.cat(imgs, dim=0) 118 | # imgs = imgs.permute(1,0,2,3) 119 | imgs_list.append(imgs) # 将每个视频段的图像,存在一个总的list中 120 | imgs_array = torch.stack(imgs_list) # 沿着一个新维度对输入张量序列进行连接。 121 | return imgs_array, pid, camid 122 | 123 | else: 124 | raise KeyError("Unknown sample method: {}. Expected one of {}".format(self.sample, self.sample_methods)) 125 | -------------------------------------------------------------------------------- /3随机采样,形成pk批次.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from collections import defaultdict 3 | import numpy as np 4 | 5 | import torch 6 | 7 | 8 | class RandomIdentitySampler(object): # 随机为每个id选择一定数目的样本,形成一个批次 9 | """ 10 | Randomly sample N identities, then for each identity, 首先随机采样N个身份,然后对每个身份,随机选择K个例子,根据这种策略 11 | randomly sample K instances, therefore batch size is N*K. 形成一个N*K批次. 12 | 13 | Code imported from https://github.com/Cysu/open-reid/blob/master/reid/utils/data/sampler.py. 14 | 15 | Args: 16 | data_source (Dataset): dataset to sample from. 一个包含样本的参数 17 | num_instances (int): number of instances per identity. 每个身份的样本数 18 | """ 19 | def __init__(self, data_source, num_instances=4): 20 | self.data_source = data_source # 数据集 21 | self.num_instances = num_instances # 默认给每个身份选择4帧图片 22 | self.index_dic = defaultdict(list) # 索引字典的初始化,采用defaultdict()方法 23 | # defaultdict的作用是在于,当字典里的key不存在但被查找时,返回的不是keyError而是一个默认值,[] 24 | for index, (_, pid, _) in enumerate(data_source): # 将数据集中的行人id和索引放进字典中 25 | self.index_dic[pid].append(index) 26 | self.pids = list(self.index_dic.keys()) # 行人id是字典的key值 27 | self.num_identities = len(self.pids) # 行人的个数是行人id的长度 28 | 29 | def __iter__(self): # 迭代器 30 | indices = torch.randperm(self.num_identities) # torch.randperm(n),给定参数n,返回一个从0到n-1的随机整数排列. 31 | ret = [] # 初始化个列表 32 | for i in indices: # 遍历数据集中的行人身份 33 | pid = self.pids[i] # 行人的身份 34 | t = self.index_dic[pid] # 获得字典中,行人身份对应的样本数 35 | replace = False if len(t) >= self.num_instances else True 36 | # 如果t>=4,则replace = False 如果t < 4,replace = True ,即样本数小于4,则复制前面的样本 37 | t = np.random.choice(t, size=self.num_instances, replace=replace) # np.random.choice(a,size,replace,p) 38 | # 从t中选择size个样本,replace = true的话有可能会出现重复的样本,就是将前面抽出来的样本重新放回去. 39 | ret.extend(t) # 在list的末尾一次性追加另一个序列中的多个值,即用新列表扩展原来的列表. 40 | return iter(ret) # iter()函数,迭代器. 41 | 42 | def __len__(self): 43 | return self.num_identities * self.num_instances # 一个批次的长度 44 | -------------------------------------------------------------------------------- /4随机图像裁剪.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from torchvision.transforms import * 4 | from PIL import Image 5 | import random 6 | import numpy as np 7 | 8 | 9 | class Random2DTranslation(object): 10 | """ 11 | With a probability, first increase image size to (1 + 1/8), and then perform random crop. 12 | 设定一个概率p,首先将图像尺寸扩大到(1 + 1/8),然后执行随机裁剪 13 | Args: 14 | height (int): target height. 目标图像的高度 15 | width (int): target width. 目标图像的宽度 16 | p (float): probability of performing this transformation. Default: 0.5. 执行改变图像大小,进行裁剪的概率 17 | """ 18 | def __init__(self, height, width, p=0.5, interpolation=Image.BILINEAR): 19 | self.height = height 20 | self.width = width 21 | self.p = p 22 | self.interpolation = interpolation # 插值方式:双线性插值 23 | 24 | def __call__(self, img): 25 | """ 26 | Args: 27 | img (PIL Image): Image to be cropped. 28 | 29 | Returns: 30 | PIL Image: Cropped image. 31 | """ 32 | if random.random() < self.p: # random.random()返回随机生成的一个实数,它在[0,1)范围内. 33 | return img.resize((self.width, self.height), self.interpolation) # 不改变图像的大小,((图像的宽, 高), 双线性插值) 34 | 35 | new_width, new_height = int(round(self.width * 1.125)), int(round(self.height * 1.125)) 36 | # 先将图像尺寸扩大到原来的1.125倍, round函数:返回浮点数的四舍五入值 37 | resized_img = img.resize((new_width, new_height), self.interpolation) # 改变图像的大小,采用双线性插值方法 38 | x_maxrange = new_width - self.width # 宽度的余量 39 | y_maxrange = new_height - self.height # 高度的余量 40 | # random.uniform(x,y) 随机生成一个实数,它在[x,y)范围内. 41 | x1 = int(round(random.uniform(0, x_maxrange))) 42 | y1 = int(round(random.uniform(0, y_maxrange))) # 随机确定左上顶点 43 | box = (x1, y1, x1 + self.width, y1 + self.height) # 裁剪图片的区域范围(左,上,右,下) 44 | # Python中规定左上角为(0,0)的坐标点,最后的两个数字必须比前面两个大 45 | croped_img = resized_img.crop(box) # 进行图像裁剪 46 | 47 | return croped_img 48 | 49 | 50 | if __name__ == '__main__': 51 | pass 52 | -------------------------------------------------------------------------------- /5损失函数.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import torch 4 | from torch import nn 5 | from torch.autograd import Variable 6 | 7 | """ 8 | Shorthands for loss: 几个损失函数的缩写 9 | - CrossEntropyLabelSmooth: xent 带标签平滑正则化的交叉熵损失函数 简记为 xent 10 | - TripletLoss: htri 三元组损失 简记为 htri 11 | - CenterLoss: cent 中心损失 简记为 cent 12 | """ 13 | __all__ = ['CrossEntropyLabelSmooth', 'TripletLoss', 'CenterLoss'] 14 | 15 | 16 | class CrossEntropyLabelSmooth(nn.Module): 17 | """Cross entropy loss with label smoothing regularizer. 18 | 19 | Reference: 20 | Szegedy et al. Rethinking the Inception Architecture for Computer Vision. CVPR 2016. 21 | Equation: y = (1 - epsilon) * y + epsilon / K. 22 | 23 | Args: 24 | num_classes (int): number of classes. 类别的数目 25 | epsilon (float): weight. 权重 26 | """ 27 | def __init__(self, num_classes, epsilon=0.1, use_gpu=True): 28 | super(CrossEntropyLabelSmooth, self).__init__() # super() 函数是用于调用父类的一个方法 29 | self.num_classes = num_classes 30 | self.epsilon = epsilon # 权重设为0.1 31 | self.use_gpu = use_gpu # 使用gpu 32 | self.logsoftmax = nn.LogSoftmax(dim=1) # Log(Softmax(x)) 将计算Softmax的维数 33 | 34 | def forward(self, inputs, targets): # 前向传播 35 | """ 36 | Args: 37 | inputs: prediction matrix (before softmax) with shape (batch_size, num_classes) 38 | 输入 : 具有形状(batch_size,num_classes)的预测矩阵(在softmax之前) 39 | targets: ground truth labels with shape (num_classes) 40 | 目标 : 具有形状(num_classes)的ground truth标签 41 | """ 42 | log_probs = self.logsoftmax(inputs) # 将输入进行Log(Softmax(x))计算 43 | targets = torch.zeros(log_probs.size()) # log_probs.size()先获得输入的长度,再返回一个全为标量0的张量,形状由输入长度决定 44 | targets = targets.scatter_(1, targets.unsqueeze(1).data.cpu(), 1) 45 | # scatter_(dim, index, src)将src中数据根据index中的索引按照dim的方向填进input中 46 | if self.use_gpu: 47 | targets = targets.cuda() 48 | targets = Variable(targets, requires_grad=False) 49 | targets = (1 - self.epsilon) * targets + self.epsilon / self.num_classes 50 | loss = (- targets * log_probs).mean(0).sum() 51 | return loss 52 | 53 | 54 | class TripletLoss(nn.Module): 55 | """Triplet loss with hard positive/negative mining. 56 | 57 | Reference: 58 | Hermans et al. In Defense of the Triplet Loss for Person Re-Identification. arXiv:1703.07737. 59 | 60 | Code imported from https://github.com/Cysu/open-reid/blob/master/reid/loss/triplet.py. 61 | 62 | Args: 63 | margin (float): margin for triplet. 64 | """ 65 | def __init__(self, margin=0.3): 66 | super(TripletLoss, self).__init__() 67 | self.margin = margin 68 | self.ranking_loss = nn.MarginRankingLoss(margin=margin) 69 | 70 | def forward(self, inputs, targets): 71 | """ 72 | Args: 73 | inputs: feature matrix with shape (batch_size, feat_dim) 74 | targets: ground truth labels with shape (num_classes) 75 | """ 76 | n = inputs.size(0) 77 | # Compute pairwise distance, replace by the official when merged 78 | dist = torch.pow(inputs, 2).sum(dim=1, keepdim=True).expand(n, n) 79 | dist = dist + dist.t() 80 | dist.addmm_(1, -2, inputs, inputs.t()) 81 | dist = dist.clamp(min=1e-12).sqrt() # for numerical stability 82 | # For each anchor, find the hardest positive and negative 83 | mask = targets.expand(n, n).eq(targets.expand(n, n).t()) 84 | dist_ap, dist_an = [], [] 85 | for i in range(n): 86 | dist_ap.append(dist[i][mask[i]].max()) 87 | dist_an.append(dist[i][mask[i] == 0].min()) 88 | dist_ap = torch.cat(dist_ap) 89 | dist_an = torch.cat(dist_an) 90 | # Compute ranking hinge loss 91 | y = dist_an.data.new() 92 | y.resize_as_(dist_an.data) 93 | y.fill_(1) 94 | y = Variable(y) 95 | loss = self.ranking_loss(dist_an, dist_ap, y) 96 | return loss 97 | 98 | 99 | class CenterLoss(nn.Module): 100 | """Center loss. 101 | 102 | Reference: 103 | Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016. 104 | 105 | Args: 106 | num_classes (int): number of classes. 107 | feat_dim (int): feature dimension. 108 | """ 109 | def __init__(self, num_classes=10, feat_dim=2, use_gpu=True): 110 | super(CenterLoss, self).__init__() 111 | self.num_classes = num_classes 112 | self.feat_dim = feat_dim 113 | self.use_gpu = use_gpu 114 | 115 | if self.use_gpu: 116 | self.centers = nn.Parameter(torch.randn(self.num_classes, self.feat_dim).cuda()) 117 | else: 118 | self.centers = nn.Parameter(torch.randn(self.num_classes, self.feat_dim)) 119 | 120 | def forward(self, x, labels): 121 | """ 122 | Args: 123 | x: feature matrix with shape (batch_size, feat_dim). 124 | labels: ground truth labels with shape (num_classes). 125 | """ 126 | batch_size = x.size(0) 127 | distmat = torch.pow(x, 2).sum(dim=1, keepdim=True).expand(batch_size, self.num_classes) + \ 128 | torch.pow(self.centers, 2).sum(dim=1, keepdim=True).expand(self.num_classes, batch_size).t() 129 | distmat.addmm_(1, -2, x, self.centers.t()) 130 | 131 | classes = torch.arange(self.num_classes).long() 132 | if self.use_gpu: 133 | classes = classes.cuda() 134 | classes = Variable(classes) 135 | labels = labels.unsqueeze(1).expand(batch_size, self.num_classes) 136 | mask = labels.eq(classes.expand(batch_size, self.num_classes)) 137 | 138 | dist = [] 139 | for i in range(batch_size): 140 | value = distmat[i][mask[i]] 141 | value = value.clamp(min=1e-12, max=1e+12) # for numerical stability 142 | dist.append(value) 143 | dist = torch.cat(dist) 144 | loss = dist.mean() 145 | 146 | return loss 147 | 148 | 149 | if __name__ == '__main__': 150 | pass 151 | -------------------------------------------------------------------------------- /6评估模型的方法.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, absolute_import 2 | import numpy as np 3 | import copy 4 | 5 | 6 | def evaluate(distmat, q_pids, g_pids, q_camids, g_camids, max_rank=50): 7 | # 评估函数,参数有(距离矩阵,查询行人的id,图库行人的id,查询的摄像头,图库的摄像头,最大秩) 8 | num_q, num_g = distmat.shape # num_q查询的行人数=距离矩阵的行, num_g图库的行人数=距离矩阵的列 9 | if num_g < max_rank: # 图库的行人数 < 最大秩 10 | max_rank = num_g # 更改最大秩 = 图库的行人数 ,说明图库的样本数太小了 11 | print("Note: number of gallery samples is quite small, got {}".format(num_g)) 12 | indices = np.argsort(distmat, axis=1) # top_k 13 | # 将矩阵distmat按照axis排序,并返回排序后的下标 14 | matches = (g_pids[indices] == q_pids[:, np.newaxis]) # q_pids[:, np.newaxis] 在原有维度后面加一个维度 (n,1) 15 | # 匹配,相同返回true,不同返回false, 16 | matches = matches.astype(np.int32) # 将bool类型的true false转换成int类型的1, 0 17 | 18 | # compute cmc curve for each query 对于每个查询计算它的cmc曲线 19 | all_cmc = [] 20 | all_AP = [] 21 | num_valid_q = 0. 22 | for q_idx in range(num_q): # 遍历每个查询的行人 23 | # get query pid and camid 24 | q_pid = q_pids[q_idx] # 获得每个查询的行人身份 25 | q_camid = q_camids[q_idx] # 获得对应的摄像头id 26 | 27 | # remove gallery samples that have the same pid and camid with query 从图库样本中删除与查询集中有相同身份和摄像头id 28 | order = indices[q_idx] # 查询集索引 29 | remove = (g_pids[order] == q_pid) & (g_camids[order] == q_camid) # 相等为1,不等为0 30 | keep = np.invert(remove) # np.invert() 位非,对每一位取反 ,则删去了相等的,变成去除相等的为0 ,保留不等的为1 31 | 32 | # compute cmc curve 累积匹配曲线 33 | orig_cmc = matches[q_idx][keep] # binary vector, positions with value 1 are correct matches 34 | # 二进制向量,值为1的位置是正确的匹配 35 | if not np.any(orig_cmc): # np.any 相当于或运算.如果可迭代对象orig_cmc中任意存在每一个元素为True则返回True, 36 | # this condition is true when query identity does not appear in gallery 37 | # 当查询的身份未出现在图库中时,此条件为真 38 | continue 39 | 40 | cmc = orig_cmc.cumsum() # 返回累加和,不改变数据形状 41 | cmc[cmc > 1] = 1 # cmc最大为1,超过1的置为1 42 | 43 | all_cmc.append(cmc[:max_rank]) # 将前max_rank的cmc添加到all_cmc列表的后面 rank1到rank50 44 | num_valid_q += 1. # 有效的查询身份+1 45 | 46 | # compute average precision 计算平均精度 47 | # reference: https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Average_precision 48 | num_rel = orig_cmc.sum() # 所有元素求和 49 | tmp_cmc = orig_cmc.cumsum() # 累加和,不改变数据形状 50 | tmp_cmc = [x / (i+1.) for i, x in enumerate(tmp_cmc)] # enumerate() 函数,返回数据下标和数据(i,x) 51 | # 计算top_i的cmc ..x / (i+1.) 52 | tmp_cmc = np.asarray(tmp_cmc) * orig_cmc # np.asarray(tmp_cmc),数据类型转换为数组, 只保留正确的匹配 , 错误的值为0 53 | AP = tmp_cmc.sum() / num_rel # 平均精度 = 正确匹配的元素求和 除以原来所有元素的总和 54 | all_AP.append(AP) 55 | 56 | assert num_valid_q > 0, "Error: all query identities do not appear in gallery" 57 | 58 | all_cmc = np.asarray(all_cmc).astype(np.float32) 59 | all_cmc = all_cmc.sum(0) / num_valid_q 60 | mAP = np.mean(all_AP) 61 | 62 | return all_cmc, mAP 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # something you should know for video_reid 2 | resnet50的结构图 https://blog.csdn.net/Seven_year_Promise/article/details/69358681 3 | 4 | resnet50的详细介绍 https://blog.csdn.net/Seven_year_Promise/article/details/69360488 5 | 6 | code forked from https://github.com/jiyanggao/Video-Person-ReID 7 | 8 | ###############video_reid 数据集介绍############### 9 | 1. MARS ----> https://blog.csdn.net/qq_34132310/article/details/83869605 10 | -------------------------------------------------------------------------------- /info/query_IDX.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AsuradaYuci/understand_videobased_reid/cef368960212808351dfcd1e65487b8683806257/info/query_IDX.mat -------------------------------------------------------------------------------- /info/tracks_test_info.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AsuradaYuci/understand_videobased_reid/cef368960212808351dfcd1e65487b8683806257/info/tracks_test_info.mat -------------------------------------------------------------------------------- /info/tracks_train_info.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AsuradaYuci/understand_videobased_reid/cef368960212808351dfcd1e65487b8683806257/info/tracks_train_info.mat -------------------------------------------------------------------------------- /main_video_person_reid.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, absolute_import 2 | import os 3 | import sys 4 | import time 5 | import datetime 6 | import argparse 7 | import os.path as osp 8 | import numpy as np 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.backends.cudnn as cudnn 13 | from torch.utils.data import DataLoader 14 | from torch.autograd import Variable 15 | from torch.optim import lr_scheduler 16 | 17 | import data_manager 18 | from video_loader import VideoDataset 19 | import transforms as T 20 | import models 21 | from models import resnet3d 22 | from losses import CrossEntropyLabelSmooth, TripletLoss 23 | from utils import AverageMeter, Logger, save_checkpoint 24 | from eval_metrics import evaluate 25 | from samplers import RandomIdentitySampler 26 | 27 | # 命令行参数 28 | parser = argparse.ArgumentParser(description='Train video model with cross entropy loss') 29 | # Datasets 数据集 30 | parser.add_argument('-d', '--dataset', type=str, default='mars', 31 | choices=data_manager.get_names()) 32 | parser.add_argument('-j', '--workers', default=4, type=int, 33 | help="number of data loading workers (default: 4)") 34 | parser.add_argument('--height', type=int, default=224, 35 | help="height of an image (default: 224)") 36 | parser.add_argument('--width', type=int, default=112, 37 | help="width of an image (default: 112)") 38 | parser.add_argument('--seq-len', type=int, default=4, help="number of images to sample in a tracklet") 39 | # Optimization options 优化选择 40 | parser.add_argument('--max-epoch', default=800, type=int, 41 | help="maximum epochs to run") 42 | parser.add_argument('--start-epoch', default=0, type=int, 43 | help="manual epoch number (useful on restarts)") 44 | parser.add_argument('--train-batch', default=32, type=int, 45 | help="train batch size") 46 | parser.add_argument('--test-batch', default=1, type=int, help="has to be 1") 47 | parser.add_argument('--lr', '--learning-rate', default=0.0003, type=float, 48 | help="initial learning rate, use 0.0001 for rnn, use 0.0003 for pooling and attention") 49 | parser.add_argument('--stepsize', default=200, type=int, 50 | help="stepsize to decay learning rate (>0 means this is enabled)") 51 | parser.add_argument('--gamma', default=0.1, type=float, 52 | help="learning rate decay") 53 | parser.add_argument('--weight-decay', default=5e-04, type=float, 54 | help="weight decay (default: 5e-04)") 55 | parser.add_argument('--margin', type=float, default=0.3, help="margin for triplet loss") 56 | parser.add_argument('--num-instances', type=int, default=4, 57 | help="number of instances per identity") 58 | parser.add_argument('--htri-only', action='store_true', default=False, 59 | help="if this is True, only htri loss is used in training") 60 | # Architecture 网络结构 61 | parser.add_argument('-a', '--arch', type=str, default='resnet50tp', help="resnet503d, resnet50tp, resnet50ta, " 62 | "resnetrnn") 63 | parser.add_argument('--pool', type=str, default='avg', choices=['avg', 'max']) 64 | 65 | # Miscs 66 | parser.add_argument('--print-freq', type=int, default=80, help="print frequency") 67 | parser.add_argument('--seed', type=int, default=1, help="manual seed") 68 | parser.add_argument('--pretrained-model', type=str, 69 | default='/home/ying/Desktop/Video-Person-ReID-master/resnet-50-kinetics.pth', 70 | help='need to be set for resnet3d models') 71 | parser.add_argument('--evaluate', action='store_true', help="evaluation only") 72 | parser.add_argument('--eval-step', type=int, default=50, 73 | help="run evaluation for every N epochs (set to -1 to test after training)") 74 | parser.add_argument('--save-dir', type=str, default='log') 75 | parser.add_argument('--use-cpu', action='store_true', help="use cpu") 76 | parser.add_argument('--gpu-devices', default='0', type=str, help='gpu device ids for CUDA_VISIBLE_DEVICES') 77 | 78 | # 解析命令行参数 79 | args = parser.parse_args() 80 | 81 | 82 | def main(): 83 | torch.manual_seed(args.seed) # 为CPU设置种子用于生成随机数,以使得结果是确定的 84 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices # 在代码中指定需要使用的GPU 85 | use_gpu = torch.cuda.is_available() # 查看当前环境是否支持CUDA,支持返回true,不支持返回false 86 | if args.use_cpu: 87 | use_gpu = False 88 | 89 | if not args.evaluate: # 如果不是评估,那就是训练,输出训练日志;否则输出测试日志。 90 | sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) 91 | else: 92 | sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) 93 | print("==========\nArgs:{}\n==========".format(args)) # 打印所有参数 94 | 95 | if use_gpu: # 如果使用gpu,输出选定的gpu, 96 | print("Currently using GPU {}".format(args.gpu_devices)) 97 | cudnn.benchmark = True # 在程序刚开始加这条语句可以提升一点训练速度,没什么额外开销 98 | torch.cuda.manual_seed_all(args.seed) # 为GPU设置种子用于生成随机数,以使得结果是确定的 99 | else: 100 | print("Currently using CPU (GPU is highly recommended)") 101 | 102 | print("Initializing dataset {}".format(args.dataset)) 103 | dataset = data_manager.init_dataset(name=args.dataset) # 初始化数据集,从data_manager.py文件中加载。 104 | 105 | # import transforms as T. 106 | # T.Compose=一起组合几个变换。 107 | transform_train = T.Compose([ 108 | T.Random2DTranslation(args.height, args.width), # 以一个概率进行,首先将图像大小增加到(1 + 1/8),然后执行随机裁剪。 109 | T.RandomHorizontalFlip(), # 以给定的概率(0.5)随机水平翻转给定的PIL图像。 110 | T.ToTensor(), # 将``PIL Image``或``numpy.ndarray``转换为张量。 111 | T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # 用平均值和标准偏差归一化张量图像。 112 | # input[channel] = (input[channel] - mean[channel]) / std[channel] 113 | ]) 114 | 115 | transform_test = T.Compose([ 116 | T.Resize((args.height, args.width)), # 将输入PIL图像的大小调整为给定大小。 117 | T.ToTensor(), 118 | T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 119 | ]) 120 | 121 | # 设置pin_memory=True,则意味着生成的Tensor数据最开始是属于内存中的锁页内存,这样将内存的Tensor转义到GPU的显存就会更快一些。 122 | pin_memory = True if use_gpu else False 123 | 124 | # DataLoader数据加载器。 组合数据集和采样器,并在数据集上提供单进程或多进程迭代器。 125 | trainloader = DataLoader( 126 | # VideoDataset:基于视频的person reid的数据集.(训练的数据集,视频序列长度,采样方法:随机,进行数据增强) 127 | VideoDataset(dataset.train, seq_len=args.seq_len, sample='random', transform=transform_train), 128 | # 随机抽样N个身份,然后对于每个身份,随机抽样K个实例,因此批量大小为N * K. 129 | sampler=RandomIdentitySampler(dataset.train, num_instances=args.num_instances), 130 | batch_size=args.train_batch, # 训练的批次大小 131 | num_workers=args.workers, # 多进程的数目 132 | pin_memory=pin_memory, 133 | drop_last=True, 134 | ) # 如果数据集大小不能被批量大小整除,则设置为“True”以删除最后一个不完整的批次。 135 | 136 | queryloader = DataLoader( 137 | VideoDataset(dataset.query, seq_len=args.seq_len, sample='dense', transform=transform_test), 138 | batch_size=args.test_batch, 139 | shuffle=False, # 设置为“True”以使数据在每个时期重新洗牌(默认值:False)。 140 | num_workers=args.workers, 141 | pin_memory=pin_memory, 142 | drop_last=False, # 如果“False”和数据集的大小不能被批量大小整除,那么最后一批将更小。 143 | ) 144 | 145 | galleryloader = DataLoader( 146 | VideoDataset(dataset.gallery, seq_len=args.seq_len, sample='dense', transform=transform_test), 147 | batch_size=args.test_batch, shuffle=False, num_workers=args.workers, 148 | pin_memory=pin_memory, drop_last=False, 149 | ) 150 | 151 | print("Initializing model: {}".format(args.arch)) # 模型的初始化 152 | 153 | if args.arch == 'resnet503d': 154 | model = resnet3d.resnet50(num_classes=dataset.num_train_pids, sample_width=args.width, 155 | sample_height=args.height, sample_duration=args.seq_len) 156 | # 如果不存在预训练模型,则报错 157 | if not os.path.exists(args.pretrained_model): 158 | raise IOError("Can't find pretrained model: {}".format(args.pretrained_model)) 159 | # 导入预训练的模型 160 | print("Loading checkpoint from '{}'".format(args.pretrained_model)) 161 | checkpoint = torch.load(args.pretrained_model) 162 | state_dict = {} # 状态字典,从checkpoint文件中加载参数 163 | for key in checkpoint['state_dict']: 164 | if 'fc' in key: 165 | continue 166 | state_dict[key.partition("module.")[2]] = checkpoint['state_dict'][key] 167 | model.load_state_dict(state_dict, strict=False) 168 | else: 169 | model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids, loss={'xent', 'htri'}) 170 | print("Model size: {:.5f}M".format(sum(p.numel() for p in model.parameters())/1000000.0)) 171 | 172 | # 损失函数:xent:softmax交叉熵损失函数。htri:三元组损失函数。 173 | criterion_xent = CrossEntropyLabelSmooth(num_classes=dataset.num_train_pids, use_gpu=use_gpu) 174 | criterion_htri = TripletLoss(margin=args.margin) 175 | # 优化器:adam 176 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) 177 | # stepsize,逐步减少学习率(> 0表示已启用) 178 | if args.stepsize > 0: 179 | scheduler = lr_scheduler.StepLR(optimizer, step_size=args.stepsize, gamma=args.gamma) 180 | # lr_scheduler学习率计划,StepLR,将每个参数组的学习速率设置为每个步长时期由gamma衰减的初始lr. 181 | start_epoch = args.start_epoch # 手动时期编号(重启时有用) 182 | 183 | if use_gpu: 184 | model = nn.DataParallel(model).cuda() # 多GPU训练 185 | # DataParallel是torch.nn下的一个类,需要制定的参数是module(可以多gpu运行的类函数)和input(数据集) 186 | 187 | if args.evaluate: # 这里的evaluate没有意义,应该添加代码导入保存的checkpoint,再test 188 | print("Evaluate only") # 进行评估 189 | test(model, queryloader, galleryloader, args.pool, use_gpu) 190 | return 191 | 192 | start_time = time.time() # 开始的时间 193 | best_rank1 = -np.inf # 初始化,负无穷 194 | if args.arch == 'resnet503d': # 如果模型为resnet503d, 195 | torch.backends.cudnn.benchmark = False 196 | 197 | for epoch in range(start_epoch, args.max_epoch): # epoch,从开始到最大,进行训练。 198 | print("==> Epoch {}/{}".format(epoch+1, args.max_epoch)) 199 | 200 | train(model, criterion_xent, criterion_htri, optimizer, trainloader, use_gpu) 201 | 202 | if args.stepsize > 0: 203 | scheduler.step() 204 | 205 | # 如果运行一次评估的需要的epoch数大于0,并且当前epoch+1能整除这个epoch数,或者等于最大epoch数。那么就进行一次评估。 206 | if args.eval_step > 0 and (epoch+1) % args.eval_step == 0 or (epoch+1) == args.max_epoch: 207 | print("==> Test") 208 | rank1 = test(model, queryloader, galleryloader, args.pool, use_gpu) 209 | is_best = rank1 > best_rank1 # 比较,大于则返回true,否则返回false。 210 | if is_best: 211 | best_rank1 = rank1 212 | 213 | if use_gpu: 214 | state_dict = model.module.state_dict() 215 | # 函数static_dict()用于返回包含模块所有状态的字典,包括参数和缓存。 216 | else: 217 | state_dict = model.state_dict() 218 | # 保存checkpoint文件 219 | save_checkpoint({ 220 | 'state_dict': state_dict, 221 | 'rank1': rank1, 222 | 'epoch': epoch, 223 | }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch+1) + '.pth.tar')) 224 | # 经过的时间 225 | elapsed = round(time.time() - start_time) # round() 方法返回浮点数x的四舍五入值 226 | elapsed = str(datetime.timedelta(seconds=elapsed)) # 对象代表两个时间之间的时间差, 227 | print("Finished. Total elapsed time (h:m:s): {}".format(elapsed)) 228 | 229 | 230 | def train(model, criterion_xent, criterion_htri, optimizer, trainloader, use_gpu): 231 | 232 | model.train() # 选择训练的数据集 233 | losses = AverageMeter() # 计算和保存当前值和平均值。 234 | 235 | for batch_idx, (imgs, pids, _) in enumerate(trainloader): # trainloader,121行, 236 | # 从trainloader中获得批次的索引,图像和行人的id。 237 | if use_gpu: 238 | imgs, pids = imgs.cuda(), pids.cuda() # 将数据转到gpu上 239 | imgs, pids = Variable(imgs), Variable(pids) # 将imgs, pids装进Variable中 240 | outputs, features = model(imgs) # 喂给模型图片 241 | if args.htri_only: 242 | # only use hard triplet loss to train the network,只使用三元组损失训练网络 243 | loss = criterion_htri(features, pids) 244 | else: 245 | # combine hard triplet loss with cross entropy loss 三元组损失加交叉熵损失函数 246 | xent_loss = criterion_xent(outputs, pids) 247 | htri_loss = criterion_htri(features, pids) 248 | loss = xent_loss + htri_loss 249 | optimizer.zero_grad() # 将所有参数的梯度置为0 250 | loss.backward() # 梯度反向传播 251 | optimizer.step() # 进行adam优化 252 | losses.update(loss.data[0], pids.size(0)) # 参数更新 253 | 254 | if (batch_idx+1) % args.print_freq == 0: # 输出的频率,多少批次输出一次 255 | print("Batch {}/{}\t Loss {:.6f} ({:.6f})".format(batch_idx+1, len(trainloader), losses.val, losses.avg)) 256 | 257 | 258 | def test(model, queryloader, galleryloader, pool, use_gpu, ranks=[1, 5, 10, 20]): 259 | 260 | model.eval() # 模型的评估 261 | 262 | qf, q_pids, q_camids = [], [], [] 263 | for batch_idx, (imgs, pids, camids) in enumerate(queryloader): 264 | if use_gpu: 265 | imgs = imgs.cuda() 266 | imgs = Variable(imgs, volatile=True) # 将imgs装进Variable中, 267 | # volatile=True的节点不会求导,即使requires_grad=True,也不会进行反向传播,对于不需要反向传播的情景(inference,测试推断), 268 | # 该参数可以实现一定速度的提升,并节省一半的显存,因为其不需要保存梯度 269 | 270 | b, n, s, c, h, w = imgs.size() # b=1, n=batchs, s=图片的长度 271 | assert(b == 1) # 断言函数 272 | imgs = imgs.view(b*n, s, c, h, w) 273 | features = model(imgs) # 喂给模型图片,获得特征 274 | features = features.view(n, -1) # view()函数作用是将一个多行的Tensor,拼接成一行。 275 | features = torch.mean(features, 0) # 取平均值 276 | features = features.data.cpu() 277 | qf.append(features) # 向列表尾部追加一个新元素,序列特征 278 | q_pids.extend(pids) # 向列表尾部追加一个列表,人的身份person id 279 | q_camids.extend(camids) # 摄像机的id 280 | qf = torch.stack(qf) # 堆叠 281 | q_pids = np.asarray(q_pids) # 将列表转换为数组 282 | q_camids = np.asarray(q_camids) 283 | 284 | print("Extracted features for query set, obtained {}-by-{} matrix".format(qf.size(0), qf.size(1))) 285 | 286 | gf, g_pids, g_camids = [], [], [] # 图库,gallery 287 | for batch_idx, (imgs, pids, camids) in enumerate(galleryloader): 288 | if use_gpu: 289 | imgs = imgs.cuda() 290 | imgs = Variable(imgs, volatile=True) 291 | b, n, s, c, h, w = imgs.size() 292 | imgs = imgs.view(b*n, s, c, h, w) 293 | assert(b == 1) 294 | features = model(imgs) 295 | features = features.view(n, -1) 296 | if pool == 'avg': # 采用平均池化还是最大池化 297 | features = torch.mean(features, 0) 298 | else: 299 | features, _ = torch.max(features, 0) 300 | features = features.data.cpu() 301 | gf.append(features) 302 | g_pids.extend(pids) 303 | g_camids.extend(camids) 304 | gf = torch.stack(gf) 305 | g_pids = np.asarray(g_pids) 306 | g_camids = np.asarray(g_camids) 307 | 308 | print("Extracted features for gallery set, obtained {}-by-{} matrix".format(gf.size(0), gf.size(1))) 309 | print("Computing distance matrix") 310 | # 计算距离矩阵 311 | m, n = [gf.size(0), qf.size(0)] # 矩阵的行 312 | distmat = torch.pow(qf, 2).sum(dim=1, keepdim=True).expand(m, n) + \ 313 | torch.pow(gf, 2).sum(dim=1, keepdim=True).expand(n, m).t() 314 | distmat.addmm_(1, -2, qf, gf.t()) 315 | distmat = distmat.numpy() 316 | 317 | print("Computing CMC and mAP") 318 | cmc, mAP = evaluate(distmat, q_pids, g_pids, q_camids, g_camids) 319 | 320 | print("Results ----------") 321 | print("mAP: {:.1%}".format(mAP)) 322 | print("CMC curve") 323 | for r in ranks: 324 | print("Rank-{:<3}: {:.1%}".format(r, cmc[r-1])) 325 | print("------------------") 326 | 327 | return cmc[0] 328 | 329 | 330 | if __name__ == '__main__': 331 | main() 332 | -------------------------------------------------------------------------------- /models/ResNet.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from torch.autograd import Variable 7 | import torchvision 8 | 9 | __all__ = ['ResNet50TP', 'ResNet50TA', 'ResNet50RNN'] 10 | # 用 __all__ 在模块级别暴露接口,1.提供了哪些是公开接口的约定 11 | # 2.控制 from xxx import * 的行为,import * 就只会导入 __all__ 列出的成员。 12 | # 3.list类型,以字面量的形式显式写出来.同时应该写在所有 import 语句下面 13 | 14 | 15 | class ResNet50TP(nn.Module): # 定义时间池化网络模型 16 | def __init__(self, num_classes, loss={'xent'}, **kwargs): # **kwargs表示的就是形参中按照关键字传值把多余的传值以字典的方式呈现 17 | super(ResNet50TP, self).__init__() 18 | # 首先找到ResNet50TP的父类(nn.Module),然后把类ResNet50TP的对象转换为父类的对象 19 | self.loss = loss # 损失函数为标签平滑正则化交叉熵损失函数 20 | resnet50 = torchvision.models.resnet50(pretrained=True) # 调用封装好的resnet50模型, 21 | # pretrained=True 调用一个在imagenet上预训练过的模型 22 | self.base = nn.Sequential(*list(resnet50.children())[:-2]) # 快速构建基本网络,选择resnet50的子网络,去掉原模型的最后两层 23 | # nn.Sequential是一个Sequential容器,模块将按照构造函数中传递的顺序添加到模块中 24 | self.feat_dim = 2048 # 输出的特征维度为2048 25 | self.classifier = nn.Linear(self.feat_dim, num_classes) # 分类器 26 | # nn.Linear(x,y)一种线性变换y = Ax + b 27 | 28 | def forward(self, x): # 前向传播函数定义,输入为x,5维的tensor (batch_size,seq_len,channels,width,height,) 29 | b = x.size(0) # batch_size 30 | t = x.size(1) # seq_len 31 | x = x.view(b*t, x.size(2), x.size(3), x.size(4)) # b*t= 32 × 4 =128 图片总数 32 | # view()会将原有数据重新分配为一个新的张量 33 | x = self.base(x) # x输入到基本网络中 34 | x = F.avg_pool2d(x, x.size()[2:]) # avg_pool2d(x, x.size(2), x.size(3), x.size(4)) 35 | # 2d平均池化操作,input tensor (minibatch x in_channels x iH x iW) 36 | x = x.view(b, t, -1) # -1 表示维度从其他维度推断出来的 37 | # 将数据重新分配 38 | x = x.permute(0, 2, 1) 39 | # 调整数据的维度,将1,2维数据互换 40 | f = F.avg_pool1d(x, t) # 1d的平均池化 41 | f = f.view(b, self.feat_dim) # b为行人的身份数,也是矩阵的行数,矩阵有2048列 42 | if not self.training: # 如果是测试,返回特征向量 43 | return f 44 | y = self.classifier(f) # 返回分类器输出结果 45 | 46 | if self.loss == {'xent'}: # 如果损失函数是标签平滑正则化softmax交叉熵损失函数 47 | return y 48 | elif self.loss == {'xent', 'htri'}: # 损失函数是交叉熵损失函数+难样本挖掘三元组损失 49 | return y, f 50 | elif self.loss == {'cent'}: # 中心损失 51 | return y, f 52 | else: 53 | raise KeyError("Unsupported loss: {}".format(self.loss)) 54 | 55 | 56 | class ResNet50TA(nn.Module): # 时间注意力模型 57 | def __init__(self, num_classes, loss={'xent'}, **kwargs): 58 | super(ResNet50TA, self).__init__() 59 | self.loss = loss 60 | resnet50 = torchvision.models.resnet50(pretrained=True) 61 | self.base = nn.Sequential(*list(resnet50.children())[:-2]) 62 | self.att_gen = 'softmax' # method for attention generation: softmax or sigmoid 63 | # 注意力的生成,通过softmax or sigmoid 64 | self.feat_dim = 2048 # feature dimension 特征维度为2048 65 | self.middle_dim = 256 # middle layer dimension 中间层的维度为256 66 | self.classifier = nn.Linear(self.feat_dim, num_classes) # 线性输出 67 | self.attention_conv = nn.Conv2d(self.feat_dim, self.middle_dim, (7, 4)) # 输入为2048,滤波器为256 68 | # 7,4 cooresponds to 224, 112 input image size 69 | self.attention_tconv = nn.Conv1d(self.middle_dim, 1, 3, padding=1) 70 | 71 | def forward(self, x): 72 | b = x.size(0) # b = 32 73 | t = x.size(1) # t = 4 74 | x = x.view(b*t, x.size(2), x.size(3), x.size(4)) # 128x3x224x112 75 | x = self.base(x) # 将图片输入到模型中 128x2048x7x4 76 | a = F.relu(self.attention_conv(x)) # 激励函数 128x256x1x1 77 | a = a.view(b, t, self.middle_dim) # 将原有数据重新分配为一个新的张量 (b,t,256) => 32x4x256 78 | a = a.permute(0, 2, 1) # 调整数据的维度,将1,2维数据互换 32x256x4 79 | a = F.relu(self.attention_tconv(a)) # 激励函数 32x1x4 80 | a = a.view(b, t) # 将原有数据重新分配为一个新的张量 (b,t) => 32x4 81 | x = F.avg_pool2d(x, x.size()[2:]) # avg_pool2d(x, x.size(3), x.size(4)) 2d的平均池化128x2048x1x1 82 | # a = 注意力分数 83 | if self. att_gen == 'softmax': 84 | a = F.softmax(a, dim=1) # dim = 1,在维度1计算softmax 85 | elif self.att_gen == 'sigmoid': 86 | a = F.sigmoid(a) 87 | a = F.normalize(a, p=1, dim=1) # p =1 标准公式中的指数值, dim = 1 在维度一进行归一化操作 88 | else: 89 | raise KeyError("Unsupported attention generation function: {}".format(self.att_gen)) 90 | x = x.view(b, t, -1) # 将x数据重新进行分配 32x4x2048 91 | a = torch.unsqueeze(a, -1) # 在a所在维度上增加1 (b,t,1) 32x4x1 92 | a = a.expand(b, t, self.feat_dim) # 指定单个维度扩大为更大的尺寸,(b,t,2048) 32x4x2048 93 | att_x = torch.mul(x, a) # 用标量值a乘以输入x的每个元素,并返回一个新的结果张量。32x4x2048 94 | att_x = torch.sum(att_x, 1) # 返回输入张量给定维度上每行的和。返回1列数 32x2048 95 | 96 | f = att_x.view(b, self.feat_dim) # 将att_x数据重新进行分配,(b,2048) 32x2048 97 | if not self.training: 98 | return f 99 | y = self.classifier(f) # 线性输出,根据特征向量f分类 32x625 100 | 101 | if self.loss == {'xent'}: 102 | return y 103 | elif self.loss == {'xent', 'htri'}: 104 | return y, f 105 | elif self.loss == {'cent'}: 106 | return y, f 107 | else: 108 | raise KeyError("Unsupported loss: {}".format(self.loss)) 109 | 110 | 111 | class ResNet50RNN(nn.Module): 112 | def __init__(self, num_classes, loss={'xent'}, **kwargs): 113 | super(ResNet50RNN, self).__init__() 114 | self.loss = loss 115 | resnet50 = torchvision.models.resnet50(pretrained=True) 116 | self.base = nn.Sequential(*list(resnet50.children())[:-2]) 117 | self.hidden_dim = 512 118 | self.feat_dim = 2048 119 | self.classifier = nn.Linear(self.hidden_dim, num_classes) # 输出层 120 | self.lstm = nn.LSTM( 121 | input_size=self.feat_dim, # 每个小段视频的特征维度 122 | hidden_size=self.hidden_dim, # LSTM的隐藏单元 123 | num_layers=1, # 有几层lstm层 124 | batch_first=True # 输入和输出会是以batch_size为第一维度的特征集 125 | ) 126 | 127 | def forward(self, x): 128 | b = x.size(0) 129 | t = x.size(1) 130 | x = x.view(b*t, x.size(2), x.size(3), x.size(4)) 131 | x = self.base(x) # x输入到基本网络中 132 | x = F.avg_pool2d(x, x.size()[2:]) # 进行2d的平均池化 133 | x = x.view(b, t, -1) # 对x的数据重新进行分配, 输入x的形状为(batch,time_step,input_size) 134 | # lstm有两个隐藏层状态,h_n是分线,h_c是主线 135 | output, (h_n, h_c) = self.lstm(x) # 输出为(batch,time_step,output_size) 136 | output = output.permute(0, 2, 1) 137 | f = F.avg_pool1d(output, t) # 关于时间步的1d平均池化 138 | f = f.view(b, self.hidden_dim) # 对数据重新进行分配,(b,512) 139 | if not self.training: 140 | return f 141 | y = self.classifier(f) 142 | 143 | if self.loss == {'xent'}: 144 | return y 145 | elif self.loss == {'xent', 'htri'}: 146 | return y, f 147 | elif self.loss == {'cent'}: 148 | return y, f 149 | else: 150 | raise KeyError("Unsupported loss: {}".format(self.loss)) 151 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from .ResNet import * # 导入模块,每次使用模块中的函数,直接使用函数就可以了 4 | # 是把ResNet模块中所有函数都导入进来; 注:相当于导入的是一个文件夹中所有文件,所有函数都是绝对路径。 5 | __factory = { 6 | 'resnet50tp': ResNet50TP, 7 | 'resnet50ta': ResNet50TA, 8 | 'resnet50rnn': ResNet50RNN, 9 | } # 字典 10 | 11 | 12 | def get_names(): # 获得相应模型的名称(key) 13 | return __factory.keys() 14 | 15 | 16 | def init_model(name, *args, **kwargs): # 初始化模型 17 | if name not in __factory.keys(): # 如果模型名称不在模型字典中,报错,keyerror 映射中没有这个键 18 | raise KeyError("Unknown model: {}".format(name)) 19 | return __factory[name](*args, **kwargs) # 返回字典中相应key对应的value,这里是模型的函数名 20 | -------------------------------------------------------------------------------- /models/resnet3d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = [ 9 | 'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 10 | 'resnet152', 'resnet200' 11 | ] 12 | 13 | 14 | def conv3x3x3(in_planes, out_planes, stride=1): 15 | # 3x3x3 convolution with padding 3*3*3卷积块 https://blog.csdn.net/weicao1990/article/details/80283443 16 | # 对d帧h*w的彩色RGB图像进行卷积,即假设输入的大小为,(d,w,h),通道数为c,卷积核(滤波器)为f,即滤波器的维度为f*f*f*c,卷积核的数目为1, 17 | # 输出 (d-f+1)*(w-f+1)*(h-f+1)*1 18 | return nn.Conv3d( 19 | in_planes, # 输入平面,输入信号的通道 20 | out_planes, # 输出平面,卷积产生的通道 21 | kernel_size=3, # 卷积核大小为3 22 | stride=stride, # 卷积步长为1 23 | padding=1, # 填充,输入的每一条边补充0的层数 如果padding不是0,会在输入的每一边添加相应数目0 24 | bias=False # 不添加偏置 25 | ) 26 | 27 | 28 | def downsample_basic_block(x, planes, stride): # 下采样基本块,进行了一次平均池化 29 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) # 3d平均池化 输入大小:(N,C,D_in,H_in,W_in) 30 | # D_out = (D_in-1)/(stride+1) H_out = (H_in-1)/(stride+1) W_out = (W_in-1)/(stride+1) 输出:(N,C,D_out,H_out,W_out) 31 | zero_pads = torch.Tensor( 32 | out.size(0), planes - out.size(1), out.size(2), out.size(3), 33 | out.size(4)).zero_() # 用0填充该tensor 34 | if isinstance(out.data, torch.cuda.FloatTensor): # isinstance(object, classinfo) 35 | # 如果对象的类型与参数二的类型(classinfo)相同则返回 True,否则返回 False。 36 | zero_pads = zero_pads.cuda() # 转换成GPU张量 37 | 38 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) # 将out.data, zero_pads按列放在一起 39 | 40 | return out 41 | 42 | 43 | class BasicBlock(nn.Module): # resnet的基本块,都是3*3*3卷积 44 | expansion = 1 45 | 46 | def __init__(self, inplanes, planes, stride=1, downsample=None): 47 | super(BasicBlock, self).__init__() 48 | self.conv1 = conv3x3x3(inplanes, planes, stride) # 第一次3*3*3卷积 49 | self.bn1 = nn.BatchNorm3d(planes) # 对小批量(mini-batch)4d数据组成的5d输入进行批标准化(Batch Normalization)操作 50 | # (N, C, D, H, W) 51 | self.relu = nn.ReLU(inplace=True) # 激励函数{ReLU}(x)= max(0, x) inplace-选择是否进行覆盖运算 52 | 53 | self.conv2 = conv3x3x3(planes, planes) # 第二次 3*3*3卷积 54 | self.bn2 = nn.BatchNorm3d(planes) 55 | self.downsample = downsample # 跳跃链接 56 | self.stride = stride 57 | 58 | def forward(self, x): # 前向传播,朴素残差模块(不带bottleneck) 59 | residual = x # 残差 60 | 61 | out = self.conv1(x) 62 | out = self.bn1(out) 63 | out = self.relu(out) 64 | 65 | out = self.conv2(out) 66 | out = self.bn2(out) 67 | 68 | if self.downsample is not None: 69 | residual = self.downsample(x) 70 | 71 | out += residual # H(x)=F(x)+x 72 | out = self.relu(out) 73 | 74 | return out 75 | 76 | 77 | class Bottleneck(nn.Module): # bottleneck残差模块 78 | expansion = 4 79 | 80 | def __init__(self, inplanes, planes, stride=1, downsample=None): # planes = 64 81 | super(Bottleneck, self).__init__() 82 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 83 | self.bn1 = nn.BatchNorm3d(planes) 84 | 85 | self.conv2 = nn.Conv3d( 86 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 87 | self.bn2 = nn.BatchNorm3d(planes) 88 | 89 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) # planes = 256 90 | self.bn3 = nn.BatchNorm3d(planes * 4) 91 | 92 | self.relu = nn.ReLU(inplace=True) 93 | self.downsample = downsample 94 | self.stride = stride 95 | 96 | def forward(self, x): 97 | residual = x 98 | 99 | out = self.conv1(x) 100 | out = self.bn1(out) 101 | out = self.relu(out) 102 | 103 | out = self.conv2(out) 104 | out = self.bn2(out) 105 | out = self.relu(out) 106 | 107 | out = self.conv3(out) 108 | out = self.bn3(out) 109 | 110 | if self.downsample is not None: 111 | residual = self.downsample(x) 112 | 113 | out += residual 114 | out = self.relu(out) 115 | 116 | return out 117 | 118 | 119 | class ResNet(nn.Module): 120 | 121 | def __init__(self, 122 | block, 123 | layers, 124 | sample_height, 125 | sample_width, 126 | sample_duration, # 样本的持续时间,图片的数目 d 127 | shortcut_type='B', 128 | num_classes=400): 129 | self.inplanes = 64 # 输入为64个通道 130 | super(ResNet, self).__init__() 131 | self.conv1 = nn.Conv3d( # part1.第一次卷积, 132 | 3, # 输入的通道为3 (d,h,w) 133 | 64, # 输出的通道为64 134 | kernel_size=7, # 卷积核(滤波器的尺寸) 7*7*7 135 | stride=(1, 2, 2), # 步长 136 | padding=(3, 3, 3), # padding的深度为3 137 | bias=False) 138 | self.bn1 = nn.BatchNorm3d(64) 139 | self.relu = nn.ReLU(inplace=True) 140 | # part2: 最大池化.滤波器为3*3*3, 步长为2, 填充的深度为1 141 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 142 | 143 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) # 构建一个bottleneck残差模块, 144 | # 输入通道数为64, 输出通道数变为256 这一层构建layers[0]=3个卷积块 145 | 146 | self.layer2 = self._make_layer( 147 | block, 128, layers[1], shortcut_type, stride=2) 148 | # 输入通道数为128, 输出通道数变为512 这一层构建layers[1]=4个卷积块 149 | 150 | self.layer3 = self._make_layer( 151 | block, 256, layers[2], shortcut_type, stride=2) 152 | # 输入通道数为256, 输出通道数变为1024 这一层构建layers[2]=6个卷积块 153 | 154 | self.layer4 = self._make_layer( 155 | block, 512, layers[3], shortcut_type, stride=2) 156 | # 输入通道数为512, 输出通道数变为2048 这一层构建layers[3]=3个卷积块 157 | 158 | last_duration = int(math.ceil(sample_duration / 16.0)) # 确定最后平均池化的滤波器大小 159 | last_height = int(math.ceil(sample_height / 32.0)) 160 | last_width = int(math.ceil(sample_width / 32.0)) 161 | self.avgpool = nn.AvgPool3d( 162 | (last_duration, last_height, last_width), stride=1) 163 | 164 | self.fc = nn.Linear(512 * block.expansion, num_classes) # 全连接层,分类器 165 | 166 | for m in self.modules(): # 遍历网络中的所有模型 167 | if isinstance(m, nn.Conv3d): # 如果模型的类型与nn.Conv3d一样,其权重初始化为 168 | m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out') 169 | elif isinstance(m, nn.BatchNorm3d): 170 | m.weight.data.fill_(1) 171 | m.bias.data.zero_() 172 | 173 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 174 | # block = bottleneck 残差模块 planes = 输入的通道数 175 | # blocks = 要构建几个残差块 176 | downsample = None 177 | if stride != 1 or self.inplanes != planes * block.expansion: # 步长不等于1,或者通道数不等于当前的通道乘以4 178 | if shortcut_type == 'A': # 跳跃链接方式"A",进行平均池化 179 | downsample = partial( # 偏函数Partial(第一个参数 = 一个函数,第二部分=可变参数,第三个参数) 180 | downsample_basic_block, # 将所要承载的函数作为partial()函数的第一个参数, 181 | planes=planes * block.expansion, # 原函数的各个参数依次作为partial()函数后续的参数,除非使用关键字参数. 182 | stride=stride) # 这样得到一个新的函数,对原来的downsample_basic_block函数进行了扩展 183 | else: # 跳跃链接的方式"B", 进行一次卷积操作+批量归一化 184 | downsample = nn.Sequential( 185 | nn.Conv3d( # 一次1*1*1 256个通道的卷积 186 | self.inplanes, 187 | planes * block.expansion, # 256 188 | kernel_size=1, 189 | stride=stride, 190 | bias=False), 191 | nn.BatchNorm3d(planes * block.expansion) # 批量归一化操作 192 | ) 193 | 194 | layers = [] # 建立一个层的列表 195 | 196 | layers.append(block(self.inplanes, planes, stride, downsample)) # 将残差块内容放入layers列表中, 197 | # 第一个残差块是带有降采样的跳跃链接 198 | self.inplanes = planes * block.expansion # 输入的通道数*4 199 | for i in range(1, blocks): # blocks = layers[] 从第二个开始遍历layers列表 根据[n]判断要建立几个基本残差块 200 | layers.append(block(self.inplanes, planes)) # 从第二个残差块开始,跳跃链接没有降采样 201 | 202 | return nn.Sequential(*layers) # 返回构建好的残差块 203 | 204 | def load_matched_state_dict(self, state_dict): # 从预训练模型中加载匹配的状态字典,即模型参数 205 | 206 | own_state = self.state_dict() 207 | for name, param in state_dict.items(): 208 | if name not in own_state: 209 | continue 210 | # if isinstance(param, Parameter): 211 | # backwards compatibility for serialized parameters 212 | param = param.data 213 | print("loading "+name) 214 | own_state[name].copy_(param) 215 | 216 | def forward(self, x): # 整个Resnet的前向传播 217 | # default size is (b, s, c, w, h), s for seq_len, c for channel 218 | # convert for 3d cnn, (b, c, s, w, h) 219 | x = x.permute(0,2,1,3,4) 220 | x = self.conv1(x) 221 | x = self.bn1(x) 222 | x = self.relu(x) 223 | x = self.maxpool(x) 224 | 225 | x = self.layer1(x) 226 | x = self.layer2(x) 227 | x = self.layer3(x) 228 | x = self.layer4(x) 229 | x = self.avgpool(x) 230 | x = x.view(x.size(0), -1) 231 | y = self.fc(x) 232 | 233 | return y, x 234 | 235 | 236 | def get_fine_tuning_parameters(model, ft_begin_index): 237 | if ft_begin_index == 0: 238 | return model.parameters() 239 | 240 | ft_module_names = [] 241 | for i in range(ft_begin_index, 5): 242 | ft_module_names.append('layer{}'.format(i)) 243 | ft_module_names.append('fc') 244 | 245 | parameters = [] 246 | for k, v in model.named_parameters(): 247 | for ft_module in ft_module_names: 248 | if ft_module in k: 249 | parameters.append({'params': v}) 250 | break 251 | else: 252 | parameters.append({'params': v, 'lr': 0.0}) 253 | 254 | return parameters 255 | 256 | 257 | def resnet10(**kwargs): 258 | """Constructs a ResNet-18 model. 259 | """ 260 | model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs) 261 | return model 262 | 263 | 264 | def resnet18(**kwargs): 265 | """Constructs a ResNet-18 model. 266 | """ 267 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) 268 | return model 269 | 270 | 271 | def resnet34(**kwargs): 272 | """Constructs a ResNet-34 model. 273 | """ 274 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) 275 | return model 276 | 277 | 278 | def resnet50(**kwargs): 279 | """Constructs a ResNet-50 model. 280 | """ 281 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) 282 | return model 283 | 284 | 285 | def resnet101(**kwargs): 286 | """Constructs a ResNet-101 model. 287 | """ 288 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) 289 | return model 290 | 291 | 292 | def resnet152(**kwargs): 293 | """Constructs a ResNet-101 model. 294 | """ 295 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) 296 | return model 297 | 298 | 299 | def resnet200(**kwargs): 300 | """Constructs a ResNet-101 model. 301 | """ 302 | model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs) 303 | return model 304 | 305 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # @author = yuci 4 | # Date: 18-10-2 下午12:44 5 | 6 | 7 | from __future__ import print_function, absolute_import 8 | import os 9 | import sys 10 | import argparse 11 | import os.path as osp 12 | import numpy as np 13 | 14 | import torch 15 | import torch.nn as nn 16 | import torch.backends.cudnn as cudnn 17 | from torch.utils.data import DataLoader 18 | from torch.autograd import Variable 19 | 20 | 21 | import data_manager 22 | from video_loader import VideoDataset 23 | import transforms as T 24 | import models 25 | from models import resnet3d 26 | from utils import Logger, visualize_ranked_results 27 | from eval_metrics import evaluate 28 | 29 | 30 | parser = argparse.ArgumentParser(description='Train video model with cross entropy loss') 31 | # Datasets 32 | parser.add_argument('-d', '--dataset', type=str, default='mars', 33 | choices=data_manager.get_names()) 34 | parser.add_argument('-j', '--workers', default=4, type=int, 35 | help="number of data loading workers (default: 4)") 36 | parser.add_argument('--height', type=int, default=224, 37 | help="height of an image (default: 224)") 38 | parser.add_argument('--width', type=int, default=112, 39 | help="width of an image (default: 112)") 40 | parser.add_argument('--seq-len', type=int, default=4, help="number of images to sample in a tracklet") 41 | # Optimization options 42 | parser.add_argument('--max-epoch', default=800, type=int, 43 | help="maximum epochs to run") 44 | parser.add_argument('--start-epoch', default=0, type=int, 45 | help="manual epoch number (useful on restarts)") 46 | parser.add_argument('--train-batch', default=32, type=int, 47 | help="train batch size") 48 | parser.add_argument('--test-batch', default=1, type=int, help="has to be 1") 49 | parser.add_argument('--lr', '--learning-rate', default=0.0003, type=float, 50 | help="initial learning rate, use 0.0001 for rnn, use 0.0003 for pooling and attention") 51 | parser.add_argument('--stepsize', default=200, type=int, 52 | help="stepsize to decay learning rate (>0 means this is enabled)") 53 | parser.add_argument('--gamma', default=0.1, type=float, 54 | help="learning rate decay") 55 | parser.add_argument('--weight-decay', default=5e-04, type=float, 56 | help="weight decay (default: 5e-04)") 57 | parser.add_argument('--margin', type=float, default=0.3, help="margin for triplet loss") 58 | parser.add_argument('--num-instances', type=int, default=4, 59 | help="number of instances per identity") 60 | parser.add_argument('--htri-only', action='store_true', default=False, 61 | help="if this is True, only htri loss is used in training") 62 | # Architecture 63 | parser.add_argument('-a', '--arch', type=str, default='resnet50tp', 64 | help="resnet503d, resnet50tp, resnet50ta, resnetrnn") 65 | parser.add_argument('--pool', type=str, default='avg', choices=['avg', 'max']) 66 | 67 | # Miscs 68 | parser.add_argument('--print-freq', type=int, default=78, help="print frequency") 69 | parser.add_argument('--seed', type=int, default=1, help="manual seed") 70 | parser.add_argument('--pretrained-model', type=str, default='/home/ying/Desktop/video/resnet-50-kinetics.pth', 71 | help='need to be set for resnet3d models') 72 | parser.add_argument('--best-model', type=str, default='/home/ying/Desktop/video/log/best_model.pth.tar', 73 | help='need to be set for evaluate the model') 74 | parser.add_argument('--evaluate', action='store_true', help="evaluation only") 75 | parser.add_argument('--eval-step', type=int, default=50, 76 | help="run evaluation for every N epochs (set to -1 to test after training)") 77 | parser.add_argument('--save-dir', type=str, default='log') 78 | parser.add_argument('--use-cpu', action='store_true', help="use cpu") 79 | parser.add_argument('--gpu-devices', default='0', type=str, help='gpu device ids for CUDA_VISIBLE_DEVICES') 80 | parser.add_argument('--vis-ranked-res', action='store_true', 81 | help="visualize ranked results, only available in evaluation mode (default: False)") 82 | 83 | args = parser.parse_args() 84 | 85 | 86 | def main(): 87 | torch.manual_seed(args.seed) 88 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices 89 | use_gpu = torch.cuda.is_available() 90 | if args.use_cpu: 91 | use_gpu = False 92 | 93 | sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) 94 | print("==========\nArgs:{}\n==========".format(args)) 95 | 96 | if use_gpu: 97 | print("Currently using GPU {}".format(args.gpu_devices)) 98 | cudnn.benchmark = True 99 | torch.cuda.manual_seed_all(args.seed) 100 | else: 101 | print("Currently using CPU (GPU is highly recommended)") 102 | 103 | print("Initializing dataset {}".format(args.dataset)) 104 | dataset = data_manager.init_dataset(name=args.dataset) 105 | 106 | transform_test = T.Compose([ 107 | T.Resize((args.height, args.width)), 108 | T.ToTensor(), 109 | T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 110 | ]) 111 | 112 | pin_memory = True if use_gpu else False 113 | 114 | queryloader = DataLoader( 115 | VideoDataset(dataset.query, seq_len=args.seq_len, sample='dense', transform=transform_test), 116 | batch_size=args.test_batch, shuffle=False, num_workers=args.workers, 117 | pin_memory=pin_memory, drop_last=False, 118 | ) 119 | 120 | galleryloader = DataLoader( 121 | VideoDataset(dataset.gallery, seq_len=args.seq_len, sample='dense', transform=transform_test), 122 | batch_size=args.test_batch, shuffle=False, num_workers=args.workers, 123 | pin_memory=pin_memory, drop_last=False, 124 | ) 125 | if args.arch == 'resnet503d': 126 | cudnn.benchmark = False 127 | 128 | print("Initializing model: {}".format(args.arch)) 129 | if args.arch == 'resnet503d': 130 | model = resnet3d.resnet50(num_classes=dataset.num_train_pids, sample_width=args.width, 131 | sample_height=args.height, sample_duration=args.seq_len) 132 | if not os.path.exists(args.best_model): 133 | raise IOError("Can't find best model: {}".format(args.best_model)) 134 | print("Loading checkpoint from '{}'".format(args.best_model)) 135 | checkpoint = torch.load(args.best_model) 136 | state_dict = {} 137 | for key in checkpoint['state_dict']: 138 | state_dict[key] = checkpoint['state_dict'][key] 139 | model.load_state_dict(state_dict, strict=False) 140 | else: 141 | model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids, loss={'xent', 'htri'}) 142 | if not os.path.exists(args.best_model): 143 | raise IOError("Can't find best model: {}".format(args.best_model)) 144 | print("Loading checkpoint from '{}'".format(args.best_model)) 145 | checkpoint = torch.load(args.best_model) 146 | state_dict = {} 147 | for key in checkpoint['state_dict']: 148 | state_dict[key] = checkpoint['state_dict'][key] 149 | model.load_state_dict(state_dict, strict=False) 150 | print("Model size: {:.5f}M".format(sum(p.numel() for p in model.parameters()) / 1000000.0)) 151 | 152 | if use_gpu: 153 | model = nn.DataParallel(model).cuda() 154 | 155 | if args.evaluate: 156 | print("Evaluate only") 157 | test(model, queryloader, galleryloader, args.pool, use_gpu) 158 | # distmat = test(model, queryloader, galleryloader, args.pool, use_gpu) # rnn时不能这么做,否则out of memory 159 | # if args.vis_ranked_res: 160 | # visualize_ranked_results( 161 | # distmat, dataset, 162 | # save_dir=osp.join(args.save_dir, 'ranked_results'), 163 | # topk=20, 164 | # ) 165 | 166 | return 167 | 168 | 169 | # def test(model, queryloader, galleryloader, pool, use_gpu, ranks=[1, 5, 10, 20], return_distmat=False): 170 | def test(model, queryloader, galleryloader, pool, use_gpu, ranks=[1, 5, 10, 20]): 171 | model.eval() 172 | 173 | qf, q_pids, q_camids = [], [], [] 174 | for batch_idx, (imgs, pids, camids) in enumerate(queryloader): 175 | if use_gpu: 176 | imgs = imgs.cuda() 177 | imgs = Variable(imgs, volatile=True) 178 | # b=1, n=number of clips, s=16 179 | b, n, s, c, h, w = imgs.size() 180 | assert (b == 1) 181 | imgs = imgs.view(b * n, s, c, h, w) 182 | features = model(imgs) 183 | features = features.view(n, -1) 184 | features = torch.mean(features, 0) 185 | features = features.data.cpu() 186 | qf.append(features) 187 | q_pids.extend(pids) 188 | q_camids.extend(camids) 189 | qf = torch.stack(qf) 190 | q_pids = np.asarray(q_pids) 191 | q_camids = np.asarray(q_camids) 192 | 193 | print("Extracted features for query set, obtained {}-by-{} matrix".format(qf.size(0), qf.size(1))) 194 | 195 | gf, g_pids, g_camids = [], [], [] 196 | for batch_idx, (imgs, pids, camids) in enumerate(galleryloader): 197 | if use_gpu: 198 | imgs = imgs.cuda() 199 | imgs = Variable(imgs, volatile=True) 200 | b, n, s, c, h, w = imgs.size() 201 | imgs = imgs.view(b * n, s, c, h, w) 202 | assert (b == 1) 203 | features = model(imgs) 204 | features = features.view(n, -1) 205 | if pool == 'avg': 206 | features = torch.mean(features, 0) 207 | else: 208 | features, _ = torch.max(features, 0) 209 | features = features.data.cpu() 210 | gf.append(features) 211 | g_pids.extend(pids) 212 | g_camids.extend(camids) 213 | gf = torch.stack(gf) 214 | g_pids = np.asarray(g_pids) 215 | g_camids = np.asarray(g_camids) 216 | 217 | print("Extracted features for gallery set, obtained {}-by-{} matrix".format(gf.size(0), gf.size(1))) 218 | print("Computing distance matrix") 219 | 220 | m, n = qf.size(0), gf.size(0) 221 | distmat = torch.pow(qf, 2).sum(dim=1, keepdim=True).expand(m, n) + \ 222 | torch.pow(gf, 2).sum(dim=1, keepdim=True).expand(n, m).t() 223 | distmat.addmm_(1, -2, qf, gf.t()) 224 | distmat = distmat.numpy() 225 | 226 | print("Computing CMC and mAP") 227 | cmc, mAP = evaluate(distmat, q_pids, g_pids, q_camids, g_camids) 228 | 229 | print("Results ----------") 230 | print("mAP: {:.1%}".format(mAP)) 231 | print("CMC curve") 232 | for r in ranks: 233 | print("Rank-{:<3}: {:.1%}".format(r, cmc[r - 1])) 234 | print("------------------") 235 | # if return_distmat: 236 | # return distmat 237 | return cmc[0] 238 | 239 | 240 | if __name__ == '__main__': 241 | main() 242 | --------------------------------------------------------------------------------