├── average_precision_calculator.py ├── config.py ├── data ├── __init__.py ├── make_train_test.py ├── organize_UCF101.py ├── process_features.py └── process_pca.py ├── dataloader.py ├── main.py ├── metrics.py ├── models ├── NeXtVLAD.py ├── __init__.py └── video_classifiers.py ├── notebooks └── arch_debug.ipynb ├── sample.py ├── train.py └── util.py /average_precision_calculator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Calculate or keep track of the interpolated average precision. 16 | It provides an interface for calculating interpolated average precision for an 17 | entire list or the top-n ranked items. For the definition of the 18 | (non-)interpolated average precision: 19 | http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf 20 | Example usages: 21 | 1) Use it as a static function call to directly calculate average precision for 22 | a short ranked list in the memory. 23 | ``` 24 | import random 25 | p = np.array([random.random() for _ in xrange(10)]) 26 | a = np.array([random.choice([0, 1]) for _ in xrange(10)]) 27 | ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a) 28 | ``` 29 | 2) Use it as an object for long ranked list that cannot be stored in memory or 30 | the case where partial predictions can be observed at a time (Tensorflow 31 | predictions). In this case, we first call the function accumulate many times 32 | to process parts of the ranked list. After processing all the parts, we call 33 | peek_interpolated_ap_at_n. 34 | ``` 35 | p1 = np.array([random.random() for _ in xrange(5)]) 36 | a1 = np.array([random.choice([0, 1]) for _ in xrange(5)]) 37 | p2 = np.array([random.random() for _ in xrange(5)]) 38 | a2 = np.array([random.choice([0, 1]) for _ in xrange(5)]) 39 | # interpolated average precision at 10 using 1000 break points 40 | calculator = average_precision_calculator.AveragePrecisionCalculator(10) 41 | calculator.accumulate(p1, a1) 42 | calculator.accumulate(p2, a2) 43 | ap3 = calculator.peek_ap_at_n() 44 | ``` 45 | """ 46 | 47 | import heapq 48 | import random 49 | import numbers 50 | 51 | import numpy 52 | 53 | 54 | class AveragePrecisionCalculator(object): 55 | """Calculate the average precision and average precision at n.""" 56 | 57 | def __init__(self, top_n=None): 58 | """Construct an AveragePrecisionCalculator to calculate average precision. 59 | This class is used to calculate the average precision for a single label. 60 | Args: 61 | top_n: A positive Integer specifying the average precision at n, or 62 | None to use all provided data points. 63 | Raises: 64 | ValueError: An error occurred when the top_n is not a positive integer. 65 | """ 66 | if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None): 67 | raise ValueError("top_n must be a positive integer or None.") 68 | 69 | self._top_n = top_n # average precision at n 70 | self._total_positives = 0 # total number of positives have seen 71 | self._heap = [] # max heap of (prediction, actual) 72 | 73 | @property 74 | def heap_size(self): 75 | """Gets the heap size maintained in the class.""" 76 | return len(self._heap) 77 | 78 | @property 79 | def num_accumulated_positives(self): 80 | """Gets the number of positive samples that have been accumulated.""" 81 | return self._total_positives 82 | 83 | def accumulate(self, predictions, actuals, num_positives=None): 84 | """Accumulate the predictions and their ground truth labels. 85 | After the function call, we may call peek_ap_at_n to actually calculate 86 | the average precision. 87 | Note predictions and actuals must have the same shape. 88 | Args: 89 | predictions: a list storing the prediction scores. 90 | actuals: a list storing the ground truth labels. Any value 91 | larger than 0 will be treated as positives, otherwise as negatives. 92 | num_positives = If the 'predictions' and 'actuals' inputs aren't complete, 93 | then it's possible some true positives were missed in them. In that case, 94 | you can provide 'num_positives' in order to accurately track recall. 95 | Raises: 96 | ValueError: An error occurred when the format of the input is not the 97 | numpy 1-D array or the shape of predictions and actuals does not match. 98 | """ 99 | if len(predictions) != len(actuals): 100 | raise ValueError("the shape of predictions and actuals does not match.") 101 | 102 | if not num_positives is None: 103 | if not isinstance(num_positives, numbers.Number) or num_positives < 0: 104 | raise ValueError("'num_positives' was provided but it wan't a nonzero number.") 105 | 106 | if not num_positives is None: 107 | self._total_positives += num_positives 108 | else: 109 | self._total_positives += numpy.size(numpy.where(actuals > 0)) 110 | topk = self._top_n 111 | heap = self._heap 112 | 113 | for i in range(numpy.size(predictions)): 114 | if topk is None or len(heap) < topk: 115 | heapq.heappush(heap, (predictions[i], actuals[i])) 116 | else: 117 | if predictions[i] > heap[0][0]: # heap[0] is the smallest 118 | heapq.heappop(heap) 119 | heapq.heappush(heap, (predictions[i], actuals[i])) 120 | 121 | def clear(self): 122 | """Clear the accumulated predictions.""" 123 | self._heap = [] 124 | self._total_positives = 0 125 | 126 | def peek_ap_at_n(self): 127 | """Peek the non-interpolated average precision at n. 128 | Returns: 129 | The non-interpolated average precision at n (default 0). 130 | If n is larger than the length of the ranked list, 131 | the average precision will be returned. 132 | """ 133 | if self.heap_size <= 0: 134 | return 0 135 | predlists = numpy.array(list(zip(*self._heap))) 136 | 137 | ap = self.ap_at_n(predlists[0], 138 | predlists[1], 139 | n=self._top_n, 140 | total_num_positives=self._total_positives) 141 | return ap 142 | 143 | @staticmethod 144 | def ap(predictions, actuals): 145 | """Calculate the non-interpolated average precision. 146 | Args: 147 | predictions: a numpy 1-D array storing the sparse prediction scores. 148 | actuals: a numpy 1-D array storing the ground truth labels. Any value 149 | larger than 0 will be treated as positives, otherwise as negatives. 150 | Returns: 151 | The non-interpolated average precision at n. 152 | If n is larger than the length of the ranked list, 153 | the average precision will be returned. 154 | Raises: 155 | ValueError: An error occurred when the format of the input is not the 156 | numpy 1-D array or the shape of predictions and actuals does not match. 157 | """ 158 | return AveragePrecisionCalculator.ap_at_n(predictions, 159 | actuals, 160 | n=None) 161 | 162 | @staticmethod 163 | def ap_at_n(predictions, actuals, n=20, total_num_positives=None): 164 | """Calculate the non-interpolated average precision. 165 | Args: 166 | predictions: a numpy 1-D array storing the sparse prediction scores. 167 | actuals: a numpy 1-D array storing the ground truth labels. Any value 168 | larger than 0 will be treated as positives, otherwise as negatives. 169 | n: the top n items to be considered in ap@n. 170 | total_num_positives : (optionally) you can specify the number of total 171 | positive 172 | in the list. If specified, it will be used in calculation. 173 | Returns: 174 | The non-interpolated average precision at n. 175 | If n is larger than the length of the ranked list, 176 | the average precision will be returned. 177 | Raises: 178 | ValueError: An error occurred when 179 | 1) the format of the input is not the numpy 1-D array; 180 | 2) the shape of predictions and actuals does not match; 181 | 3) the input n is not a positive integer. 182 | """ 183 | if len(predictions) != len(actuals): 184 | raise ValueError("the shape of predictions and actuals does not match.") 185 | 186 | if n is not None: 187 | if not isinstance(n, int) or n <= 0: 188 | raise ValueError("n must be 'None' or a positive integer." 189 | " It was '%s'." % n) 190 | 191 | ap = 0.0 192 | 193 | predictions = numpy.array(predictions) 194 | actuals = numpy.array(actuals) 195 | 196 | # add a shuffler to avoid overestimating the ap 197 | predictions, actuals = AveragePrecisionCalculator._shuffle(predictions, 198 | actuals) 199 | sortidx = sorted( 200 | range(len(predictions)), 201 | key=lambda k: predictions[k], 202 | reverse=True) 203 | 204 | if total_num_positives is None: 205 | numpos = numpy.size(numpy.where(actuals > 0)) 206 | else: 207 | numpos = total_num_positives 208 | 209 | if numpos == 0: 210 | return 0 211 | 212 | if n is not None: 213 | numpos = min(numpos, n) 214 | delta_recall = 1.0 / numpos 215 | poscount = 0.0 216 | 217 | # calculate the ap 218 | r = len(sortidx) 219 | if n is not None: 220 | r = min(r, n) 221 | for i in range(r): 222 | if actuals[sortidx[i]] > 0: 223 | poscount += 1 224 | ap += poscount / (i + 1) * delta_recall 225 | return ap 226 | 227 | @staticmethod 228 | def _shuffle(predictions, actuals): 229 | random.seed(0) 230 | suffidx = random.sample(range(len(predictions)), len(predictions)) 231 | predictions = predictions[suffidx] 232 | actuals = actuals[suffidx] 233 | return predictions, actuals 234 | 235 | @staticmethod 236 | def _zero_one_normalize(predictions, epsilon=1e-7): 237 | """Normalize the predictions to the range between 0.0 and 1.0. 238 | For some predictions like SVM predictions, we need to normalize them before 239 | calculate the interpolated average precision. The normalization will not 240 | change the rank in the original list and thus won't change the average 241 | precision. 242 | Args: 243 | predictions: a numpy 1-D array storing the sparse prediction scores. 244 | epsilon: a small constant to avoid denominator being zero. 245 | Returns: 246 | The normalized prediction. 247 | """ 248 | denominator = numpy.max(predictions) - numpy.min(predictions) 249 | ret = (predictions - numpy.min(predictions)) / numpy.max(denominator, 250 | epsilon) 251 | return ret -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | img_w = 224 2 | img_h = 224 3 | 4 | dataset_params = { 5 | 'batch_size': 128, 6 | 'shuffle': True, 7 | 'num_workers': 4, 8 | 'pin_memory': True 9 | } -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w-garcia/NeXtVLAD.pytorch/cc7235b578ad092cd180083397de7411c1fe4684/data/__init__.py -------------------------------------------------------------------------------- /data/make_train_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Modified from code here: https://github.com/Yidadaa/Pytorch-Video-Classification 3 | ''' 4 | 5 | import os 6 | import numpy as np 7 | from tqdm import tqdm 8 | import concurrent.futures 9 | import argparse 10 | import ffmpeg 11 | 12 | # 数据集的默认位置 13 | # default params 14 | default_output_dir = os.path.dirname(os.path.abspath(__file__)) 15 | default_src_dir = os.path.join(default_output_dir, 'UCF') 16 | default_test_size = 0.2 17 | 18 | 19 | def split(src_dir=default_src_dir, output_dir=default_src_dir, size=default_test_size): 20 | # 设置默认参数 21 | # set defaults 22 | src_dir = default_src_dir if src_dir is None else src_dir 23 | output_dir = default_output_dir if output_dir is None else output_dir 24 | size = default_test_size if size is None else size 25 | 26 | if not os.path.exists(output_dir): 27 | os.mkdir(output_dir) 28 | 29 | # 生成测试集和训练集目录 30 | # split into train and test 31 | for folder in ['train', 'test']: 32 | folder_path = os.path.join(output_dir, folder) 33 | if not os.path.exists(folder_path): 34 | os.mkdir(folder_path) 35 | print('Folder {} is created'.format(folder_path)) 36 | 37 | # 划分测试集和训练集 38 | train_set = [] 39 | test_set = [] 40 | classes = os.listdir(src_dir) 41 | num_classes = len(classes) 42 | for class_index, classname in enumerate(classes): 43 | print(f"Current class:\t{class_index+1}") 44 | # 读取所有视频路径 45 | videos = os.listdir(os.path.join(src_dir, classname)) 46 | # 打乱视频名称 47 | np.random.shuffle(videos) 48 | # 确定测试集划分点 49 | split_size = int(len(videos) * size) 50 | 51 | # 生成训练集和测试集的文件夹 52 | for i in range(2): 53 | part = ['train', 'test'][i] 54 | class_dir = os.path.join(output_dir, part, classname) 55 | if not os.path.exists(class_dir): 56 | os.mkdir(class_dir) 57 | 58 | jobs = [] 59 | # 遍历每个视频,将每个视频的图像帧提取出来 60 | for i in range(len(videos)): 61 | video_path = os.path.join(src_dir, classname, videos[i]) 62 | 63 | video_type = 'test' if i <= split_size else 'train' 64 | video_name = videos[i].rsplit('.')[0] 65 | 66 | img_dir = os.path.join(output_dir, video_type, classname, f'{video_name}') 67 | if not os.path.exists(img_dir): 68 | os.makedirs(img_dir) 69 | if len(os.listdir(img_dir)) > 0: 70 | continue 71 | 72 | img_path = os.path.join(output_dir, video_type, classname, f'{video_name}/%6d.jpg') 73 | jobs.append({'in': video_path, 'out': img_path}) 74 | 75 | info = [classname, video_name, img_path] 76 | # 将视频帧信息保存起来 77 | if video_type == 'test': 78 | test_set.append(info) 79 | else: 80 | train_set.append(info) 81 | 82 | def subproc_call(job): 83 | try: 84 | # sample at 1fps: https://arxiv.org/pdf/1609.08675.pdf 85 | process = ( 86 | ffmpeg 87 | .input(job['in']) 88 | .output(job['out'], pattern_type='glob', r=1) 89 | .run_async(pipe_stdout=True, pipe_stderr=True) 90 | ) 91 | out, err = process.communicate() 92 | except ffmpeg.Error as e: 93 | print(e) 94 | print(err) 95 | 96 | # subproc_call(jobs[0]) 97 | with concurrent.futures.ThreadPoolExecutor() as executor: 98 | # wrap with list to run .map generator on execution 99 | _ = list(tqdm(executor.map(subproc_call, jobs), total=len(jobs))) 100 | 101 | # 将训练集和测试集数据保存到文件中,方便写dataloader 102 | datas = [train_set, test_set] 103 | names = ['train', 'test'] 104 | for i in range(2): 105 | with open(output_dir + '/' + names[i] + '.csv', 'w') as f: 106 | f.write('\n'.join([','.join(line) for line in datas[i]])) 107 | 108 | 109 | def parse_args(): 110 | parser = argparse.ArgumentParser(usage='python3 make_train_test.py -i path/to/UCF -o path/to/output -s 0.3') 111 | parser.add_argument('-i', '--src_dir', help='path to UCF datasets', default=default_src_dir) 112 | parser.add_argument('-o', '--output_dir', help='path to output', default=default_output_dir) 113 | parser.add_argument('-s', '--size', help='ratio of test sets', default=default_test_size) 114 | args = parser.parse_args() 115 | return args 116 | 117 | 118 | if __name__ == '__main__': 119 | args = parse_args() 120 | split(**vars(args)) 121 | -------------------------------------------------------------------------------- /data/organize_UCF101.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import fnmatch 4 | import re 5 | from tqdm import tqdm 6 | 7 | if __name__ == '__main__': 8 | ind_filepath = "/mnt/nfs/hgst-raid1/WD-Passport_4TB/dataset/UCF101/ucfTrainTestlist/classInd.txt" 9 | vids_dir = "/mnt/nfs/hgst-raid1/WD-Passport_4TB/dataset/UCF101/videos" 10 | 11 | with open(ind_filepath, 'r') as f: 12 | lines = f.readlines() 13 | classes = [l.strip().split(' ')[1] for l in lines] 14 | for c in tqdm(classes): 15 | c_dir = os.path.join(vids_dir, c) 16 | if not os.path.exists(c_dir): 17 | os.makedirs(c_dir) 18 | rx = re.compile(fnmatch.translate(f"*{c}*.avi"), re.IGNORECASE) 19 | class_videos = list(filter(rx.search, os.listdir(vids_dir))) 20 | if len(class_videos) != 0: 21 | # script was already ran 22 | # continue 23 | from_paths = [os.path.join(vids_dir, cv) for cv in sorted(class_videos)] 24 | to_paths = [os.path.join(c_dir, cv) for cv in sorted(class_videos)] 25 | for from_path, to_path in zip(from_paths, to_paths): 26 | shutil.move(from_path, to_path) 27 | -------------------------------------------------------------------------------- /data/process_features.py: -------------------------------------------------------------------------------- 1 | """ 2 | Re-tooled version of the script found on VideoToTextDNN: 3 | https://github.com/OSUPCVLab/VideoToTextDNN/blob/master/data/py3_process_features.py 4 | 5 | Perform batched feature extract using Cadene pretrainedmodels 6 | """ 7 | import torch 8 | import argparse 9 | import time 10 | import os 11 | import numpy as np 12 | import logging 13 | 14 | from util import TransformImage, create_batches, process_batches, init_model 15 | 16 | logging.basicConfig() 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(logging.DEBUG) 19 | 20 | available_features = ['nasnetalarge', 'resnet152', 'pnasnet5large', 'densenet121', 'senet154', 'polynet', 'vgg16'] 21 | 22 | args = None 23 | 24 | 25 | def extract_features(args): 26 | root_frames_dir = args.frames_dir 27 | root_feats_dir = args.feats_dir 28 | work = args.work 29 | autofill = int(args.autofill) 30 | ftype = args.type 31 | gpu_list = args.gpu_list 32 | 33 | class_dirs = os.listdir(root_frames_dir) 34 | 35 | # skip a level for UCF101 dataset 36 | for class_dir in class_dirs: 37 | class_frames_dir = os.path.join(root_frames_dir, class_dir) 38 | 39 | frames_dirs = os.listdir(class_frames_dir) 40 | 41 | class_feats_dir = os.path.join(root_feats_dir, class_dir) 42 | if not os.path.isdir(class_feats_dir): 43 | os.makedirs(class_feats_dir) 44 | 45 | # else: 46 | # if autofill: 47 | # logger.info('AUTOFILL ON: Attempting to autofill missing features.') 48 | # frames_dirs = validate_feats.go(featsd=root_feats_dir, framesd=root_frames_dir) 49 | 50 | # Difficulty of each job is measured by # of frames to process in each chunk. 51 | # Can't be randomized since autofill list woudld be no longer valid. 52 | # np.random.shuffle(frames_dirs) 53 | work = len(frames_dirs) if not work else work 54 | 55 | tf_img, model = init_model(args.gpu_list, args.type) 56 | 57 | work_done = 0 58 | while work_done != work: 59 | frames_dirs_avail = diff_feats(class_frames_dir, class_feats_dir) 60 | if len(frames_dirs_avail) == 0: 61 | break 62 | 63 | frames_dir = frames_dirs_avail.pop() 64 | feat_filename = frames_dir.split('/')[-1] + '.npy' 65 | video_feats_path = os.path.join(class_feats_dir, feat_filename) 66 | 67 | if os.path.exists(video_feats_path): 68 | logger.info('Features already extracted:\t{}'.format(video_feats_path)) 69 | continue 70 | 71 | try: 72 | frames_to_do = [os.path.join(args.frames_dir, class_dir, frames_dir, p) for p in 73 | os.listdir(os.path.join(args.frames_dir, class_dir, frames_dir))] 74 | except Exception as e: 75 | logger.exception(e) 76 | continue 77 | 78 | # Must sort so frames follow numerical order. os.listdir does not guarantee order. 79 | frames_to_do.sort() 80 | 81 | if len(frames_to_do) == 0: 82 | logger.warning("Frame folder has no frames! Skipping...") 83 | continue 84 | 85 | # Save a flag copy 86 | with open(video_feats_path, 'wb') as pf: 87 | np.save(pf, []) 88 | 89 | try: 90 | batches = create_batches(frames_to_do, tf_img, logger=logger, batch_size=args.batch_size) 91 | except OSError as e: 92 | logger.exception(e) 93 | logger.warning("Corrupt image file. Skipping...") 94 | os.remove(video_feats_path) 95 | continue 96 | 97 | logger.debug("Start video {}".format(work_done)) 98 | 99 | feats = process_batches(batches, ftype, gpu_list, model, logger=logger) 100 | 101 | with open(video_feats_path, 'wb') as pf: 102 | np.save(pf, feats) 103 | logger.info('Saved complete features to {}.'.format(video_feats_path)) 104 | work_done += 1 105 | 106 | 107 | def diff_feats(frames_dir, feats_dir): 108 | feats = ['.'.join(i.split('.')[:-1]) for i in os.listdir(feats_dir)] 109 | feats = set(feats) 110 | frames = set([fr for fr in os.listdir(frames_dir) if len(os.listdir(os.path.join(frames_dir, fr)))]) 111 | needed_feats = frames - feats 112 | return needed_feats 113 | 114 | 115 | if __name__ == '__main__': 116 | arg_parser = argparse.ArgumentParser() 117 | arg_parser.add_argument('frames_dir',help = 'Directory where there are frame directories.') 118 | arg_parser.add_argument('feats_dir',help = 'Root directory of dataset\'s processed videos.') 119 | arg_parser.add_argument('-w', '--work', help = 'Number of features to process. Defaults to all.', default=0, type=int) 120 | arg_parser.add_argument('-gl', '--gpu_list', required=True, nargs='+', type=int, help="Space delimited list of GPU indices to use. Example for 4 GPUs: -gl 0 1 2 3") 121 | arg_parser.add_argument('-bs', '--batch_size', type=int, help="Batch size to use during feature extraction. Larger batch size = more VRAM usage", default=8) 122 | arg_parser.add_argument('--type', required=True, help = 'ConvNet to use for processing features.', choices=available_features) 123 | arg_parser.add_argument('--autofill', action='store_true', default=False, help="Perform diff between frames_dir and feats_dir and fill them in.") 124 | 125 | args = arg_parser.parse_args() 126 | 127 | start_time = time.time() 128 | 129 | logger.info("Found {} GPUs, using {}.".format(torch.cuda.device_count(), len(args.gpu_list))) 130 | 131 | extract_features(args) 132 | 133 | logger.info("Job took %s mins" % ((time.time() - start_time)/60)) 134 | -------------------------------------------------------------------------------- /data/process_pca.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | from tqdm import tqdm 6 | from fnmatch import filter 7 | from sklearn.decomposition import PCA 8 | 9 | from util import feature_pca_whiten, feature_pca 10 | 11 | 12 | if __name__ == '__main__': 13 | opt = argparse.ArgumentParser() 14 | opt.add_argument('training_features_folder', help="Folder containing full-scale training features.") 15 | opt.add_argument('test_features_folder', help="Folder containing full-scale test features.") 16 | opt.add_argument('save_folder', help="Folder to save PCA params and features.") 17 | opt = vars(opt.parse_args()) 18 | 19 | D = [] 20 | for root, dirs, filenames in os.walk(opt['training_features_folder']): 21 | for npf_name in filter(filenames, "*.npy"): 22 | npf_path = os.path.join(root, npf_name) 23 | npo = np.load(npf_path) 24 | D.extend(npo) 25 | 26 | print(f"Generating PCA vectors...") 27 | pca = PCA(n_components=1024) 28 | pca.fit(D) 29 | eigenvecs = pca.components_ 30 | eigenvals = pca.explained_variance_ 31 | center = pca.mean_ 32 | np.save(os.path.join(opt['save_folder'], 'eigenvecss.npy'), eigenvecs) 33 | np.save(os.path.join(opt['save_folder'], 'eigenvals.npy'), eigenvals) 34 | np.save(os.path.join(opt['save_folder'], 'mean.npy'), center) 35 | 36 | for split_folder in [opt['training_features_folder'], opt['test_features_folder']]: 37 | if split_folder == opt['training_features_folder']: 38 | out_root = os.path.join(opt['save_folder'], 'train_PCA-1024') 39 | elif split_folder == opt['test_features_folder']: 40 | out_root = os.path.join(opt['save_folder'], 'test_PCA-1024') 41 | else: 42 | break 43 | 44 | print(f"Created {out_root}") 45 | class_dirs = os.listdir(split_folder) 46 | num_classes = len(class_dirs) 47 | for k, class_dir in enumerate(class_dirs): 48 | print(f"Class {k+1}/{num_classes}") 49 | class_feats_dir = os.path.join(split_folder, class_dir) 50 | class_out_dir = os.path.join(out_root, class_dir) 51 | if not os.path.isdir(class_out_dir): 52 | os.makedirs(class_out_dir) 53 | 54 | for npff in tqdm(os.listdir(class_feats_dir)): 55 | npf = os.path.join(class_feats_dir, npff) 56 | feats = np.load(npf) 57 | feats_pca = np.zeros((len(feats), 1024)) 58 | for i, feat in enumerate(feats): 59 | # TODO: toggle whitening on/off 60 | # pcaw = feature_pca_whiten(feat, center, eigenvals, eigenvecs) 61 | pcaw = feature_pca(feat, center, eigenvals, eigenvecs) 62 | feats_pca[i] = pcaw 63 | 64 | feats_pca_path = os.path.join(class_out_dir, npff) 65 | np.save(feats_pca_path, feats_pca) 66 | -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import random 4 | import os 5 | import numpy as np 6 | import torch 7 | import torch.nn.functional as F 8 | from torch.utils.data import Dataset 9 | from torch.autograd import Variable 10 | from multiprocessing import Pool 11 | from multiprocessing import Queue 12 | from collections import defaultdict 13 | 14 | 15 | class CocoDataset(Dataset): 16 | 17 | def __init__(self, coco_labels): 18 | # python 3 19 | # super().__init__() 20 | super(CocoDataset, self).__init__() 21 | self.coco_labels = list(coco_labels['labels'].items()) 22 | self.num_classes = coco_labels['num_classes'] 23 | 24 | def __getitem__(self, ix): 25 | labels = torch.zeros(self.num_classes) 26 | image_id, labels_ids = self.coco_labels[ix] 27 | labels[labels_ids] = 1 28 | data = {} 29 | data['image_ids'] = image_id 30 | data['labels'] = labels 31 | return data 32 | 33 | def __len__(self): 34 | return len(self.coco_labels) 35 | 36 | 37 | pool_queue = Queue() 38 | work = [] 39 | 40 | 41 | def _threaded_sample_load(vid_id, fpath, n_frame_steps): 42 | fc_feat = load_and_subsample_feat(fpath, n_frame_steps) 43 | pool_queue.put((vid_id, fc_feat)) 44 | 45 | 46 | class VideoClassificationFolder: 47 | def __init__(self, feats_folder: str): 48 | """ 49 | Init the video classification folder with the following tree structure: 50 | - [train|test] 51 | |- class 0 52 | |- - video 0 53 | |- - ... 54 | |- - video_{0i} 55 | |- ... 56 | |- ... 57 | |- class k 58 | |- - video 0 59 | |- - ... 60 | |- - video_{ki} 61 | 62 | $i$ is not guaranteed to be consistent between classes 63 | :param feats_folder: root directory where features are stored 64 | """ 65 | self.class_to_feats_map = defaultdict(list) 66 | self.feats_dir = feats_folder 67 | self.num_classes = len(os.listdir(self.feats_dir)) 68 | for c in os.listdir(self.feats_dir): 69 | self.class_to_feats_map[c] = [os.path.join(self.feats_dir, c, npf) for npf in 70 | os.listdir(os.path.join(self.feats_dir, c))] 71 | 72 | def flattened(self) -> dict: 73 | """ 74 | :return: a flattened tree as a dict of idx: 2-tuple (feats_path, class_id) with deterministic ordering 75 | """ 76 | l = {} 77 | i = 0 78 | for c in sorted(list(self.class_to_feats_map.keys())): 79 | for feats_path in sorted(self.class_to_feats_map[c]): 80 | l[i] = (feats_path, c) 81 | i += 1 82 | 83 | return l 84 | 85 | def __len__(self) -> int: 86 | return sum([len(self.class_to_feats_map[c]) for c in list(self.class_to_feats_map.keys())]) 87 | 88 | 89 | class VideoClassificationDataset(Dataset): 90 | 91 | # def get_vocab_size(self): 92 | # return len(self.get_vocab()) 93 | 94 | # def get_vocab(self): 95 | # return self.ix_to_word 96 | 97 | # def get_seq_length(self): 98 | # return self.seq_length 99 | 100 | def __init__(self, opt, mode): 101 | # python 3 102 | # super().__init__() 103 | super(VideoClassificationDataset, self).__init__() 104 | self.mode = mode # to load train/val/test data 105 | self.feats_dir = opt['feats_dir'] 106 | self.max_frames = opt['max_frames'] 107 | self.tree = VideoClassificationFolder(self.feats_dir) 108 | self.num_classes = self.tree.num_classes 109 | self.n = len(self.tree) 110 | # self.n_frame_steps = opt['n_frame_steps'] 111 | # load in the sequence data 112 | 113 | if self.mode != 'inference': 114 | print(f'load feats from {self.feats_dir}') 115 | # Memory cache for features 116 | print(f"Pre-cache {self.n} features in memory.") 117 | self._feat_cache = {} 118 | # pool = Pool(16) 119 | 120 | for idx, (fc_feat_path, c) in self.tree.flattened().items(): 121 | try: 122 | fc_feat, mask = load_and_subsample_feat(fc_feat_path, self.max_frames) 123 | self._feat_cache[idx] = (fc_feat, mask, c) 124 | except: 125 | print(f"{fc_feat_path} was not found") 126 | 127 | self.classes = sorted(list(self.tree.class_to_feats_map.keys())) 128 | self.tree = self.tree.flattened() 129 | print("Finished initializing dataloader.") 130 | 131 | def __getitem__(self, ix): 132 | """This function returns a tuple that is further passed to collate_fn 133 | """ 134 | ix = ix % self.n 135 | 136 | fc_feat = self._feat_cache.get(ix, None) 137 | if fc_feat is None: 138 | fc_feat_path, c = self.tree[ix] 139 | fc_feat, mask = load_and_subsample_feat(fc_feat_path, self.max_frames) 140 | self._feat_cache[ix] = (fc_feat, mask, c) 141 | else: 142 | fc_feat, mask, c = self._feat_cache[ix] 143 | 144 | label = self.classes.index(c) 145 | 146 | data = { 147 | 'fc_feats': Variable(torch.from_numpy(fc_feat).type(torch.FloatTensor)), 148 | 'ground_truth': Variable(torch.from_numpy(one_hot(label, self.num_classes)).type(torch.FloatTensor)), 149 | 'video_id': ix, 150 | 'mask': Variable(torch.from_numpy(mask).type(torch.FloatTensor)) 151 | } 152 | return data 153 | 154 | def __len__(self): 155 | return self.n 156 | 157 | 158 | def load_and_subsample_feat(fc_feat_path, max_frames, n_frame_steps=1): 159 | # fc_feat = np.load(fc_feat_path) 160 | # Subsampling 161 | # samples = np.round(np.linspace( 162 | # 0, fc_feat.shape[0] - 1, n_frame_steps)).astype(np.int32) 163 | try: 164 | fc_feat = np.load(fc_feat_path) 165 | n = min(max_frames, len(fc_feat)) 166 | padded = np.zeros((max_frames, fc_feat.shape[1])) 167 | padded[:n, :] = fc_feat[:n, :] 168 | mask = np.zeros((max_frames,)) 169 | mask[:n] = 1 170 | except Exception as e: 171 | print("Bad feature file in dataset: {}. Purge, re-process, and try again.".format(fc_feat_path)) 172 | raise e 173 | return padded, mask 174 | 175 | 176 | def one_hot(idx, num_classes): 177 | out = np.zeros(num_classes) 178 | out[idx] = 1 179 | return out 180 | 181 | 182 | if __name__ == '__main__': 183 | opt = { 184 | 'feats_dir': "/home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/train_PCA-1024", 185 | 'max_frames': 50 186 | } 187 | 188 | vd = VideoClassificationDataset(opt, 'train') 189 | data = vd.__getitem__(5) 190 | fc_feats = data['fc_feats'] 191 | print(fc_feats.shape) 192 | print(data['mask']) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | from math import ceil 4 | import random, shutil, json 5 | from os.path import join, exists, isfile 6 | from os import makedirs, remove, environ 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | from torch.utils.data import DataLoader, SubsetRandomSampler 13 | from torch.utils.data.dataset import Subset 14 | from datetime import datetime 15 | import torchvision.models as models 16 | import h5py 17 | import faiss 18 | 19 | from tensorboardX import SummaryWriter 20 | import numpy as np 21 | from models import netvlad 22 | 23 | parser = argparse.ArgumentParser(description='pytorch-NetVlad') 24 | parser.add_argument('--mode', type=str, default='train', help='Mode', choices=['train', 'test', 'cluster']) 25 | parser.add_argument('--batchSize', type=int, default=4, 26 | help='Number of triplets (query, pos, negs). Each triplet consists of 12 images.') 27 | parser.add_argument('--cacheBatchSize', type=int, default=24, help='Batch size for caching and testing') 28 | parser.add_argument('--cacheRefreshRate', type=int, default=1000, 29 | help='How often to refresh cache, in number of queries. 0 for off') 30 | parser.add_argument('--nEpochs', type=int, default=30, help='number of epochs to train for') 31 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 32 | help='manual epoch number (useful on restarts)') 33 | parser.add_argument('--nGPU', type=int, default=1, help='number of GPU to use.') 34 | parser.add_argument('--optim', type=str, default='SGD', help='optimizer to use', choices=['SGD', 'ADAM']) 35 | parser.add_argument('--lr', type=float, default=0.0001, help='Learning Rate.') 36 | parser.add_argument('--lrStep', type=float, default=5, help='Decay LR ever N steps.') 37 | parser.add_argument('--lrGamma', type=float, default=0.5, help='Multiply LR by Gamma for decaying.') 38 | parser.add_argument('--weightDecay', type=float, default=0.001, help='Weight decay for SGD.') 39 | parser.add_argument('--momentum', type=float, default=0.9, help='Momentum for SGD.') 40 | parser.add_argument('--nocuda', action='store_true', help='Dont use cuda') 41 | parser.add_argument('--threads', type=int, default=8, help='Number of threads for each data loader to use') 42 | parser.add_argument('--seed', type=int, default=123, help='Random seed to use.') 43 | parser.add_argument('--dataPath', type=str, default='/nfs/ibrahimi/data/', help='Path for centroid data.') 44 | parser.add_argument('--runsPath', type=str, default='/nfs/ibrahimi/runs/', help='Path to save runs to.') 45 | parser.add_argument('--savePath', type=str, default='checkpoints', 46 | help='Path to save checkpoints to in logdir. Default=checkpoints/') 47 | parser.add_argument('--cachePath', type=str, default=environ['TMPDIR'], help='Path to save cache to.') 48 | parser.add_argument('--resume', type=str, default='', help='Path to load checkpoint from, for resuming training or testing.') 49 | parser.add_argument('--ckpt', type=str, default='latest', 50 | help='Resume from latest or best checkpoint.', choices=['latest', 'best']) 51 | parser.add_argument('--evalEvery', type=int, default=1, 52 | help='Do a validation set run, and save, every N epochs.') 53 | parser.add_argument('--patience', type=int, default=10, help='Patience for early stopping. 0 is off.') 54 | parser.add_argument('--dataset', type=str, default='pittsburgh', 55 | help='Dataset to use', choices=['pittsburgh']) 56 | parser.add_argument('--arch', type=str, default='vgg16', 57 | help='basenetwork to use', choices=['vgg16', 'alexnet']) 58 | parser.add_argument('--vladv2', action='store_true', help='Use VLAD v2') 59 | parser.add_argument('--pooling', type=str, default='netvlad', help='type of pooling to use', 60 | choices=['netvlad', 'max', 'avg']) 61 | parser.add_argument('--num_clusters', type=int, default=64, help='Number of NetVlad clusters. Default=64') 62 | parser.add_argument('--margin', type=float, default=0.1, help='Margin for triplet loss. Default=0.1') 63 | parser.add_argument('--split', type=str, default='val', help='Data split to use for testing. Default is val', 64 | choices=['test', 'test250k', 'train', 'val']) 65 | parser.add_argument('--fromscratch', action='store_true', help='Train from scratch rather than using pretrained models') 66 | 67 | def train(epoch): 68 | epoch_loss = 0 69 | startIter = 1 # keep track of batch iter across subsets for logging 70 | 71 | if opt.cacheRefreshRate > 0: 72 | subsetN = ceil(len(train_set) / opt.cacheRefreshRate) 73 | #TODO randomise the arange before splitting? 74 | subsetIdx = np.array_split(np.arange(len(train_set)), subsetN) 75 | else: 76 | subsetN = 1 77 | subsetIdx = [np.arange(len(train_set))] 78 | 79 | nBatches = (len(train_set) + opt.batchSize - 1) // opt.batchSize 80 | 81 | for subIter in range(subsetN): 82 | print('====> Building Cache') 83 | model.eval() 84 | train_set.cache = join(opt.cachePath, train_set.whichSet + '_feat_cache.hdf5') 85 | with h5py.File(train_set.cache, mode='w') as h5: 86 | pool_size = encoder_dim 87 | if opt.pooling.lower() == 'netvlad': pool_size *= opt.num_clusters 88 | h5feat = h5.create_dataset("features", 89 | [len(whole_train_set), pool_size], 90 | dtype=np.float32) 91 | with torch.no_grad(): 92 | for iteration, (input, indices) in enumerate(whole_training_data_loader, 1): 93 | input = input.to(device) 94 | image_encoding = model.encoder(input) 95 | vlad_encoding = model.pool(image_encoding) 96 | h5feat[indices.detach().numpy(), :] = vlad_encoding.detach().cpu().numpy() 97 | del input, image_encoding, vlad_encoding 98 | 99 | sub_train_set = Subset(dataset=train_set, indices=subsetIdx[subIter]) 100 | 101 | training_data_loader = DataLoader(dataset=sub_train_set, num_workers=opt.threads, 102 | batch_size=opt.batchSize, shuffle=True, 103 | collate_fn=dataset.collate_fn, pin_memory=cuda) 104 | 105 | print('Allocated:', torch.cuda.memory_allocated()) 106 | print('Cached:', torch.cuda.memory_cached()) 107 | 108 | model.train() 109 | for iteration, (query, positives, negatives, 110 | negCounts, indices) in enumerate(training_data_loader, startIter): 111 | # some reshaping to put query, pos, negs in a single (N, 3, H, W) tensor 112 | # where N = batchSize * (nQuery + nPos + nNeg) 113 | if query is None: continue # in case we get an empty batch 114 | 115 | B, C, H, W = query.shape 116 | nNeg = torch.sum(negCounts) 117 | input = torch.cat([query, positives, negatives]) 118 | 119 | input = input.to(device) 120 | image_encoding = model.encoder(input) 121 | vlad_encoding = model.pool(image_encoding) 122 | 123 | vladQ, vladP, vladN = torch.split(vlad_encoding, [B, B, nNeg]) 124 | 125 | optimizer.zero_grad() 126 | 127 | # calculate loss for each Query, Positive, Negative triplet 128 | # due to potential difference in number of negatives have to 129 | # do it per query, per negative 130 | loss = 0 131 | for i, negCount in enumerate(negCounts): 132 | for n in range(negCount): 133 | negIx = (torch.sum(negCounts[:i]) + n).item() 134 | loss += criterion(vladQ[i:i+1], vladP[i:i+1], vladN[negIx:negIx+1]) 135 | 136 | loss /= nNeg.float().to(device) # normalise by actual number of negatives 137 | loss.backward() 138 | optimizer.step() 139 | del input, image_encoding, vlad_encoding, vladQ, vladP, vladN 140 | del query, positives, negatives 141 | 142 | batch_loss = loss.item() 143 | epoch_loss += batch_loss 144 | 145 | if iteration % 50 == 0 or nBatches <= 10: 146 | print("==> Epoch[{}]({}/{}): Loss: {:.4f}".format(epoch, iteration, 147 | nBatches, batch_loss), flush=True) 148 | writer.add_scalar('Train/Loss', batch_loss, 149 | ((epoch-1) * nBatches) + iteration) 150 | writer.add_scalar('Train/nNeg', nNeg, 151 | ((epoch-1) * nBatches) + iteration) 152 | print('Allocated:', torch.cuda.memory_allocated()) 153 | print('Cached:', torch.cuda.memory_cached()) 154 | 155 | startIter += len(training_data_loader) 156 | del training_data_loader, loss 157 | optimizer.zero_grad() 158 | torch.cuda.empty_cache() 159 | remove(train_set.cache) # delete HDF5 cache 160 | 161 | avg_loss = epoch_loss / nBatches 162 | 163 | print("===> Epoch {} Complete: Avg. Loss: {:.4f}".format(epoch, avg_loss), 164 | flush=True) 165 | writer.add_scalar('Train/AvgLoss', avg_loss, epoch) 166 | 167 | def test(eval_set, epoch=0, write_tboard=False): 168 | # TODO what if features dont fit in memory? 169 | test_data_loader = DataLoader(dataset=eval_set, 170 | num_workers=opt.threads, batch_size=opt.cacheBatchSize, shuffle=False, 171 | pin_memory=cuda) 172 | 173 | model.eval() 174 | with torch.no_grad(): 175 | print('====> Extracting Features') 176 | pool_size = encoder_dim 177 | if opt.pooling.lower() == 'netvlad': pool_size *= opt.num_clusters 178 | dbFeat = np.empty((len(eval_set), pool_size)) 179 | 180 | for iteration, (input, indices) in enumerate(test_data_loader, 1): 181 | input = input.to(device) 182 | image_encoding = model.encoder(input) 183 | vlad_encoding = model.pool(image_encoding) 184 | 185 | dbFeat[indices.detach().numpy(), :] = vlad_encoding.detach().cpu().numpy() 186 | if iteration % 50 == 0 or len(test_data_loader) <= 10: 187 | print("==> Batch ({}/{})".format(iteration, 188 | len(test_data_loader)), flush=True) 189 | 190 | del input, image_encoding, vlad_encoding 191 | del test_data_loader 192 | 193 | # extracted for both db and query, now split in own sets 194 | qFeat = dbFeat[eval_set.dbStruct.numDb:].astype('float32') 195 | dbFeat = dbFeat[:eval_set.dbStruct.numDb].astype('float32') 196 | 197 | print('====> Building faiss index') 198 | faiss_index = faiss.IndexFlatL2(pool_size) 199 | faiss_index.add(dbFeat) 200 | 201 | print('====> Calculating recall @ N') 202 | n_values = [1,5,10,20] 203 | 204 | _, predictions = faiss_index.search(qFeat, max(n_values)) 205 | 206 | # for each query get those within threshold distance 207 | gt = eval_set.getPositives() 208 | 209 | correct_at_n = np.zeros(len(n_values)) 210 | #TODO can we do this on the matrix in one go? 211 | for qIx, pred in enumerate(predictions): 212 | for i,n in enumerate(n_values): 213 | # if in top N then also in top NN, where NN > N 214 | if np.any(np.in1d(pred[:n], gt[qIx])): 215 | correct_at_n[i:] += 1 216 | break 217 | recall_at_n = correct_at_n / eval_set.dbStruct.numQ 218 | 219 | recalls = {} #make dict for output 220 | for i,n in enumerate(n_values): 221 | recalls[n] = recall_at_n[i] 222 | print("====> Recall@{}: {:.4f}".format(n, recall_at_n[i])) 223 | if write_tboard: writer.add_scalar('Val/Recall@' + str(n), recall_at_n[i], epoch) 224 | 225 | return recalls 226 | 227 | def get_clusters(cluster_set): 228 | nDescriptors = 50000 229 | nPerImage = 100 230 | nIm = ceil(nDescriptors/nPerImage) 231 | 232 | sampler = SubsetRandomSampler(np.random.choice(len(cluster_set), nIm, replace=False)) 233 | data_loader = DataLoader(dataset=cluster_set, 234 | num_workers=opt.threads, batch_size=opt.cacheBatchSize, shuffle=False, 235 | pin_memory=cuda, 236 | sampler=sampler) 237 | 238 | if not exists(join(opt.dataPath, 'centroids')): 239 | makedirs(join(opt.dataPath, 'centroids')) 240 | 241 | initcache = join(opt.dataPath, 'centroids', opt.arch + '_' + cluster_set.dataset + '_' + str(opt.num_clusters) + '_desc_cen.hdf5') 242 | with h5py.File(initcache, mode='w') as h5: 243 | with torch.no_grad(): 244 | model.eval() 245 | print('====> Extracting Descriptors') 246 | dbFeat = h5.create_dataset("descriptors", 247 | [nDescriptors, encoder_dim], 248 | dtype=np.float32) 249 | 250 | for iteration, (input, indices) in enumerate(data_loader, 1): 251 | input = input.to(device) 252 | image_descriptors = model.encoder(input).view(input.size(0), encoder_dim, -1).permute(0, 2, 1) 253 | 254 | batchix = (iteration-1)*opt.cacheBatchSize*nPerImage 255 | for ix in range(image_descriptors.size(0)): 256 | # sample different location for each image in batch 257 | sample = np.random.choice(image_descriptors.size(1), nPerImage, replace=False) 258 | startix = batchix + ix*nPerImage 259 | dbFeat[startix:startix+nPerImage, :] = image_descriptors[ix, sample, :].detach().cpu().numpy() 260 | 261 | if iteration % 50 == 0 or len(data_loader) <= 10: 262 | print("==> Batch ({}/{})".format(iteration, 263 | ceil(nIm/opt.cacheBatchSize)), flush=True) 264 | del input, image_descriptors 265 | 266 | print('====> Clustering..') 267 | niter = 100 268 | kmeans = faiss.Kmeans(encoder_dim, opt.num_clusters, niter, verbose=False) 269 | kmeans.train(dbFeat[...]) 270 | 271 | print('====> Storing centroids', kmeans.centroids.shape) 272 | h5.create_dataset('centroids', data=kmeans.centroids) 273 | print('====> Done!') 274 | 275 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 276 | model_out_path = join(opt.savePath, filename) 277 | torch.save(state, model_out_path) 278 | if is_best: 279 | shutil.copyfile(model_out_path, join(opt.savePath, 'model_best.pth.tar')) 280 | 281 | class Flatten(nn.Module): 282 | def forward(self, input): 283 | return input.view(input.size(0), -1) 284 | 285 | class L2Norm(nn.Module): 286 | def __init__(self, dim=1): 287 | super().__init__() 288 | self.dim = dim 289 | 290 | def forward(self, input): 291 | return F.normalize(input, p=2, dim=self.dim) 292 | 293 | if __name__ == "__main__": 294 | opt = parser.parse_args() 295 | 296 | restore_var = ['lr', 'lrStep', 'lrGamma', 'weightDecay', 'momentum', 297 | 'runsPath', 'savePath', 'arch', 'num_clusters', 'pooling', 'optim', 298 | 'margin', 'seed', 'patience'] 299 | if opt.resume: 300 | flag_file = join(opt.resume, 'checkpoints', 'flags.json') 301 | if exists(flag_file): 302 | with open(flag_file, 'r') as f: 303 | stored_flags = {'--'+k : str(v) for k,v in json.load(f).items() if k in restore_var} 304 | to_del = [] 305 | for flag, val in stored_flags.items(): 306 | for act in parser._actions: 307 | if act.dest == flag[2:]: 308 | # store_true / store_false args don't accept arguments, filter these 309 | if type(act.const) == type(True): 310 | if val == str(act.default): 311 | to_del.append(flag) 312 | else: 313 | stored_flags[flag] = '' 314 | for flag in to_del: del stored_flags[flag] 315 | 316 | train_flags = [x for x in list(sum(stored_flags.items(), tuple())) if len(x) > 0] 317 | print('Restored flags:', train_flags) 318 | opt = parser.parse_args(train_flags, namespace=opt) 319 | 320 | print(opt) 321 | 322 | if opt.dataset.lower() == 'pittsburgh': 323 | import pittsburgh as dataset 324 | else: 325 | raise Exception('Unknown dataset') 326 | 327 | cuda = not opt.nocuda 328 | if cuda and not torch.cuda.is_available(): 329 | raise Exception("No GPU found, please run with --nocuda") 330 | 331 | device = torch.device("cuda" if cuda else "cpu") 332 | 333 | random.seed(opt.seed) 334 | np.random.seed(opt.seed) 335 | torch.manual_seed(opt.seed) 336 | if cuda: 337 | torch.cuda.manual_seed(opt.seed) 338 | 339 | print('===> Loading dataset(s)') 340 | if opt.mode.lower() == 'train': 341 | whole_train_set = dataset.get_whole_training_set() 342 | whole_training_data_loader = DataLoader(dataset=whole_train_set, 343 | num_workers=opt.threads, batch_size=opt.cacheBatchSize, shuffle=False, 344 | pin_memory=cuda) 345 | 346 | train_set = dataset.get_training_query_set(opt.margin) 347 | 348 | print('====> Training query set:', len(train_set)) 349 | whole_test_set = dataset.get_whole_val_set() 350 | print('===> Evaluating on val set, query count:', whole_test_set.dbStruct.numQ) 351 | elif opt.mode.lower() == 'test': 352 | if opt.split.lower() == 'test': 353 | whole_test_set = dataset.get_whole_test_set() 354 | print('===> Evaluating on test set') 355 | elif opt.split.lower() == 'test250k': 356 | whole_test_set = dataset.get_250k_test_set() 357 | print('===> Evaluating on test250k set') 358 | elif opt.split.lower() == 'train': 359 | whole_test_set = dataset.get_whole_training_set() 360 | print('===> Evaluating on train set') 361 | elif opt.split.lower() == 'val': 362 | whole_test_set = dataset.get_whole_val_set() 363 | print('===> Evaluating on val set') 364 | else: 365 | raise ValueError('Unknown dataset split: ' + opt.split) 366 | print('====> Query count:', whole_test_set.dbStruct.numQ) 367 | elif opt.mode.lower() == 'cluster': 368 | whole_train_set = dataset.get_whole_training_set(onlyDB=True) 369 | 370 | print('===> Building model') 371 | 372 | pretrained = not opt.fromscratch 373 | if opt.arch.lower() == 'alexnet': 374 | encoder_dim = 256 375 | encoder = models.alexnet(pretrained=pretrained) 376 | # capture only features and remove last relu and maxpool 377 | layers = list(encoder.features.children())[:-2] 378 | 379 | if pretrained: 380 | # if using pretrained only train conv5 381 | for l in layers[:-1]: 382 | for p in l.parameters(): 383 | p.requires_grad = False 384 | 385 | elif opt.arch.lower() == 'vgg16': 386 | encoder_dim = 512 387 | encoder = models.vgg16(pretrained=pretrained) 388 | # capture only feature part and remove last relu and maxpool 389 | layers = list(encoder.features.children())[:-2] 390 | 391 | if pretrained: 392 | # if using pretrained then only train conv5_1, conv5_2, and conv5_3 393 | for l in layers[:-5]: 394 | for p in l.parameters(): 395 | p.requires_grad = False 396 | 397 | if opt.mode.lower() == 'cluster' and not opt.vladv2: 398 | layers.append(L2Norm()) 399 | 400 | encoder = nn.Sequential(*layers) 401 | model = nn.Module() 402 | model.add_module('encoder', encoder) 403 | 404 | if opt.mode.lower() != 'cluster': 405 | if opt.pooling.lower() == 'netvlad': 406 | net_vlad = netvlad.NetVLAD(num_clusters=opt.num_clusters, dim=encoder_dim, vladv2=opt.vladv2) 407 | if not opt.resume: 408 | if opt.mode.lower() == 'train': 409 | initcache = join(opt.dataPath, 'centroids', opt.arch + '_' + train_set.dataset + '_' + str(opt.num_clusters) +'_desc_cen.hdf5') 410 | else: 411 | initcache = join(opt.dataPath, 'centroids', opt.arch + '_' + whole_test_set.dataset + '_' + str(opt.num_clusters) +'_desc_cen.hdf5') 412 | 413 | if not exists(initcache): 414 | raise FileNotFoundError('Could not find clusters, please run with --mode=cluster before proceeding') 415 | 416 | with h5py.File(initcache, mode='r') as h5: 417 | clsts = h5.get("centroids")[...] 418 | traindescs = h5.get("descriptors")[...] 419 | net_vlad.init_params(clsts, traindescs) 420 | del clsts, traindescs 421 | 422 | model.add_module('pool', net_vlad) 423 | elif opt.pooling.lower() == 'max': 424 | global_pool = nn.AdaptiveMaxPool2d((1,1)) 425 | model.add_module('pool', nn.Sequential(*[global_pool, Flatten(), L2Norm()])) 426 | elif opt.pooling.lower() == 'avg': 427 | global_pool = nn.AdaptiveAvgPool2d((1,1)) 428 | model.add_module('pool', nn.Sequential(*[global_pool, Flatten(), L2Norm()])) 429 | else: 430 | raise ValueError('Unknown pooling type: ' + opt.pooling) 431 | 432 | isParallel = False 433 | if opt.nGPU > 1 and torch.cuda.device_count() > 1: 434 | model.encoder = nn.DataParallel(model.encoder) 435 | if opt.mode.lower() != 'cluster': 436 | model.pool = nn.DataParallel(model.pool) 437 | isParallel = True 438 | 439 | if not opt.resume: 440 | model = model.to(device) 441 | 442 | if opt.mode.lower() == 'train': 443 | if opt.optim.upper() == 'ADAM': 444 | optimizer = optim.Adam(filter(lambda p: p.requires_grad, 445 | model.parameters()), lr=opt.lr)#, betas=(0,0.9)) 446 | elif opt.optim.upper() == 'SGD': 447 | optimizer = optim.SGD(filter(lambda p: p.requires_grad, 448 | model.parameters()), lr=opt.lr, 449 | momentum=opt.momentum, 450 | weight_decay=opt.weightDecay) 451 | 452 | scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=opt.lrStep, gamma=opt.lrGamma) 453 | else: 454 | raise ValueError('Unknown optimizer: ' + opt.optim) 455 | 456 | # original paper/code doesn't sqrt() the distances, we do, so sqrt() the margin, I think :D 457 | criterion = nn.TripletMarginLoss(margin=opt.margin**0.5, 458 | p=2, reduction='sum').to(device) 459 | 460 | if opt.resume: 461 | if opt.ckpt.lower() == 'latest': 462 | resume_ckpt = join(opt.resume, 'checkpoints', 'checkpoint.pth.tar') 463 | elif opt.ckpt.lower() == 'best': 464 | resume_ckpt = join(opt.resume, 'checkpoints', 'model_best.pth.tar') 465 | 466 | if isfile(resume_ckpt): 467 | print("=> loading checkpoint '{}'".format(resume_ckpt)) 468 | checkpoint = torch.load(resume_ckpt, map_location=lambda storage, loc: storage) 469 | opt.start_epoch = checkpoint['epoch'] 470 | best_metric = checkpoint['best_score'] 471 | model.load_state_dict(checkpoint['state_dict']) 472 | model = model.to(device) 473 | if opt.mode == 'train': 474 | optimizer.load_state_dict(checkpoint['optimizer']) 475 | print("=> loaded checkpoint '{}' (epoch {})" 476 | .format(resume_ckpt, checkpoint['epoch'])) 477 | else: 478 | print("=> no checkpoint found at '{}'".format(resume_ckpt)) 479 | 480 | if opt.mode.lower() == 'test': 481 | print('===> Running evaluation step') 482 | epoch = 1 483 | recalls = test(whole_test_set, epoch, write_tboard=False) 484 | elif opt.mode.lower() == 'cluster': 485 | print('===> Calculating descriptors and clusters') 486 | get_clusters(whole_train_set) 487 | elif opt.mode.lower() == 'train': 488 | print('===> Training model') 489 | writer = SummaryWriter(log_dir=join(opt.runsPath, datetime.now().strftime('%b%d_%H-%M-%S')+'_'+opt.arch+'_'+opt.pooling)) 490 | 491 | # write checkpoints in logdir 492 | logdir = writer.file_writer.get_logdir() 493 | opt.savePath = join(logdir, opt.savePath) 494 | if not opt.resume: 495 | makedirs(opt.savePath) 496 | 497 | with open(join(opt.savePath, 'flags.json'), 'w') as f: 498 | f.write(json.dumps( 499 | {k:v for k,v in vars(opt).items()} 500 | )) 501 | print('===> Saving state to:', logdir) 502 | 503 | not_improved = 0 504 | best_score = 0 505 | for epoch in range(opt.start_epoch+1, opt.nEpochs + 1): 506 | if opt.optim.upper() == 'SGD': 507 | scheduler.step(epoch) 508 | train(epoch) 509 | if (epoch % opt.evalEvery) == 0: 510 | recalls = test(whole_test_set, epoch, write_tboard=True) 511 | is_best = recalls[5] > best_score 512 | if is_best: 513 | not_improved = 0 514 | best_score = recalls[5] 515 | else: 516 | not_improved += 1 517 | 518 | save_checkpoint({ 519 | 'epoch': epoch, 520 | 'state_dict': model.state_dict(), 521 | 'recalls': recalls, 522 | 'best_score': best_score, 523 | 'optimizer' : optimizer.state_dict(), 524 | 'parallel' : isParallel, 525 | }, is_best) 526 | 527 | if opt.patience > 0 and not_improved > (opt.patience / opt.evalEvery): 528 | print('Performance did not improve for', opt.patience, 'epochs. Stopping.') 529 | break 530 | 531 | print("=> Best Recall@5: {:.4f}".format(best_score), flush=True) 532 | writer.close() 533 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Based on: https://github.com/linrongc/youtube-8m/blob/master/eval_util.py 3 | """ 4 | 5 | import numpy as np 6 | from sklearn.metrics import average_precision_score 7 | from sklearn.preprocessing import MultiLabelBinarizer 8 | 9 | import average_precision_calculator as ap_calculator 10 | 11 | 12 | def flatten(l): 13 | """ Merges a list of lists into a single list. """ 14 | return [item for sublist in l for item in sublist] 15 | 16 | 17 | def calculate_gap(predictions, actuals, top_k=20): 18 | """Performs a local (numpy) calculation of the global average precision. 19 | Only the top_k predictions are taken for each of the videos. 20 | Args: 21 | predictions: Matrix containing the outputs of the model. 22 | Dimensions are 'batch' x 'num_classes'. 23 | actuals: Matrix containing the ground truth labels. 24 | Dimensions are 'batch' x 'num_classes'. 25 | top_k: How many predictions to use per video. 26 | Returns: 27 | float: The global average precision. 28 | """ 29 | gap_calculator = ap_calculator.AveragePrecisionCalculator() 30 | sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k) 31 | gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives)) 32 | return gap_calculator.peek_ap_at_n() 33 | 34 | 35 | def top_k_by_class(predictions, labels, k=20): 36 | """Extracts the top k predictions for each video, sorted by class. 37 | Args: 38 | predictions: A numpy matrix containing the outputs of the model. 39 | Dimensions are 'batch' x 'num_classes'. 40 | k: the top k non-zero entries to preserve in each prediction. 41 | Returns: 42 | A tuple (predictions,labels, true_positives). 'predictions' and 'labels' 43 | are lists of lists of floats. 'true_positives' is a list of scalars. The 44 | length of the lists are equal to the number of classes. The entries in the 45 | predictions variable are probability predictions, and 46 | the corresponding entries in the labels variable are the ground truth for 47 | those predictions. The entries in 'true_positives' are the number of true 48 | positives for each class in the ground truth. 49 | Raises: 50 | ValueError: An error occurred when the k is not a positive integer. 51 | """ 52 | if k <= 0: 53 | raise ValueError("k must be a positive integer.") 54 | k = min(k, predictions.shape[1]) 55 | num_classes = predictions.shape[1] 56 | prediction_triplets = [] 57 | for video_index in range(predictions.shape[0]): 58 | prediction_triplets.extend(top_k_triplets(predictions[video_index], labels[video_index], k)) 59 | out_predictions = [[] for v in range(num_classes)] 60 | out_labels = [[] for v in range(num_classes)] 61 | for triplet in prediction_triplets: 62 | out_predictions[triplet[0]].append(triplet[1]) 63 | out_labels[triplet[0]].append(triplet[2]) 64 | out_true_positives = [np.sum(labels[:, i]) for i in range(num_classes)] 65 | 66 | return out_predictions, out_labels, out_true_positives 67 | 68 | 69 | def top_k_triplets(predictions, labels, k=20): 70 | """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in 71 | (prediction, class) format""" 72 | m = len(predictions) 73 | k = min(k, m) 74 | indices = np.argpartition(predictions, -k)[-k:] 75 | 76 | return [(index, predictions[index], labels[index]) for index in indices] 77 | -------------------------------------------------------------------------------- /models/NeXtVLAD.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class NeXtVLAD(nn.Module): 7 | """NeXtVLAD layer implementation""" 8 | 9 | def __init__(self, dim=1024, num_clusters=64, lamb=2, groups=8, max_frames=300): 10 | super(NeXtVLAD, self).__init__() 11 | self.num_clusters = num_clusters 12 | self.dim = dim 13 | self.alpha = 0 14 | self.K = num_clusters 15 | self.G = groups 16 | self.group_size = int((lamb * dim) // self.G) 17 | # expansion FC 18 | self.fc0 = nn.Linear(dim, lamb * dim) 19 | # soft assignment FC (the cluster weights) 20 | self.fc_gk = nn.Linear(lamb * dim, self.G * self.K) 21 | # attention over groups FC 22 | self.fc_g = nn.Linear(lamb * dim, self.G) 23 | self.cluster_weights2 = nn.Parameter(torch.rand(1, self.group_size, self.K)) 24 | 25 | self.bn0 = nn.BatchNorm1d(max_frames) 26 | self.bn1 = nn.BatchNorm1d(1) 27 | 28 | def forward(self, x, mask=None): 29 | # print(f"x: {x.shape}") 30 | 31 | _, M, N = x.shape 32 | # expansion FC: B x M x N -> B x M x λN 33 | x_dot = self.fc0(x) 34 | 35 | # reshape into groups: B x M x λN -> B x M x G x (λN/G) 36 | x_tilde = x_dot.reshape(-1, M, self.G, self.group_size) 37 | 38 | # residuals across groups and clusters: B x M x λN -> B x M x (G*K) 39 | WgkX = self.fc_gk(x_dot) 40 | WgkX = self.bn0(WgkX) 41 | 42 | # residuals reshape across clusters: B x M x (G*K) -> B x (M*G) x K 43 | WgkX = WgkX.reshape(-1, M * self.G, self.K) 44 | 45 | # softmax over assignment: B x (M*G) x K -> B x (M*G) x K 46 | alpha_gk = F.softmax(WgkX, dim=-1) 47 | 48 | # attention across groups: B x M x λN -> B x M x G 49 | alpha_g = torch.sigmoid(self.fc_g(x_dot)) 50 | if mask is not None: 51 | alpha_g = torch.mul(alpha_g, mask.unsqueeze(2)) 52 | 53 | # reshape across time: B x M x G -> B x (M*G) x 1 54 | alpha_g = alpha_g.reshape(-1, M * self.G, 1) 55 | 56 | # apply attention: B x (M*G) x K (X) B x (M*G) x 1 -> B x (M*G) x K 57 | activation = torch.mul(alpha_gk, alpha_g) 58 | 59 | # sum over time and group: B x (M*G) x K -> B x 1 x K 60 | a_sum = torch.sum(activation, -2, keepdim=True) 61 | 62 | # calculate group centers: B x 1 x K (X) 1 x (λN/G) x K -> B x (λN/G) x K 63 | a = torch.mul(a_sum, self.cluster_weights2) 64 | 65 | # permute: B x (M*G) x K -> B x K x (M*G) 66 | activation = activation.permute(0, 2, 1) 67 | 68 | # reshape: B x M x G x (λN/G) -> B x (M*G) x (λN/G) 69 | reshaped_x_tilde = x_tilde.reshape(-1, M * self.G, self.group_size) 70 | 71 | # cluster activation: B x K x (M*G) (X) B x (M*G) x (λN/G) -> B x K x (λN/G) 72 | vlad = torch.matmul(activation, reshaped_x_tilde) 73 | # print(f"vlad: {vlad.shape}") 74 | 75 | # permute: B x K x (λN/G) (X) B x (λN/G) x K 76 | vlad = vlad.permute(0, 2, 1) 77 | # distance to centers: B x (λN/G) x K (-) B x (λN/G) x K 78 | vlad = torch.sub(vlad, a) 79 | # normalize: B x (λN/G) x K 80 | vlad = F.normalize(vlad, 1) 81 | # reshape: B x (λN/G) x K -> B x 1 x (K * (λN/G)) 82 | vlad = vlad.reshape(-1, 1, self.K * self.group_size) 83 | vlad = self.bn1(vlad) 84 | # reshape: B x 1 x (K * (λN/G)) -> B x (K * (λN/G)) 85 | vlad = vlad.reshape(-1, self.K * self.group_size) 86 | 87 | return vlad 88 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w-garcia/NeXtVLAD.pytorch/cc7235b578ad092cd180083397de7411c1fe4684/models/__init__.py -------------------------------------------------------------------------------- /models/video_classifiers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .NeXtVLAD import NeXtVLAD 6 | 7 | 8 | class NeXtVLADModel(nn.Module): 9 | def __init__(self, num_classes, num_clusters=64, dim=1024, lamb=2, hidden_size=1024, 10 | groups=8, max_frames=300, drop_rate=0.5, gating_reduction=8): 11 | super(NeXtVLADModel, self).__init__() 12 | self.drop_rate = drop_rate 13 | self.group_size = int((lamb * dim) // groups) 14 | self.fc0 = nn.Linear(num_clusters * self.group_size, hidden_size) 15 | self.bn0 = nn.BatchNorm1d(1) 16 | self.fc1 = nn.Linear(hidden_size, hidden_size // gating_reduction) 17 | self.bn1 = nn.BatchNorm1d(1) 18 | self.fc2 = nn.Linear(hidden_size // gating_reduction, hidden_size) 19 | self.logistic = nn.Linear(hidden_size, num_classes) 20 | 21 | self.video_nextvlad = NeXtVLAD(1024, max_frames=max_frames, lamb=lamb, 22 | num_clusters=num_clusters, groups=groups) 23 | 24 | def forward(self, x, mask=None): 25 | # B x M x N -> B x (K * (λN/G)) 26 | vlad = self.video_nextvlad(x, mask=mask) 27 | 28 | # B x (K * (λN/G)) 29 | if self.drop_rate > 0.: 30 | vlad = F.dropout(vlad, p=self.drop_rate) 31 | 32 | # B x (K * (λN/G)) -> B x H0 33 | activation = self.fc0(vlad) 34 | activation = self.bn0(activation.unsqueeze(1)).squeeze() 35 | activation = F.relu(activation) 36 | # B x H0 -> B x Gr 37 | gates = self.fc1(activation) 38 | gates = self.bn1(gates.unsqueeze(1)).squeeze() 39 | # B x Gr -> B x H0 40 | gates = self.fc2(gates) 41 | gates = torch.sigmoid(gates) 42 | # B x H0 -> B x H0 43 | activation = torch.mul(activation, gates) 44 | # B x H0 -> B x k 45 | out = self.logistic(activation) 46 | out = torch.sigmoid(out) 47 | 48 | return out 49 | 50 | 51 | class ConvNeXtVLADModel(nn.Module): 52 | """ 53 | A full Conv + neXtVLAD video classifier pipeline 54 | """ 55 | 56 | def __init__(self, nextvlad_model, eigenvecs, eigenvals, center, device, opt): 57 | super(ConvNeXtVLADModel, self).__init__() 58 | import pretrainedmodels 59 | self.ftype = opt['type'] 60 | self.conv = pretrainedmodels.__dict__[opt['type']](num_classes=1000, pretrained='imagenet') 61 | self.device = device 62 | self.eigenvecs = torch.from_numpy(eigenvecs).type(torch.FloatTensor).to(device) 63 | # self.eigenvals = torch.from_numpy(eigenvals).type(torch.FloatTensor) 64 | self.center = torch.from_numpy(center).type(torch.FloatTensor).to(device) 65 | self.video_classifier = nextvlad_model 66 | 67 | def _process_batch(self, batch): 68 | output_features = self.conv.features(batch) 69 | # output_features = output_features.data.cpu() 70 | 71 | conv_size = output_features.shape[-1] 72 | 73 | if self.ftype == 'nasnetalarge' or self.ftype == 'pnasnet5large': 74 | relu = nn.ReLU() 75 | rf = relu(output_features) 76 | avg_pool = nn.AvgPool2d(conv_size, stride=1, padding=0) 77 | out_feats = avg_pool(rf) 78 | else: 79 | avg_pool = nn.AdaptiveAvgPool2d((1, 1)) 80 | # B x H0 x 1 x 1 81 | out_feats = avg_pool(output_features) 82 | # B x H0 83 | out_feats = out_feats.view(out_feats.size(0), -1) 84 | 85 | # PCA (no whiten): 86 | # B x H0 (-) B x H0 87 | out_feats = out_feats - self.center 88 | # B x H0 -> B x 1 x (H0/2) 89 | out_feats = out_feats.unsqueeze(1).matmul(torch.t(self.eigenvecs)) 90 | # verification: 91 | # (np) out_feats[0].detach().cpu().numpy().reshape(1, 2048).dot(self.eigenvecs.detach().cpu().numpy().T) 92 | # == 93 | # (torch) out_feats.unsqueeze(1).matmul(torch.t(self.eigenvecs))[0] 94 | 95 | # B x (H0/2) 96 | return out_feats.squeeze(1) 97 | 98 | def conv_forward(self, frame_batch): 99 | return self._process_batch(frame_batch) 100 | 101 | def nextvlad_model_forward(self, vid_feats, mask): 102 | return self.video_classifier.forward(vid_feats, mask) 103 | -------------------------------------------------------------------------------- /notebooks/arch_debug.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from importlib import reload" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import dataloader\n", 19 | "reload(dataloader)\n", 20 | "from dataloader import VideoClassificationDataset\n", 21 | "import argparse" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "opt = {\n", 31 | " 'feats_dir': \"/home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/train_PCA-1024\",\n", 32 | " 'max_frames': 50\n", 33 | "}" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "load feats from /home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/train_PCA-1024\n", 46 | "Pre-cache 309 features in memory.\n", 47 | "Finished initializing dataloader.\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "train_dataset = VideoClassificationDataset(opt, 'train')" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 5, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "import torch\n", 62 | "import torch.nn as nn\n", 63 | "import torch.nn.functional as F\n", 64 | "import torch.optim as optim\n", 65 | "\n", 66 | "device = torch.device(\"cuda:0\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 6, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "from torch.utils.data import DataLoader" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 7, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "train_loader = DataLoader(train_dataset,\n", 85 | " batch_size=8,\n", 86 | " num_workers=4,\n", 87 | " shuffle=True)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 11, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "class NeXtVLAD(nn.Module):\n", 97 | " \"\"\"NeXtVLAD layer implementation\"\"\"\n", 98 | "\n", 99 | " def __init__(self, dim=1024, num_clusters=64, lamb=2, groups=8, max_frames=300):\n", 100 | " super(NeXtVLAD, self).__init__()\n", 101 | " self.num_clusters = num_clusters\n", 102 | " self.dim = dim\n", 103 | " self.alpha = 0\n", 104 | " self.K = num_clusters\n", 105 | " self.G = groups\n", 106 | " self.group_size = int((lamb*dim) // self.G)\n", 107 | " # expansion FC\n", 108 | " self.fc0 = nn.Linear(dim, lamb*dim)\n", 109 | " # soft assignment FC (the cluster weights)\n", 110 | " self.fc_gk = nn.Linear(lamb*dim, self.G * self.K)\n", 111 | " # attention over groups FC\n", 112 | " self.fc_g = nn.Linear(lamb*dim, self.G)\n", 113 | " self.cluster_weights2 = nn.Parameter(torch.rand(1, self.group_size, self.K))\n", 114 | " \n", 115 | " self.bn0 = nn.BatchNorm1d(max_frames)\n", 116 | " self.bn1 = nn.BatchNorm1d(1)\n", 117 | " \n", 118 | " \n", 119 | " def forward(self, x, mask=None):\n", 120 | "# print(f\"x: {x.shape}\")\n", 121 | " \n", 122 | " _, M, N = x.shape\n", 123 | " # expansion FC: B x M x N -> B x M x λN\n", 124 | " x_dot = self.fc0(x) \n", 125 | " \n", 126 | " # reshape into groups: B x M x λN -> B x M x G x (λN/G)\n", 127 | " x_tilde = x_dot.reshape(-1, M, self.G, self.group_size)\n", 128 | " \n", 129 | " # residuals across groups and clusters: B x M x λN -> B x M x (G*K) \n", 130 | " WgkX = self.fc_gk(x_dot)\n", 131 | " WgkX = self.bn0(WgkX)\n", 132 | " \n", 133 | " # residuals reshape across clusters: B x M x (G*K) -> B x (M*G) x K\n", 134 | " WgkX = WgkX.reshape(-1, M*self.G, self.K)\n", 135 | " \n", 136 | " # softmax over assignment: B x (M*G) x K -> B x (M*G) x K\n", 137 | " alpha_gk = F.softmax(WgkX, dim=-1)\n", 138 | " \n", 139 | " # attention across groups: B x M x λN -> B x M x G\n", 140 | " alpha_g = torch.sigmoid(self.fc_g(x_dot))\n", 141 | " if mask is not None:\n", 142 | " alpha_g = torch.mul(alpha_g, mask.unsqueeze(2))\n", 143 | " \n", 144 | " # reshape across time: B x M x G -> B x (M*G) x 1\n", 145 | " alpha_g = alpha_g.reshape(-1, M*self.G, 1)\n", 146 | " \n", 147 | " # apply attention: B x (M*G) x K (X) B x (M*G) x 1 -> B x (M*G) x K\n", 148 | " activation = torch.mul(alpha_gk, alpha_g)\n", 149 | " \n", 150 | " # sum over time and group: B x (M*G) x K -> B x 1 x K\n", 151 | " a_sum = torch.sum(activation, -2, keepdim=True)\n", 152 | " \n", 153 | " # calculate group centers: B x 1 x K (X) 1 x (λN/G) x K -> B x (λN/G) x K\n", 154 | " a = torch.mul(a_sum, self.cluster_weights2)\n", 155 | " \n", 156 | " # permute: B x (M*G) x K -> B x K x (M*G)\n", 157 | " activation = activation.permute(0, 2, 1)\n", 158 | " \n", 159 | " # reshape: B x M x G x (λN/G) -> B x (M*G) x (λN/G)\n", 160 | " reshaped_x_tilde = x_tilde.reshape(-1, M * self.G, self.group_size)\n", 161 | " \n", 162 | " # cluster activation: B x K x (M*G) (X) B x (M*G) x (λN/G) -> B x K x (λN/G)\n", 163 | " vlad = torch.matmul(activation, reshaped_x_tilde)\n", 164 | " # print(f\"vlad: {vlad.shape}\")\n", 165 | " \n", 166 | " # permute: B x K x (λN/G) (X) B x (λN/G) x K\n", 167 | " vlad = vlad.permute(0, 2, 1)\n", 168 | " # distance to centers: B x (λN/G) x K (-) B x (λN/G) x K\n", 169 | " vlad = torch.sub(vlad, a)\n", 170 | " # normalize: B x (λN/G) x K\n", 171 | " vlad = F.normalize(vlad, 1)\n", 172 | " # reshape: B x (λN/G) x K -> B x 1 x (K * (λN/G))\n", 173 | " vlad = vlad.reshape(-1, 1, self.K*self.group_size)\n", 174 | " vlad = self.bn1(vlad)\n", 175 | " # reshape: B x 1 x (K * (λN/G)) -> B x (K * (λN/G)) \n", 176 | " vlad = vlad.reshape(-1, self.K*self.group_size)\n", 177 | " \n", 178 | " return vlad" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 12, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "class NeXtVLADModel(nn.Module):\n", 188 | " def __init__(self, num_classes, num_clusters=64, dim=1024, lamb=2, hidden_size=1024, \n", 189 | " groups=8, max_frames=300, drop_rate=0.5, gating_reduction=8):\n", 190 | " super(NeXtVLADModel, self).__init__()\n", 191 | " self.drop_rate = drop_rate\n", 192 | " self.group_size = int((lamb*dim) // groups)\n", 193 | " self.fc0 = nn.Linear(num_clusters*self.group_size, hidden_size)\n", 194 | " self.bn0 = nn.BatchNorm1d(1)\n", 195 | " self.fc1 = nn.Linear(hidden_size, hidden_size // gating_reduction)\n", 196 | " self.bn1 = nn.BatchNorm1d(1)\n", 197 | " self.fc2 = nn.Linear(hidden_size // gating_reduction, hidden_size)\n", 198 | " self.logistic = nn.Linear(hidden_size, num_classes)\n", 199 | " \n", 200 | " self.video_nextvlad = NeXtVLAD(1024, max_frames=max_frames, lamb=lamb, \n", 201 | " num_clusters=num_clusters, groups=groups)\n", 202 | " \n", 203 | " def forward(self, x, mask=None):\n", 204 | " # B x M x N -> B x (K * (λN/G)) \n", 205 | " vlad = self.video_nextvlad(x, mask=mask)\n", 206 | " \n", 207 | " # B x (K * (λN/G)) \n", 208 | " if self.drop_rate > 0.:\n", 209 | " vlad = F.dropout(vlad, p=self.drop_rate)\n", 210 | " \n", 211 | " # B x (K * (λN/G)) -> B x H0\n", 212 | " activation = self.fc0(vlad)\n", 213 | " activation = self.bn0(activation.unsqueeze(1)).squeeze()\n", 214 | " activation = F.relu(activation)\n", 215 | " # B x H0 -> B x Gr\n", 216 | " gates = self.fc1(activation)\n", 217 | " gates = self.bn1(gates.unsqueeze(1)).squeeze()\n", 218 | " # B x Gr -> B x H0\n", 219 | " gates = self.fc2(gates)\n", 220 | " gates = torch.sigmoid(gates)\n", 221 | " # B x H0\n", 222 | " activation = torch.mul(activation, gates)\n", 223 | " out = self.logistic(activation)\n", 224 | " out = torch.sigmoid(out)\n", 225 | " \n", 226 | " return out\n", 227 | " \n", 228 | " " 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 22, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "model = NeXtVLADModel(train_dataset.num_classes, max_frames=opt['max_frames'])" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 23, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "epoch:\t0,\tloss:0.71150803565979\n", 250 | "epoch:\t0,\tloss:0.7146843075752258\n", 251 | "epoch:\t0,\tloss:0.5954272150993347\n", 252 | "epoch:\t0,\tloss:0.6422020196914673\n", 253 | "epoch:\t0,\tloss:0.5928784012794495\n", 254 | "epoch:\t0,\tloss:0.5122247338294983\n", 255 | "epoch:\t0,\tloss:0.6339268088340759\n", 256 | "epoch:\t0,\tloss:0.6461817026138306\n", 257 | "epoch:\t0,\tloss:0.5485246777534485\n", 258 | "epoch:\t0,\tloss:0.4685041010379791\n", 259 | "epoch:\t0,\tloss:0.47569671273231506\n", 260 | "epoch:\t0,\tloss:0.44309744238853455\n", 261 | "epoch:\t0,\tloss:0.4427070617675781\n", 262 | "epoch:\t0,\tloss:0.4889450967311859\n", 263 | "epoch:\t0,\tloss:0.4677741527557373\n", 264 | "epoch:\t0,\tloss:0.5337631106376648\n", 265 | "epoch:\t0,\tloss:0.26898637413978577\n", 266 | "epoch:\t0,\tloss:0.4407520294189453\n", 267 | "epoch:\t0,\tloss:0.3478405773639679\n", 268 | "epoch:\t0,\tloss:0.3842126131057739\n", 269 | "epoch:\t0,\tloss:0.4324081242084503\n", 270 | "epoch:\t0,\tloss:0.3600882589817047\n", 271 | "epoch:\t0,\tloss:0.23256240785121918\n", 272 | "epoch:\t0,\tloss:0.3121560513973236\n", 273 | "epoch:\t0,\tloss:0.30288824439048767\n", 274 | "epoch:\t0,\tloss:0.17022843658924103\n", 275 | "epoch:\t0,\tloss:0.2929340898990631\n", 276 | "epoch:\t0,\tloss:0.12832005321979523\n", 277 | "epoch:\t0,\tloss:0.27824535965919495\n", 278 | "epoch:\t0,\tloss:0.1415291577577591\n", 279 | "epoch:\t0,\tloss:0.10328144580125809\n", 280 | "epoch:\t0,\tloss:0.18037013709545135\n", 281 | "epoch:\t0,\tloss:0.09852627664804459\n", 282 | "epoch:\t0,\tloss:0.24034304916858673\n", 283 | "epoch:\t0,\tloss:0.19834274053573608\n", 284 | "epoch:\t0,\tloss:0.18170402944087982\n", 285 | "epoch:\t0,\tloss:0.09749788045883179\n", 286 | "epoch:\t0,\tloss:0.11607547849416733\n", 287 | "epoch:\t0,\tloss:0.0376109853386879\n", 288 | "epoch:\t1,\tloss:0.09225022047758102\n", 289 | "epoch:\t1,\tloss:0.02818330191075802\n", 290 | "epoch:\t1,\tloss:0.01991969160735607\n", 291 | "epoch:\t1,\tloss:0.027169905602931976\n", 292 | "epoch:\t1,\tloss:0.03678622841835022\n", 293 | "epoch:\t1,\tloss:0.025325099006295204\n", 294 | "epoch:\t1,\tloss:0.02467428892850876\n", 295 | "epoch:\t1,\tloss:0.02342064119875431\n", 296 | "epoch:\t1,\tloss:0.006782206241041422\n", 297 | "epoch:\t1,\tloss:0.01385602355003357\n", 298 | "epoch:\t1,\tloss:0.025996623560786247\n", 299 | "epoch:\t1,\tloss:0.05951960012316704\n", 300 | "epoch:\t1,\tloss:0.02473391406238079\n", 301 | "epoch:\t1,\tloss:0.009166402742266655\n", 302 | "epoch:\t1,\tloss:0.014516142196953297\n", 303 | "epoch:\t1,\tloss:0.008172743953764439\n", 304 | "epoch:\t1,\tloss:0.00869485829025507\n", 305 | "epoch:\t1,\tloss:0.007238415535539389\n", 306 | "epoch:\t1,\tloss:0.015679262578487396\n", 307 | "epoch:\t1,\tloss:0.005326947197318077\n", 308 | "epoch:\t1,\tloss:0.004706756677478552\n", 309 | "epoch:\t1,\tloss:0.0034857578575611115\n", 310 | "epoch:\t1,\tloss:0.004158268216997385\n", 311 | "epoch:\t1,\tloss:0.0059410869143903255\n", 312 | "epoch:\t1,\tloss:0.006137473974376917\n", 313 | "epoch:\t1,\tloss:0.027485201135277748\n", 314 | "epoch:\t1,\tloss:0.0030662603676319122\n", 315 | "epoch:\t1,\tloss:0.0027211287524551153\n", 316 | "epoch:\t1,\tloss:0.005441123154014349\n", 317 | "epoch:\t1,\tloss:0.002412184840068221\n", 318 | "epoch:\t1,\tloss:0.003415692364796996\n", 319 | "epoch:\t1,\tloss:0.002235227031633258\n", 320 | "epoch:\t1,\tloss:0.0033596300054341555\n", 321 | "epoch:\t1,\tloss:0.005315082613378763\n", 322 | "epoch:\t1,\tloss:0.002937655197456479\n", 323 | "epoch:\t1,\tloss:0.002290458185598254\n", 324 | "epoch:\t1,\tloss:0.007688215002417564\n", 325 | "epoch:\t1,\tloss:0.0027433286886662245\n", 326 | "epoch:\t1,\tloss:0.002007373608648777\n", 327 | "epoch:\t2,\tloss:0.0023477952927351\n", 328 | "epoch:\t2,\tloss:0.001861131633631885\n", 329 | "epoch:\t2,\tloss:0.0010836547007784247\n", 330 | "epoch:\t2,\tloss:0.0018228593980893493\n", 331 | "epoch:\t2,\tloss:0.002600134816020727\n", 332 | "epoch:\t2,\tloss:0.0016692288918420672\n", 333 | "epoch:\t2,\tloss:0.0022326342295855284\n", 334 | "epoch:\t2,\tloss:0.002286992035806179\n", 335 | "epoch:\t2,\tloss:0.0010390046518296003\n", 336 | "epoch:\t2,\tloss:0.0015219493070617318\n", 337 | "epoch:\t2,\tloss:0.0020285442005842924\n", 338 | "epoch:\t2,\tloss:0.001638404093682766\n", 339 | "epoch:\t2,\tloss:0.0016711241332814097\n", 340 | "epoch:\t2,\tloss:0.001292763277888298\n", 341 | "epoch:\t2,\tloss:0.0024440630804747343\n", 342 | "epoch:\t2,\tloss:0.0009746658033691347\n", 343 | "epoch:\t2,\tloss:0.0018626617966219783\n", 344 | "epoch:\t2,\tloss:0.0014337325701490045\n", 345 | "epoch:\t2,\tloss:0.0010064152302220464\n", 346 | "epoch:\t2,\tloss:0.0013640819815918803\n", 347 | "epoch:\t2,\tloss:0.0010625479044392705\n", 348 | "epoch:\t2,\tloss:0.0014420481165871024\n", 349 | "epoch:\t2,\tloss:0.0008912244811654091\n", 350 | "epoch:\t2,\tloss:0.001545345294289291\n", 351 | "epoch:\t2,\tloss:0.0010661480482667685\n", 352 | "epoch:\t2,\tloss:0.0008565050084143877\n", 353 | "epoch:\t2,\tloss:0.0006735824863426387\n", 354 | "epoch:\t2,\tloss:0.0008698648889549077\n", 355 | "epoch:\t2,\tloss:0.0017080713296309114\n", 356 | "epoch:\t2,\tloss:0.0010185097344219685\n", 357 | "epoch:\t2,\tloss:0.0010622901609167457\n", 358 | "epoch:\t2,\tloss:0.0012533634435385466\n", 359 | "epoch:\t2,\tloss:0.0009148705867119133\n", 360 | "epoch:\t2,\tloss:0.0006462182500399649\n", 361 | "epoch:\t2,\tloss:0.0005030953325331211\n", 362 | "epoch:\t2,\tloss:0.0015385170700028539\n", 363 | "epoch:\t2,\tloss:0.0006630505085922778\n", 364 | "epoch:\t2,\tloss:0.0007098554633557796\n", 365 | "epoch:\t2,\tloss:0.0009038225398398936\n", 366 | "epoch:\t3,\tloss:0.00042479473631829023\n", 367 | "epoch:\t3,\tloss:0.0005855750059708953\n", 368 | "epoch:\t3,\tloss:0.0007057485054247081\n", 369 | "epoch:\t3,\tloss:0.0008201799355447292\n", 370 | "epoch:\t3,\tloss:0.0005157763953320682\n", 371 | "epoch:\t3,\tloss:0.0008646969799883664\n", 372 | "epoch:\t3,\tloss:0.0009159119799733162\n", 373 | "epoch:\t3,\tloss:0.000726655765902251\n", 374 | "epoch:\t3,\tloss:0.0007182210683822632\n", 375 | "epoch:\t3,\tloss:0.0008668283117003739\n", 376 | "epoch:\t3,\tloss:0.0007231898489408195\n", 377 | "epoch:\t3,\tloss:0.0008738313335925341\n", 378 | "epoch:\t3,\tloss:0.0006608644616790116\n", 379 | "epoch:\t3,\tloss:0.0006918812287040055\n", 380 | "epoch:\t3,\tloss:0.00042746460530906916\n", 381 | "epoch:\t3,\tloss:0.0004623888526111841\n", 382 | "epoch:\t3,\tloss:0.00040710484609007835\n", 383 | "epoch:\t3,\tloss:0.0006149121909402311\n", 384 | "epoch:\t3,\tloss:0.0006414263043552637\n", 385 | "epoch:\t3,\tloss:0.0005345437093637884\n", 386 | "epoch:\t3,\tloss:0.0007223087013699114\n", 387 | "epoch:\t3,\tloss:0.0008337963372468948\n", 388 | "epoch:\t3,\tloss:0.0016031116247177124\n", 389 | "epoch:\t3,\tloss:0.0008548393961973488\n", 390 | "epoch:\t3,\tloss:0.0007479392806999385\n", 391 | "epoch:\t3,\tloss:0.0006933917757123709\n", 392 | "epoch:\t3,\tloss:0.0005947808967903256\n", 393 | "epoch:\t3,\tloss:0.00040444658952765167\n", 394 | "epoch:\t3,\tloss:0.0005790918366983533\n", 395 | "epoch:\t3,\tloss:0.0009108057129196823\n", 396 | "epoch:\t3,\tloss:0.0008470119792036712\n", 397 | "epoch:\t3,\tloss:0.0009477338171564043\n", 398 | "epoch:\t3,\tloss:0.00045438858796842396\n", 399 | "epoch:\t3,\tloss:0.0008903048583306372\n", 400 | "epoch:\t3,\tloss:0.0007609418244101107\n", 401 | "epoch:\t3,\tloss:0.001175822108052671\n", 402 | "epoch:\t3,\tloss:0.0005316018941812217\n", 403 | "epoch:\t3,\tloss:0.0006653064046986401\n", 404 | "epoch:\t3,\tloss:0.00032494479091838\n", 405 | "epoch:\t4,\tloss:0.0005201410385780036\n", 406 | "epoch:\t4,\tloss:0.0007186997099779546\n", 407 | "epoch:\t4,\tloss:0.00048609552322886884\n", 408 | "epoch:\t4,\tloss:0.0008609591168351471\n", 409 | "epoch:\t4,\tloss:0.0006337621598504484\n", 410 | "epoch:\t4,\tloss:0.00048226353828795254\n", 411 | "epoch:\t4,\tloss:0.0005028933519497514\n", 412 | "epoch:\t4,\tloss:0.00029791248380206525\n", 413 | "epoch:\t4,\tloss:0.0005183366592973471\n", 414 | "epoch:\t4,\tloss:0.00031539611518383026\n", 415 | "epoch:\t4,\tloss:0.00048409486771561205\n", 416 | "epoch:\t4,\tloss:0.00035559770185500383\n", 417 | "epoch:\t4,\tloss:0.0006230109720490873\n", 418 | "epoch:\t4,\tloss:0.0006612534634768963\n", 419 | "epoch:\t4,\tloss:0.00029597230604849756\n", 420 | "epoch:\t4,\tloss:0.0006362967542372644\n", 421 | "epoch:\t4,\tloss:0.00038377134478650987\n", 422 | "epoch:\t4,\tloss:0.0007281986181624234\n", 423 | "epoch:\t4,\tloss:0.0004282900772523135\n", 424 | "epoch:\t4,\tloss:0.00039028548053465784\n", 425 | "epoch:\t4,\tloss:0.0003747685404960066\n", 426 | "epoch:\t4,\tloss:0.0005309387925080955\n", 427 | "epoch:\t4,\tloss:0.000556213257368654\n", 428 | "epoch:\t4,\tloss:0.0005487402086146176\n", 429 | "epoch:\t4,\tloss:0.0003494209668133408\n", 430 | "epoch:\t4,\tloss:0.0006299832020886242\n", 431 | "epoch:\t4,\tloss:0.0004588236042764038\n", 432 | "epoch:\t4,\tloss:0.0005549622583203018\n", 433 | "epoch:\t4,\tloss:0.00018302483658771962\n", 434 | "epoch:\t4,\tloss:0.00024095167464111\n", 435 | "epoch:\t4,\tloss:0.0005101535934954882\n", 436 | "epoch:\t4,\tloss:0.00034454729757271707\n", 437 | "epoch:\t4,\tloss:0.00025429722154513\n", 438 | "epoch:\t4,\tloss:0.0002479896356817335\n", 439 | "epoch:\t4,\tloss:0.0007369245286099613\n", 440 | "epoch:\t4,\tloss:0.00034910847898572683\n", 441 | "epoch:\t4,\tloss:0.0005303460056893528\n", 442 | "epoch:\t4,\tloss:0.0005001642857678235\n", 443 | "epoch:\t4,\tloss:0.0002676190924830735\n" 444 | ] 445 | } 446 | ], 447 | "source": [ 448 | "optimizer = optim.Adam(model.parameters(), lr=0.001)\n", 449 | "exp_lr_schedulr = optim.lr_scheduler.StepLR(optimizer, step_size=25)\n", 450 | "\n", 451 | "model.train()\n", 452 | "model.to(device)\n", 453 | "\n", 454 | "for epoch in range(5):\n", 455 | " for data in train_loader:\n", 456 | " fc_feats = data['fc_feats'].to(device)\n", 457 | " labels = data['ground_truth'].to(device)\n", 458 | " masks = data['mask'].to(device)\n", 459 | "\n", 460 | " out = model(fc_feats, mask=masks)\n", 461 | " # print(f\"out: {out.shape}\")\n", 462 | " # print(f\"labels: {labels.shape}\")\n", 463 | " loss = F.binary_cross_entropy(out, labels)\n", 464 | "\n", 465 | " optimizer.zero_grad()\n", 466 | " loss.backward()\n", 467 | " optimizer.step()\n", 468 | " print(f\"epoch:\\t{epoch},\\tloss:{loss.cpu().data.numpy()}\")" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 65, 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "torch.Size([16, 1024])\n" 481 | ] 482 | } 483 | ], 484 | "source": [ 485 | "data = train_dataset.__getitem__(5)\n", 486 | "fc_feats = data['fc_feats']\n", 487 | "print(fc_feats.shape)" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": 36, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "import numpy as np\n", 497 | "import metrics\n", 498 | "reload(metrics)\n", 499 | "from metrics import calculate_gap" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 34, 505 | "metadata": {}, 506 | "outputs": [ 507 | { 508 | "name": "stdout", 509 | "output_type": "stream", 510 | "text": [ 511 | "load feats from /home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/test_PCA-1024\n", 512 | "Pre-cache 75 features in memory.\n", 513 | "Finished initializing dataloader.\n" 514 | ] 515 | } 516 | ], 517 | "source": [ 518 | "opt = {\n", 519 | " 'feats_dir': \"/home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/test_PCA-1024\",\n", 520 | " 'max_frames': 50\n", 521 | "}\n", 522 | "test_dataset = VideoClassificationDataset(opt, 'test')\n", 523 | "test_loader = DataLoader(test_dataset,\n", 524 | " batch_size=8,\n", 525 | " num_workers=4,\n", 526 | " shuffle=True)" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 43, 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "name": "stdout", 536 | "output_type": "stream", 537 | "text": [ 538 | "GAP(20): 0.9933333333333333\n" 539 | ] 540 | } 541 | ], 542 | "source": [ 543 | "preds = []\n", 544 | "actuals = []\n", 545 | "\n", 546 | "for data in test_loader:\n", 547 | " fc_feats = data['fc_feats'].to(device)\n", 548 | " labels = data['ground_truth']\n", 549 | " masks = data['mask'].to(device)\n", 550 | "\n", 551 | " out = model(fc_feats, mask=masks)\n", 552 | " out = out.cpu().data.numpy()\n", 553 | " labels = labels.cpu().data.numpy()\n", 554 | "# print(out.shape)\n", 555 | "# print(labels.shape)\n", 556 | " preds.extend(out)\n", 557 | " actuals.extend(labels)\n", 558 | " \n", 559 | "print(f\"GAP(20): {calculate_gap(np.asarray(preds), np.asarray(actuals), top_k=20)}\")" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [] 568 | } 569 | ], 570 | "metadata": { 571 | "kernelspec": { 572 | "display_name": "Python 3", 573 | "language": "python", 574 | "name": "python3" 575 | }, 576 | "language_info": { 577 | "codemirror_mode": { 578 | "name": "ipython", 579 | "version": 3 580 | }, 581 | "file_extension": ".py", 582 | "mimetype": "text/x-python", 583 | "name": "python", 584 | "nbconvert_exporter": "python", 585 | "pygments_lexer": "ipython3", 586 | "version": "3.6.10" 587 | } 588 | }, 589 | "nbformat": 4, 590 | "nbformat_minor": 4 591 | } 592 | -------------------------------------------------------------------------------- /sample.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | import numpy as np 5 | import logging 6 | import ffmpeg 7 | 8 | from models.video_classifiers import NeXtVLADModel 9 | from util import feature_pca, create_batches 10 | from torch.autograd import Variable 11 | 12 | from util import init_model as init_convnet 13 | from util import process_batches 14 | 15 | logging.basicConfig() 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.DEBUG) 18 | device = torch.device("cuda:0") 19 | 20 | available_features = ['nasnetalarge', 'resnet152', 'pnasnet5large', 'densenet121', 'senet154', 'polynet', 'vgg16'] 21 | 22 | 23 | if __name__ == '__main__': 24 | opt = argparse.ArgumentParser() 25 | opt.add_argument('ckpt_file', help='Path to the NeXtVLAD checkpoint file.') 26 | opt.add_argument('pca_dir', help='Directory containing PCA data.') 27 | opt.add_argument('files', nargs='+', help='List of files to process.') 28 | opt.add_argument('-gl', '--gpu_list', 29 | required=True, nargs='+', type=int, 30 | help="Space delimited list of GPU indices to use. Example for 4 GPUs: -gl 0 1 2 3") 31 | opt.add_argument('-bs', '--batch_size', type=int, 32 | help="Batch size to use during feature extraction. Larger batch size = more VRAM usage", 33 | default=8) 34 | opt.add_argument('--type', required=True, 35 | help='ConvNet to use for processing features.', 36 | choices=available_features) 37 | opt.add_argument('--max_frames', help="Max frames length of dataset.", default=50, type=int) 38 | opt.add_argument('--num_classes', help="Number of classes that was in train dataset.", default=5, type=int) 39 | 40 | opt = vars(opt.parse_args()) 41 | 42 | logger.info("Found {} GPUs, using {}.".format(torch.cuda.device_count(), len(opt['gpu_list']))) 43 | 44 | # Convnet 45 | tf_img, convnet = init_convnet(opt['gpu_list'], opt['type']) 46 | # PCA 47 | eigenvecs = np.load(os.path.join(opt['pca_dir'], 'eigenvecss.npy')) 48 | eigenvals = np.load(os.path.join(opt['pca_dir'], 'eigenvals.npy')) 49 | center = np.load(os.path.join(opt['pca_dir'], 'mean.npy')) 50 | # neXtVLAD 51 | model = NeXtVLADModel(opt['num_classes'], max_frames=opt['max_frames']) 52 | model.load_state_dict(torch.load(opt['ckpt_file'])) 53 | model.to(device) 54 | model.eval() 55 | 56 | for video in opt['files']: 57 | probe = ffmpeg.probe(video) 58 | video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None) 59 | width = int(video_stream['width']) 60 | height = int(video_stream['height']) 61 | 62 | out, _ = ( 63 | ffmpeg 64 | .input(video) 65 | # .output('pipe:', format='rawvideo', pix_fmt='rgb24') 66 | .output('pipe:', format='rawvideo', pix_fmt='rgb24', r=1) 67 | .run(capture_stdout=True) 68 | ) 69 | video_np = ( 70 | np 71 | .frombuffer(out, np.uint8) 72 | .reshape([-1, height, width, 3]) 73 | ) 74 | batches = create_batches(video_np, tf_img, batch_size=opt['batch_size']) 75 | feats = process_batches(batches, opt['type'], opt['gpu_list'], convnet) 76 | 77 | fpca = np.zeros((len(feats), eigenvecs.T.shape[1])) 78 | for i, feat in enumerate(feats): 79 | fpca[i] = feature_pca(feat, center, eigenvals, eigenvecs) 80 | 81 | n = min(opt['max_frames'], len(fpca)) 82 | padded = np.zeros((opt['max_frames'], fpca.shape[1])) 83 | padded[:n, :] = fpca[:n, :] 84 | mask = np.zeros((opt['max_frames'],)) 85 | mask[:n] = 1 86 | 87 | fc_feats = Variable(torch.from_numpy(padded).type(torch.FloatTensor)).to(device) 88 | mask = Variable(torch.from_numpy(mask).type(torch.FloatTensor)).to(device) 89 | 90 | out = model(fc_feats.unsqueeze(0), mask=mask.unsqueeze(0)) 91 | print(f"{video}: {out.argmax().detach().cpu().numpy()}") 92 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import numpy as np 7 | from torch.utils.data import DataLoader 8 | 9 | from dataloader import VideoClassificationDataset 10 | from models.video_classifiers import NeXtVLADModel 11 | from metrics import calculate_gap 12 | from tqdm import tqdm 13 | 14 | device = torch.device("cuda:0") 15 | 16 | 17 | def train(opt, model, optimizer, scheduler, train_loader): 18 | with tqdm(total=len(train_loader)) as pb: 19 | for data in train_loader: 20 | fc_feats = data['fc_feats'].to(device) 21 | labels = data['ground_truth'].to(device) 22 | masks = data['mask'].to(device) 23 | 24 | out = model(fc_feats, mask=masks) 25 | loss = F.binary_cross_entropy(out, labels) 26 | 27 | optimizer.zero_grad() 28 | loss.backward() 29 | optimizer.step() 30 | str_loss = f"{loss.cpu().data.numpy():.4f}" 31 | pb.update(1) 32 | pb.set_postfix(epoch=epoch, loss=str_loss) 33 | 34 | 35 | def eval(opt, model, test_loader): 36 | preds = [] 37 | actuals = [] 38 | 39 | for data in test_loader: 40 | fc_feats = data['fc_feats'].to(device) 41 | labels = data['ground_truth'] 42 | masks = data['mask'].to(device) 43 | 44 | out = model(fc_feats, mask=masks) 45 | out = out.cpu().data.numpy() 46 | labels = labels.cpu().data.numpy() 47 | preds.extend(out) 48 | actuals.extend(labels) 49 | 50 | gap_score = calculate_gap(np.asarray(preds), np.asarray(actuals), top_k=opt['gapk']) 51 | return gap_score 52 | 53 | 54 | if __name__ == '__main__': 55 | opt = argparse.ArgumentParser() 56 | opt.add_argument('train_feats_dir', help="Directory where train features are stored.") 57 | opt.add_argument('test_feats_dir', help="Directory where test features are stored.") 58 | opt.add_argument('--max_frames', help="Max frames length of dataset.", default=50, type=int) 59 | opt.add_argument('--gapk', help="Value of K for computing GAP score.", default=20, type=int) 60 | opt.add_argument('--num_epochs', help="Number of epochs.", default=5, type=int) 61 | opt.add_argument('--ckpt_dir', help="Where to save checkpoints.", default='ckpt/') 62 | 63 | opt = vars(opt.parse_args()) 64 | 65 | if not os.path.isdir(opt['ckpt_dir']): 66 | os.mkdir(opt['ckpt_dir']) 67 | 68 | train_opts = { 69 | 'feats_dir': opt['train_feats_dir'], 70 | 'max_frames': opt['max_frames'] 71 | } 72 | train_dataset = VideoClassificationDataset(train_opts, 'train') 73 | train_loader = DataLoader(train_dataset, 74 | batch_size=8, 75 | num_workers=4, 76 | shuffle=True) 77 | 78 | test_opts = { 79 | 'feats_dir': opt['test_feats_dir'], 80 | 'max_frames': opt['max_frames'] 81 | } 82 | test_dataset = VideoClassificationDataset(test_opts, 'test') 83 | test_loader = DataLoader(test_dataset, 84 | batch_size=8, 85 | num_workers=4, 86 | shuffle=True) 87 | 88 | model = NeXtVLADModel(train_dataset.num_classes, max_frames=opt['max_frames']) 89 | optimizer = optim.Adam(model.parameters(), lr=0.001) 90 | exp_lr_schedulr = optim.lr_scheduler.StepLR(optimizer, step_size=25) 91 | 92 | model.to(device) 93 | 94 | for epoch in range(opt['num_epochs']): 95 | model.train() 96 | train(opt, model, optimizer, exp_lr_schedulr, train_loader) 97 | 98 | model.eval() 99 | gap_score = eval(opt, model, test_loader) 100 | print(f"GAP({opt['gapk']}): {gap_score:.3f}") 101 | 102 | model_path = os.path.join(opt['ckpt_dir'], f"model_e{epoch}_gap{opt['gapk']}-{gap_score:.3f}.pth") 103 | torch.save(model.state_dict(), model_path) 104 | print(f"Model saved to {model_path}") -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import math 5 | import torch.nn as nn 6 | import pretrainedmodels 7 | 8 | from PIL import Image 9 | from pretrainedmodels.utils import ToRange255, ToSpaceBGR, transforms, munchify 10 | 11 | 12 | def feature_pca_whiten(feat, center, eigenvals, eigenvecs): 13 | epsilon = 1e-4 14 | d = feat.shape[0] 15 | 16 | # subtract mean 17 | fcen = feat - center 18 | # principal components 19 | fpca = fcen.reshape((1, d)).dot(eigenvecs.T).squeeze(0) 20 | # whiten 21 | pcaw = fpca / np.sqrt(eigenvals + epsilon) 22 | 23 | return pcaw 24 | 25 | 26 | def feature_pca(feat, center, eigenvals, eigenvecs): 27 | """ 28 | Skip whitening, as done by Lin et al. 29 | :param feat: 30 | :param center: 31 | :param eigenvals: 32 | :param eigenvecs: 33 | :return: 34 | """ 35 | d = feat.shape[0] 36 | 37 | # subtract mean 38 | fcen = feat - center 39 | # principal components 40 | fpca = fcen.reshape((1, d)).dot(eigenvecs.T).squeeze(0) 41 | 42 | return fpca 43 | 44 | 45 | def create_batches(frames_to_do, tf_img_fn, logger=None, batch_size=32): 46 | n = len(frames_to_do) 47 | if n < batch_size: 48 | if logger: logger.warning("Sample size less than batch size: Cutting batch size.") 49 | batch_size = n 50 | 51 | if logger: logger.info("Generating {} batches...".format(n // batch_size)) 52 | batches = [] 53 | frames_to_do = np.array(frames_to_do) 54 | 55 | for idx in range(0, n, batch_size): 56 | frames_idx = list(range(idx, min(idx+batch_size, n))) 57 | batch_frames = frames_to_do[frames_idx] 58 | 59 | batch_tensor = None 60 | for i, frame_ in enumerate(batch_frames): 61 | if type(frame_) is np.ndarray: 62 | input_frame = Image.fromarray(frame_).convert('RGB') 63 | else: # filename 64 | input_frame = Image.open(frame_).convert('RGB') 65 | input_tensor = tf_img_fn(input_frame) # 3x400x225 -> 3x299x299 size may differ 66 | # input_tensor = input_tensor.unsqueeze(0) # 3x299x299 -> 1x3x299x299 67 | if batch_tensor is None: 68 | batch_tensor = torch.zeros((len(batch_frames),) + input_tensor.shape) 69 | batch_tensor[i] = input_tensor 70 | 71 | batch_ag = torch.autograd.Variable(batch_tensor, requires_grad=False) 72 | batches.append(batch_ag) 73 | return batches 74 | 75 | 76 | class TransformImage(object): 77 | 78 | def __init__(self, opts, scale=0.875, random_crop=False, 79 | random_hflip=False, random_vflip=False, 80 | preserve_aspect_ratio=True): 81 | if type(opts) == dict: 82 | opts = munchify(opts) 83 | self.input_size = opts.input_size 84 | self.input_space = opts.input_space 85 | self.input_range = opts.input_range 86 | self.mean = opts.mean 87 | self.std = opts.std 88 | 89 | # https://github.com/tensorflow/models/blob/master/research/inception/inception/image_processing.py#L294 90 | self.scale = scale 91 | self.random_crop = random_crop 92 | self.random_hflip = random_hflip 93 | self.random_vflip = random_vflip 94 | 95 | tfs = [] 96 | if preserve_aspect_ratio: 97 | tfs.append(transforms.Resize(int(math.floor(max(self.input_size)/self.scale)))) 98 | else: 99 | height = int(self.input_size[1] / self.scale) 100 | width = int(self.input_size[2] / self.scale) 101 | tfs.append(transforms.Resize((height, width))) 102 | 103 | if random_crop: 104 | tfs.append(transforms.RandomCrop(max(self.input_size))) 105 | # else: 106 | # tfs.append(transforms.CenterCrop(max(self.input_size))) 107 | 108 | if random_hflip: 109 | tfs.append(transforms.RandomHorizontalFlip()) 110 | 111 | if random_vflip: 112 | tfs.append(transforms.RandomVerticalFlip()) 113 | 114 | tfs.append(transforms.ToTensor()) 115 | tfs.append(ToSpaceBGR(self.input_space=='BGR')) 116 | tfs.append(ToRange255(max(self.input_range)==255)) 117 | tfs.append(transforms.Normalize(mean=self.mean, std=self.std)) 118 | 119 | self.tf = transforms.Compose(tfs) 120 | 121 | def __call__(self, img): 122 | tensor = self.tf(img) 123 | return tensor 124 | 125 | 126 | def init_model(gpu_ids, model_name): 127 | 128 | # model_name = 'pnasnet5large' 129 | # could be fbresnet152 or inceptionresnetv2 130 | model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet') 131 | model.eval() 132 | 133 | # transformations depending on the model 134 | # rescale, center crop, normalize, and others (ex: ToBGR, ToRange255) 135 | tf_img = TransformImage(model) 136 | 137 | """ 138 | TODO(WG): Would be nice to use something like DataParallel, but that only does forward pass on given module. 139 | Need to stop before logits step. 140 | Should create wrapper for pretrainedmodels that does the MPI-like ops across GPUs on model.features modules: 141 | 1) replicated 142 | 2) scatter 143 | 3) parallel_apply 144 | 4) gather 145 | Would have to know what layers are being used on each model. 146 | """ 147 | if torch.cuda.is_available(): 148 | model = model.cuda(device=gpu_ids[0]) 149 | 150 | return tf_img, model 151 | 152 | 153 | def process_batches(batches, ftype, gpu_list, model, logger=None): 154 | done_batches = [] 155 | for i, batch in enumerate(batches): 156 | if torch.cuda.is_available(): 157 | batch = batch.cuda(device=gpu_list[0]) 158 | 159 | output_features = model.features(batch) 160 | output_features = output_features.data.cpu() 161 | 162 | conv_size = output_features.shape[-1] 163 | 164 | if ftype == 'nasnetalarge' or ftype == 'pnasnet5large': 165 | relu = nn.ReLU() 166 | rf = relu(output_features) 167 | avg_pool = nn.AvgPool2d(conv_size, stride=1, padding=0) 168 | out_feats = avg_pool(rf) 169 | else: 170 | avg_pool = nn.AdaptiveAvgPool2d((1, 1)) 171 | out_feats = avg_pool(output_features) 172 | 173 | out_feats = out_feats.view(out_feats.size(0), -1) 174 | if logger: logger.info('Processed {}/{} batches.\r'.format(i + 1, len(batches))) 175 | 176 | done_batches.append(out_feats) 177 | feats = np.concatenate(done_batches, axis=0) 178 | return feats --------------------------------------------------------------------------------