├── average_precision_calculator.py
├── config.py
├── data
    ├── __init__.py
    ├── make_train_test.py
    ├── organize_UCF101.py
    ├── process_features.py
    └── process_pca.py
├── dataloader.py
├── main.py
├── metrics.py
├── models
    ├── NeXtVLAD.py
    ├── __init__.py
    └── video_classifiers.py
├── notebooks
    └── arch_debug.ipynb
├── sample.py
├── train.py
└── util.py


/average_precision_calculator.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Calculate or keep track of the interpolated average precision.
 16 | It provides an interface for calculating interpolated average precision for an
 17 | entire list or the top-n ranked items. For the definition of the
 18 | (non-)interpolated average precision:
 19 | http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf
 20 | Example usages:
 21 | 1) Use it as a static function call to directly calculate average precision for
 22 | a short ranked list in the memory.
 23 | ```
 24 | import random
 25 | p = np.array([random.random() for _ in xrange(10)])
 26 | a = np.array([random.choice([0, 1]) for _ in xrange(10)])
 27 | ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)
 28 | ```
 29 | 2) Use it as an object for long ranked list that cannot be stored in memory or
 30 | the case where partial predictions can be observed at a time (Tensorflow
 31 | predictions). In this case, we first call the function accumulate many times
 32 | to process parts of the ranked list. After processing all the parts, we call
 33 | peek_interpolated_ap_at_n.
 34 | ```
 35 | p1 = np.array([random.random() for _ in xrange(5)])
 36 | a1 = np.array([random.choice([0, 1]) for _ in xrange(5)])
 37 | p2 = np.array([random.random() for _ in xrange(5)])
 38 | a2 = np.array([random.choice([0, 1]) for _ in xrange(5)])
 39 | # interpolated average precision at 10 using 1000 break points
 40 | calculator = average_precision_calculator.AveragePrecisionCalculator(10)
 41 | calculator.accumulate(p1, a1)
 42 | calculator.accumulate(p2, a2)
 43 | ap3 = calculator.peek_ap_at_n()
 44 | ```
 45 | """
 46 | 
 47 | import heapq
 48 | import random
 49 | import numbers
 50 | 
 51 | import numpy
 52 | 
 53 | 
 54 | class AveragePrecisionCalculator(object):
 55 |   """Calculate the average precision and average precision at n."""
 56 | 
 57 |   def __init__(self, top_n=None):
 58 |     """Construct an AveragePrecisionCalculator to calculate average precision.
 59 |     This class is used to calculate the average precision for a single label.
 60 |     Args:
 61 |       top_n: A positive Integer specifying the average precision at n, or
 62 |         None to use all provided data points.
 63 |     Raises:
 64 |       ValueError: An error occurred when the top_n is not a positive integer.
 65 |     """
 66 |     if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):
 67 |       raise ValueError("top_n must be a positive integer or None.")
 68 | 
 69 |     self._top_n = top_n  # average precision at n
 70 |     self._total_positives = 0  # total number of positives have seen
 71 |     self._heap = []  # max heap of (prediction, actual)
 72 | 
 73 |   @property
 74 |   def heap_size(self):
 75 |     """Gets the heap size maintained in the class."""
 76 |     return len(self._heap)
 77 | 
 78 |   @property
 79 |   def num_accumulated_positives(self):
 80 |     """Gets the number of positive samples that have been accumulated."""
 81 |     return self._total_positives
 82 | 
 83 |   def accumulate(self, predictions, actuals, num_positives=None):
 84 |     """Accumulate the predictions and their ground truth labels.
 85 |     After the function call, we may call peek_ap_at_n to actually calculate
 86 |     the average precision.
 87 |     Note predictions and actuals must have the same shape.
 88 |     Args:
 89 |       predictions: a list storing the prediction scores.
 90 |       actuals: a list storing the ground truth labels. Any value
 91 |       larger than 0 will be treated as positives, otherwise as negatives.
 92 |       num_positives = If the 'predictions' and 'actuals' inputs aren't complete,
 93 |       then it's possible some true positives were missed in them. In that case,
 94 |       you can provide 'num_positives' in order to accurately track recall.
 95 |     Raises:
 96 |       ValueError: An error occurred when the format of the input is not the
 97 |       numpy 1-D array or the shape of predictions and actuals does not match.
 98 |     """
 99 |     if len(predictions) != len(actuals):
100 |       raise ValueError("the shape of predictions and actuals does not match.")
101 | 
102 |     if not num_positives is None:
103 |       if not isinstance(num_positives, numbers.Number) or num_positives < 0:
104 |         raise ValueError("'num_positives' was provided but it wan't a nonzero number.")
105 | 
106 |     if not num_positives is None:
107 |       self._total_positives += num_positives
108 |     else:
109 |       self._total_positives += numpy.size(numpy.where(actuals > 0))
110 |     topk = self._top_n
111 |     heap = self._heap
112 | 
113 |     for i in range(numpy.size(predictions)):
114 |       if topk is None or len(heap) < topk:
115 |         heapq.heappush(heap, (predictions[i], actuals[i]))
116 |       else:
117 |         if predictions[i] > heap[0][0]:  # heap[0] is the smallest
118 |           heapq.heappop(heap)
119 |           heapq.heappush(heap, (predictions[i], actuals[i]))
120 | 
121 |   def clear(self):
122 |     """Clear the accumulated predictions."""
123 |     self._heap = []
124 |     self._total_positives = 0
125 | 
126 |   def peek_ap_at_n(self):
127 |     """Peek the non-interpolated average precision at n.
128 |     Returns:
129 |       The non-interpolated average precision at n (default 0).
130 |       If n is larger than the length of the ranked list,
131 |       the average precision will be returned.
132 |     """
133 |     if self.heap_size <= 0:
134 |       return 0
135 |     predlists = numpy.array(list(zip(*self._heap)))
136 | 
137 |     ap = self.ap_at_n(predlists[0],
138 |                       predlists[1],
139 |                       n=self._top_n,
140 |                       total_num_positives=self._total_positives)
141 |     return ap
142 | 
143 |   @staticmethod
144 |   def ap(predictions, actuals):
145 |     """Calculate the non-interpolated average precision.
146 |     Args:
147 |       predictions: a numpy 1-D array storing the sparse prediction scores.
148 |       actuals: a numpy 1-D array storing the ground truth labels. Any value
149 |       larger than 0 will be treated as positives, otherwise as negatives.
150 |     Returns:
151 |       The non-interpolated average precision at n.
152 |       If n is larger than the length of the ranked list,
153 |       the average precision will be returned.
154 |     Raises:
155 |       ValueError: An error occurred when the format of the input is not the
156 |       numpy 1-D array or the shape of predictions and actuals does not match.
157 |     """
158 |     return AveragePrecisionCalculator.ap_at_n(predictions,
159 |                                               actuals,
160 |                                               n=None)
161 | 
162 |   @staticmethod
163 |   def ap_at_n(predictions, actuals, n=20, total_num_positives=None):
164 |     """Calculate the non-interpolated average precision.
165 |     Args:
166 |       predictions: a numpy 1-D array storing the sparse prediction scores.
167 |       actuals: a numpy 1-D array storing the ground truth labels. Any value
168 |       larger than 0 will be treated as positives, otherwise as negatives.
169 |       n: the top n items to be considered in ap@n.
170 |       total_num_positives : (optionally) you can specify the number of total
171 |         positive
172 |       in the list. If specified, it will be used in calculation.
173 |     Returns:
174 |       The non-interpolated average precision at n.
175 |       If n is larger than the length of the ranked list,
176 |       the average precision will be returned.
177 |     Raises:
178 |       ValueError: An error occurred when
179 |       1) the format of the input is not the numpy 1-D array;
180 |       2) the shape of predictions and actuals does not match;
181 |       3) the input n is not a positive integer.
182 |     """
183 |     if len(predictions) != len(actuals):
184 |       raise ValueError("the shape of predictions and actuals does not match.")
185 | 
186 |     if n is not None:
187 |       if not isinstance(n, int) or n <= 0:
188 |         raise ValueError("n must be 'None' or a positive integer."
189 |                          " It was '%s'." % n)
190 | 
191 |     ap = 0.0
192 | 
193 |     predictions = numpy.array(predictions)
194 |     actuals = numpy.array(actuals)
195 | 
196 |     # add a shuffler to avoid overestimating the ap
197 |     predictions, actuals = AveragePrecisionCalculator._shuffle(predictions,
198 |                                                                actuals)
199 |     sortidx = sorted(
200 |         range(len(predictions)),
201 |         key=lambda k: predictions[k],
202 |         reverse=True)
203 | 
204 |     if total_num_positives is None:
205 |       numpos = numpy.size(numpy.where(actuals > 0))
206 |     else:
207 |       numpos = total_num_positives
208 | 
209 |     if numpos == 0:
210 |       return 0
211 | 
212 |     if n is not None:
213 |       numpos = min(numpos, n)
214 |     delta_recall = 1.0 / numpos
215 |     poscount = 0.0
216 | 
217 |     # calculate the ap
218 |     r = len(sortidx)
219 |     if n is not None:
220 |       r = min(r, n)
221 |     for i in range(r):
222 |       if actuals[sortidx[i]] > 0:
223 |         poscount += 1
224 |         ap += poscount / (i + 1) * delta_recall
225 |     return ap
226 | 
227 |   @staticmethod
228 |   def _shuffle(predictions, actuals):
229 |     random.seed(0)
230 |     suffidx = random.sample(range(len(predictions)), len(predictions))
231 |     predictions = predictions[suffidx]
232 |     actuals = actuals[suffidx]
233 |     return predictions, actuals
234 | 
235 |   @staticmethod
236 |   def _zero_one_normalize(predictions, epsilon=1e-7):
237 |     """Normalize the predictions to the range between 0.0 and 1.0.
238 |     For some predictions like SVM predictions, we need to normalize them before
239 |     calculate the interpolated average precision. The normalization will not
240 |     change the rank in the original list and thus won't change the average
241 |     precision.
242 |     Args:
243 |       predictions: a numpy 1-D array storing the sparse prediction scores.
244 |       epsilon: a small constant to avoid denominator being zero.
245 |     Returns:
246 |       The normalized prediction.
247 |     """
248 |     denominator = numpy.max(predictions) - numpy.min(predictions)
249 |     ret = (predictions - numpy.min(predictions)) / numpy.max(denominator,
250 |                                                              epsilon)
251 |     return ret


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | img_w = 224
2 | img_h = 224
3 | 
4 | dataset_params = {
5 |     'batch_size': 128,
6 |     'shuffle': True,
7 |     'num_workers': 4,
8 |     'pin_memory': True
9 | }


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w-garcia/NeXtVLAD.pytorch/cc7235b578ad092cd180083397de7411c1fe4684/data/__init__.py


--------------------------------------------------------------------------------
/data/make_train_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Modified from code here: https://github.com/Yidadaa/Pytorch-Video-Classification
  3 | '''
  4 | 
  5 | import os
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import concurrent.futures
  9 | import argparse
 10 | import ffmpeg
 11 | 
 12 | # 数据集的默认位置
 13 | # default params
 14 | default_output_dir = os.path.dirname(os.path.abspath(__file__))
 15 | default_src_dir = os.path.join(default_output_dir, 'UCF')
 16 | default_test_size = 0.2
 17 | 
 18 | 
 19 | def split(src_dir=default_src_dir, output_dir=default_src_dir, size=default_test_size):
 20 |     # 设置默认参数
 21 |     # set defaults
 22 |     src_dir = default_src_dir if src_dir is None else src_dir
 23 |     output_dir = default_output_dir if output_dir is None else output_dir
 24 |     size = default_test_size if size is None else size
 25 | 
 26 |     if not os.path.exists(output_dir):
 27 |         os.mkdir(output_dir)
 28 | 
 29 |     # 生成测试集和训练集目录
 30 |     # split into train and test
 31 |     for folder in ['train', 'test']:
 32 |         folder_path = os.path.join(output_dir, folder)
 33 |         if not os.path.exists(folder_path):
 34 |             os.mkdir(folder_path)
 35 |             print('Folder {} is created'.format(folder_path))
 36 | 
 37 |     # 划分测试集和训练集
 38 |     train_set = []
 39 |     test_set = []
 40 |     classes = os.listdir(src_dir)
 41 |     num_classes = len(classes)
 42 |     for class_index, classname in enumerate(classes):
 43 |         print(f"Current class:\t{class_index+1}")
 44 |         # 读取所有视频路径
 45 |         videos = os.listdir(os.path.join(src_dir, classname))
 46 |         # 打乱视频名称
 47 |         np.random.shuffle(videos)
 48 |         # 确定测试集划分点
 49 |         split_size = int(len(videos) * size)
 50 | 
 51 |         # 生成训练集和测试集的文件夹
 52 |         for i in range(2):
 53 |             part = ['train', 'test'][i]
 54 |             class_dir = os.path.join(output_dir, part, classname)
 55 |             if not os.path.exists(class_dir):
 56 |                 os.mkdir(class_dir)
 57 | 
 58 |         jobs = []
 59 |         # 遍历每个视频，将每个视频的图像帧提取出来
 60 |         for i in range(len(videos)):
 61 |             video_path = os.path.join(src_dir, classname, videos[i])
 62 | 
 63 |             video_type = 'test' if i <= split_size else 'train'
 64 |             video_name = videos[i].rsplit('.')[0]
 65 | 
 66 |             img_dir = os.path.join(output_dir, video_type, classname, f'{video_name}')
 67 |             if not os.path.exists(img_dir):
 68 |                 os.makedirs(img_dir)
 69 |             if len(os.listdir(img_dir)) > 0:
 70 |                 continue
 71 | 
 72 |             img_path = os.path.join(output_dir, video_type, classname, f'{video_name}/%6d.jpg')
 73 |             jobs.append({'in': video_path, 'out': img_path})
 74 | 
 75 |             info = [classname, video_name, img_path]
 76 |             # 将视频帧信息保存起来
 77 |             if video_type == 'test':
 78 |                 test_set.append(info)
 79 |             else:
 80 |                 train_set.append(info)
 81 | 
 82 |         def subproc_call(job):
 83 |             try:
 84 |                 # sample at 1fps: https://arxiv.org/pdf/1609.08675.pdf
 85 |                 process = (
 86 |                     ffmpeg
 87 |                     .input(job['in'])
 88 |                     .output(job['out'], pattern_type='glob', r=1)
 89 |                     .run_async(pipe_stdout=True, pipe_stderr=True)
 90 |                 )
 91 |                 out, err = process.communicate()
 92 |             except ffmpeg.Error as e:
 93 |                 print(e)
 94 |                 print(err)
 95 | 
 96 |         # subproc_call(jobs[0])
 97 |         with concurrent.futures.ThreadPoolExecutor() as executor:
 98 |             # wrap with list to run .map generator on execution
 99 |             _ = list(tqdm(executor.map(subproc_call, jobs), total=len(jobs)))
100 | 
101 |         # 将训练集和测试集数据保存到文件中，方便写dataloader
102 |         datas = [train_set, test_set]
103 |         names = ['train', 'test']
104 |         for i in range(2):
105 |             with open(output_dir + '/' + names[i] + '.csv', 'w') as f:
106 |                 f.write('\n'.join([','.join(line) for line in datas[i]]))
107 | 
108 | 
109 | def parse_args():
110 |     parser = argparse.ArgumentParser(usage='python3 make_train_test.py -i path/to/UCF -o path/to/output -s 0.3')
111 |     parser.add_argument('-i', '--src_dir', help='path to UCF datasets', default=default_src_dir)
112 |     parser.add_argument('-o', '--output_dir', help='path to output', default=default_output_dir)
113 |     parser.add_argument('-s', '--size', help='ratio of test sets', default=default_test_size)
114 |     args = parser.parse_args()
115 |     return args
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     args = parse_args()
120 |     split(**vars(args))
121 | 


--------------------------------------------------------------------------------
/data/organize_UCF101.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import fnmatch
 4 | import re
 5 | from tqdm import tqdm
 6 | 
 7 | if __name__ == '__main__':
 8 |     ind_filepath = "/mnt/nfs/hgst-raid1/WD-Passport_4TB/dataset/UCF101/ucfTrainTestlist/classInd.txt"
 9 |     vids_dir = "/mnt/nfs/hgst-raid1/WD-Passport_4TB/dataset/UCF101/videos"
10 | 
11 |     with open(ind_filepath, 'r') as f:
12 |         lines = f.readlines()
13 |         classes = [l.strip().split(' ')[1] for l in lines]
14 |         for c in tqdm(classes):
15 |             c_dir = os.path.join(vids_dir, c)
16 |             if not os.path.exists(c_dir):
17 |                 os.makedirs(c_dir)
18 |             rx = re.compile(fnmatch.translate(f"*{c}*.avi"), re.IGNORECASE)
19 |             class_videos = list(filter(rx.search, os.listdir(vids_dir)))
20 |             if len(class_videos) != 0:
21 |                 # script was already ran
22 |                 # continue
23 |                 from_paths = [os.path.join(vids_dir, cv) for cv in sorted(class_videos)]
24 |                 to_paths = [os.path.join(c_dir, cv) for cv in sorted(class_videos)]
25 |                 for from_path, to_path in zip(from_paths, to_paths):
26 |                     shutil.move(from_path, to_path)
27 | 


--------------------------------------------------------------------------------
/data/process_features.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Re-tooled version of the script found on VideoToTextDNN:
  3 | https://github.com/OSUPCVLab/VideoToTextDNN/blob/master/data/py3_process_features.py
  4 | 
  5 | Perform batched feature extract using Cadene pretrainedmodels
  6 | """
  7 | import torch
  8 | import argparse
  9 | import time
 10 | import os
 11 | import numpy as np
 12 | import logging
 13 | 
 14 | from util import TransformImage, create_batches, process_batches, init_model
 15 | 
 16 | logging.basicConfig()
 17 | logger = logging.getLogger(__name__)
 18 | logger.setLevel(logging.DEBUG)
 19 | 
 20 | available_features = ['nasnetalarge', 'resnet152', 'pnasnet5large', 'densenet121', 'senet154', 'polynet', 'vgg16']
 21 | 
 22 | args = None
 23 | 
 24 | 
 25 | def extract_features(args):
 26 |     root_frames_dir = args.frames_dir
 27 |     root_feats_dir = args.feats_dir
 28 |     work = args.work
 29 |     autofill = int(args.autofill)
 30 |     ftype = args.type
 31 |     gpu_list = args.gpu_list
 32 | 
 33 |     class_dirs = os.listdir(root_frames_dir)
 34 | 
 35 |     # skip a level for UCF101 dataset
 36 |     for class_dir in class_dirs:
 37 |         class_frames_dir = os.path.join(root_frames_dir, class_dir)
 38 | 
 39 |         frames_dirs = os.listdir(class_frames_dir)
 40 | 
 41 |         class_feats_dir = os.path.join(root_feats_dir, class_dir)
 42 |         if not os.path.isdir(class_feats_dir):
 43 |             os.makedirs(class_feats_dir)
 44 | 
 45 |         # else:
 46 |         #     if autofill:
 47 |         #         logger.info('AUTOFILL ON: Attempting to autofill missing features.')
 48 |         #         frames_dirs = validate_feats.go(featsd=root_feats_dir, framesd=root_frames_dir)
 49 | 
 50 |         # Difficulty of each job is measured by # of frames to process in each chunk.
 51 |         # Can't be randomized since autofill list woudld be no longer valid.
 52 |         # np.random.shuffle(frames_dirs)
 53 |         work = len(frames_dirs) if not work else work
 54 | 
 55 |         tf_img, model = init_model(args.gpu_list, args.type)
 56 | 
 57 |         work_done = 0
 58 |         while work_done != work:
 59 |             frames_dirs_avail = diff_feats(class_frames_dir, class_feats_dir)
 60 |             if len(frames_dirs_avail) == 0:
 61 |                 break
 62 | 
 63 |             frames_dir = frames_dirs_avail.pop()
 64 |             feat_filename = frames_dir.split('/')[-1] + '.npy'
 65 |             video_feats_path = os.path.join(class_feats_dir, feat_filename)
 66 | 
 67 |             if os.path.exists(video_feats_path):
 68 |                 logger.info('Features already extracted:\t{}'.format(video_feats_path))
 69 |                 continue
 70 | 
 71 |             try:
 72 |                 frames_to_do = [os.path.join(args.frames_dir, class_dir, frames_dir, p) for p in
 73 |                                 os.listdir(os.path.join(args.frames_dir, class_dir, frames_dir))]
 74 |             except Exception as e:
 75 |                 logger.exception(e)
 76 |                 continue
 77 | 
 78 |             # Must sort so frames follow numerical order. os.listdir does not guarantee order.
 79 |             frames_to_do.sort()
 80 | 
 81 |             if len(frames_to_do) == 0:
 82 |                 logger.warning("Frame folder has no frames! Skipping...")
 83 |                 continue
 84 | 
 85 |             # Save a flag copy
 86 |             with open(video_feats_path, 'wb') as pf:
 87 |                 np.save(pf, [])
 88 | 
 89 |             try:
 90 |                 batches = create_batches(frames_to_do, tf_img, logger=logger, batch_size=args.batch_size)
 91 |             except OSError as e:
 92 |                 logger.exception(e)
 93 |                 logger.warning("Corrupt image file. Skipping...")
 94 |                 os.remove(video_feats_path)
 95 |                 continue
 96 | 
 97 |             logger.debug("Start video {}".format(work_done))
 98 | 
 99 |             feats = process_batches(batches, ftype, gpu_list, model, logger=logger)
100 | 
101 |             with open(video_feats_path, 'wb') as pf:
102 |                 np.save(pf, feats)
103 |                 logger.info('Saved complete features to {}.'.format(video_feats_path))
104 |             work_done += 1
105 | 
106 | 
107 | def diff_feats(frames_dir, feats_dir):
108 |     feats = ['.'.join(i.split('.')[:-1]) for i in os.listdir(feats_dir)]
109 |     feats = set(feats)
110 |     frames = set([fr for fr in os.listdir(frames_dir) if len(os.listdir(os.path.join(frames_dir, fr)))])
111 |     needed_feats = frames - feats
112 |     return needed_feats
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     arg_parser = argparse.ArgumentParser()
117 |     arg_parser.add_argument('frames_dir',help = 'Directory where there are frame directories.')
118 |     arg_parser.add_argument('feats_dir',help = 'Root directory of dataset\'s processed videos.')
119 |     arg_parser.add_argument('-w', '--work', help = 'Number of features to process. Defaults to all.', default=0, type=int)
120 |     arg_parser.add_argument('-gl', '--gpu_list', required=True, nargs='+', type=int, help="Space delimited list of GPU indices to use. Example for 4 GPUs: -gl 0 1 2 3")
121 |     arg_parser.add_argument('-bs', '--batch_size', type=int, help="Batch size to use during feature extraction. Larger batch size = more VRAM usage", default=8)
122 |     arg_parser.add_argument('--type', required=True, help = 'ConvNet to use for processing features.', choices=available_features)
123 |     arg_parser.add_argument('--autofill', action='store_true', default=False, help="Perform diff between frames_dir and feats_dir and fill them in.")
124 | 
125 |     args = arg_parser.parse_args()
126 | 
127 |     start_time = time.time()
128 | 
129 |     logger.info("Found {} GPUs, using {}.".format(torch.cuda.device_count(), len(args.gpu_list)))
130 | 
131 |     extract_features(args)
132 | 
133 |     logger.info("Job took %s mins" % ((time.time() - start_time)/60))
134 | 


--------------------------------------------------------------------------------
/data/process_pca.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | from tqdm import tqdm
 6 | from fnmatch import filter
 7 | from sklearn.decomposition import PCA
 8 | 
 9 | from util import feature_pca_whiten, feature_pca
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     opt = argparse.ArgumentParser()
14 |     opt.add_argument('training_features_folder', help="Folder containing full-scale training features.")
15 |     opt.add_argument('test_features_folder', help="Folder containing full-scale test features.")
16 |     opt.add_argument('save_folder', help="Folder to save PCA params and features.")
17 |     opt = vars(opt.parse_args())
18 | 
19 |     D = []
20 |     for root, dirs, filenames in os.walk(opt['training_features_folder']):
21 |         for npf_name in filter(filenames, "*.npy"):
22 |             npf_path = os.path.join(root, npf_name)
23 |             npo = np.load(npf_path)
24 |             D.extend(npo)
25 | 
26 |     print(f"Generating PCA vectors...")
27 |     pca = PCA(n_components=1024)
28 |     pca.fit(D)
29 |     eigenvecs = pca.components_
30 |     eigenvals = pca.explained_variance_
31 |     center = pca.mean_
32 |     np.save(os.path.join(opt['save_folder'], 'eigenvecss.npy'), eigenvecs)
33 |     np.save(os.path.join(opt['save_folder'], 'eigenvals.npy'), eigenvals)
34 |     np.save(os.path.join(opt['save_folder'], 'mean.npy'), center)
35 | 
36 |     for split_folder in [opt['training_features_folder'], opt['test_features_folder']]:
37 |         if split_folder == opt['training_features_folder']:
38 |             out_root = os.path.join(opt['save_folder'], 'train_PCA-1024')
39 |         elif split_folder == opt['test_features_folder']:
40 |             out_root = os.path.join(opt['save_folder'], 'test_PCA-1024')
41 |         else:
42 |             break
43 | 
44 |         print(f"Created {out_root}")
45 |         class_dirs = os.listdir(split_folder)
46 |         num_classes = len(class_dirs)
47 |         for k, class_dir in enumerate(class_dirs):
48 |             print(f"Class {k+1}/{num_classes}")
49 |             class_feats_dir = os.path.join(split_folder, class_dir)
50 |             class_out_dir = os.path.join(out_root, class_dir)
51 |             if not os.path.isdir(class_out_dir):
52 |                 os.makedirs(class_out_dir)
53 | 
54 |             for npff in tqdm(os.listdir(class_feats_dir)):
55 |                 npf = os.path.join(class_feats_dir, npff)
56 |                 feats = np.load(npf)
57 |                 feats_pca = np.zeros((len(feats), 1024))
58 |                 for i, feat in enumerate(feats):
59 |                     # TODO: toggle whitening on/off
60 |                     # pcaw = feature_pca_whiten(feat, center, eigenvals, eigenvecs)
61 |                     pcaw = feature_pca(feat, center, eigenvals, eigenvecs)
62 |                     feats_pca[i] = pcaw
63 | 
64 |                 feats_pca_path = os.path.join(class_out_dir, npff)
65 |                 np.save(feats_pca_path, feats_pca)
66 | 


--------------------------------------------------------------------------------
/dataloader.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import random
  4 | import os
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from torch.utils.data import Dataset
  9 | from torch.autograd import Variable
 10 | from multiprocessing import Pool
 11 | from multiprocessing import Queue
 12 | from collections import defaultdict
 13 | 
 14 | 
 15 | class CocoDataset(Dataset):
 16 | 
 17 |     def __init__(self, coco_labels):
 18 |         # python 3
 19 |         # super().__init__()
 20 |         super(CocoDataset, self).__init__()
 21 |         self.coco_labels = list(coco_labels['labels'].items())
 22 |         self.num_classes = coco_labels['num_classes']
 23 | 
 24 |     def __getitem__(self, ix):
 25 |         labels = torch.zeros(self.num_classes)
 26 |         image_id, labels_ids = self.coco_labels[ix]
 27 |         labels[labels_ids] = 1
 28 |         data = {}
 29 |         data['image_ids'] = image_id
 30 |         data['labels'] = labels
 31 |         return data
 32 | 
 33 |     def __len__(self):
 34 |         return len(self.coco_labels)
 35 | 
 36 | 
 37 | pool_queue = Queue()
 38 | work = []
 39 | 
 40 | 
 41 | def _threaded_sample_load(vid_id, fpath, n_frame_steps):
 42 |     fc_feat = load_and_subsample_feat(fpath, n_frame_steps)
 43 |     pool_queue.put((vid_id, fc_feat))
 44 | 
 45 | 
 46 | class VideoClassificationFolder:
 47 |     def __init__(self, feats_folder: str):
 48 |         """
 49 |         Init the video classification folder with the following tree structure:
 50 |             - [train|test]
 51 |             |- class 0
 52 |             |- - video 0
 53 |             |- - ...
 54 |             |- - video_{0i}
 55 |             |- ...
 56 |             |- ...
 57 |             |- class k
 58 |             |- - video 0
 59 |             |- - ...
 60 |             |- - video_{ki}
 61 | 
 62 |             $i$ is not guaranteed to be consistent between classes
 63 |         :param feats_folder: root directory where features are stored
 64 |         """
 65 |         self.class_to_feats_map = defaultdict(list)
 66 |         self.feats_dir = feats_folder
 67 |         self.num_classes = len(os.listdir(self.feats_dir))
 68 |         for c in os.listdir(self.feats_dir):
 69 |             self.class_to_feats_map[c] = [os.path.join(self.feats_dir, c, npf) for npf in
 70 |                                           os.listdir(os.path.join(self.feats_dir, c))]
 71 | 
 72 |     def flattened(self) -> dict:
 73 |         """
 74 |         :return: a flattened tree as a dict of idx: 2-tuple (feats_path, class_id) with deterministic ordering
 75 |         """
 76 |         l = {}
 77 |         i = 0
 78 |         for c in sorted(list(self.class_to_feats_map.keys())):
 79 |             for feats_path in sorted(self.class_to_feats_map[c]):
 80 |                 l[i] = (feats_path, c)
 81 |                 i += 1
 82 | 
 83 |         return l
 84 | 
 85 |     def __len__(self) -> int:
 86 |         return sum([len(self.class_to_feats_map[c]) for c in list(self.class_to_feats_map.keys())])
 87 | 
 88 | 
 89 | class VideoClassificationDataset(Dataset):
 90 | 
 91 |     # def get_vocab_size(self):
 92 |     #     return len(self.get_vocab())
 93 | 
 94 |     # def get_vocab(self):
 95 |     #     return self.ix_to_word
 96 | 
 97 |     # def get_seq_length(self):
 98 |     #     return self.seq_length
 99 | 
100 |     def __init__(self, opt, mode):
101 |         # python 3
102 |         # super().__init__()
103 |         super(VideoClassificationDataset, self).__init__()
104 |         self.mode = mode  # to load train/val/test data
105 |         self.feats_dir = opt['feats_dir']
106 |         self.max_frames = opt['max_frames']
107 |         self.tree = VideoClassificationFolder(self.feats_dir)
108 |         self.num_classes = self.tree.num_classes
109 |         self.n = len(self.tree)
110 |         # self.n_frame_steps = opt['n_frame_steps']
111 |         # load in the sequence data
112 | 
113 |         if self.mode != 'inference':
114 |             print(f'load feats from {self.feats_dir}')
115 |             # Memory cache for features
116 |             print(f"Pre-cache {self.n} features in memory.")
117 |             self._feat_cache = {}
118 |             # pool = Pool(16)
119 | 
120 |             for idx, (fc_feat_path, c) in self.tree.flattened().items():
121 |                 try:
122 |                     fc_feat, mask = load_and_subsample_feat(fc_feat_path, self.max_frames)
123 |                     self._feat_cache[idx] = (fc_feat, mask, c)
124 |                 except:
125 |                     print(f"{fc_feat_path} was not found")
126 | 
127 |         self.classes = sorted(list(self.tree.class_to_feats_map.keys()))
128 |         self.tree = self.tree.flattened()
129 |         print("Finished initializing dataloader.")
130 | 
131 |     def __getitem__(self, ix):
132 |         """This function returns a tuple that is further passed to collate_fn
133 |         """
134 |         ix = ix % self.n
135 | 
136 |         fc_feat = self._feat_cache.get(ix, None)
137 |         if fc_feat is None:
138 |             fc_feat_path, c = self.tree[ix]
139 |             fc_feat, mask = load_and_subsample_feat(fc_feat_path, self.max_frames)
140 |             self._feat_cache[ix] = (fc_feat, mask, c)
141 |         else:
142 |             fc_feat, mask, c = self._feat_cache[ix]
143 | 
144 |         label = self.classes.index(c)
145 | 
146 |         data = {
147 |             'fc_feats':  Variable(torch.from_numpy(fc_feat).type(torch.FloatTensor)),
148 |             'ground_truth': Variable(torch.from_numpy(one_hot(label, self.num_classes)).type(torch.FloatTensor)),
149 |             'video_id': ix,
150 |             'mask': Variable(torch.from_numpy(mask).type(torch.FloatTensor))
151 |         }
152 |         return data
153 | 
154 |     def __len__(self):
155 |         return self.n
156 | 
157 | 
158 | def load_and_subsample_feat(fc_feat_path, max_frames, n_frame_steps=1):
159 |     # fc_feat = np.load(fc_feat_path)
160 |     # Subsampling
161 |     # samples = np.round(np.linspace(
162 |     #     0, fc_feat.shape[0] - 1, n_frame_steps)).astype(np.int32)
163 |     try:
164 |         fc_feat = np.load(fc_feat_path)
165 |         n = min(max_frames, len(fc_feat))
166 |         padded = np.zeros((max_frames, fc_feat.shape[1]))
167 |         padded[:n, :] = fc_feat[:n, :]
168 |         mask = np.zeros((max_frames,))
169 |         mask[:n] = 1
170 |     except Exception as e:
171 |         print("Bad feature file in dataset: {}. Purge, re-process, and try again.".format(fc_feat_path))
172 |         raise e
173 |     return padded, mask
174 | 
175 | 
176 | def one_hot(idx, num_classes):
177 |     out = np.zeros(num_classes)
178 |     out[idx] = 1
179 |     return out
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     opt = {
184 |         'feats_dir': "/home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/train_PCA-1024",
185 |         'max_frames': 50
186 |     }
187 | 
188 |     vd = VideoClassificationDataset(opt, 'train')
189 |     data = vd.__getitem__(5)
190 |     fc_feats = data['fc_feats']
191 |     print(fc_feats.shape)
192 |     print(data['mask'])


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import argparse
  3 | from math import ceil
  4 | import random, shutil, json
  5 | from os.path import join, exists, isfile
  6 | from os import makedirs, remove, environ
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | import torch.optim as optim
 12 | from torch.utils.data import DataLoader, SubsetRandomSampler
 13 | from torch.utils.data.dataset import Subset
 14 | from datetime import datetime
 15 | import torchvision.models as models
 16 | import h5py
 17 | import faiss
 18 | 
 19 | from tensorboardX import SummaryWriter
 20 | import numpy as np
 21 | from models import netvlad
 22 | 
 23 | parser = argparse.ArgumentParser(description='pytorch-NetVlad')
 24 | parser.add_argument('--mode', type=str, default='train', help='Mode', choices=['train', 'test', 'cluster'])
 25 | parser.add_argument('--batchSize', type=int, default=4, 
 26 |         help='Number of triplets (query, pos, negs). Each triplet consists of 12 images.')
 27 | parser.add_argument('--cacheBatchSize', type=int, default=24, help='Batch size for caching and testing')
 28 | parser.add_argument('--cacheRefreshRate', type=int, default=1000, 
 29 |         help='How often to refresh cache, in number of queries. 0 for off')
 30 | parser.add_argument('--nEpochs', type=int, default=30, help='number of epochs to train for')
 31 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 
 32 |         help='manual epoch number (useful on restarts)')
 33 | parser.add_argument('--nGPU', type=int, default=1, help='number of GPU to use.')
 34 | parser.add_argument('--optim', type=str, default='SGD', help='optimizer to use', choices=['SGD', 'ADAM'])
 35 | parser.add_argument('--lr', type=float, default=0.0001, help='Learning Rate.')
 36 | parser.add_argument('--lrStep', type=float, default=5, help='Decay LR ever N steps.')
 37 | parser.add_argument('--lrGamma', type=float, default=0.5, help='Multiply LR by Gamma for decaying.')
 38 | parser.add_argument('--weightDecay', type=float, default=0.001, help='Weight decay for SGD.')
 39 | parser.add_argument('--momentum', type=float, default=0.9, help='Momentum for SGD.')
 40 | parser.add_argument('--nocuda', action='store_true', help='Dont use cuda')
 41 | parser.add_argument('--threads', type=int, default=8, help='Number of threads for each data loader to use')
 42 | parser.add_argument('--seed', type=int, default=123, help='Random seed to use.')
 43 | parser.add_argument('--dataPath', type=str, default='/nfs/ibrahimi/data/', help='Path for centroid data.')
 44 | parser.add_argument('--runsPath', type=str, default='/nfs/ibrahimi/runs/', help='Path to save runs to.')
 45 | parser.add_argument('--savePath', type=str, default='checkpoints', 
 46 |         help='Path to save checkpoints to in logdir. Default=checkpoints/')
 47 | parser.add_argument('--cachePath', type=str, default=environ['TMPDIR'], help='Path to save cache to.')
 48 | parser.add_argument('--resume', type=str, default='', help='Path to load checkpoint from, for resuming training or testing.')
 49 | parser.add_argument('--ckpt', type=str, default='latest', 
 50 |         help='Resume from latest or best checkpoint.', choices=['latest', 'best'])
 51 | parser.add_argument('--evalEvery', type=int, default=1, 
 52 |         help='Do a validation set run, and save, every N epochs.')
 53 | parser.add_argument('--patience', type=int, default=10, help='Patience for early stopping. 0 is off.')
 54 | parser.add_argument('--dataset', type=str, default='pittsburgh', 
 55 |         help='Dataset to use', choices=['pittsburgh'])
 56 | parser.add_argument('--arch', type=str, default='vgg16', 
 57 |         help='basenetwork to use', choices=['vgg16', 'alexnet'])
 58 | parser.add_argument('--vladv2', action='store_true', help='Use VLAD v2')
 59 | parser.add_argument('--pooling', type=str, default='netvlad', help='type of pooling to use',
 60 |         choices=['netvlad', 'max', 'avg'])
 61 | parser.add_argument('--num_clusters', type=int, default=64, help='Number of NetVlad clusters. Default=64')
 62 | parser.add_argument('--margin', type=float, default=0.1, help='Margin for triplet loss. Default=0.1')
 63 | parser.add_argument('--split', type=str, default='val', help='Data split to use for testing. Default is val', 
 64 |         choices=['test', 'test250k', 'train', 'val'])
 65 | parser.add_argument('--fromscratch', action='store_true', help='Train from scratch rather than using pretrained models')
 66 | 
 67 | def train(epoch):
 68 |     epoch_loss = 0
 69 |     startIter = 1 # keep track of batch iter across subsets for logging
 70 | 
 71 |     if opt.cacheRefreshRate > 0:
 72 |         subsetN = ceil(len(train_set) / opt.cacheRefreshRate)
 73 |         #TODO randomise the arange before splitting?
 74 |         subsetIdx = np.array_split(np.arange(len(train_set)), subsetN)
 75 |     else:
 76 |         subsetN = 1
 77 |         subsetIdx = [np.arange(len(train_set))]
 78 | 
 79 |     nBatches = (len(train_set) + opt.batchSize - 1) // opt.batchSize
 80 | 
 81 |     for subIter in range(subsetN):
 82 |         print('====> Building Cache')
 83 |         model.eval()
 84 |         train_set.cache = join(opt.cachePath, train_set.whichSet + '_feat_cache.hdf5')
 85 |         with h5py.File(train_set.cache, mode='w') as h5: 
 86 |             pool_size = encoder_dim
 87 |             if opt.pooling.lower() == 'netvlad': pool_size *= opt.num_clusters
 88 |             h5feat = h5.create_dataset("features", 
 89 |                     [len(whole_train_set), pool_size], 
 90 |                     dtype=np.float32)
 91 |             with torch.no_grad():
 92 |                 for iteration, (input, indices) in enumerate(whole_training_data_loader, 1):
 93 |                     input = input.to(device)
 94 |                     image_encoding = model.encoder(input)
 95 |                     vlad_encoding = model.pool(image_encoding) 
 96 |                     h5feat[indices.detach().numpy(), :] = vlad_encoding.detach().cpu().numpy()
 97 |                     del input, image_encoding, vlad_encoding
 98 | 
 99 |         sub_train_set = Subset(dataset=train_set, indices=subsetIdx[subIter])
100 | 
101 |         training_data_loader = DataLoader(dataset=sub_train_set, num_workers=opt.threads, 
102 |                     batch_size=opt.batchSize, shuffle=True, 
103 |                     collate_fn=dataset.collate_fn, pin_memory=cuda)
104 | 
105 |         print('Allocated:', torch.cuda.memory_allocated())
106 |         print('Cached:', torch.cuda.memory_cached())
107 | 
108 |         model.train()
109 |         for iteration, (query, positives, negatives, 
110 |                 negCounts, indices) in enumerate(training_data_loader, startIter):
111 |             # some reshaping to put query, pos, negs in a single (N, 3, H, W) tensor
112 |             # where N = batchSize * (nQuery + nPos + nNeg)
113 |             if query is None: continue # in case we get an empty batch
114 | 
115 |             B, C, H, W = query.shape
116 |             nNeg = torch.sum(negCounts)
117 |             input = torch.cat([query, positives, negatives])
118 | 
119 |             input = input.to(device)
120 |             image_encoding = model.encoder(input)
121 |             vlad_encoding = model.pool(image_encoding) 
122 | 
123 |             vladQ, vladP, vladN = torch.split(vlad_encoding, [B, B, nNeg])
124 | 
125 |             optimizer.zero_grad()
126 |             
127 |             # calculate loss for each Query, Positive, Negative triplet
128 |             # due to potential difference in number of negatives have to 
129 |             # do it per query, per negative
130 |             loss = 0
131 |             for i, negCount in enumerate(negCounts):
132 |                 for n in range(negCount):
133 |                     negIx = (torch.sum(negCounts[:i]) + n).item()
134 |                     loss += criterion(vladQ[i:i+1], vladP[i:i+1], vladN[negIx:negIx+1])
135 | 
136 |             loss /= nNeg.float().to(device) # normalise by actual number of negatives
137 |             loss.backward()
138 |             optimizer.step()
139 |             del input, image_encoding, vlad_encoding, vladQ, vladP, vladN
140 |             del query, positives, negatives
141 | 
142 |             batch_loss = loss.item()
143 |             epoch_loss += batch_loss
144 | 
145 |             if iteration % 50 == 0 or nBatches <= 10:
146 |                 print("==> Epoch[{}]({}/{}): Loss: {:.4f}".format(epoch, iteration, 
147 |                     nBatches, batch_loss), flush=True)
148 |                 writer.add_scalar('Train/Loss', batch_loss, 
149 |                         ((epoch-1) * nBatches) + iteration)
150 |                 writer.add_scalar('Train/nNeg', nNeg, 
151 |                         ((epoch-1) * nBatches) + iteration)
152 |                 print('Allocated:', torch.cuda.memory_allocated())
153 |                 print('Cached:', torch.cuda.memory_cached())
154 | 
155 |         startIter += len(training_data_loader)
156 |         del training_data_loader, loss
157 |         optimizer.zero_grad()
158 |         torch.cuda.empty_cache()
159 |         remove(train_set.cache) # delete HDF5 cache
160 | 
161 |     avg_loss = epoch_loss / nBatches
162 | 
163 |     print("===> Epoch {} Complete: Avg. Loss: {:.4f}".format(epoch, avg_loss), 
164 |             flush=True)
165 |     writer.add_scalar('Train/AvgLoss', avg_loss, epoch)
166 | 
167 | def test(eval_set, epoch=0, write_tboard=False):
168 |     # TODO what if features dont fit in memory? 
169 |     test_data_loader = DataLoader(dataset=eval_set, 
170 |                 num_workers=opt.threads, batch_size=opt.cacheBatchSize, shuffle=False, 
171 |                 pin_memory=cuda)
172 | 
173 |     model.eval()
174 |     with torch.no_grad():
175 |         print('====> Extracting Features')
176 |         pool_size = encoder_dim
177 |         if opt.pooling.lower() == 'netvlad': pool_size *= opt.num_clusters
178 |         dbFeat = np.empty((len(eval_set), pool_size))
179 | 
180 |         for iteration, (input, indices) in enumerate(test_data_loader, 1):
181 |             input = input.to(device)
182 |             image_encoding = model.encoder(input)
183 |             vlad_encoding = model.pool(image_encoding) 
184 | 
185 |             dbFeat[indices.detach().numpy(), :] = vlad_encoding.detach().cpu().numpy()
186 |             if iteration % 50 == 0 or len(test_data_loader) <= 10:
187 |                 print("==> Batch ({}/{})".format(iteration, 
188 |                     len(test_data_loader)), flush=True)
189 | 
190 |             del input, image_encoding, vlad_encoding
191 |     del test_data_loader
192 | 
193 |     # extracted for both db and query, now split in own sets
194 |     qFeat = dbFeat[eval_set.dbStruct.numDb:].astype('float32')
195 |     dbFeat = dbFeat[:eval_set.dbStruct.numDb].astype('float32')
196 |     
197 |     print('====> Building faiss index')
198 |     faiss_index = faiss.IndexFlatL2(pool_size)
199 |     faiss_index.add(dbFeat)
200 | 
201 |     print('====> Calculating recall @ N')
202 |     n_values = [1,5,10,20]
203 | 
204 |     _, predictions = faiss_index.search(qFeat, max(n_values)) 
205 | 
206 |     # for each query get those within threshold distance
207 |     gt = eval_set.getPositives() 
208 | 
209 |     correct_at_n = np.zeros(len(n_values))
210 |     #TODO can we do this on the matrix in one go?
211 |     for qIx, pred in enumerate(predictions):
212 |         for i,n in enumerate(n_values):
213 |             # if in top N then also in top NN, where NN > N
214 |             if np.any(np.in1d(pred[:n], gt[qIx])):
215 |                 correct_at_n[i:] += 1
216 |                 break
217 |     recall_at_n = correct_at_n / eval_set.dbStruct.numQ
218 | 
219 |     recalls = {} #make dict for output
220 |     for i,n in enumerate(n_values):
221 |         recalls[n] = recall_at_n[i]
222 |         print("====> Recall@{}: {:.4f}".format(n, recall_at_n[i]))
223 |         if write_tboard: writer.add_scalar('Val/Recall@' + str(n), recall_at_n[i], epoch)
224 | 
225 |     return recalls
226 | 
227 | def get_clusters(cluster_set):
228 |     nDescriptors = 50000
229 |     nPerImage = 100
230 |     nIm = ceil(nDescriptors/nPerImage)
231 | 
232 |     sampler = SubsetRandomSampler(np.random.choice(len(cluster_set), nIm, replace=False))
233 |     data_loader = DataLoader(dataset=cluster_set, 
234 |                 num_workers=opt.threads, batch_size=opt.cacheBatchSize, shuffle=False, 
235 |                 pin_memory=cuda,
236 |                 sampler=sampler)
237 | 
238 |     if not exists(join(opt.dataPath, 'centroids')):
239 |         makedirs(join(opt.dataPath, 'centroids'))
240 | 
241 |     initcache = join(opt.dataPath, 'centroids', opt.arch + '_' + cluster_set.dataset + '_' + str(opt.num_clusters) + '_desc_cen.hdf5')
242 |     with h5py.File(initcache, mode='w') as h5: 
243 |         with torch.no_grad():
244 |             model.eval()
245 |             print('====> Extracting Descriptors')
246 |             dbFeat = h5.create_dataset("descriptors", 
247 |                         [nDescriptors, encoder_dim], 
248 |                         dtype=np.float32)
249 | 
250 |             for iteration, (input, indices) in enumerate(data_loader, 1):
251 |                 input = input.to(device)
252 |                 image_descriptors = model.encoder(input).view(input.size(0), encoder_dim, -1).permute(0, 2, 1)
253 | 
254 |                 batchix = (iteration-1)*opt.cacheBatchSize*nPerImage
255 |                 for ix in range(image_descriptors.size(0)):
256 |                     # sample different location for each image in batch
257 |                     sample = np.random.choice(image_descriptors.size(1), nPerImage, replace=False)
258 |                     startix = batchix + ix*nPerImage
259 |                     dbFeat[startix:startix+nPerImage, :] = image_descriptors[ix, sample, :].detach().cpu().numpy()
260 | 
261 |                 if iteration % 50 == 0 or len(data_loader) <= 10:
262 |                     print("==> Batch ({}/{})".format(iteration, 
263 |                         ceil(nIm/opt.cacheBatchSize)), flush=True)
264 |                 del input, image_descriptors
265 |         
266 |         print('====> Clustering..')
267 |         niter = 100
268 |         kmeans = faiss.Kmeans(encoder_dim, opt.num_clusters, niter, verbose=False)
269 |         kmeans.train(dbFeat[...])
270 | 
271 |         print('====> Storing centroids', kmeans.centroids.shape)
272 |         h5.create_dataset('centroids', data=kmeans.centroids)
273 |         print('====> Done!')
274 | 
275 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
276 |     model_out_path = join(opt.savePath, filename)
277 |     torch.save(state, model_out_path)
278 |     if is_best:
279 |         shutil.copyfile(model_out_path, join(opt.savePath, 'model_best.pth.tar'))
280 | 
281 | class Flatten(nn.Module):
282 |     def forward(self, input):
283 |         return input.view(input.size(0), -1)
284 | 
285 | class L2Norm(nn.Module):
286 |     def __init__(self, dim=1):
287 |         super().__init__()
288 |         self.dim = dim
289 | 
290 |     def forward(self, input):
291 |         return F.normalize(input, p=2, dim=self.dim)
292 | 
293 | if __name__ == "__main__":
294 |     opt = parser.parse_args()
295 | 
296 |     restore_var = ['lr', 'lrStep', 'lrGamma', 'weightDecay', 'momentum', 
297 |             'runsPath', 'savePath', 'arch', 'num_clusters', 'pooling', 'optim',
298 |             'margin', 'seed', 'patience']
299 |     if opt.resume:
300 |         flag_file = join(opt.resume, 'checkpoints', 'flags.json')
301 |         if exists(flag_file):
302 |             with open(flag_file, 'r') as f:
303 |                 stored_flags = {'--'+k : str(v) for k,v in json.load(f).items() if k in restore_var}
304 |                 to_del = []
305 |                 for flag, val in stored_flags.items():
306 |                     for act in parser._actions:
307 |                         if act.dest == flag[2:]:
308 |                             # store_true / store_false args don't accept arguments, filter these 
309 |                             if type(act.const) == type(True):
310 |                                 if val == str(act.default):
311 |                                     to_del.append(flag)
312 |                                 else:
313 |                                     stored_flags[flag] = ''
314 |                 for flag in to_del: del stored_flags[flag]
315 | 
316 |                 train_flags = [x for x in list(sum(stored_flags.items(), tuple())) if len(x) > 0]
317 |                 print('Restored flags:', train_flags)
318 |                 opt = parser.parse_args(train_flags, namespace=opt)
319 | 
320 |     print(opt)
321 | 
322 |     if opt.dataset.lower() == 'pittsburgh':
323 |         import pittsburgh as dataset
324 |     else:
325 |         raise Exception('Unknown dataset')
326 | 
327 |     cuda = not opt.nocuda
328 |     if cuda and not torch.cuda.is_available():
329 |         raise Exception("No GPU found, please run with --nocuda")
330 | 
331 |     device = torch.device("cuda" if cuda else "cpu")
332 | 
333 |     random.seed(opt.seed)
334 |     np.random.seed(opt.seed)
335 |     torch.manual_seed(opt.seed)
336 |     if cuda:
337 |         torch.cuda.manual_seed(opt.seed)
338 | 
339 |     print('===> Loading dataset(s)')
340 |     if opt.mode.lower() == 'train':
341 |         whole_train_set = dataset.get_whole_training_set()
342 |         whole_training_data_loader = DataLoader(dataset=whole_train_set, 
343 |                 num_workers=opt.threads, batch_size=opt.cacheBatchSize, shuffle=False, 
344 |                 pin_memory=cuda)
345 | 
346 |         train_set = dataset.get_training_query_set(opt.margin)
347 | 
348 |         print('====> Training query set:', len(train_set))
349 |         whole_test_set = dataset.get_whole_val_set()
350 |         print('===> Evaluating on val set, query count:', whole_test_set.dbStruct.numQ)
351 |     elif opt.mode.lower() == 'test':
352 |         if opt.split.lower() == 'test':
353 |             whole_test_set = dataset.get_whole_test_set()
354 |             print('===> Evaluating on test set')
355 |         elif opt.split.lower() == 'test250k':
356 |             whole_test_set = dataset.get_250k_test_set()
357 |             print('===> Evaluating on test250k set')
358 |         elif opt.split.lower() == 'train':
359 |             whole_test_set = dataset.get_whole_training_set()
360 |             print('===> Evaluating on train set')
361 |         elif opt.split.lower() == 'val':
362 |             whole_test_set = dataset.get_whole_val_set()
363 |             print('===> Evaluating on val set')
364 |         else:
365 |             raise ValueError('Unknown dataset split: ' + opt.split)
366 |         print('====> Query count:', whole_test_set.dbStruct.numQ)
367 |     elif opt.mode.lower() == 'cluster':
368 |         whole_train_set = dataset.get_whole_training_set(onlyDB=True)
369 | 
370 |     print('===> Building model')
371 | 
372 |     pretrained = not opt.fromscratch
373 |     if opt.arch.lower() == 'alexnet':
374 |         encoder_dim = 256
375 |         encoder = models.alexnet(pretrained=pretrained)
376 |         # capture only features and remove last relu and maxpool
377 |         layers = list(encoder.features.children())[:-2]
378 | 
379 |         if pretrained:
380 |             # if using pretrained only train conv5
381 |             for l in layers[:-1]:
382 |                 for p in l.parameters():
383 |                     p.requires_grad = False
384 | 
385 |     elif opt.arch.lower() == 'vgg16':
386 |         encoder_dim = 512
387 |         encoder = models.vgg16(pretrained=pretrained)
388 |         # capture only feature part and remove last relu and maxpool
389 |         layers = list(encoder.features.children())[:-2]
390 | 
391 |         if pretrained:
392 |             # if using pretrained then only train conv5_1, conv5_2, and conv5_3
393 |             for l in layers[:-5]: 
394 |                 for p in l.parameters():
395 |                     p.requires_grad = False
396 | 
397 |     if opt.mode.lower() == 'cluster' and not opt.vladv2:
398 |         layers.append(L2Norm())
399 | 
400 |     encoder = nn.Sequential(*layers)
401 |     model = nn.Module() 
402 |     model.add_module('encoder', encoder)
403 | 
404 |     if opt.mode.lower() != 'cluster':
405 |         if opt.pooling.lower() == 'netvlad':
406 |             net_vlad = netvlad.NetVLAD(num_clusters=opt.num_clusters, dim=encoder_dim, vladv2=opt.vladv2)
407 |             if not opt.resume: 
408 |                 if opt.mode.lower() == 'train':
409 |                     initcache = join(opt.dataPath, 'centroids', opt.arch + '_' + train_set.dataset + '_' + str(opt.num_clusters) +'_desc_cen.hdf5')
410 |                 else:
411 |                     initcache = join(opt.dataPath, 'centroids', opt.arch + '_' + whole_test_set.dataset + '_' + str(opt.num_clusters) +'_desc_cen.hdf5')
412 | 
413 |                 if not exists(initcache):
414 |                     raise FileNotFoundError('Could not find clusters, please run with --mode=cluster before proceeding')
415 | 
416 |                 with h5py.File(initcache, mode='r') as h5: 
417 |                     clsts = h5.get("centroids")[...]
418 |                     traindescs = h5.get("descriptors")[...]
419 |                     net_vlad.init_params(clsts, traindescs) 
420 |                     del clsts, traindescs
421 | 
422 |             model.add_module('pool', net_vlad)
423 |         elif opt.pooling.lower() == 'max':
424 |             global_pool = nn.AdaptiveMaxPool2d((1,1))
425 |             model.add_module('pool', nn.Sequential(*[global_pool, Flatten(), L2Norm()]))
426 |         elif opt.pooling.lower() == 'avg':
427 |             global_pool = nn.AdaptiveAvgPool2d((1,1))
428 |             model.add_module('pool', nn.Sequential(*[global_pool, Flatten(), L2Norm()]))
429 |         else:
430 |             raise ValueError('Unknown pooling type: ' + opt.pooling)
431 | 
432 |     isParallel = False
433 |     if opt.nGPU > 1 and torch.cuda.device_count() > 1:
434 |         model.encoder = nn.DataParallel(model.encoder)
435 |         if opt.mode.lower() != 'cluster':
436 |             model.pool = nn.DataParallel(model.pool)
437 |         isParallel = True
438 | 
439 |     if not opt.resume:
440 |         model = model.to(device)
441 |     
442 |     if opt.mode.lower() == 'train':
443 |         if opt.optim.upper() == 'ADAM':
444 |             optimizer = optim.Adam(filter(lambda p: p.requires_grad, 
445 |                 model.parameters()), lr=opt.lr)#, betas=(0,0.9))
446 |         elif opt.optim.upper() == 'SGD':
447 |             optimizer = optim.SGD(filter(lambda p: p.requires_grad, 
448 |                 model.parameters()), lr=opt.lr,
449 |                 momentum=opt.momentum,
450 |                 weight_decay=opt.weightDecay)
451 | 
452 |             scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=opt.lrStep, gamma=opt.lrGamma)
453 |         else:
454 |             raise ValueError('Unknown optimizer: ' + opt.optim)
455 | 
456 |         # original paper/code doesn't sqrt() the distances, we do, so sqrt() the margin, I think :D
457 |         criterion = nn.TripletMarginLoss(margin=opt.margin**0.5, 
458 |                 p=2, reduction='sum').to(device)
459 | 
460 |     if opt.resume:
461 |         if opt.ckpt.lower() == 'latest':
462 |             resume_ckpt = join(opt.resume, 'checkpoints', 'checkpoint.pth.tar')
463 |         elif opt.ckpt.lower() == 'best':
464 |             resume_ckpt = join(opt.resume, 'checkpoints', 'model_best.pth.tar')
465 | 
466 |         if isfile(resume_ckpt):
467 |             print("=> loading checkpoint '{}'".format(resume_ckpt))
468 |             checkpoint = torch.load(resume_ckpt, map_location=lambda storage, loc: storage)
469 |             opt.start_epoch = checkpoint['epoch']
470 |             best_metric = checkpoint['best_score']
471 |             model.load_state_dict(checkpoint['state_dict'])
472 |             model = model.to(device)
473 |             if opt.mode == 'train':
474 |                 optimizer.load_state_dict(checkpoint['optimizer'])
475 |             print("=> loaded checkpoint '{}' (epoch {})"
476 |                   .format(resume_ckpt, checkpoint['epoch']))
477 |         else:
478 |             print("=> no checkpoint found at '{}'".format(resume_ckpt))
479 | 
480 |     if opt.mode.lower() == 'test':
481 |         print('===> Running evaluation step')
482 |         epoch = 1
483 |         recalls = test(whole_test_set, epoch, write_tboard=False)
484 |     elif opt.mode.lower() == 'cluster':
485 |         print('===> Calculating descriptors and clusters')
486 |         get_clusters(whole_train_set)
487 |     elif opt.mode.lower() == 'train':
488 |         print('===> Training model')
489 |         writer = SummaryWriter(log_dir=join(opt.runsPath, datetime.now().strftime('%b%d_%H-%M-%S')+'_'+opt.arch+'_'+opt.pooling))
490 | 
491 |         # write checkpoints in logdir
492 |         logdir = writer.file_writer.get_logdir()
493 |         opt.savePath = join(logdir, opt.savePath)
494 |         if not opt.resume:
495 |             makedirs(opt.savePath)
496 | 
497 |         with open(join(opt.savePath, 'flags.json'), 'w') as f:
498 |             f.write(json.dumps(
499 |                 {k:v for k,v in vars(opt).items()}
500 |                 ))
501 |         print('===> Saving state to:', logdir)
502 | 
503 |         not_improved = 0
504 |         best_score = 0
505 |         for epoch in range(opt.start_epoch+1, opt.nEpochs + 1):
506 |             if opt.optim.upper() == 'SGD':
507 |                 scheduler.step(epoch)
508 |             train(epoch)
509 |             if (epoch % opt.evalEvery) == 0:
510 |                 recalls = test(whole_test_set, epoch, write_tboard=True)
511 |                 is_best = recalls[5] > best_score 
512 |                 if is_best:
513 |                     not_improved = 0
514 |                     best_score = recalls[5]
515 |                 else: 
516 |                     not_improved += 1
517 | 
518 |                 save_checkpoint({
519 |                         'epoch': epoch,
520 |                         'state_dict': model.state_dict(),
521 |                         'recalls': recalls,
522 |                         'best_score': best_score,
523 |                         'optimizer' : optimizer.state_dict(),
524 |                         'parallel' : isParallel,
525 |                 }, is_best)
526 | 
527 |                 if opt.patience > 0 and not_improved > (opt.patience / opt.evalEvery):
528 |                     print('Performance did not improve for', opt.patience, 'epochs. Stopping.')
529 |                     break
530 | 
531 |         print("=> Best Recall@5: {:.4f}".format(best_score), flush=True)
532 |         writer.close()
533 | 


--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Based on: https://github.com/linrongc/youtube-8m/blob/master/eval_util.py
 3 | """
 4 | 
 5 | import numpy as np
 6 | from sklearn.metrics import average_precision_score
 7 | from sklearn.preprocessing import MultiLabelBinarizer
 8 | 
 9 | import average_precision_calculator as ap_calculator
10 | 
11 | 
12 | def flatten(l):
13 |     """ Merges a list of lists into a single list. """
14 |     return [item for sublist in l for item in sublist]
15 | 
16 | 
17 | def calculate_gap(predictions, actuals, top_k=20):
18 |     """Performs a local (numpy) calculation of the global average precision.
19 |     Only the top_k predictions are taken for each of the videos.
20 |     Args:
21 |         predictions: Matrix containing the outputs of the model.
22 |             Dimensions are 'batch' x 'num_classes'.
23 |         actuals: Matrix containing the ground truth labels.
24 |             Dimensions are 'batch' x 'num_classes'.
25 |         top_k: How many predictions to use per video.
26 |     Returns:
27 |         float: The global average precision.
28 |     """
29 |     gap_calculator = ap_calculator.AveragePrecisionCalculator()
30 |     sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k)
31 |     gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
32 |     return gap_calculator.peek_ap_at_n()
33 | 
34 | 
35 | def top_k_by_class(predictions, labels, k=20):
36 |     """Extracts the top k predictions for each video, sorted by class.
37 |     Args:
38 |         predictions: A numpy matrix containing the outputs of the model.
39 |             Dimensions are 'batch' x 'num_classes'.
40 |         k: the top k non-zero entries to preserve in each prediction.
41 |     Returns:
42 |           A tuple (predictions,labels, true_positives). 'predictions' and 'labels'
43 |           are lists of lists of floats. 'true_positives' is a list of scalars. The
44 |           length of the lists are equal to the number of classes. The entries in the
45 |           predictions variable are probability predictions, and
46 |           the corresponding entries in the labels variable are the ground truth for
47 |           those predictions. The entries in 'true_positives' are the number of true
48 |           positives for each class in the ground truth.
49 |     Raises:
50 |         ValueError: An error occurred when the k is not a positive integer.
51 |     """
52 |     if k <= 0:
53 |         raise ValueError("k must be a positive integer.")
54 |     k = min(k, predictions.shape[1])
55 |     num_classes = predictions.shape[1]
56 |     prediction_triplets = []
57 |     for video_index in range(predictions.shape[0]):
58 |         prediction_triplets.extend(top_k_triplets(predictions[video_index], labels[video_index], k))
59 |     out_predictions = [[] for v in range(num_classes)]
60 |     out_labels = [[] for v in range(num_classes)]
61 |     for triplet in prediction_triplets:
62 |         out_predictions[triplet[0]].append(triplet[1])
63 |         out_labels[triplet[0]].append(triplet[2])
64 |     out_true_positives = [np.sum(labels[:, i]) for i in range(num_classes)]
65 | 
66 |     return out_predictions, out_labels, out_true_positives
67 | 
68 | 
69 | def top_k_triplets(predictions, labels, k=20):
70 |     """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in
71 |     (prediction, class) format"""
72 |     m = len(predictions)
73 |     k = min(k, m)
74 |     indices = np.argpartition(predictions, -k)[-k:]
75 | 
76 |     return [(index, predictions[index], labels[index]) for index in indices]
77 | 


--------------------------------------------------------------------------------
/models/NeXtVLAD.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class NeXtVLAD(nn.Module):
 7 |     """NeXtVLAD layer implementation"""
 8 | 
 9 |     def __init__(self, dim=1024, num_clusters=64, lamb=2, groups=8, max_frames=300):
10 |         super(NeXtVLAD, self).__init__()
11 |         self.num_clusters = num_clusters
12 |         self.dim = dim
13 |         self.alpha = 0
14 |         self.K = num_clusters
15 |         self.G = groups
16 |         self.group_size = int((lamb * dim) // self.G)
17 |         # expansion FC
18 |         self.fc0 = nn.Linear(dim, lamb * dim)
19 |         # soft assignment FC (the cluster weights)
20 |         self.fc_gk = nn.Linear(lamb * dim, self.G * self.K)
21 |         # attention over groups FC
22 |         self.fc_g = nn.Linear(lamb * dim, self.G)
23 |         self.cluster_weights2 = nn.Parameter(torch.rand(1, self.group_size, self.K))
24 | 
25 |         self.bn0 = nn.BatchNorm1d(max_frames)
26 |         self.bn1 = nn.BatchNorm1d(1)
27 | 
28 |     def forward(self, x, mask=None):
29 |         #         print(f"x: {x.shape}")
30 | 
31 |         _, M, N = x.shape
32 |         # expansion FC: B x M x N -> B x M x λN
33 |         x_dot = self.fc0(x)
34 | 
35 |         # reshape into groups: B x M x λN -> B x M x G x (λN/G)
36 |         x_tilde = x_dot.reshape(-1, M, self.G, self.group_size)
37 | 
38 |         # residuals across groups and clusters: B x M x λN -> B x M x (G*K)
39 |         WgkX = self.fc_gk(x_dot)
40 |         WgkX = self.bn0(WgkX)
41 | 
42 |         # residuals reshape across clusters: B x M x (G*K) -> B x (M*G) x K
43 |         WgkX = WgkX.reshape(-1, M * self.G, self.K)
44 | 
45 |         # softmax over assignment: B x (M*G) x K -> B x (M*G) x K
46 |         alpha_gk = F.softmax(WgkX, dim=-1)
47 | 
48 |         # attention across groups: B x M x λN -> B x M x G
49 |         alpha_g = torch.sigmoid(self.fc_g(x_dot))
50 |         if mask is not None:
51 |             alpha_g = torch.mul(alpha_g, mask.unsqueeze(2))
52 | 
53 |         # reshape across time: B x M x G -> B x (M*G) x 1
54 |         alpha_g = alpha_g.reshape(-1, M * self.G, 1)
55 | 
56 |         # apply attention: B x (M*G) x K (X) B x (M*G) x 1 -> B x (M*G) x K
57 |         activation = torch.mul(alpha_gk, alpha_g)
58 | 
59 |         # sum over time and group: B x (M*G) x K -> B x 1 x K
60 |         a_sum = torch.sum(activation, -2, keepdim=True)
61 | 
62 |         # calculate group centers: B x 1 x K (X) 1 x (λN/G) x K -> B x (λN/G) x K
63 |         a = torch.mul(a_sum, self.cluster_weights2)
64 | 
65 |         # permute: B x (M*G) x K -> B x K x (M*G)
66 |         activation = activation.permute(0, 2, 1)
67 | 
68 |         # reshape: B x M x G x (λN/G) -> B x (M*G) x (λN/G)
69 |         reshaped_x_tilde = x_tilde.reshape(-1, M * self.G, self.group_size)
70 | 
71 |         # cluster activation: B x K x (M*G) (X) B x (M*G) x (λN/G) -> B x K x (λN/G)
72 |         vlad = torch.matmul(activation, reshaped_x_tilde)
73 |         # print(f"vlad: {vlad.shape}")
74 | 
75 |         # permute: B x K x (λN/G) (X) B x (λN/G) x K
76 |         vlad = vlad.permute(0, 2, 1)
77 |         # distance to centers: B x (λN/G) x K (-) B x (λN/G) x K
78 |         vlad = torch.sub(vlad, a)
79 |         # normalize: B x (λN/G) x K
80 |         vlad = F.normalize(vlad, 1)
81 |         # reshape: B x (λN/G) x K -> B x 1 x (K * (λN/G))
82 |         vlad = vlad.reshape(-1, 1, self.K * self.group_size)
83 |         vlad = self.bn1(vlad)
84 |         # reshape:  B x 1 x (K * (λN/G)) -> B x (K * (λN/G))
85 |         vlad = vlad.reshape(-1, self.K * self.group_size)
86 | 
87 |         return vlad
88 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w-garcia/NeXtVLAD.pytorch/cc7235b578ad092cd180083397de7411c1fe4684/models/__init__.py


--------------------------------------------------------------------------------
/models/video_classifiers.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from .NeXtVLAD import NeXtVLAD
  6 | 
  7 | 
  8 | class NeXtVLADModel(nn.Module):
  9 |     def __init__(self, num_classes, num_clusters=64, dim=1024, lamb=2, hidden_size=1024,
 10 |                  groups=8, max_frames=300, drop_rate=0.5, gating_reduction=8):
 11 |         super(NeXtVLADModel, self).__init__()
 12 |         self.drop_rate = drop_rate
 13 |         self.group_size = int((lamb * dim) // groups)
 14 |         self.fc0 = nn.Linear(num_clusters * self.group_size, hidden_size)
 15 |         self.bn0 = nn.BatchNorm1d(1)
 16 |         self.fc1 = nn.Linear(hidden_size, hidden_size // gating_reduction)
 17 |         self.bn1 = nn.BatchNorm1d(1)
 18 |         self.fc2 = nn.Linear(hidden_size // gating_reduction, hidden_size)
 19 |         self.logistic = nn.Linear(hidden_size, num_classes)
 20 | 
 21 |         self.video_nextvlad = NeXtVLAD(1024, max_frames=max_frames, lamb=lamb,
 22 |                                        num_clusters=num_clusters, groups=groups)
 23 | 
 24 |     def forward(self, x, mask=None):
 25 |         # B x M x N -> B x (K * (λN/G))
 26 |         vlad = self.video_nextvlad(x, mask=mask)
 27 | 
 28 |         # B x (K * (λN/G))
 29 |         if self.drop_rate > 0.:
 30 |             vlad = F.dropout(vlad, p=self.drop_rate)
 31 | 
 32 |         # B x (K * (λN/G))  -> B x H0
 33 |         activation = self.fc0(vlad)
 34 |         activation = self.bn0(activation.unsqueeze(1)).squeeze()
 35 |         activation = F.relu(activation)
 36 |         # B x H0 -> B x Gr
 37 |         gates = self.fc1(activation)
 38 |         gates = self.bn1(gates.unsqueeze(1)).squeeze()
 39 |         # B x Gr -> B x H0
 40 |         gates = self.fc2(gates)
 41 |         gates = torch.sigmoid(gates)
 42 |         # B x H0 -> B x H0
 43 |         activation = torch.mul(activation, gates)
 44 |         # B x H0 -> B x k
 45 |         out = self.logistic(activation)
 46 |         out = torch.sigmoid(out)
 47 | 
 48 |         return out
 49 | 
 50 | 
 51 | class ConvNeXtVLADModel(nn.Module):
 52 |     """
 53 |     A full Conv + neXtVLAD video classifier pipeline
 54 |     """
 55 | 
 56 |     def __init__(self, nextvlad_model, eigenvecs, eigenvals, center, device, opt):
 57 |         super(ConvNeXtVLADModel, self).__init__()
 58 |         import pretrainedmodels
 59 |         self.ftype = opt['type']
 60 |         self.conv = pretrainedmodels.__dict__[opt['type']](num_classes=1000, pretrained='imagenet')
 61 |         self.device = device
 62 |         self.eigenvecs = torch.from_numpy(eigenvecs).type(torch.FloatTensor).to(device)
 63 |         # self.eigenvals = torch.from_numpy(eigenvals).type(torch.FloatTensor)
 64 |         self.center = torch.from_numpy(center).type(torch.FloatTensor).to(device)
 65 |         self.video_classifier = nextvlad_model
 66 | 
 67 |     def _process_batch(self, batch):
 68 |         output_features = self.conv.features(batch)
 69 |         # output_features = output_features.data.cpu()
 70 | 
 71 |         conv_size = output_features.shape[-1]
 72 | 
 73 |         if self.ftype == 'nasnetalarge' or self.ftype == 'pnasnet5large':
 74 |             relu = nn.ReLU()
 75 |             rf = relu(output_features)
 76 |             avg_pool = nn.AvgPool2d(conv_size, stride=1, padding=0)
 77 |             out_feats = avg_pool(rf)
 78 |         else:
 79 |             avg_pool = nn.AdaptiveAvgPool2d((1, 1))
 80 |             # B x H0 x 1 x 1
 81 |             out_feats = avg_pool(output_features)
 82 |         # B x H0
 83 |         out_feats = out_feats.view(out_feats.size(0), -1)
 84 | 
 85 |         # PCA (no whiten):
 86 |         # B x H0 (-) B x H0
 87 |         out_feats = out_feats - self.center
 88 |         # B x H0 -> B x 1 x (H0/2)
 89 |         out_feats = out_feats.unsqueeze(1).matmul(torch.t(self.eigenvecs))
 90 |         # verification:
 91 |         # (np) out_feats[0].detach().cpu().numpy().reshape(1, 2048).dot(self.eigenvecs.detach().cpu().numpy().T)
 92 |         #   ==
 93 |         # (torch) out_feats.unsqueeze(1).matmul(torch.t(self.eigenvecs))[0]
 94 | 
 95 |         # B x (H0/2)
 96 |         return out_feats.squeeze(1)
 97 | 
 98 |     def conv_forward(self, frame_batch):
 99 |         return self._process_batch(frame_batch)
100 | 
101 |     def nextvlad_model_forward(self, vid_feats, mask):
102 |         return self.video_classifier.forward(vid_feats, mask)
103 | 


--------------------------------------------------------------------------------
/notebooks/arch_debug.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from importlib import reload"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import dataloader\n",
 19 |     "reload(dataloader)\n",
 20 |     "from dataloader import VideoClassificationDataset\n",
 21 |     "import argparse"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 3,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "opt = {\n",
 31 |     "    'feats_dir': \"/home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/train_PCA-1024\",\n",
 32 |     "    'max_frames': 50\n",
 33 |     "}"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 4,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "load feats from /home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/train_PCA-1024\n",
 46 |       "Pre-cache 309 features in memory.\n",
 47 |       "Finished initializing dataloader.\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "train_dataset = VideoClassificationDataset(opt, 'train')"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 5,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import torch\n",
 62 |     "import torch.nn as nn\n",
 63 |     "import torch.nn.functional as F\n",
 64 |     "import torch.optim as optim\n",
 65 |     "\n",
 66 |     "device = torch.device(\"cuda:0\")"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 6,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "from torch.utils.data import DataLoader"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 7,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "train_loader = DataLoader(train_dataset,\n",
 85 |     "                         batch_size=8,\n",
 86 |     "                         num_workers=4,\n",
 87 |     "                         shuffle=True)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 11,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "class NeXtVLAD(nn.Module):\n",
 97 |     "    \"\"\"NeXtVLAD layer implementation\"\"\"\n",
 98 |     "\n",
 99 |     "    def __init__(self, dim=1024, num_clusters=64, lamb=2, groups=8, max_frames=300):\n",
100 |     "        super(NeXtVLAD, self).__init__()\n",
101 |     "        self.num_clusters = num_clusters\n",
102 |     "        self.dim = dim\n",
103 |     "        self.alpha = 0\n",
104 |     "        self.K = num_clusters\n",
105 |     "        self.G = groups\n",
106 |     "        self.group_size = int((lamb*dim) // self.G)\n",
107 |     "        # expansion FC\n",
108 |     "        self.fc0 = nn.Linear(dim, lamb*dim)\n",
109 |     "        # soft assignment FC (the cluster weights)\n",
110 |     "        self.fc_gk = nn.Linear(lamb*dim, self.G * self.K)\n",
111 |     "        # attention over groups FC\n",
112 |     "        self.fc_g = nn.Linear(lamb*dim, self.G)\n",
113 |     "        self.cluster_weights2 = nn.Parameter(torch.rand(1, self.group_size, self.K))\n",
114 |     "        \n",
115 |     "        self.bn0 = nn.BatchNorm1d(max_frames)\n",
116 |     "        self.bn1 = nn.BatchNorm1d(1)\n",
117 |     "        \n",
118 |     "        \n",
119 |     "    def forward(self, x, mask=None):\n",
120 |     "#         print(f\"x: {x.shape}\")\n",
121 |     "    \n",
122 |     "        _, M, N = x.shape\n",
123 |     "        # expansion FC: B x M x N -> B x M x λN\n",
124 |     "        x_dot = self.fc0(x) \n",
125 |     "        \n",
126 |     "        # reshape into groups: B x M x λN -> B x M x G x (λN/G)\n",
127 |     "        x_tilde = x_dot.reshape(-1, M, self.G, self.group_size)\n",
128 |     "        \n",
129 |     "        # residuals across groups and clusters: B x M x λN -> B x M x (G*K) \n",
130 |     "        WgkX = self.fc_gk(x_dot)\n",
131 |     "        WgkX = self.bn0(WgkX)\n",
132 |     "        \n",
133 |     "        # residuals reshape across clusters: B x M x (G*K) -> B x (M*G) x K\n",
134 |     "        WgkX = WgkX.reshape(-1, M*self.G, self.K)\n",
135 |     "        \n",
136 |     "        # softmax over assignment: B x (M*G) x K -> B x (M*G) x K\n",
137 |     "        alpha_gk = F.softmax(WgkX, dim=-1)\n",
138 |     "        \n",
139 |     "        # attention across groups: B x M x λN -> B x M x G\n",
140 |     "        alpha_g = torch.sigmoid(self.fc_g(x_dot))\n",
141 |     "        if mask is not None:\n",
142 |     "            alpha_g = torch.mul(alpha_g, mask.unsqueeze(2))\n",
143 |     "        \n",
144 |     "        # reshape across time: B x M x G -> B x (M*G) x 1\n",
145 |     "        alpha_g = alpha_g.reshape(-1, M*self.G, 1)\n",
146 |     "        \n",
147 |     "        # apply attention: B x (M*G) x K (X) B x (M*G) x 1 -> B x (M*G) x K\n",
148 |     "        activation = torch.mul(alpha_gk, alpha_g)\n",
149 |     "        \n",
150 |     "        # sum over time and group: B x (M*G) x K -> B x 1 x K\n",
151 |     "        a_sum = torch.sum(activation, -2, keepdim=True)\n",
152 |     "        \n",
153 |     "        # calculate group centers: B x 1 x K (X) 1 x (λN/G) x K -> B x (λN/G) x K\n",
154 |     "        a = torch.mul(a_sum, self.cluster_weights2)\n",
155 |     "        \n",
156 |     "        # permute: B x (M*G) x K -> B x K x (M*G)\n",
157 |     "        activation = activation.permute(0, 2, 1)\n",
158 |     "        \n",
159 |     "        # reshape: B x M x G x (λN/G) -> B x (M*G) x (λN/G)\n",
160 |     "        reshaped_x_tilde = x_tilde.reshape(-1, M * self.G, self.group_size)\n",
161 |     "        \n",
162 |     "        # cluster activation: B x K x (M*G) (X) B x (M*G) x (λN/G) -> B x K x (λN/G)\n",
163 |     "        vlad = torch.matmul(activation, reshaped_x_tilde)\n",
164 |     "        # print(f\"vlad: {vlad.shape}\")\n",
165 |     "        \n",
166 |     "        # permute: B x K x (λN/G) (X) B x (λN/G) x K\n",
167 |     "        vlad = vlad.permute(0, 2, 1)\n",
168 |     "        # distance to centers: B x (λN/G) x K (-) B x (λN/G) x K\n",
169 |     "        vlad = torch.sub(vlad, a)\n",
170 |     "        # normalize: B x (λN/G) x K\n",
171 |     "        vlad = F.normalize(vlad, 1)\n",
172 |     "        # reshape: B x (λN/G) x K -> B x 1 x (K * (λN/G))\n",
173 |     "        vlad = vlad.reshape(-1, 1, self.K*self.group_size)\n",
174 |     "        vlad = self.bn1(vlad)\n",
175 |     "        # reshape:  B x 1 x (K * (λN/G)) -> B x (K * (λN/G)) \n",
176 |     "        vlad = vlad.reshape(-1, self.K*self.group_size)\n",
177 |     "        \n",
178 |     "        return vlad"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 12,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "class NeXtVLADModel(nn.Module):\n",
188 |     "    def __init__(self, num_classes, num_clusters=64, dim=1024, lamb=2, hidden_size=1024, \n",
189 |     "                 groups=8, max_frames=300, drop_rate=0.5, gating_reduction=8):\n",
190 |     "        super(NeXtVLADModel, self).__init__()\n",
191 |     "        self.drop_rate = drop_rate\n",
192 |     "        self.group_size = int((lamb*dim) // groups)\n",
193 |     "        self.fc0 = nn.Linear(num_clusters*self.group_size, hidden_size)\n",
194 |     "        self.bn0 = nn.BatchNorm1d(1)\n",
195 |     "        self.fc1 = nn.Linear(hidden_size, hidden_size // gating_reduction)\n",
196 |     "        self.bn1 = nn.BatchNorm1d(1)\n",
197 |     "        self.fc2 = nn.Linear(hidden_size // gating_reduction, hidden_size)\n",
198 |     "        self.logistic = nn.Linear(hidden_size, num_classes)\n",
199 |     "        \n",
200 |     "        self.video_nextvlad = NeXtVLAD(1024, max_frames=max_frames, lamb=lamb, \n",
201 |     "                                       num_clusters=num_clusters, groups=groups)\n",
202 |     "        \n",
203 |     "    def forward(self, x, mask=None):\n",
204 |     "        # B x M x N -> B x (K * (λN/G)) \n",
205 |     "        vlad = self.video_nextvlad(x, mask=mask)\n",
206 |     "        \n",
207 |     "        # B x (K * (λN/G)) \n",
208 |     "        if self.drop_rate > 0.:\n",
209 |     "            vlad = F.dropout(vlad, p=self.drop_rate)\n",
210 |     "        \n",
211 |     "        # B x (K * (λN/G))  -> B x H0\n",
212 |     "        activation = self.fc0(vlad)\n",
213 |     "        activation = self.bn0(activation.unsqueeze(1)).squeeze()\n",
214 |     "        activation = F.relu(activation)\n",
215 |     "        # B x H0 -> B x Gr\n",
216 |     "        gates = self.fc1(activation)\n",
217 |     "        gates = self.bn1(gates.unsqueeze(1)).squeeze()\n",
218 |     "        # B x Gr -> B x H0\n",
219 |     "        gates = self.fc2(gates)\n",
220 |     "        gates = torch.sigmoid(gates)\n",
221 |     "        # B x H0\n",
222 |     "        activation = torch.mul(activation, gates)\n",
223 |     "        out = self.logistic(activation)\n",
224 |     "        out = torch.sigmoid(out)\n",
225 |     "        \n",
226 |     "        return out\n",
227 |     "            \n",
228 |     "        "
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 22,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "model = NeXtVLADModel(train_dataset.num_classes, max_frames=opt['max_frames'])"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 23,
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "epoch:\t0,\tloss:0.71150803565979\n",
250 |       "epoch:\t0,\tloss:0.7146843075752258\n",
251 |       "epoch:\t0,\tloss:0.5954272150993347\n",
252 |       "epoch:\t0,\tloss:0.6422020196914673\n",
253 |       "epoch:\t0,\tloss:0.5928784012794495\n",
254 |       "epoch:\t0,\tloss:0.5122247338294983\n",
255 |       "epoch:\t0,\tloss:0.6339268088340759\n",
256 |       "epoch:\t0,\tloss:0.6461817026138306\n",
257 |       "epoch:\t0,\tloss:0.5485246777534485\n",
258 |       "epoch:\t0,\tloss:0.4685041010379791\n",
259 |       "epoch:\t0,\tloss:0.47569671273231506\n",
260 |       "epoch:\t0,\tloss:0.44309744238853455\n",
261 |       "epoch:\t0,\tloss:0.4427070617675781\n",
262 |       "epoch:\t0,\tloss:0.4889450967311859\n",
263 |       "epoch:\t0,\tloss:0.4677741527557373\n",
264 |       "epoch:\t0,\tloss:0.5337631106376648\n",
265 |       "epoch:\t0,\tloss:0.26898637413978577\n",
266 |       "epoch:\t0,\tloss:0.4407520294189453\n",
267 |       "epoch:\t0,\tloss:0.3478405773639679\n",
268 |       "epoch:\t0,\tloss:0.3842126131057739\n",
269 |       "epoch:\t0,\tloss:0.4324081242084503\n",
270 |       "epoch:\t0,\tloss:0.3600882589817047\n",
271 |       "epoch:\t0,\tloss:0.23256240785121918\n",
272 |       "epoch:\t0,\tloss:0.3121560513973236\n",
273 |       "epoch:\t0,\tloss:0.30288824439048767\n",
274 |       "epoch:\t0,\tloss:0.17022843658924103\n",
275 |       "epoch:\t0,\tloss:0.2929340898990631\n",
276 |       "epoch:\t0,\tloss:0.12832005321979523\n",
277 |       "epoch:\t0,\tloss:0.27824535965919495\n",
278 |       "epoch:\t0,\tloss:0.1415291577577591\n",
279 |       "epoch:\t0,\tloss:0.10328144580125809\n",
280 |       "epoch:\t0,\tloss:0.18037013709545135\n",
281 |       "epoch:\t0,\tloss:0.09852627664804459\n",
282 |       "epoch:\t0,\tloss:0.24034304916858673\n",
283 |       "epoch:\t0,\tloss:0.19834274053573608\n",
284 |       "epoch:\t0,\tloss:0.18170402944087982\n",
285 |       "epoch:\t0,\tloss:0.09749788045883179\n",
286 |       "epoch:\t0,\tloss:0.11607547849416733\n",
287 |       "epoch:\t0,\tloss:0.0376109853386879\n",
288 |       "epoch:\t1,\tloss:0.09225022047758102\n",
289 |       "epoch:\t1,\tloss:0.02818330191075802\n",
290 |       "epoch:\t1,\tloss:0.01991969160735607\n",
291 |       "epoch:\t1,\tloss:0.027169905602931976\n",
292 |       "epoch:\t1,\tloss:0.03678622841835022\n",
293 |       "epoch:\t1,\tloss:0.025325099006295204\n",
294 |       "epoch:\t1,\tloss:0.02467428892850876\n",
295 |       "epoch:\t1,\tloss:0.02342064119875431\n",
296 |       "epoch:\t1,\tloss:0.006782206241041422\n",
297 |       "epoch:\t1,\tloss:0.01385602355003357\n",
298 |       "epoch:\t1,\tloss:0.025996623560786247\n",
299 |       "epoch:\t1,\tloss:0.05951960012316704\n",
300 |       "epoch:\t1,\tloss:0.02473391406238079\n",
301 |       "epoch:\t1,\tloss:0.009166402742266655\n",
302 |       "epoch:\t1,\tloss:0.014516142196953297\n",
303 |       "epoch:\t1,\tloss:0.008172743953764439\n",
304 |       "epoch:\t1,\tloss:0.00869485829025507\n",
305 |       "epoch:\t1,\tloss:0.007238415535539389\n",
306 |       "epoch:\t1,\tloss:0.015679262578487396\n",
307 |       "epoch:\t1,\tloss:0.005326947197318077\n",
308 |       "epoch:\t1,\tloss:0.004706756677478552\n",
309 |       "epoch:\t1,\tloss:0.0034857578575611115\n",
310 |       "epoch:\t1,\tloss:0.004158268216997385\n",
311 |       "epoch:\t1,\tloss:0.0059410869143903255\n",
312 |       "epoch:\t1,\tloss:0.006137473974376917\n",
313 |       "epoch:\t1,\tloss:0.027485201135277748\n",
314 |       "epoch:\t1,\tloss:0.0030662603676319122\n",
315 |       "epoch:\t1,\tloss:0.0027211287524551153\n",
316 |       "epoch:\t1,\tloss:0.005441123154014349\n",
317 |       "epoch:\t1,\tloss:0.002412184840068221\n",
318 |       "epoch:\t1,\tloss:0.003415692364796996\n",
319 |       "epoch:\t1,\tloss:0.002235227031633258\n",
320 |       "epoch:\t1,\tloss:0.0033596300054341555\n",
321 |       "epoch:\t1,\tloss:0.005315082613378763\n",
322 |       "epoch:\t1,\tloss:0.002937655197456479\n",
323 |       "epoch:\t1,\tloss:0.002290458185598254\n",
324 |       "epoch:\t1,\tloss:0.007688215002417564\n",
325 |       "epoch:\t1,\tloss:0.0027433286886662245\n",
326 |       "epoch:\t1,\tloss:0.002007373608648777\n",
327 |       "epoch:\t2,\tloss:0.0023477952927351\n",
328 |       "epoch:\t2,\tloss:0.001861131633631885\n",
329 |       "epoch:\t2,\tloss:0.0010836547007784247\n",
330 |       "epoch:\t2,\tloss:0.0018228593980893493\n",
331 |       "epoch:\t2,\tloss:0.002600134816020727\n",
332 |       "epoch:\t2,\tloss:0.0016692288918420672\n",
333 |       "epoch:\t2,\tloss:0.0022326342295855284\n",
334 |       "epoch:\t2,\tloss:0.002286992035806179\n",
335 |       "epoch:\t2,\tloss:0.0010390046518296003\n",
336 |       "epoch:\t2,\tloss:0.0015219493070617318\n",
337 |       "epoch:\t2,\tloss:0.0020285442005842924\n",
338 |       "epoch:\t2,\tloss:0.001638404093682766\n",
339 |       "epoch:\t2,\tloss:0.0016711241332814097\n",
340 |       "epoch:\t2,\tloss:0.001292763277888298\n",
341 |       "epoch:\t2,\tloss:0.0024440630804747343\n",
342 |       "epoch:\t2,\tloss:0.0009746658033691347\n",
343 |       "epoch:\t2,\tloss:0.0018626617966219783\n",
344 |       "epoch:\t2,\tloss:0.0014337325701490045\n",
345 |       "epoch:\t2,\tloss:0.0010064152302220464\n",
346 |       "epoch:\t2,\tloss:0.0013640819815918803\n",
347 |       "epoch:\t2,\tloss:0.0010625479044392705\n",
348 |       "epoch:\t2,\tloss:0.0014420481165871024\n",
349 |       "epoch:\t2,\tloss:0.0008912244811654091\n",
350 |       "epoch:\t2,\tloss:0.001545345294289291\n",
351 |       "epoch:\t2,\tloss:0.0010661480482667685\n",
352 |       "epoch:\t2,\tloss:0.0008565050084143877\n",
353 |       "epoch:\t2,\tloss:0.0006735824863426387\n",
354 |       "epoch:\t2,\tloss:0.0008698648889549077\n",
355 |       "epoch:\t2,\tloss:0.0017080713296309114\n",
356 |       "epoch:\t2,\tloss:0.0010185097344219685\n",
357 |       "epoch:\t2,\tloss:0.0010622901609167457\n",
358 |       "epoch:\t2,\tloss:0.0012533634435385466\n",
359 |       "epoch:\t2,\tloss:0.0009148705867119133\n",
360 |       "epoch:\t2,\tloss:0.0006462182500399649\n",
361 |       "epoch:\t2,\tloss:0.0005030953325331211\n",
362 |       "epoch:\t2,\tloss:0.0015385170700028539\n",
363 |       "epoch:\t2,\tloss:0.0006630505085922778\n",
364 |       "epoch:\t2,\tloss:0.0007098554633557796\n",
365 |       "epoch:\t2,\tloss:0.0009038225398398936\n",
366 |       "epoch:\t3,\tloss:0.00042479473631829023\n",
367 |       "epoch:\t3,\tloss:0.0005855750059708953\n",
368 |       "epoch:\t3,\tloss:0.0007057485054247081\n",
369 |       "epoch:\t3,\tloss:0.0008201799355447292\n",
370 |       "epoch:\t3,\tloss:0.0005157763953320682\n",
371 |       "epoch:\t3,\tloss:0.0008646969799883664\n",
372 |       "epoch:\t3,\tloss:0.0009159119799733162\n",
373 |       "epoch:\t3,\tloss:0.000726655765902251\n",
374 |       "epoch:\t3,\tloss:0.0007182210683822632\n",
375 |       "epoch:\t3,\tloss:0.0008668283117003739\n",
376 |       "epoch:\t3,\tloss:0.0007231898489408195\n",
377 |       "epoch:\t3,\tloss:0.0008738313335925341\n",
378 |       "epoch:\t3,\tloss:0.0006608644616790116\n",
379 |       "epoch:\t3,\tloss:0.0006918812287040055\n",
380 |       "epoch:\t3,\tloss:0.00042746460530906916\n",
381 |       "epoch:\t3,\tloss:0.0004623888526111841\n",
382 |       "epoch:\t3,\tloss:0.00040710484609007835\n",
383 |       "epoch:\t3,\tloss:0.0006149121909402311\n",
384 |       "epoch:\t3,\tloss:0.0006414263043552637\n",
385 |       "epoch:\t3,\tloss:0.0005345437093637884\n",
386 |       "epoch:\t3,\tloss:0.0007223087013699114\n",
387 |       "epoch:\t3,\tloss:0.0008337963372468948\n",
388 |       "epoch:\t3,\tloss:0.0016031116247177124\n",
389 |       "epoch:\t3,\tloss:0.0008548393961973488\n",
390 |       "epoch:\t3,\tloss:0.0007479392806999385\n",
391 |       "epoch:\t3,\tloss:0.0006933917757123709\n",
392 |       "epoch:\t3,\tloss:0.0005947808967903256\n",
393 |       "epoch:\t3,\tloss:0.00040444658952765167\n",
394 |       "epoch:\t3,\tloss:0.0005790918366983533\n",
395 |       "epoch:\t3,\tloss:0.0009108057129196823\n",
396 |       "epoch:\t3,\tloss:0.0008470119792036712\n",
397 |       "epoch:\t3,\tloss:0.0009477338171564043\n",
398 |       "epoch:\t3,\tloss:0.00045438858796842396\n",
399 |       "epoch:\t3,\tloss:0.0008903048583306372\n",
400 |       "epoch:\t3,\tloss:0.0007609418244101107\n",
401 |       "epoch:\t3,\tloss:0.001175822108052671\n",
402 |       "epoch:\t3,\tloss:0.0005316018941812217\n",
403 |       "epoch:\t3,\tloss:0.0006653064046986401\n",
404 |       "epoch:\t3,\tloss:0.00032494479091838\n",
405 |       "epoch:\t4,\tloss:0.0005201410385780036\n",
406 |       "epoch:\t4,\tloss:0.0007186997099779546\n",
407 |       "epoch:\t4,\tloss:0.00048609552322886884\n",
408 |       "epoch:\t4,\tloss:0.0008609591168351471\n",
409 |       "epoch:\t4,\tloss:0.0006337621598504484\n",
410 |       "epoch:\t4,\tloss:0.00048226353828795254\n",
411 |       "epoch:\t4,\tloss:0.0005028933519497514\n",
412 |       "epoch:\t4,\tloss:0.00029791248380206525\n",
413 |       "epoch:\t4,\tloss:0.0005183366592973471\n",
414 |       "epoch:\t4,\tloss:0.00031539611518383026\n",
415 |       "epoch:\t4,\tloss:0.00048409486771561205\n",
416 |       "epoch:\t4,\tloss:0.00035559770185500383\n",
417 |       "epoch:\t4,\tloss:0.0006230109720490873\n",
418 |       "epoch:\t4,\tloss:0.0006612534634768963\n",
419 |       "epoch:\t4,\tloss:0.00029597230604849756\n",
420 |       "epoch:\t4,\tloss:0.0006362967542372644\n",
421 |       "epoch:\t4,\tloss:0.00038377134478650987\n",
422 |       "epoch:\t4,\tloss:0.0007281986181624234\n",
423 |       "epoch:\t4,\tloss:0.0004282900772523135\n",
424 |       "epoch:\t4,\tloss:0.00039028548053465784\n",
425 |       "epoch:\t4,\tloss:0.0003747685404960066\n",
426 |       "epoch:\t4,\tloss:0.0005309387925080955\n",
427 |       "epoch:\t4,\tloss:0.000556213257368654\n",
428 |       "epoch:\t4,\tloss:0.0005487402086146176\n",
429 |       "epoch:\t4,\tloss:0.0003494209668133408\n",
430 |       "epoch:\t4,\tloss:0.0006299832020886242\n",
431 |       "epoch:\t4,\tloss:0.0004588236042764038\n",
432 |       "epoch:\t4,\tloss:0.0005549622583203018\n",
433 |       "epoch:\t4,\tloss:0.00018302483658771962\n",
434 |       "epoch:\t4,\tloss:0.00024095167464111\n",
435 |       "epoch:\t4,\tloss:0.0005101535934954882\n",
436 |       "epoch:\t4,\tloss:0.00034454729757271707\n",
437 |       "epoch:\t4,\tloss:0.00025429722154513\n",
438 |       "epoch:\t4,\tloss:0.0002479896356817335\n",
439 |       "epoch:\t4,\tloss:0.0007369245286099613\n",
440 |       "epoch:\t4,\tloss:0.00034910847898572683\n",
441 |       "epoch:\t4,\tloss:0.0005303460056893528\n",
442 |       "epoch:\t4,\tloss:0.0005001642857678235\n",
443 |       "epoch:\t4,\tloss:0.0002676190924830735\n"
444 |      ]
445 |     }
446 |    ],
447 |    "source": [
448 |     "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
449 |     "exp_lr_schedulr = optim.lr_scheduler.StepLR(optimizer, step_size=25)\n",
450 |     "\n",
451 |     "model.train()\n",
452 |     "model.to(device)\n",
453 |     "\n",
454 |     "for epoch in range(5):\n",
455 |     "    for data in train_loader:\n",
456 |     "        fc_feats = data['fc_feats'].to(device)\n",
457 |     "        labels = data['ground_truth'].to(device)\n",
458 |     "        masks = data['mask'].to(device)\n",
459 |     "\n",
460 |     "        out = model(fc_feats, mask=masks)\n",
461 |     "    #     print(f\"out: {out.shape}\")\n",
462 |     "    #     print(f\"labels: {labels.shape}\")\n",
463 |     "        loss = F.binary_cross_entropy(out, labels)\n",
464 |     "\n",
465 |     "        optimizer.zero_grad()\n",
466 |     "        loss.backward()\n",
467 |     "        optimizer.step()\n",
468 |     "        print(f\"epoch:\\t{epoch},\\tloss:{loss.cpu().data.numpy()}\")"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": 65,
474 |    "metadata": {},
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "torch.Size([16, 1024])\n"
481 |      ]
482 |     }
483 |    ],
484 |    "source": [
485 |     "data = train_dataset.__getitem__(5)\n",
486 |     "fc_feats = data['fc_feats']\n",
487 |     "print(fc_feats.shape)"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": 36,
493 |    "metadata": {},
494 |    "outputs": [],
495 |    "source": [
496 |     "import numpy as np\n",
497 |     "import metrics\n",
498 |     "reload(metrics)\n",
499 |     "from metrics import calculate_gap"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": 34,
505 |    "metadata": {},
506 |    "outputs": [
507 |     {
508 |      "name": "stdout",
509 |      "output_type": "stream",
510 |      "text": [
511 |       "load feats from /home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/test_PCA-1024\n",
512 |       "Pre-cache 75 features in memory.\n",
513 |       "Finished initializing dataloader.\n"
514 |      ]
515 |     }
516 |    ],
517 |    "source": [
518 |     "opt = {\n",
519 |     "    'feats_dir': \"/home/wgar/NeXtVLAD.pytorch/data/UCF101_debug/test_PCA-1024\",\n",
520 |     "    'max_frames': 50\n",
521 |     "}\n",
522 |     "test_dataset = VideoClassificationDataset(opt, 'test')\n",
523 |     "test_loader = DataLoader(test_dataset,\n",
524 |     "                         batch_size=8,\n",
525 |     "                         num_workers=4,\n",
526 |     "                         shuffle=True)"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": 43,
532 |    "metadata": {},
533 |    "outputs": [
534 |     {
535 |      "name": "stdout",
536 |      "output_type": "stream",
537 |      "text": [
538 |       "GAP(20): 0.9933333333333333\n"
539 |      ]
540 |     }
541 |    ],
542 |    "source": [
543 |     "preds = []\n",
544 |     "actuals = []\n",
545 |     "\n",
546 |     "for data in test_loader:\n",
547 |     "    fc_feats = data['fc_feats'].to(device)\n",
548 |     "    labels = data['ground_truth']\n",
549 |     "    masks = data['mask'].to(device)\n",
550 |     "\n",
551 |     "    out = model(fc_feats, mask=masks)\n",
552 |     "    out = out.cpu().data.numpy()\n",
553 |     "    labels = labels.cpu().data.numpy()\n",
554 |     "#     print(out.shape)\n",
555 |     "#     print(labels.shape)\n",
556 |     "    preds.extend(out)\n",
557 |     "    actuals.extend(labels)\n",
558 |     "    \n",
559 |     "print(f\"GAP(20): {calculate_gap(np.asarray(preds), np.asarray(actuals), top_k=20)}\")"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": null,
565 |    "metadata": {},
566 |    "outputs": [],
567 |    "source": []
568 |   }
569 |  ],
570 |  "metadata": {
571 |   "kernelspec": {
572 |    "display_name": "Python 3",
573 |    "language": "python",
574 |    "name": "python3"
575 |   },
576 |   "language_info": {
577 |    "codemirror_mode": {
578 |     "name": "ipython",
579 |     "version": 3
580 |    },
581 |    "file_extension": ".py",
582 |    "mimetype": "text/x-python",
583 |    "name": "python",
584 |    "nbconvert_exporter": "python",
585 |    "pygments_lexer": "ipython3",
586 |    "version": "3.6.10"
587 |   }
588 |  },
589 |  "nbformat": 4,
590 |  "nbformat_minor": 4
591 | }
592 | 


--------------------------------------------------------------------------------
/sample.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import torch
 4 | import numpy as np
 5 | import logging
 6 | import ffmpeg
 7 | 
 8 | from models.video_classifiers import NeXtVLADModel
 9 | from util import feature_pca, create_batches
10 | from torch.autograd import Variable
11 | 
12 | from util import init_model as init_convnet
13 | from util import process_batches
14 | 
15 | logging.basicConfig()
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.DEBUG)
18 | device = torch.device("cuda:0")
19 | 
20 | available_features = ['nasnetalarge', 'resnet152', 'pnasnet5large', 'densenet121', 'senet154', 'polynet', 'vgg16']
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     opt = argparse.ArgumentParser()
25 |     opt.add_argument('ckpt_file', help='Path to the NeXtVLAD checkpoint file.')
26 |     opt.add_argument('pca_dir', help='Directory containing PCA data.')
27 |     opt.add_argument('files', nargs='+', help='List of files to process.')
28 |     opt.add_argument('-gl', '--gpu_list',
29 |                      required=True, nargs='+', type=int,
30 |                      help="Space delimited list of GPU indices to use. Example for 4 GPUs: -gl 0 1 2 3")
31 |     opt.add_argument('-bs', '--batch_size', type=int,
32 |                      help="Batch size to use during feature extraction. Larger batch size = more VRAM usage",
33 |                      default=8)
34 |     opt.add_argument('--type', required=True,
35 |                      help='ConvNet to use for processing features.',
36 |                      choices=available_features)
37 |     opt.add_argument('--max_frames', help="Max frames length of dataset.", default=50, type=int)
38 |     opt.add_argument('--num_classes', help="Number of classes that was in train dataset.", default=5, type=int)
39 | 
40 |     opt = vars(opt.parse_args())
41 | 
42 |     logger.info("Found {} GPUs, using {}.".format(torch.cuda.device_count(), len(opt['gpu_list'])))
43 | 
44 |     # Convnet
45 |     tf_img, convnet = init_convnet(opt['gpu_list'], opt['type'])
46 |     # PCA
47 |     eigenvecs = np.load(os.path.join(opt['pca_dir'], 'eigenvecss.npy'))
48 |     eigenvals = np.load(os.path.join(opt['pca_dir'], 'eigenvals.npy'))
49 |     center = np.load(os.path.join(opt['pca_dir'], 'mean.npy'))
50 |     # neXtVLAD
51 |     model = NeXtVLADModel(opt['num_classes'], max_frames=opt['max_frames'])
52 |     model.load_state_dict(torch.load(opt['ckpt_file']))
53 |     model.to(device)
54 |     model.eval()
55 | 
56 |     for video in opt['files']:
57 |         probe = ffmpeg.probe(video)
58 |         video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
59 |         width = int(video_stream['width'])
60 |         height = int(video_stream['height'])
61 | 
62 |         out, _ = (
63 |             ffmpeg
64 |             .input(video)
65 |             # .output('pipe:', format='rawvideo', pix_fmt='rgb24')
66 |             .output('pipe:', format='rawvideo', pix_fmt='rgb24', r=1)
67 |             .run(capture_stdout=True)
68 |         )
69 |         video_np = (
70 |             np
71 |             .frombuffer(out, np.uint8)
72 |             .reshape([-1, height, width, 3])
73 |         )
74 |         batches = create_batches(video_np, tf_img, batch_size=opt['batch_size'])
75 |         feats = process_batches(batches, opt['type'], opt['gpu_list'], convnet)
76 | 
77 |         fpca = np.zeros((len(feats), eigenvecs.T.shape[1]))
78 |         for i, feat in enumerate(feats):
79 |             fpca[i] = feature_pca(feat, center, eigenvals, eigenvecs)
80 | 
81 |         n = min(opt['max_frames'], len(fpca))
82 |         padded = np.zeros((opt['max_frames'], fpca.shape[1]))
83 |         padded[:n, :] = fpca[:n, :]
84 |         mask = np.zeros((opt['max_frames'],))
85 |         mask[:n] = 1
86 | 
87 |         fc_feats = Variable(torch.from_numpy(padded).type(torch.FloatTensor)).to(device)
88 |         mask = Variable(torch.from_numpy(mask).type(torch.FloatTensor)).to(device)
89 | 
90 |         out = model(fc_feats.unsqueeze(0), mask=mask.unsqueeze(0))
91 |         print(f"{video}: {out.argmax().detach().cpu().numpy()}")
92 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import torch
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | import numpy as np
  7 | from torch.utils.data import DataLoader
  8 | 
  9 | from dataloader import VideoClassificationDataset
 10 | from models.video_classifiers import NeXtVLADModel
 11 | from metrics import calculate_gap
 12 | from tqdm import tqdm
 13 | 
 14 | device = torch.device("cuda:0")
 15 | 
 16 | 
 17 | def train(opt, model, optimizer, scheduler, train_loader):
 18 |     with tqdm(total=len(train_loader)) as pb:
 19 |         for data in train_loader:
 20 |             fc_feats = data['fc_feats'].to(device)
 21 |             labels = data['ground_truth'].to(device)
 22 |             masks = data['mask'].to(device)
 23 | 
 24 |             out = model(fc_feats, mask=masks)
 25 |             loss = F.binary_cross_entropy(out, labels)
 26 | 
 27 |             optimizer.zero_grad()
 28 |             loss.backward()
 29 |             optimizer.step()
 30 |             str_loss = f"{loss.cpu().data.numpy():.4f}"
 31 |             pb.update(1)
 32 |             pb.set_postfix(epoch=epoch, loss=str_loss)
 33 | 
 34 | 
 35 | def eval(opt, model, test_loader):
 36 |     preds = []
 37 |     actuals = []
 38 | 
 39 |     for data in test_loader:
 40 |         fc_feats = data['fc_feats'].to(device)
 41 |         labels = data['ground_truth']
 42 |         masks = data['mask'].to(device)
 43 | 
 44 |         out = model(fc_feats, mask=masks)
 45 |         out = out.cpu().data.numpy()
 46 |         labels = labels.cpu().data.numpy()
 47 |         preds.extend(out)
 48 |         actuals.extend(labels)
 49 | 
 50 |     gap_score = calculate_gap(np.asarray(preds), np.asarray(actuals), top_k=opt['gapk'])
 51 |     return gap_score
 52 | 
 53 | 
 54 | if __name__ == '__main__':
 55 |     opt = argparse.ArgumentParser()
 56 |     opt.add_argument('train_feats_dir', help="Directory where train features are stored.")
 57 |     opt.add_argument('test_feats_dir', help="Directory where test features are stored.")
 58 |     opt.add_argument('--max_frames', help="Max frames length of dataset.", default=50, type=int)
 59 |     opt.add_argument('--gapk', help="Value of K for computing GAP score.", default=20, type=int)
 60 |     opt.add_argument('--num_epochs', help="Number of epochs.", default=5, type=int)
 61 |     opt.add_argument('--ckpt_dir', help="Where to save checkpoints.", default='ckpt/')
 62 | 
 63 |     opt = vars(opt.parse_args())
 64 | 
 65 |     if not os.path.isdir(opt['ckpt_dir']):
 66 |         os.mkdir(opt['ckpt_dir'])
 67 | 
 68 |     train_opts = {
 69 |         'feats_dir': opt['train_feats_dir'],
 70 |         'max_frames': opt['max_frames']
 71 |     }
 72 |     train_dataset = VideoClassificationDataset(train_opts, 'train')
 73 |     train_loader = DataLoader(train_dataset,
 74 |                               batch_size=8,
 75 |                               num_workers=4,
 76 |                               shuffle=True)
 77 | 
 78 |     test_opts = {
 79 |         'feats_dir': opt['test_feats_dir'],
 80 |         'max_frames': opt['max_frames']
 81 |     }
 82 |     test_dataset = VideoClassificationDataset(test_opts, 'test')
 83 |     test_loader = DataLoader(test_dataset,
 84 |                              batch_size=8,
 85 |                              num_workers=4,
 86 |                              shuffle=True)
 87 | 
 88 |     model = NeXtVLADModel(train_dataset.num_classes, max_frames=opt['max_frames'])
 89 |     optimizer = optim.Adam(model.parameters(), lr=0.001)
 90 |     exp_lr_schedulr = optim.lr_scheduler.StepLR(optimizer, step_size=25)
 91 | 
 92 |     model.to(device)
 93 | 
 94 |     for epoch in range(opt['num_epochs']):
 95 |         model.train()
 96 |         train(opt, model, optimizer, exp_lr_schedulr, train_loader)
 97 | 
 98 |         model.eval()
 99 |         gap_score = eval(opt, model, test_loader)
100 |         print(f"GAP({opt['gapk']}): {gap_score:.3f}")
101 | 
102 |         model_path = os.path.join(opt['ckpt_dir'], f"model_e{epoch}_gap{opt['gapk']}-{gap_score:.3f}.pth")
103 |         torch.save(model.state_dict(), model_path)
104 |         print(f"Model saved to {model_path}")


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import numpy as np
  4 | import math
  5 | import torch.nn as nn
  6 | import pretrainedmodels
  7 | 
  8 | from PIL import Image
  9 | from pretrainedmodels.utils import ToRange255, ToSpaceBGR, transforms, munchify
 10 | 
 11 | 
 12 | def feature_pca_whiten(feat, center, eigenvals, eigenvecs):
 13 |     epsilon = 1e-4
 14 |     d = feat.shape[0]
 15 | 
 16 |     # subtract mean
 17 |     fcen = feat - center
 18 |     # principal components
 19 |     fpca = fcen.reshape((1, d)).dot(eigenvecs.T).squeeze(0)
 20 |     # whiten
 21 |     pcaw = fpca / np.sqrt(eigenvals + epsilon)
 22 | 
 23 |     return pcaw
 24 | 
 25 | 
 26 | def feature_pca(feat, center, eigenvals, eigenvecs):
 27 |     """
 28 |     Skip whitening, as done by Lin et al.
 29 |     :param feat:
 30 |     :param center:
 31 |     :param eigenvals:
 32 |     :param eigenvecs:
 33 |     :return:
 34 |     """
 35 |     d = feat.shape[0]
 36 | 
 37 |     # subtract mean
 38 |     fcen = feat - center
 39 |     # principal components
 40 |     fpca = fcen.reshape((1, d)).dot(eigenvecs.T).squeeze(0)
 41 | 
 42 |     return fpca
 43 | 
 44 | 
 45 | def create_batches(frames_to_do, tf_img_fn, logger=None, batch_size=32):
 46 |     n = len(frames_to_do)
 47 |     if n < batch_size:
 48 |         if logger: logger.warning("Sample size less than batch size: Cutting batch size.")
 49 |         batch_size = n
 50 | 
 51 |     if logger: logger.info("Generating {} batches...".format(n // batch_size))
 52 |     batches = []
 53 |     frames_to_do = np.array(frames_to_do)
 54 | 
 55 |     for idx in range(0, n, batch_size):
 56 |         frames_idx = list(range(idx, min(idx+batch_size, n)))
 57 |         batch_frames = frames_to_do[frames_idx]
 58 | 
 59 |         batch_tensor = None
 60 |         for i, frame_ in enumerate(batch_frames):
 61 |             if type(frame_) is np.ndarray:
 62 |                 input_frame = Image.fromarray(frame_).convert('RGB')
 63 |             else: # filename
 64 |                 input_frame = Image.open(frame_).convert('RGB')
 65 |             input_tensor = tf_img_fn(input_frame)  # 3x400x225 -> 3x299x299 size may differ
 66 |             # input_tensor = input_tensor.unsqueeze(0)  # 3x299x299 -> 1x3x299x299
 67 |             if batch_tensor is None:
 68 |                 batch_tensor = torch.zeros((len(batch_frames),) + input_tensor.shape)
 69 |             batch_tensor[i] = input_tensor
 70 | 
 71 |         batch_ag = torch.autograd.Variable(batch_tensor, requires_grad=False)
 72 |         batches.append(batch_ag)
 73 |     return batches
 74 | 
 75 | 
 76 | class TransformImage(object):
 77 | 
 78 |     def __init__(self, opts, scale=0.875, random_crop=False,
 79 |                  random_hflip=False, random_vflip=False,
 80 |                  preserve_aspect_ratio=True):
 81 |         if type(opts) == dict:
 82 |             opts = munchify(opts)
 83 |         self.input_size = opts.input_size
 84 |         self.input_space = opts.input_space
 85 |         self.input_range = opts.input_range
 86 |         self.mean = opts.mean
 87 |         self.std = opts.std
 88 | 
 89 |         # https://github.com/tensorflow/models/blob/master/research/inception/inception/image_processing.py#L294
 90 |         self.scale = scale
 91 |         self.random_crop = random_crop
 92 |         self.random_hflip = random_hflip
 93 |         self.random_vflip = random_vflip
 94 | 
 95 |         tfs = []
 96 |         if preserve_aspect_ratio:
 97 |             tfs.append(transforms.Resize(int(math.floor(max(self.input_size)/self.scale))))
 98 |         else:
 99 |             height = int(self.input_size[1] / self.scale)
100 |             width = int(self.input_size[2] / self.scale)
101 |             tfs.append(transforms.Resize((height, width)))
102 | 
103 |         if random_crop:
104 |             tfs.append(transforms.RandomCrop(max(self.input_size)))
105 |         # else:
106 |         #     tfs.append(transforms.CenterCrop(max(self.input_size)))
107 | 
108 |         if random_hflip:
109 |             tfs.append(transforms.RandomHorizontalFlip())
110 | 
111 |         if random_vflip:
112 |             tfs.append(transforms.RandomVerticalFlip())
113 | 
114 |         tfs.append(transforms.ToTensor())
115 |         tfs.append(ToSpaceBGR(self.input_space=='BGR'))
116 |         tfs.append(ToRange255(max(self.input_range)==255))
117 |         tfs.append(transforms.Normalize(mean=self.mean, std=self.std))
118 | 
119 |         self.tf = transforms.Compose(tfs)
120 | 
121 |     def __call__(self, img):
122 |         tensor = self.tf(img)
123 |         return tensor
124 | 
125 | 
126 | def init_model(gpu_ids, model_name):
127 | 
128 |     # model_name = 'pnasnet5large'
129 |     # could be fbresnet152 or inceptionresnetv2
130 |     model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet')
131 |     model.eval()
132 | 
133 |     # transformations depending on the model
134 |     # rescale, center crop, normalize, and others (ex: ToBGR, ToRange255)
135 |     tf_img = TransformImage(model)
136 | 
137 |     """
138 |     TODO(WG): Would be nice to use something like DataParallel, but that only does forward pass on given module.
139 |     Need to stop before logits step. 
140 |     Should create wrapper for pretrainedmodels that does the MPI-like ops across GPUs on model.features modules:
141 |     1) replicated
142 |     2) scatter
143 |     3) parallel_apply
144 |     4) gather
145 |     Would have to know what layers are being used on each model. 
146 |     """
147 |     if torch.cuda.is_available():
148 |         model = model.cuda(device=gpu_ids[0])
149 | 
150 |     return tf_img, model
151 | 
152 | 
153 | def process_batches(batches, ftype, gpu_list, model, logger=None):
154 |     done_batches = []
155 |     for i, batch in enumerate(batches):
156 |         if torch.cuda.is_available():
157 |             batch = batch.cuda(device=gpu_list[0])
158 | 
159 |         output_features = model.features(batch)
160 |         output_features = output_features.data.cpu()
161 | 
162 |         conv_size = output_features.shape[-1]
163 | 
164 |         if ftype == 'nasnetalarge' or ftype == 'pnasnet5large':
165 |             relu = nn.ReLU()
166 |             rf = relu(output_features)
167 |             avg_pool = nn.AvgPool2d(conv_size, stride=1, padding=0)
168 |             out_feats = avg_pool(rf)
169 |         else:
170 |             avg_pool = nn.AdaptiveAvgPool2d((1, 1))
171 |             out_feats = avg_pool(output_features)
172 | 
173 |         out_feats = out_feats.view(out_feats.size(0), -1)
174 |         if logger: logger.info('Processed {}/{} batches.\r'.format(i + 1, len(batches)))
175 | 
176 |         done_batches.append(out_feats)
177 |     feats = np.concatenate(done_batches, axis=0)
178 |     return feats


--------------------------------------------------------------------------------