├── .idea ├── deployment.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── pytorch-act-detector.iml └── vcs.xml ├── README.md ├── config.py ├── data ├── __init__.py ├── dataset.py └── transforms.py ├── error_handle.py ├── layers ├── __init__.py ├── act_cuboid_loss.py └── ssd.py ├── test.py ├── train.py ├── utils ├── __init__.py ├── act_tubes.py ├── box.py ├── map_eval.py └── prior_tubes.py └── visual_featuremaps.py /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 29 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/pytorch-act-detector.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pytorch-act-detector 2 | paper:https://arxiv.org/abs/1705.01861 3 | 4 | |model |modilaty |map | 5 | |:----:|:--------:|:---:| 6 | |UCFSports|rgb|0.8259| 7 | 8 | 9 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import sys 4 | 5 | 6 | def add_path(path): 7 | if path not in sys.path: 8 | sys.path.insert(0, path) 9 | 10 | 11 | root_dir = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) 12 | 13 | 14 | class Config: 15 | # -------------------------data config ------------------------------# 16 | dataset = 'UCF101v2' 17 | # dataset = 'UCFSports' 18 | if dataset == 'UCF101v2': 19 | data_path = "/mnt/data/qzw/data/UCF101/" 20 | elif dataset == 'UCFSports': 21 | data_path = "/mnt/data/qzw/data/UCFSports/" 22 | else: 23 | data_path = " " 24 | print("dataset not found!!") 25 | exit(0) 26 | 27 | modality = 'rgb' 28 | init_model = "/mnt/data/qzw/model/pytorch-act-detector/{}/{}-init-model-{}-pytorch-single.pkl".format(dataset, dataset, modality) 29 | trained_model = "/mnt/data/qzw/model/pytorch-act-detector/{}/my_trained_pytorch_model_{}-{}.pkl".format(dataset, dataset, modality) 30 | new_trained_model = "/mnt/data/qzw/model/pytorch-act-detector/{}/my_new_trained_pytorch_model_{}-{}.pkl".format(dataset, dataset, modality) 31 | best_trained_model = "/mnt/data/qzw/model/pytorch-act-detector/{}/best_trained_pytorch_model_{}-{}-%.4f.pkl".format(dataset, dataset, modality) 32 | all_frame_boxes_list_result = "/mnt/data/qzw/result/pytorch-act-detector/{}/all_frame_boxes_list-{}-{}.pkl".format(dataset, dataset, modality) 33 | 34 | variance = [0.1, 0.1, 0.2, 0.2] 35 | sequence_length = 6 36 | 37 | # -------------------------model config ------------------------------# 38 | 39 | base_model_name = 'vgg16' 40 | freeze_init = True 41 | 42 | # -------------------------train config ------------------------------# 43 | reinit_all = False 44 | use_gpu = True 45 | warm_up_epoch = 1 46 | warm_up_ratio = 1 / 100 47 | epochs = 200 48 | train_batch_size = 192 49 | valid_batch_size = 1 50 | workers = 16 51 | 52 | lr = 0.001 53 | momentum = 0.9 54 | weight_decay = 5e-4 55 | 56 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 57 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" 58 | 59 | 60 | # config = Config() 61 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import dataset 2 | from . import transforms 3 | -------------------------------------------------------------------------------- /data/dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import torch 4 | import torch.utils.data as data 5 | import numpy as np 6 | import cv2 7 | from . import transforms 8 | from torchvision.transforms import functional as F 9 | 10 | 11 | class TubeDataset(data.Dataset): 12 | def __init__(self, DNAME, data_path, phase, modality, sequence_length): 13 | ground_truth_file = os.path.join(data_path, './cache/{}-GT.pkl'.format(DNAME)) 14 | with open(ground_truth_file, 'rb') as fid: 15 | cache = pickle.load(fid, encoding='iso-8859-1') 16 | for k in cache: 17 | setattr(self, k, cache[k]) 18 | self.MEAN = np.array([[[104, 117, 123]]], dtype=np.float32) 19 | self.DNAME = DNAME 20 | if DNAME == 'UCF101v2': 21 | self.image_format = '%05d.jpg' 22 | elif DNAME == 'UCFSports': 23 | self.image_format = '%06d.jpg' 24 | else: 25 | print("TubeDataset.DNAME value error!!") 26 | exit(-1) 27 | self.color_jitter = transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3) 28 | self.expand = transforms.Expand(self.MEAN) 29 | self.modality = modality 30 | self.data_path = data_path 31 | self.sequence_length = sequence_length 32 | self.image_shape = cache['resolution'] 33 | if modality == 'rgb': 34 | self.rgb = True 35 | elif modality == 'flow': 36 | self.rgb = False 37 | else: 38 | print("dataset mode value error!") 39 | exit(-1) 40 | self.ground_tube_list = [] 41 | self.videos_list = [] 42 | self.label_list = [] 43 | video_cnt = 0 44 | if phase == 'train': 45 | self.train = True 46 | for video in self.train_videos[0]: 47 | gttube = self.gttubes[video] 48 | if gttube.__len__() > 1: 49 | print("tube_dataset.py: warning! gttube_list length > 1!! maybe multi-labels in one video") 50 | print("video:{}".format(video)) 51 | exit(-1) 52 | for key in gttube: 53 | ts = gttube[key] 54 | for t in ts: 55 | for stf in range(t.shape[0]-self.sequence_length+1): 56 | self.label_list += [key + 1] # background label is 0 57 | self.ground_tube_list += [t[stf:stf+self.sequence_length, :]] 58 | self.videos_list += [video] 59 | video_cnt += 1 60 | # if video_cnt >= 8: 61 | # return 62 | elif phase == 'eval': 63 | self.train = False 64 | self.videos_list = self.test_videos[0] 65 | # self.videos_list = self.train_videos[0] 66 | else: 67 | print("dataset phase value error!") 68 | exit(-1) 69 | 70 | def __getitem__(self, index): 71 | # index = 10 72 | if self.rgb: 73 | root_path = os.path.join(self.data_path, 'Frames', self.videos_list[index]) 74 | else: 75 | root_path = os.path.join(self.data_path, 'FlowBrox04', self.videos_list[index]) 76 | image_list = [] 77 | all_frames = os.listdir(root_path) 78 | nframes = all_frames.__len__() 79 | if self.train: 80 | gttube = self.ground_tube_list[index] 81 | ground_truth = np.zeros((1 + self.sequence_length * 4), dtype='float32') 82 | ground_truth[0] = self.label_list[index] 83 | for i in range(self.sequence_length): 84 | ground_truth[4*i+1:4*i+5] = gttube[i, 1:] 85 | image_path = os.path.join(root_path, self.image_format % min(int(gttube[i, 0]), nframes)) 86 | im = cv2.imread(image_path) 87 | if im is None: 88 | print("{}not found!!".format(image_path)) 89 | exit(-1) 90 | image_list += [im] 91 | 92 | image_list, ground_truth, _random_crop_data = transforms.random_crop(image_list, ground_truth) 93 | if (ground_truth[1::4] < ground_truth[3::4]).sum() != 6 or (ground_truth[2::4] < ground_truth[4::4]).sum() != 6: 94 | print("_random_crop_data saved!!") 95 | with open('./error_random_crop_data.pkl', 'wb') as f: 96 | pickle.dump(index, f) 97 | pickle.dump(image_list, f) 98 | pickle.dump(ground_truth, f) 99 | pickle.dump(_random_crop_data, f) 100 | exit(-1) 101 | 102 | image_list, ground_truth, _random_flip_data = transforms.random_flip(image_list, ground_truth) 103 | if (ground_truth[1::4] < ground_truth[3::4]).sum() != 6 or (ground_truth[2::4] < ground_truth[4::4]).sum() != 6: 104 | print("_random_crop_data saved!!") 105 | with open('./error_random_flip_data.pkl', 'wb') as f: 106 | pickle.dump(index, f) 107 | pickle.dump(image_list, f) 108 | pickle.dump(ground_truth, f) 109 | pickle.dump(_random_crop_data, f) 110 | pickle.dump(_random_flip_data, f) 111 | exit(-1) 112 | 113 | image_list = self.color_jitter(image_list) 114 | image_list, ground_truth, _random_expand_data = self.expand(image_list, ground_truth) 115 | if (ground_truth[1::4] < ground_truth[3::4]).sum() != 6 or ( 116 | ground_truth[2::4] < ground_truth[4::4]).sum() != 6: 117 | print("_random_expand_data saved!!") 118 | with open('./error_random_expand_data.pkl', 'wb') as f: 119 | pickle.dump(index, f) 120 | pickle.dump(image_list, f) 121 | pickle.dump(ground_truth, f) 122 | pickle.dump(_random_crop_data, f) 123 | pickle.dump(_random_flip_data, f) 124 | pickle.dump(_random_expand_data, f) 125 | exit(-1) 126 | 127 | height_new, width_new, _ = image_list[0].shape 128 | ground_truth[1::4] = ground_truth[1::4] / width_new 129 | ground_truth[2::4] = ground_truth[2::4] / height_new 130 | ground_truth[3::4] = ground_truth[3::4] / width_new 131 | ground_truth[4::4] = ground_truth[4::4] / height_new 132 | for i in range(image_list.__len__()): 133 | image_list[i] = image_list[i] - self.MEAN 134 | image_data = np.concatenate(image_list, axis=2).astype('float32') 135 | image_data = cv2.resize(image_data, (300, 300), interpolation=cv2.INTER_LINEAR) 136 | image_data = np.transpose(image_data, (2, 0, 1)) 137 | image_data = torch.from_numpy(image_data) 138 | ground_truth = torch.from_numpy(ground_truth) 139 | else: 140 | if self.rgb: 141 | for i in range(1, nframes+1): 142 | image_path = os.path.join(root_path, self.image_format % i) 143 | im = cv2.imread(image_path) 144 | if im is None: 145 | print("{}not found!!".format(image_path)) 146 | exit(-1) 147 | im = cv2.resize(im, (300, 300), interpolation=cv2.INTER_LINEAR) 148 | im = np.transpose(im - self.MEAN, (2, 0, 1)) 149 | image_list += [im] 150 | image_data = torch.from_numpy(np.vstack(image_list).astype('float32')) 151 | ground_truth = torch.Tensor([index]) 152 | else: 153 | for i in range(1, nframes+1): 154 | flow_path = os.path.join(root_path, self.image_format % i) 155 | im = cv2.imread(flow_path) 156 | im = cv2.resize(im, (300, 300), interpolation=cv2.INTER_LINEAR) 157 | im = np.transpose(im - self.MEAN, (2, 0, 1)) 158 | # im = np.transpose(im, (2, 0, 1)) 159 | image_list += [im] 160 | image_data = torch.from_numpy(np.vstack(image_list).astype('float32')) 161 | ground_truth = torch.Tensor([index]) 162 | return image_data, ground_truth 163 | 164 | def __len__(self): 165 | return self.videos_list.__len__() 166 | # return 48 167 | 168 | def get_test_videos(self): 169 | return self.test_videos 170 | 171 | def get_nframes(self): 172 | return self.nframes 173 | 174 | def get_resolution(self): 175 | return self.resolution 176 | 177 | def get_labels(self): 178 | return self.labels 179 | 180 | def get_gttubes(self): 181 | return self.gttubes 182 | 183 | 184 | if __name__ == '__main__': 185 | import config 186 | args = config.Config() 187 | with open('./error_random_flip_data.pkl', 'wb') as f: 188 | index = pickle.load(f) 189 | image_list = pickle.load(f) 190 | ground_truth = pickle.load(f) 191 | _random_crop_data = pickle.load(f) 192 | _random_flip_data = pickle.load(f) 193 | train_dataset = TubeDataset(args.dataset, data_path=args.data_path, phase='train', 194 | modality=args.modality, 195 | sequence_length=6) 196 | a, b = train_dataset[index] 197 | -------------------------------------------------------------------------------- /data/transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import cv2 4 | from torchvision.transforms import functional as F 5 | from PIL import Image 6 | 7 | 8 | def random_flip(image_list, target): 9 | pro = np.random.random() 10 | if pro < 0.5: 11 | height, width, _ = image_list[0].shape 12 | for i in range(image_list.__len__()): 13 | image_list[i] = np.flip(image_list[i], axis=1) 14 | xmin_new = width - target[3::4] 15 | target[3::4] = width - target[1::4] 16 | target[1::4] = xmin_new 17 | return image_list, target, pro 18 | 19 | 20 | def random_crop(image_list, target): 21 | target = target.copy() 22 | scale = 0.5 23 | height, width, _ = image_list[0].shape 24 | gt_w = target[3::4] - target[1::4] 25 | gt_h = target[4::4] - target[2::4] 26 | gt_area = gt_w * gt_h 27 | gt_area = gt_area.sum() 28 | while True: 29 | xmin_crop_ratio = (1 - scale) * np.random.random() 30 | ymin_crop_ratio = xmin_crop_ratio 31 | xmax_crop_ratio = xmin_crop_ratio + (1 - xmin_crop_ratio - scale) * np.random.random() + scale 32 | ymax_crop_ratio = xmax_crop_ratio 33 | xmin_crop, ymin_crop, xmax_crop, ymax_crop = int(width * xmin_crop_ratio), int(height * ymin_crop_ratio), int( 34 | width * xmax_crop_ratio), int(height * ymax_crop_ratio) 35 | 36 | xmin_cross = np.maximum(target[1::4], xmin_crop) 37 | ymin_cross = np.maximum(target[2::4], ymin_crop) 38 | xmax_cross = np.minimum(target[3::4], xmax_crop) 39 | ymax_cross = np.minimum(target[4::4], ymax_crop) 40 | 41 | cross_w = xmax_cross - xmin_cross 42 | cross_h = ymax_cross - ymin_cross 43 | if (cross_w < 0.01).sum() > 0: 44 | continue 45 | if (cross_h < 0.01).sum() > 0: 46 | continue 47 | cross_area = cross_w * cross_h 48 | cross_area = cross_area.sum() 49 | if cross_area / gt_area < 0.8: 50 | continue 51 | target[1::4] = xmin_cross - xmin_crop 52 | target[2::4] = ymin_cross - ymin_crop 53 | target[3::4] = xmax_cross - xmin_crop 54 | target[4::4] = ymax_cross - ymin_crop 55 | break 56 | 57 | image_list_new = [image_list[i][ymin_crop:ymax_crop+1, xmin_crop:xmax_crop+1, :] for i in range(len(image_list))] 58 | 59 | return image_list_new, target, (xmin_crop_ratio, xmax_crop_ratio) 60 | 61 | 62 | class ColorJitter(object): 63 | def __init__(self, brightness=0.0, contrast=0.0, saturation=0.0, hue=0.0): 64 | self.brightness = brightness 65 | self.contrast = contrast 66 | self.saturation = saturation 67 | self.hue = hue 68 | 69 | @staticmethod 70 | def get_params(brightness, contrast, saturation, hue): 71 | transforms = [] 72 | if brightness > 0: 73 | brightness_factor = np.random.uniform(max(0, 1 - brightness), 1 + brightness) 74 | transforms.append(lambda img: F.adjust_brightness(img, brightness_factor)) 75 | 76 | if contrast > 0: 77 | contrast_factor = np.random.uniform(max(0, 1 - contrast), 1 + contrast) 78 | transforms.append(lambda img: F.adjust_contrast(img, contrast_factor)) 79 | 80 | if saturation > 0: 81 | saturation_factor = np.random.uniform(max(0, 1 - saturation), 1 + saturation) 82 | transforms.append(lambda img: F.adjust_saturation(img, saturation_factor)) 83 | 84 | if hue > 0: 85 | hue_factor = np.random.uniform(-hue, hue) 86 | transforms.append(lambda img: F.adjust_hue(img, hue_factor)) 87 | np.random.shuffle(transforms) 88 | return transforms 89 | 90 | def __call__(self, img_list): 91 | transforms = self.get_params(self.brightness, self.contrast, 92 | self.saturation, self.hue) 93 | for i in range(img_list.__len__()): 94 | img = img_list[i][..., -1::-1] # bgr2rgb 95 | img = Image.fromarray(np.uint8(img)) 96 | for t in transforms: 97 | img = t(img) 98 | img = np.asarray(img) 99 | img_list[i] = img[..., -1::-1] # rgb2bgr 100 | return img_list 101 | 102 | 103 | class Normalize(object): 104 | def __init__(self, mean, std): 105 | self.mean = mean 106 | self.std = std 107 | 108 | def __call__(self, tensor): 109 | tensor = F.normalize(tensor, self.mean, self.std) 110 | return tensor 111 | 112 | 113 | class Expand(object): 114 | def __init__(self, mean): 115 | self.expand_prob = 0.5 116 | self.max_expand_ratio = 4.0 117 | self.mean = mean 118 | 119 | def __call__(self, image_list, ground_truth): 120 | out_image_list = image_list 121 | pro = np.random.random() 122 | if pro < self.expand_prob: 123 | expand_ratio = np.random.uniform(1, self.max_expand_ratio) 124 | ori_h, ori_w, _ = image_list[0].shape 125 | new_h, new_w = int(ori_h * expand_ratio), int(ori_w * expand_ratio) 126 | out_image_list = [(np.zeros((new_h, new_w, 3), dtype=np.float32) + self.mean) for i in 127 | range(len(image_list))] 128 | h_off, w_off = int(np.floor(new_h - ori_h)), int(np.floor(new_w - ori_w)) 129 | for i in range(len(image_list)): 130 | out_image_list[i][h_off:h_off + ori_h, w_off:w_off + ori_w] = image_list[i] 131 | ground_truth[1:] += np.array([w_off, h_off, w_off, h_off] * len(image_list), dtype=np.float32) 132 | else: 133 | expand_ratio = 1.0 134 | return out_image_list, ground_truth, (pro, expand_ratio) 135 | 136 | 137 | def PCA_Jittering(img): 138 | img_size = img.size / 3 139 | # print(img.size, img_size) 140 | img1 = img.reshape(int(img_size), 3) 141 | img1 = np.transpose(img1) 142 | img_cov = np.cov([img1[0], img1[1], img1[2]]) 143 | # 计算矩阵特征向量 144 | lamda, p = np.linalg.eig(img_cov) 145 | 146 | p = np.transpose(p) 147 | # 生成正态分布的随机数 148 | alpha1 = random.normalvariate(0, 0.05) 149 | alpha2 = random.normalvariate(0, 0.05) 150 | alpha3 = random.normalvariate(0, 0.05) 151 | v = np.transpose((alpha1 * lamda[0], alpha2 * lamda[1], alpha3 * lamda[2])) # 加入扰动 152 | add_num = np.dot(p, v) 153 | img2 = np.array([img[:, :, 0] + add_num[0], img[:, :, 1] + add_num[1], img[:, :, 2] + add_num[2]]) 154 | img2 = np.swapaxes(img2, 0, 2) 155 | img2 = np.swapaxes(img2, 0, 1) 156 | img2[img2 < 0] = 0 157 | img2[img2 > 255] = 255 158 | # max_t = np.max(img2[:, :, 0]) 159 | # min_t = np.min(img2[:, :, 0]) 160 | # img2[:, :, 0] = 255 / (max_t - min_t) * (img2[:, :, 0] - min_t) 161 | # 162 | # max_t = np.max(img2[:, :, 1]) 163 | # min_t = np.min(img2[:, :, 1]) 164 | # img2[:, :, 1] = 255 / (max_t - min_t) * (img2[:, :, 1] - min_t) 165 | # 166 | # max_t = np.max(img2[:, :, 2]) 167 | # min_t = np.min(img2[:, :, 2]) 168 | # img2[:, :, 2] = 255 / (max_t - min_t) * (img2[:, :, 2] - min_t) 169 | return img2 170 | 171 | if __name__ == '__main__': 172 | import os 173 | video = '002' 174 | color = ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3) 175 | root_path = os.path.join("/mnt/data/qzw/data/UCFSports/", 'Frames', video) 176 | image_list = [] 177 | data_path = "/mnt/data/qzw/data/UCFSports/" 178 | dataset = "UCFSports" 179 | train_dataset = tube_dataset.TubeDataset(dataset, data_path=data_path, phase='eval', 180 | modality='rgb', 181 | sequence_length=6) 182 | height, width = train_dataset.resolution[video] 183 | s = 10 184 | target = np.zeros(25) 185 | target[1:] = train_dataset.gttubes[video][0][0][s: s+6, 1: 5].reshape(-1) 186 | target[1::2] = target[1::2] / width 187 | target[2::2] = target[2::2] / height 188 | for i in range(s, s+6): 189 | path = os.path.join(root_path, '%06d.jpg' % (i+1)) 190 | image = cv2.imread(path) 191 | # image = PCA_Jittering(image) 192 | image = color(image) 193 | image_list += [image] 194 | image, target_new = random_crop(np.concatenate(image_list, axis=2), target) 195 | image = cv2.resize(image, (300, 300), interpolation=cv2.INTER_LINEAR) 196 | height_new, width_new, _ = image.shape 197 | for i in range(6): 198 | p1 = (int(target_new[i * 4 + 1]*width_new), int(target_new[i * 4 + 2]*height_new)) 199 | p2 = (int(target_new[i * 4 + 3]*width_new), int(target_new[i * 4 + 4]*height_new)) 200 | im1 = (image[:, :, 3*i:3*(i+1)]).astype('uint8') 201 | cv2.rectangle(im1, p1, p2, (255, 0, 0)) 202 | cv2.imwrite('./image/test{}.jpg'.format(i), im1) 203 | # image_list += [image] 204 | 205 | # img_new = cv2.imread('./test_img.jpg') 206 | # print(img_new) 207 | ss = 0 208 | -------------------------------------------------------------------------------- /error_handle.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from utils import map_eval 5 | from utils import box 6 | import cv2 7 | import matplotlib.pyplot as plt 8 | 9 | 10 | def error_analyse(dataset): 11 | error_images_path = "/mnt/data/qzw/result/pytorch-act-detector/{}/error_images/".format(dataset.DNAME) 12 | result_file = "/mnt/data/qzw/result/pytorch-act-detector/{}/all_frame_boxes_list-{}-{}-0.8601.pkl".format(dataset.DNAME, dataset.DNAME, dataset.modality) 13 | if not os.path.isfile(result_file): 14 | raise ValueError("file:{} not found".format(result_file)) 15 | with open(result_file, "rb") as file: 16 | all_frame_boxes_list = pickle.load(file) 17 | labels = dataset.get_labels() 18 | gt_tubes = dataset.get_gttubes() 19 | gt_dict, gt_label_num = map_eval.get_ground_truth(dataset.videos_list, labels, gt_tubes) 20 | all_frame_boxes = np.vstack(all_frame_boxes_list) 21 | frame_gt_box_dict = {} 22 | frame_error_box_dict = {} 23 | frame_correct_box_dict = {} 24 | label_pr_dict = {} 25 | for label in range(labels.__len__()): 26 | pre_idx = np.where(all_frame_boxes[:, 2] == label)[0] 27 | label_pre_box = all_frame_boxes[pre_idx] 28 | pre_idx = np.argsort(-label_pre_box[:, 3]) 29 | pr = np.empty((pre_idx.shape[0]+1, 2)) 30 | pr[0, 0] = 1.0 # precision 31 | pr[0, 1] = 0.0 # recall 32 | pr_cnt = 1 33 | fn = gt_label_num[label] 34 | fp = 0 35 | tp = 0 36 | video_name_list = [] 37 | for cnt, id in enumerate(pre_idx): 38 | pre_box = label_pre_box[id, :] 39 | video_name_list += [int(pre_box[0])-1] 40 | # video_label = list(dataset.gttubes[dataset.videos_list[int(pre_box[0])-1]].keys())[0] 41 | # if labels[label] == 'Run' and dataset.labels[video_label] == 'SkateBoarding': 42 | # continue 43 | positive = False 44 | if (int(pre_box[0]), int(pre_box[1]), int(pre_box[2])) in gt_dict: 45 | _gt = gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))] 46 | 47 | if (int(pre_box[0]), int(pre_box[1])) not in frame_gt_box_dict: 48 | frame_gt_box_dict[(int(pre_box[0]), int(pre_box[1]))] = [] 49 | frame_gt_box_dict[(int(pre_box[0]), int(pre_box[1]))] += _gt.copy() 50 | 51 | ious = np.zeros(_gt.__len__()) 52 | for i, g in enumerate(_gt): 53 | ious[i] = box.jaccard_overlap_boxes(pre_box[4:], g) 54 | i_max = np.argmax(ious) 55 | if ious[i_max] > 0.5: 56 | positive = True 57 | del _gt[i_max] 58 | if _gt.__len__() == 0: 59 | del gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))] 60 | if positive: 61 | tp += 1 62 | fn -= 1 63 | if (int(pre_box[0]), int(pre_box[1])) not in frame_correct_box_dict: 64 | frame_correct_box_dict[(int(pre_box[0]), int(pre_box[1]))] = [] 65 | frame_correct_box_dict[(int(pre_box[0]), int(pre_box[1]))] += [pre_box] 66 | else: 67 | fp += 1 68 | if (int(pre_box[0]), int(pre_box[1])) not in frame_error_box_dict: 69 | frame_error_box_dict[(int(pre_box[0]), int(pre_box[1]))] = [] 70 | frame_error_box_dict[(int(pre_box[0]), int(pre_box[1]))] += [pre_box] 71 | # video_name = dataset.videos_list[int(pre_box[0])-1] 72 | # video_label = list(dataset.gttubes[video_name].keys())[0] 73 | # err_image_root = os.path.join(error_images_path, video_name+"-"+dataset.labels[video_label]) 74 | # if os.path.exists(err_image_root) is not True: 75 | # os.mkdir(err_image_root) 76 | # image = cv2.imread(os.path.join(dataset.data_path, 'Frames', video_name, dataset.image_format % int(pre_box[1]))) 77 | # draw_rec_and_save_image(_gt, pre_box, dataset.labels[int(pre_box[2])], image, err_image_root) 78 | pr[pr_cnt, 0] = tp / (fp + tp) 79 | pr[pr_cnt, 1] = tp / (tp + fn) 80 | if labels[label] == 'SkateBoarding' and cnt < 1000: 81 | pause = 0 82 | image = cv2.imread(os.path.join(dataset.data_path, 'Frames', dataset.videos_list[int(pre_box[0]) - 1], 83 | dataset.image_format % int(pre_box[1]))) 84 | err_image_root = os.path.join(error_images_path, labels[label]) 85 | if os.path.exists(err_image_root) is not True: 86 | os.mkdir(err_image_root) 87 | if positive: 88 | draw_rec_and_save_image([], [pre_box], [], labels, image, err_image_root, cnt) 89 | else: 90 | draw_rec_and_save_image([], [], [pre_box], labels, image, err_image_root, cnt) 91 | pr_cnt += 1 92 | video_name_list = np.array(video_name_list).reshape(-1, 1) 93 | label_pr_dict[label] = pr 94 | # plt.cla() 95 | # plt.plot(pr[:, 1], pr[:, 0], color='blue') 96 | # plt.xlabel('recall') 97 | # plt.ylabel('precision') 98 | # plt.savefig('./{}.jpg'.format(labels[label])) 99 | # exit(0) 100 | # for i, video in enumerate(dataset.videos_list): 101 | # video_label = list(dataset.gttubes[video].keys())[0] 102 | # if dataset.labels[video_label] != 'Walk': 103 | # continue 104 | # nframes = os.listdir(os.path.join(dataset.data_path, 'Frames', video)).__len__() 105 | # print("video index:", i) 106 | # for j in range(nframes): 107 | # image = cv2.imread(os.path.join(dataset.data_path, 'Frames', video, dataset.image_format % int(j+1))) 108 | # video_label = list(dataset.gttubes[video].keys())[0] 109 | # err_image_root = os.path.join(error_images_path, video + "-" + dataset.labels[video_label]) 110 | # if os.path.exists(err_image_root) is not True: 111 | # os.mkdir(err_image_root) 112 | # if (i+1, j+1) in frame_gt_box_dict: 113 | # gt = frame_gt_box_dict[(i+1, j+1)] 114 | # else: 115 | # gt = [] 116 | # if (i+1, j+1) in frame_correct_box_dict: 117 | # cpb = frame_correct_box_dict[(i+1, j+1)] 118 | # else: 119 | # cpb = [] 120 | # if (i+1, j+1) in frame_error_box_dict: 121 | # epb = frame_error_box_dict[(i+1, j+1)] 122 | # else: 123 | # epb = [] 124 | # draw_rec_and_save_image(gt, cpb, epb, labels, image, err_image_root, j+1) 125 | 126 | ap = np.empty(labels.__len__()) 127 | for label in label_pr_dict: 128 | prdif = label_pr_dict[label][1:, 1] - label_pr_dict[label][:-1, 1] 129 | prsum = label_pr_dict[label][1:, 0] + label_pr_dict[label][:-1, 0] 130 | ap[label] = np.sum(prdif * prsum * 0.5) 131 | mmap = np.mean(ap) 132 | print("map:", mmap) 133 | return mmap 134 | 135 | 136 | def draw_rec_and_save_image(ground_truths, correct_pre_boxes, error_pre_boxes, labels, image, image_save_path, frame_index): 137 | for gt in ground_truths: 138 | p1 = (int(gt[0]), int(gt[1])) 139 | p2 = (int(gt[2]), int(gt[3])) 140 | cv2.rectangle(image, p1, p2, (0, 255, 0)) 141 | for pb in error_pre_boxes: 142 | if pb[3] < 0.1: 143 | continue 144 | p1 = (int(pb[4]), int(pb[5])) 145 | p2 = (int(pb[6]), int(pb[7])) 146 | pt = (int(pb[4]), int(pb[7])) 147 | cv2.rectangle(image, p1, p2, (0, 0, 255)) 148 | cv2.putText(image, "conf:%.3f " % pb[3] + labels[int(pb[2])], pt, cv2.FONT_HERSHEY_SIMPLEX, 0.4, (32, 32, 32)) 149 | for pb in correct_pre_boxes: 150 | p1 = (int(pb[4]), int(pb[5])) 151 | p2 = (int(pb[6]), int(pb[7])) 152 | pt = (int(pb[4]), int(pb[5]+10)) 153 | cv2.rectangle(image, p1, p2, (255, 0, 0)) 154 | cv2.putText(image, "conf:%.3f " % pb[3] + labels[int(pb[2])], pt, cv2.FONT_HERSHEY_SIMPLEX, 0.4, (132, 132, 32)) 155 | cv2.imwrite(os.path.join(image_save_path, "frame-{}.jpg").format(int(frame_index)), image) 156 | 157 | 158 | if __name__ == '__main__': 159 | # from data import dataset 160 | # dataset_name = 'UCFSports' 161 | # modality = 'rgb' 162 | # data_path = "/mnt/data/qzw/data/{}/".format(dataset_name) 163 | # analyse_data_set = dataset.TubeDataset(dataset_name, data_path=data_path, phase='eval', 164 | # modality=modality, 165 | # sequence_length=6) 166 | # error_analyse(analyse_data_set) 167 | import config 168 | from data import dataset 169 | args = config.Config() 170 | with open('./error_random_crop_data.pkl', 'rb') as f: 171 | index = pickle.load(f) 172 | image_list = pickle.load(f) 173 | ground_truth = pickle.load(f) 174 | _random_crop_data = pickle.load(f) 175 | # _random_flip_data = pickle.load(f) 176 | train_dataset = dataset.TubeDataset(args.dataset, data_path=args.data_path, phase='train', 177 | modality=args.modality, 178 | sequence_length=6) 179 | a, b = train_dataset[index] 180 | 181 | 182 | -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from . import act_cuboid_loss 2 | from . import ssd 3 | -------------------------------------------------------------------------------- /layers/act_cuboid_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from utils import prior_tubes 6 | 7 | 8 | def get_tube_overlap(tube1, tube2, ioutable): 9 | ground_truth = tube2.expand(tube1.shape) 10 | total_tube = torch.cat([tube1.unsqueeze(0), ground_truth.unsqueeze(0)], dim=0) 11 | 12 | xmin = torch.max(total_tube[:, :, 0::4], dim=0)[0] 13 | ymin = torch.max(total_tube[:, :, 1::4], dim=0)[0] 14 | xmax = torch.min(total_tube[:, :, 2::4], dim=0)[0] 15 | ymax = torch.min(total_tube[:, :, 3::4], dim=0)[0] 16 | 17 | cross_area = torch.clamp(xmax - xmin, min=0)*torch.clamp(ymax - ymin, min=0) 18 | 19 | valid_area = cross_area.sum(dim=1) > 0 20 | valid = valid_area.unsqueeze(1).expand(tube1.shape) 21 | valid_priortubes = tube1[valid].view((-1, tube1.shape[1])) 22 | 23 | prior_area = (valid_priortubes[:, 2::4] - valid_priortubes[:, 0::4])*(valid_priortubes[:, 3::4] - valid_priortubes[:, 1::4]) 24 | valid = valid_area.unsqueeze(1).expand(-1, prior_area.shape[1]) 25 | valid_cross_area = cross_area[valid].view((-1, prior_area.shape[1])) 26 | 27 | gt_area = (tube2[2::4] - tube2[0::4])*(tube2[3::4] - tube2[1::4]) 28 | 29 | ratio = valid_cross_area/(gt_area + prior_area - valid_cross_area) 30 | ratio = ratio.sum(dim=1) 31 | ioutable[valid_area] = ratio/prior_area.shape[1] 32 | 33 | 34 | class CuboidLoss(nn.Module): 35 | def __init__(self, use_gpu, variance, num_class, k_frames): 36 | super(CuboidLoss, self).__init__() 37 | self.use_gpu = use_gpu 38 | self.variance = variance 39 | self.k_frames = k_frames 40 | self.num_class = num_class 41 | self.tubes_init = prior_tubes.RGB_TUBES(phase='train', use_gpu=use_gpu) 42 | 43 | 44 | def ACTMatchTube(self, prior_tubes, ground_truth): 45 | # prior_tubes.shape = (8396*24), it's same to all tubes 46 | # ground_truth is a tensor ,,every batch just one ground truth,,ground_truth.shape = (bath_num*sequence*(1+4)) 47 | # if self.use_gpu: 48 | # _ground_truth = ground_truth.cpu().numpy() 49 | # else: 50 | # _ground_truth = ground_truth.numpy() 51 | batch_num = ground_truth.shape[0] 52 | prior_tubes_num = prior_tubes.shape[0] 53 | tubes_label = torch.zeros((batch_num, prior_tubes_num, self.num_class), dtype=torch.uint8) 54 | tubes_label_index = torch.zeros((batch_num, prior_tubes_num), dtype=torch.int64) 55 | tubes_label[:, :, 0] = 1 56 | positive_samples_index_list = [] 57 | N = 0 58 | if self.use_gpu: 59 | iou_table = torch.zeros(prior_tubes_num, dtype=torch.float32).cuda() 60 | else: 61 | iou_table = torch.zeros(prior_tubes_num, dtype=torch.float32) 62 | for i in range(batch_num): 63 | iou_table.fill_(0) 64 | # for prior in range(prior_tubes_num): 65 | # iou_table[prior] = get_tube_overlap(prior_tubes[prior, :], _ground_truth[i, 1:], self.k_frames) 66 | get_tube_overlap(prior_tubes, ground_truth[i, 1:], iou_table) 67 | positive_sample_index = [] 68 | max_prior_index = torch.argmax(iou_table, 0) 69 | positive_sample_index += [(max_prior_index, i)] 70 | tubes_label_index[i, max_prior_index] = int(ground_truth[i, 0]) 71 | tubes_label[i, max_prior_index, int(ground_truth[i, 0])] = 1 72 | tubes_label[i, max_prior_index, 0] = 0 73 | pp = torch.argsort(-iou_table) 74 | for tt in pp: 75 | if iou_table[tt] < 0.5: 76 | break 77 | if tubes_label[i, tt, 0] == 1: 78 | positive_sample_index += [(tt, i)] 79 | tubes_label_index[i, tt] = int(ground_truth[i, 0]) 80 | tubes_label[i, tt, int(ground_truth[i, 0])] = 1 81 | tubes_label[i, tt, 0] = 0 82 | N += positive_sample_index.__len__() 83 | positive_samples_index_list += [torch.tensor(positive_sample_index)] 84 | if N == 0: 85 | print("no positive samples!") 86 | exit(-1) 87 | return positive_samples_index_list, tubes_label, tubes_label_index, N 88 | 89 | def ACTComputeConfLoss(self, conf_preds, tubes_label): 90 | conf_preds_max = torch.max(conf_preds, dim=-1)[0].unsqueeze(-1) 91 | my_conf_preds = F.softmax(conf_preds - conf_preds_max, dim=-1) 92 | aa = my_conf_preds[tubes_label] 93 | aa = aa.view(tubes_label.shape[0], tubes_label.shape[1]) 94 | tubes_loss = -torch.log(aa + 0.000001) 95 | return tubes_loss 96 | 97 | def ACTMineHardExamples(self, tubes_loss, positive_samples_index_list): 98 | negtive_samples_index_list = [] 99 | if self.use_gpu: 100 | tubes_loss = tubes_loss.cpu().detach().numpy() 101 | else: 102 | tubes_loss = tubes_loss.detach().numpy() 103 | for i in range(tubes_loss.shape[0]): 104 | positive_sample_index = positive_samples_index_list[i] 105 | positive_num = positive_sample_index.shape[0] 106 | negtive_num = 3 * positive_num 107 | hard_examples_index = [] 108 | tube_loss = tubes_loss[i, :] 109 | max_index = np.argsort(-tube_loss) 110 | negtive_count = 0 111 | for index in max_index: 112 | if index not in positive_sample_index[:, 0]: 113 | hard_examples_index += [index] 114 | negtive_count += 1 115 | if negtive_count >= negtive_num: 116 | break 117 | negtive_samples_index_list += [np.array(hard_examples_index)] 118 | return negtive_samples_index_list 119 | 120 | def ACTGetLocLoss(self, loc_preds, positive_samples_index_list, prior_tubes, ground_truth): 121 | # ground_truth is a list ,,its len is batch_num, its element.shape = gt_num*(1+24), the first is label 122 | batch_num = loc_preds.shape[0] 123 | if self.use_gpu: 124 | # _prior_tubes = torch.from_numpy(prior_tubes).cuda() 125 | encode_loc = torch.zeros(loc_preds.shape, requires_grad=False).cuda() 126 | pos_index = torch.zeros(loc_preds.shape, dtype=torch.uint8).cuda() 127 | else: 128 | # _prior_tubes = torch.from_numpy(prior_tubes) 129 | encode_loc = torch.zeros(loc_preds.shape, requires_grad=False) 130 | pos_index = torch.zeros(loc_preds.shape, dtype=torch.uint8) 131 | for i in range(batch_num): 132 | positive_samples_index = positive_samples_index_list[i] 133 | for j in range(positive_samples_index.shape[0]): 134 | pos_index[i, positive_samples_index[j, 0], :] = 1 135 | self.EncodeTube(prior_tubes[positive_samples_index[j, 0], :], ground_truth[i, 1:], 136 | encode_loc[i, positive_samples_index[j, 0], :]) 137 | loc_p = loc_preds[pos_index].view(-1, self.k_frames * 4) 138 | loc_t = encode_loc[pos_index].view(-1, self.k_frames * 4) 139 | loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') / self.k_frames 140 | return loss_l 141 | 142 | def ACTGetConfLoss(self, conf_preds, positive_samples_index_list, negtive_samples_index_list, tubes_label): 143 | ''' 144 | :param conf_preds: 145 | :param positive_samples_index_list: 146 | :param negtive_samples_index_list: 147 | :param tubes_label: (batch_num * 8396) 148 | :return: 149 | ''' 150 | batch_num = conf_preds.shape[0] 151 | prior_num = conf_preds.shape[1] 152 | conf_pos_index = torch.zeros(conf_preds.shape, dtype=torch.uint8) 153 | target_pos_index = torch.zeros((batch_num, prior_num), dtype=torch.uint8) 154 | for i in range(batch_num): 155 | positive_samples_index = positive_samples_index_list[i] 156 | negtive_samples_index = negtive_samples_index_list[i] 157 | for j in range(positive_samples_index.shape[0]): 158 | conf_pos_index[i, positive_samples_index[j, 0], :] = 1 159 | target_pos_index[i, positive_samples_index[j, 0]] = 1 160 | for j in range(negtive_samples_index.shape[0]): 161 | conf_pos_index[i, negtive_samples_index[j], :] = 1 162 | target_pos_index[i, negtive_samples_index[j]] = 1 163 | tubes_label.requires_grad = False 164 | if self.use_gpu: 165 | tubes_label = tubes_label.cuda() 166 | conf_p = conf_preds[conf_pos_index].view(-1, self.num_class) 167 | target_weights = tubes_label[target_pos_index] 168 | loss_c = F.cross_entropy(conf_p, target_weights, reduction='mean') 169 | return loss_c 170 | 171 | def EncodeTube(self, prior_tube, gt_tube, encode): 172 | ''' 173 | prior_tube=(xmin, ymin, xmax, ymax)*sequence_length 174 | gt_tube=(xmin, ymin, xmax, ymax)*sequence_length 175 | ''' 176 | # encode = torch.zeros_like(prior_tube) 177 | p_x_min = prior_tube[0::4] 178 | p_y_min = prior_tube[1::4] 179 | p_x_max = prior_tube[2::4] 180 | p_y_max = prior_tube[3::4] 181 | prior_center_x = (p_x_min + p_x_max) / 2 182 | prior_center_y = (p_y_max + p_y_min) / 2 183 | prior_w = p_x_max - p_x_min 184 | prior_h = p_y_max - p_y_min 185 | 186 | g_x_min = gt_tube[0::4] 187 | g_y_min = gt_tube[1::4] 188 | g_x_max = gt_tube[2::4] 189 | g_y_max = gt_tube[3::4] 190 | gt_center_x = (g_x_min + g_x_max) / 2 191 | gt_center_y = (g_y_min + g_y_max) / 2 192 | gt_w = g_x_max - g_x_min 193 | gt_h = g_y_max - g_y_min 194 | 195 | encode[0::4] = (gt_center_x - prior_center_x) / prior_w / self.variance[0] 196 | encode[1::4] = (gt_center_y - prior_center_y) / prior_h / self.variance[1] 197 | encode[2::4] = torch.log(gt_w / prior_w) / self.variance[2] 198 | encode[3::4] = torch.log(gt_h / prior_h) / self.variance[3] 199 | 200 | def forward(self, output, ground_truth): 201 | loc_preds, conf_preds = output 202 | positive_samples_index_list, tubes_label, tubes_label_index, N = self.ACTMatchTube(self.tubes_init.all_tubes, ground_truth) 203 | loss_l = self.ACTGetLocLoss(loc_preds, positive_samples_index_list, self.tubes_init.all_tubes, ground_truth) 204 | tubes_loss = self.ACTComputeConfLoss(conf_preds, tubes_label) 205 | negtive_samples_index_list = self.ACTMineHardExamples(tubes_loss, positive_samples_index_list) 206 | loss_c = self.ACTGetConfLoss(conf_preds, positive_samples_index_list, negtive_samples_index_list, tubes_label_index) 207 | loss_l /= N 208 | # loss_c /= N 209 | if torch.isinf(loss_l) or torch.isnan(loss_l) or torch.isinf(loss_c) or torch.isnan(loss_c): 210 | with open("./inf_temp_data.pkl", 'wb') as f: 211 | pickle.dump(loc_preds.cpu().detach(), f) 212 | pickle.dump(conf_preds.cpu().detach(), f) 213 | pickle.dump(ground_truth.cpu().detach(), f) 214 | print("get inf or nan data!!") 215 | exit(-1) 216 | return loss_l, loss_c 217 | 218 | 219 | if __name__ == '__main__': 220 | import pickle 221 | 222 | with open("./debugfile.pkl", 'rb') as f: 223 | tube1 = pickle.load(f) 224 | tube2 = pickle.load(f) 225 | get_tube_overlap(tube1, tube2, 6) 226 | -------------------------------------------------------------------------------- /layers/ssd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import pickle 5 | import numpy as np 6 | 7 | 8 | class scale_norm(nn.Module): 9 | def __init__(self, channels): 10 | super(scale_norm, self).__init__() 11 | temp = torch.zeros(channels) 12 | temp.fill_(20) 13 | self.scale = nn.Parameter(temp.reshape(1, channels, 1, 1)) 14 | 15 | def forward(self, input): 16 | output = F.normalize(input, p=2, dim=1) 17 | return output * self.scale 18 | 19 | 20 | class SSD_NET(nn.Module): 21 | def __init__(self, dataset, frezze_init, num_classes=11, modality='rgb', k=6): 22 | super(SSD_NET, self).__init__() 23 | self.frezze_init = frezze_init 24 | self.k_frames = k 25 | self.dataset = dataset 26 | self.modality = modality 27 | if modality == 'rgb': 28 | self.rgb = True 29 | elif modality == 'flow': 30 | self.rgb = False 31 | else: 32 | print("modality value error!!") 33 | exit(-1) 34 | self.num_classes = num_classes 35 | if self.rgb: 36 | self.in_channels = 3 37 | self.layer_name = 'frame' 38 | else: 39 | self.in_channels = 15 40 | self.layer_name = 'flow' 41 | 42 | self.__setattr__('conv1_1_{}'.format(self.layer_name), 43 | nn.Conv2d(in_channels=self.in_channels, out_channels=64, kernel_size=3, stride=1, padding=1)) 44 | self.__setattr__('relu1_1_{}'.format(self.layer_name), nn.ReLU()) 45 | self.__setattr__('conv1_2_{}'.format(self.layer_name), 46 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)) 47 | self.__setattr__('relu1_2_{}'.format(self.layer_name), nn.ReLU()) 48 | self.__setattr__('pool1_{}'.format(self.layer_name), nn.MaxPool2d(kernel_size=2, stride=2)) 49 | ##################################### 50 | 51 | self.__setattr__('conv2_1_{}'.format(self.layer_name), 52 | nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)) 53 | self.__setattr__('relu2_1_{}'.format(self.layer_name), nn.ReLU()) 54 | self.__setattr__('conv2_2_{}'.format(self.layer_name), 55 | nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1)) 56 | self.__setattr__('relu2_2_{}'.format(self.layer_name), nn.ReLU()) 57 | self.__setattr__('pool2_{}'.format(self.layer_name), nn.MaxPool2d(kernel_size=2, stride=2)) 58 | ##################################### 59 | 60 | self.__setattr__('conv3_1_{}'.format(self.layer_name), 61 | nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)) 62 | self.__setattr__('relu3_1_{}'.format(self.layer_name), nn.ReLU()) 63 | self.__setattr__('conv3_2_{}'.format(self.layer_name), 64 | nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)) 65 | self.__setattr__('relu3_2_{}'.format(self.layer_name), nn.ReLU()) 66 | self.__setattr__('conv3_3_{}'.format(self.layer_name), 67 | nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)) 68 | self.__setattr__('relu3_3_{}'.format(self.layer_name), nn.ReLU()) 69 | self.__setattr__('pool3_{}'.format(self.layer_name), nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)) 70 | ##################################### 71 | 72 | self.__setattr__('conv4_1_{}'.format(self.layer_name), 73 | nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)) 74 | self.__setattr__('relu4_1_{}'.format(self.layer_name), nn.ReLU()) 75 | self.__setattr__('conv4_2_{}'.format(self.layer_name), 76 | nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)) 77 | self.__setattr__('relu4_2_{}'.format(self.layer_name), nn.ReLU()) 78 | self.__setattr__('conv4_3_{}'.format(self.layer_name), 79 | nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)) 80 | self.__setattr__('relu4_3_{}'.format(self.layer_name), nn.ReLU()) 81 | self.__setattr__('conv4_3_norm_{}'.format(self.layer_name), scale_norm(512)) 82 | self.__setattr__('pool4_{}'.format(self.layer_name), nn.MaxPool2d(kernel_size=2, stride=2)) 83 | ##################################### 84 | 85 | self.__setattr__('conv5_1_{}'.format(self.layer_name), 86 | nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)) 87 | self.__setattr__('relu5_1_{}'.format(self.layer_name), nn.ReLU()) 88 | self.__setattr__('conv5_2_{}'.format(self.layer_name), 89 | nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)) 90 | self.__setattr__('relu5_2_{}'.format(self.layer_name), nn.ReLU()) 91 | self.__setattr__('conv5_3_{}'.format(self.layer_name), 92 | nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)) 93 | self.__setattr__('relu5_3_{}'.format(self.layer_name), nn.ReLU()) 94 | self.__setattr__('pool5_{}'.format(self.layer_name), nn.MaxPool2d(kernel_size=2, stride=1)) 95 | ##################################### 96 | 97 | self.__setattr__('fc_conv6_{}'.format(self.layer_name), 98 | nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, 99 | dilation=6, 100 | padding=6)) 101 | self.__setattr__('fc_relu6_{}'.format(self.layer_name), nn.ReLU()) 102 | self.__setattr__('fc_conv7_{}'.format(self.layer_name), 103 | nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1, stride=1, dilation=1, 104 | padding=0)) 105 | self.__setattr__('fc_relu7_{}'.format(self.layer_name), nn.ReLU()) 106 | ##################################### 107 | 108 | self.__setattr__('conv6_1_{}'.format(self.layer_name), 109 | nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1, stride=1, padding=0)) 110 | self.__setattr__('relu6_1_{}'.format(self.layer_name), nn.ReLU()) 111 | self.__setattr__('conv6_2_{}'.format(self.layer_name), 112 | nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1)) 113 | self.__setattr__('relu6_2_{}'.format(self.layer_name), nn.ReLU()) 114 | 115 | self.__setattr__('conv7_1_{}'.format(self.layer_name), 116 | nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1, stride=1, padding=0)) 117 | self.__setattr__('relu7_1_{}'.format(self.layer_name), nn.ReLU()) 118 | self.__setattr__('conv7_2_{}'.format(self.layer_name), 119 | nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)) 120 | self.__setattr__('relu7_2_{}'.format(self.layer_name), nn.ReLU()) 121 | 122 | self.__setattr__('conv8_1_{}'.format(self.layer_name), 123 | nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, stride=1, padding=0)) 124 | self.__setattr__('relu8_1_{}'.format(self.layer_name), nn.ReLU()) 125 | self.__setattr__('conv8_2_{}'.format(self.layer_name), 126 | nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=0)) 127 | self.__setattr__('relu8_2_{}'.format(self.layer_name), nn.ReLU()) 128 | 129 | self.__setattr__('conv9_1_{}'.format(self.layer_name), 130 | nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, stride=1, padding=0)) 131 | self.__setattr__('relu9_1_{}'.format(self.layer_name), nn.ReLU()) 132 | self.__setattr__('conv9_2_{}'.format(self.layer_name), 133 | nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=0)) 134 | self.__setattr__('relu9_2_{}'.format(self.layer_name), nn.ReLU()) 135 | ##################################### 136 | self.conv4_3_norm_loc_conv = nn.Conv2d(in_channels=3072, out_channels=96, kernel_size=3, stride=1, padding=1) 137 | self.conv4_3_norm_conf_conv = nn.Conv2d(in_channels=3072, out_channels=self.num_classes*4, kernel_size=3, stride=1, padding=1) 138 | 139 | self.fc_conv7_loc_conv = nn.Conv2d(in_channels=6144, out_channels=144, kernel_size=3, stride=1, padding=1) 140 | self.fc_conv7_conf_conv = nn.Conv2d(in_channels=6144, out_channels=self.num_classes*6, kernel_size=3, stride=1, padding=1) 141 | 142 | self.conv6_loc_conv = nn.Conv2d(in_channels=3072, out_channels=144, kernel_size=3, stride=1, padding=1) 143 | self.conv6_conf_conv = nn.Conv2d(in_channels=3072, out_channels=self.num_classes*6, kernel_size=3, stride=1, padding=1) 144 | 145 | self.conv7_loc_conv = nn.Conv2d(in_channels=1536, out_channels=144, kernel_size=3, stride=1, padding=1) 146 | self.conv7_conf_conv = nn.Conv2d(in_channels=1536, out_channels=self.num_classes*6, kernel_size=3, stride=1, padding=1) 147 | 148 | self.conv8_loc_conv = nn.Conv2d(in_channels=1536, out_channels=96, kernel_size=3, stride=1, padding=1) 149 | self.conv8_conf_conv = nn.Conv2d(in_channels=1536, out_channels=self.num_classes*4, kernel_size=3, stride=1, padding=1) 150 | 151 | self.conv9_loc_conv = nn.Conv2d(in_channels=1536, out_channels=96, kernel_size=3, stride=1, padding=1) 152 | self.conv9_conf_conv = nn.Conv2d(in_channels=1536, out_channels=self.num_classes*4, kernel_size=3, stride=1, padding=1) 153 | 154 | def copy_weights(self, conv_name_caffe, conv_name_pytorch, init_dict): 155 | my_conv = self.__getattr__(conv_name_pytorch) 156 | conv_name_caffe_list = init_dict[conv_name_caffe] 157 | caffe_weight = conv_name_caffe_list[0] 158 | my_conv.weight.data.copy_(torch.from_numpy(caffe_weight)) 159 | if conv_name_caffe_list.__len__() == 2: 160 | caffe_bias = init_dict[conv_name_caffe][1] 161 | else: 162 | caffe_bias = np.zeros_like(my_conv.bias.data.numpy()) 163 | my_conv.bias.data.copy_(torch.from_numpy(caffe_bias)) 164 | 165 | def load_trained_weights(self, pkl_file): 166 | print('load trained weights^^^^^^') 167 | f = open(pkl_file, 'rb') 168 | init_dict = pickle.load(f, encoding='iso-8859-1') 169 | f.close() 170 | if self.rgb is True: 171 | cn = '' 172 | else: 173 | cn = 'flow' 174 | for j in range(1, 3): 175 | self.copy_weights('conv{}_1_stream{}{}'.format(j, 0, cn), 'conv{}_1_{}'.format(j, self.layer_name), init_dict) 176 | self.copy_weights('conv{}_2_stream{}{}'.format(j, 0, cn), 'conv{}_2_{}'.format(j, self.layer_name), init_dict) 177 | 178 | for j in range(3, 6): 179 | self.copy_weights('conv{}_1_stream{}{}'.format(j, 0, cn), 'conv{}_1_{}'.format(j, self.layer_name), init_dict) 180 | self.copy_weights('conv{}_2_stream{}{}'.format(j, 0, cn), 'conv{}_2_{}'.format(j, self.layer_name), init_dict) 181 | self.copy_weights('conv{}_3_stream{}{}'.format(j, 0, cn), 'conv{}_3_{}'.format(j, self.layer_name), init_dict) 182 | 183 | self.copy_weights('fc6_stream{}{}'.format(0, cn), 'fc_conv6_{}'.format(self.layer_name), init_dict) 184 | self.copy_weights('fc7_stream{}{}'.format(0, cn), 'fc_conv7_{}'.format(self.layer_name), init_dict) 185 | 186 | for j in range(6, 10): 187 | self.copy_weights('conv{}_1_stream{}{}'.format(j, 0, cn), 'conv{}_1_{}'.format(j, self.layer_name), init_dict) 188 | self.copy_weights('conv{}_2_stream{}{}'.format(j, 0, cn), 'conv{}_2_{}'.format(j, self.layer_name), init_dict) 189 | self.copy_weights('conv4_3_norm_concat_mbox_conf', 'conv4_3_norm_conf_conv', init_dict) 190 | self.copy_weights('conv4_3_norm_concat_mbox_loc', 'conv4_3_norm_loc_conv', init_dict) 191 | self.copy_weights('fc7_concat_mbox_loc', 'fc_conv7_loc_conv', init_dict) 192 | self.copy_weights('fc7_concat_mbox_conf', 'fc_conv7_conf_conv', init_dict) 193 | for j in range(6, 10): 194 | self.copy_weights('conv{}_2_concat_mbox_conf'.format(j), 'conv{}_conf_conv'.format(j), init_dict) 195 | self.copy_weights('conv{}_2_concat_mbox_loc'.format(j), 'conv{}_loc_conv'.format(j), init_dict) 196 | for j in range(0, 6): 197 | my_norm = self.__getattr__('conv4_3_norm_{}'.format(self.layer_name)) 198 | caffe_weight = init_dict['conv4_3_norm_stream{}{}'.format(j, cn)][0] 199 | my_norm.scale.data.copy_(torch.from_numpy(caffe_weight).reshape(1, 512, 1, 1)) 200 | torch.save(self.state_dict(), './pytorch-models/{}/{}-trained-model-{}-pytorch-single.pkl'.format(self.dataset, self.dataset, self.modality)) 201 | print("pytorch model saved!!!") 202 | exit(0) 203 | 204 | def load_init_weights(self, pkl_file): 205 | print("load_init_weights") 206 | init_weights = torch.load(pkl_file) 207 | if self.rgb is True: 208 | cn = '' 209 | else: 210 | cn = 'flow' 211 | for j in range(1, 3): 212 | for k in range(1, 3): 213 | conv = self.__getattr__('conv{}_{}_{}'.format(j, k, self.layer_name)) 214 | conv.weight.data.copy_(init_weights['conv{}_{}_stream{}{}.weight'.format(j, k, 0, cn)]) 215 | conv.bias.data.copy_(init_weights['conv{}_{}_stream{}{}.bias'.format(j, k, 0, cn)]) 216 | for j in range(3, 6): 217 | for k in range(1, 4): 218 | conv = self.__getattr__('conv{}_{}_{}'.format(j, k, self.layer_name)) 219 | conv.weight.data.copy_(init_weights['conv{}_{}_stream{}{}.weight'.format(j, k, 0, cn)]) 220 | conv.bias.data.copy_(init_weights['conv{}_{}_stream{}{}.bias'.format(j, k, 0, cn)]) 221 | for j in range(6, 8): 222 | conv = self.__getattr__('fc_conv{}_{}'.format(j, self.layer_name)) 223 | conv.weight.data.copy_(init_weights['fc{}_stream{}.weight'.format(j, 0)]) 224 | conv.bias.data.copy_(init_weights['fc{}_stream{}.bias'.format(j, 0)]) 225 | print("load ok!, save it!") 226 | torch.save(self.state_dict(), '/home/qzw/code/my-act-detector/pytorch-models/{}/{}-init-model-{}-pytorch-single.pkl'.format(self.dataset, self.dataset, self.modality)) 227 | print("pytorch model saved!!!") 228 | exit(0) 229 | 230 | def forward(self, input): 231 | conv4_3_list = [] 232 | fc_conv7_list = [] 233 | conv6_list = [] 234 | conv7_list = [] 235 | conv8_list = [] 236 | conv9_list = [] 237 | 238 | for i in range(self.k_frames): 239 | output = input[:, self.in_channels * i:self.in_channels * (i + 1), :, :] 240 | output = self.__getattr__('conv1_1_{}'.format(self.layer_name))(output) 241 | output = self.__getattr__('relu1_1_{}'.format(self.layer_name))(output) 242 | output = self.__getattr__('conv1_2_{}'.format(self.layer_name))(output) 243 | output = self.__getattr__('relu1_2_{}'.format(self.layer_name))(output) 244 | output = self.__getattr__('pool1_{}'.format(self.layer_name))(output) 245 | 246 | output = self.__getattr__('conv2_1_{}'.format(self.layer_name))(output) 247 | output = self.__getattr__('relu2_1_{}'.format(self.layer_name))(output) 248 | output = self.__getattr__('conv2_2_{}'.format(self.layer_name))(output) 249 | output = self.__getattr__('relu2_2_{}'.format(self.layer_name))(output) 250 | output = self.__getattr__('pool2_{}'.format(self.layer_name))(output) 251 | 252 | output = self.__getattr__('conv3_1_{}'.format(self.layer_name))(output) 253 | output = self.__getattr__('relu3_1_{}'.format(self.layer_name))(output) 254 | output = self.__getattr__('conv3_2_{}'.format(self.layer_name))(output) 255 | output = self.__getattr__('relu3_2_{}'.format(self.layer_name))(output) 256 | output = self.__getattr__('conv3_3_{}'.format(self.layer_name))(output) 257 | output = self.__getattr__('relu3_3_{}'.format(self.layer_name))(output) 258 | output = self.__getattr__('pool3_{}'.format(self.layer_name))(output) 259 | 260 | output = self.__getattr__('conv4_1_{}'.format(self.layer_name))(output) 261 | output = self.__getattr__('relu4_1_{}'.format(self.layer_name))(output) 262 | output = self.__getattr__('conv4_2_{}'.format(self.layer_name))(output) 263 | output = self.__getattr__('relu4_2_{}'.format(self.layer_name))(output) 264 | output = self.__getattr__('conv4_3_{}'.format(self.layer_name))(output) 265 | output = self.__getattr__('relu4_3_{}'.format(self.layer_name))(output) 266 | conv4_3_list.append(self.__getattr__('conv4_3_norm_{}'.format(self.layer_name))(output)) 267 | 268 | output = self.__getattr__('pool4_{}'.format(self.layer_name))(output) 269 | 270 | output = self.__getattr__('conv5_1_{}'.format(self.layer_name))(output) 271 | output = self.__getattr__('relu5_1_{}'.format(self.layer_name))(output) 272 | output = self.__getattr__('conv5_2_{}'.format(self.layer_name))(output) 273 | output = self.__getattr__('relu5_2_{}'.format(self.layer_name))(output) 274 | output = self.__getattr__('conv5_3_{}'.format(self.layer_name))(output) 275 | output = self.__getattr__('relu5_3_{}'.format(self.layer_name))(output) 276 | output = self.__getattr__('pool5_{}'.format(self.layer_name))(output) 277 | 278 | conv6 = self.__getattr__('fc_conv6_{}'.format(self.layer_name)) 279 | conv6.dilation = (6 ** (i + 1), 6 ** (i + 1)) 280 | conv6.padding = (6 ** (i + 1), 6 ** (i + 1)) 281 | output = conv6(output) 282 | output = self.__getattr__('fc_relu6_{}'.format(self.layer_name))(output) 283 | output = self.__getattr__('fc_conv7_{}'.format(self.layer_name))(output) 284 | output = self.__getattr__('fc_relu7_{}'.format(self.layer_name))(output) 285 | fc_conv7_list.append(output) 286 | 287 | output = self.__getattr__('conv6_1_{}'.format(self.layer_name))(output) 288 | output = self.__getattr__('relu6_1_{}'.format(self.layer_name))(output) 289 | output = self.__getattr__('conv6_2_{}'.format(self.layer_name))(output) 290 | output = self.__getattr__('relu6_2_{}'.format(self.layer_name))(output) 291 | conv6_list.append(output) 292 | 293 | output = self.__getattr__('conv7_1_{}'.format(self.layer_name))(output) 294 | output = self.__getattr__('relu7_1_{}'.format(self.layer_name))(output) 295 | output = self.__getattr__('conv7_2_{}'.format(self.layer_name))(output) 296 | output = self.__getattr__('relu7_2_{}'.format(self.layer_name))(output) 297 | conv7_list.append(output) 298 | 299 | output = self.__getattr__('conv8_1_{}'.format(self.layer_name))(output) 300 | output = self.__getattr__('relu8_1_{}'.format(self.layer_name))(output) 301 | output = self.__getattr__('conv8_2_{}'.format(self.layer_name))(output) 302 | output = self.__getattr__('relu8_2_{}'.format(self.layer_name))(output) 303 | conv8_list.append(output) 304 | 305 | output = self.__getattr__('conv9_1_{}'.format(self.layer_name))(output) 306 | output = self.__getattr__('relu9_1_{}'.format(self.layer_name))(output) 307 | output = self.__getattr__('conv9_2_{}'.format(self.layer_name))(output) 308 | output = self.__getattr__('relu9_2_{}'.format(self.layer_name))(output) 309 | conv9_list.append(output) 310 | conv4_3_fm = torch.cat(conv4_3_list, dim=1).contiguous() 311 | fc_conv7_fm = torch.cat(fc_conv7_list, dim=1).contiguous() 312 | conv6_fm = torch.cat(conv6_list, dim=1).contiguous() 313 | conv7_fm = torch.cat(conv7_list, dim=1).contiguous() 314 | conv8_fm = torch.cat(conv8_list, dim=1).contiguous() 315 | conv9_fm = torch.cat(conv9_list, dim=1).contiguous() 316 | 317 | conv4_3_norm_localization = self.conv4_3_norm_loc_conv(conv4_3_fm) 318 | conv4_3_norm_localization = conv4_3_norm_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4) 319 | conv4_3_norm_confidence = self.conv4_3_norm_conf_conv(conv4_3_fm) 320 | conv4_3_norm_confidence = conv4_3_norm_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes) 321 | # conv4_3_norm_confidence = F.softmax(conv4_3_norm_confidence, dim=-1) 322 | 323 | fc_conv7_localization = self.fc_conv7_loc_conv(fc_conv7_fm) 324 | fc_conv7_localization = fc_conv7_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4) 325 | fc_conv7_confidence = self.fc_conv7_conf_conv(fc_conv7_fm) 326 | fc_conv7_confidence = fc_conv7_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes) 327 | # fc_conv7_confidence = F.softmax(fc_conv7_confidence, dim=-1) 328 | 329 | conv6_localization = self.conv6_loc_conv(conv6_fm) 330 | conv6_localization = conv6_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4) 331 | conv6_confidence = self.conv6_conf_conv(conv6_fm) 332 | conv6_confidence = conv6_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes) 333 | # conv6_confidence = F.softmax(conv6_confidence, dim=-1) 334 | 335 | conv7_localization = self.conv7_loc_conv(conv7_fm) 336 | conv7_localization = conv7_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4) 337 | conv7_confidence = self.conv7_conf_conv(conv7_fm) 338 | conv7_confidence = conv7_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes) 339 | # conv7_confidence = F.softmax(conv7_confidence, dim=-1) 340 | 341 | conv8_localization = self.conv8_loc_conv(conv8_fm) 342 | conv8_localization = conv8_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4) 343 | conv8_confidence = self.conv8_conf_conv(conv8_fm) 344 | conv8_confidence = conv8_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes) 345 | # conv8_confidence = F.softmax(conv8_confidence, dim=-1) 346 | 347 | conv9_localization = self.conv9_loc_conv(conv9_fm) 348 | conv9_localization = conv9_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4) 349 | conv9_confidence = self.conv9_conf_conv(conv9_fm) 350 | conv9_confidence = conv9_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes) 351 | # conv9_confidence = F.softmax(conv9_confidence, dim=-1) 352 | 353 | loc_preds = torch.cat([conv4_3_norm_localization, fc_conv7_localization, conv6_localization, conv7_localization, 354 | conv8_localization, conv9_localization], dim=1) 355 | conf_preds = torch.cat([conv4_3_norm_confidence, fc_conv7_confidence, conv6_confidence, conv7_confidence, 356 | conv8_confidence, conv9_confidence], dim=1) 357 | return loc_preds, conf_preds 358 | 359 | def get_feature_map(self, input, conv6_dilation): 360 | conv6 = self.__getattr__('fc_conv6_{}'.format(self.layer_name)) 361 | conv6.dilation = conv6_dilation 362 | conv6.padding = conv6_dilation 363 | output = self.__getattr__('conv1_1_{}'.format(self.layer_name))(input) 364 | output = self.__getattr__('relu1_1_{}'.format(self.layer_name))(output) 365 | output = self.__getattr__('conv1_2_{}'.format(self.layer_name))(output) 366 | output = self.__getattr__('relu1_2_{}'.format(self.layer_name))(output) 367 | output = self.__getattr__('pool1_{}'.format(self.layer_name))(output) 368 | 369 | output = self.__getattr__('conv2_1_{}'.format(self.layer_name))(output) 370 | output = self.__getattr__('relu2_1_{}'.format(self.layer_name))(output) 371 | output = self.__getattr__('conv2_2_{}'.format(self.layer_name))(output) 372 | output = self.__getattr__('relu2_2_{}'.format(self.layer_name))(output) 373 | output = self.__getattr__('pool2_{}'.format(self.layer_name))(output) 374 | 375 | output = self.__getattr__('conv3_1_{}'.format(self.layer_name))(output) 376 | output = self.__getattr__('relu3_1_{}'.format(self.layer_name))(output) 377 | output = self.__getattr__('conv3_2_{}'.format(self.layer_name))(output) 378 | output = self.__getattr__('relu3_2_{}'.format(self.layer_name))(output) 379 | output = self.__getattr__('conv3_3_{}'.format(self.layer_name))(output) 380 | output = self.__getattr__('relu3_3_{}'.format(self.layer_name))(output) 381 | output = self.__getattr__('pool3_{}'.format(self.layer_name))(output) 382 | 383 | output = self.__getattr__('conv4_1_{}'.format(self.layer_name))(output) 384 | output = self.__getattr__('relu4_1_{}'.format(self.layer_name))(output) 385 | output = self.__getattr__('conv4_2_{}'.format(self.layer_name))(output) 386 | output = self.__getattr__('relu4_2_{}'.format(self.layer_name))(output) 387 | output = self.__getattr__('conv4_3_{}'.format(self.layer_name))(output) 388 | output = self.__getattr__('relu4_3_{}'.format(self.layer_name))(output) 389 | conv4_3 = self.__getattr__('conv4_3_norm_{}'.format(self.layer_name))(output) 390 | 391 | output = self.__getattr__('pool4_{}'.format(self.layer_name))(output) 392 | 393 | output = self.__getattr__('conv5_1_{}'.format(self.layer_name))(output) 394 | output = self.__getattr__('relu5_1_{}'.format(self.layer_name))(output) 395 | output = self.__getattr__('conv5_2_{}'.format(self.layer_name))(output) 396 | output = self.__getattr__('relu5_2_{}'.format(self.layer_name))(output) 397 | output = self.__getattr__('conv5_3_{}'.format(self.layer_name))(output) 398 | output = self.__getattr__('relu5_3_{}'.format(self.layer_name))(output) 399 | output = self.__getattr__('pool5_{}'.format(self.layer_name))(output) 400 | 401 | output = self.__getattr__('fc_conv6_{}'.format(self.layer_name))(output) 402 | output = self.__getattr__('fc_relu6_{}'.format(self.layer_name))(output) 403 | output = self.__getattr__('fc_conv7_{}'.format(self.layer_name))(output) 404 | output = self.__getattr__('fc_relu7_{}'.format(self.layer_name))(output) 405 | fc_conv7 = output 406 | 407 | output = self.__getattr__('conv6_1_{}'.format(self.layer_name))(output) 408 | output = self.__getattr__('relu6_1_{}'.format(self.layer_name))(output) 409 | output = self.__getattr__('conv6_2_{}'.format(self.layer_name))(output) 410 | output = self.__getattr__('relu6_2_{}'.format(self.layer_name))(output) 411 | conv6 = output 412 | 413 | output = self.__getattr__('conv7_1_{}'.format(self.layer_name))(output) 414 | output = self.__getattr__('relu7_1_{}'.format(self.layer_name))(output) 415 | output = self.__getattr__('conv7_2_{}'.format(self.layer_name))(output) 416 | output = self.__getattr__('relu7_2_{}'.format(self.layer_name))(output) 417 | conv7 = output 418 | 419 | output = self.__getattr__('conv8_1_{}'.format(self.layer_name))(output) 420 | output = self.__getattr__('relu8_1_{}'.format(self.layer_name))(output) 421 | output = self.__getattr__('conv8_2_{}'.format(self.layer_name))(output) 422 | output = self.__getattr__('relu8_2_{}'.format(self.layer_name))(output) 423 | conv8 = output 424 | 425 | output = self.__getattr__('conv9_1_{}'.format(self.layer_name))(output) 426 | output = self.__getattr__('relu9_1_{}'.format(self.layer_name))(output) 427 | output = self.__getattr__('conv9_2_{}'.format(self.layer_name))(output) 428 | output = self.__getattr__('relu9_2_{}'.format(self.layer_name))(output) 429 | conv9 = output 430 | return conv4_3, fc_conv7, conv6, conv7, conv8, conv9 431 | 432 | def get_loc_conf(self, conv4_3_data, fc_conv7_data, conv6_data, conv7_data, conv8_data, conv9_data): 433 | conv4_3_norm_localization = self.conv4_3_norm_loc_conv(conv4_3_data.cuda()) 434 | conv4_3_norm_localization = conv4_3_norm_localization.permute(0, 2, 3, 1).contiguous().view( 435 | conv4_3_data.shape[0], 436 | -1, 437 | self.k_frames * 4) 438 | conv4_3_norm_confidence = self.conv4_3_norm_conf_conv(conv4_3_data.cuda()) 439 | conv4_3_norm_confidence = conv4_3_norm_confidence.permute(0, 2, 3, 1).contiguous().view(conv4_3_data.shape[0], 440 | -1, 441 | self.num_classes) 442 | fc_conv7_localization = self.fc_conv7_loc_conv(fc_conv7_data.cuda()) 443 | fc_conv7_localization = fc_conv7_localization.permute(0, 2, 3, 1).contiguous().view(fc_conv7_data.shape[0], -1, 444 | self.k_frames * 4) 445 | 446 | fc_conv7_confidence = self.fc_conv7_conf_conv(fc_conv7_data.cuda()) 447 | fc_conv7_confidence = fc_conv7_confidence.permute(0, 2, 3, 1).contiguous().view(fc_conv7_data.shape[0], -1, 448 | self.num_classes) 449 | 450 | conv6_localization = self.conv6_loc_conv(conv6_data.cuda()) 451 | conv6_localization = conv6_localization.permute(0, 2, 3, 1).contiguous().view(conv6_data.shape[0], -1, 452 | self.k_frames * 4) 453 | conv6_confidence = self.conv6_conf_conv(conv6_data.cuda()) 454 | conv6_confidence = conv6_confidence.permute(0, 2, 3, 1).contiguous().view(conv6_data.shape[0], -1, 455 | self.num_classes) 456 | 457 | conv7_localization = self.conv7_loc_conv(conv7_data.cuda()) 458 | conv7_localization = conv7_localization.permute(0, 2, 3, 1).contiguous().view(conv7_data.shape[0], -1, 459 | self.k_frames * 4) 460 | conv7_confidence = self.conv7_conf_conv(conv7_data.cuda()) 461 | conv7_confidence = conv7_confidence.permute(0, 2, 3, 1).contiguous().view(conv7_data.shape[0], -1, 462 | self.num_classes) 463 | 464 | conv8_localization = self.conv8_loc_conv(conv8_data.cuda()) 465 | conv8_localization = conv8_localization.permute(0, 2, 3, 1).contiguous().view(conv8_data.shape[0], -1, 466 | self.k_frames * 4) 467 | conv8_confidence = self.conv8_conf_conv(conv8_data.cuda()) 468 | conv8_confidence = conv8_confidence.permute(0, 2, 3, 1).contiguous().view(conv8_data.shape[0], -1, 469 | self.num_classes) 470 | 471 | conv9_localization = self.conv9_loc_conv(conv9_data.cuda()) 472 | conv9_localization = conv9_localization.permute(0, 2, 3, 1).contiguous().view(conv9_data.shape[0], -1, 473 | self.k_frames * 4) 474 | conv9_confidence = self.conv9_conf_conv(conv9_data.cuda()) 475 | conv9_confidence = conv9_confidence.permute(0, 2, 3, 1).contiguous().view(conv9_data.shape[0], -1, 476 | self.num_classes) 477 | loc_preds = torch.cat( 478 | [conv4_3_norm_localization, fc_conv7_localization, conv6_localization, conv7_localization, 479 | conv8_localization, conv9_localization], dim=1) 480 | conf_preds = torch.cat([conv4_3_norm_confidence, fc_conv7_confidence, conv6_confidence, conv7_confidence, 481 | conv8_confidence, conv9_confidence], dim=1) 482 | return loc_preds, conf_preds 483 | 484 | def train(self, mode=True): 485 | super(SSD_NET, self).train(mode) 486 | for m in self.modules(): 487 | ps = list(m.parameters()) 488 | for p in ps: 489 | p.requires_grad = True 490 | self.conv4_3_norm_conf_conv.bias.data.fill_(0) 491 | self.conv4_3_norm_conf_conv.bias.requires_grad = False 492 | self.conv4_3_norm_loc_conv.bias.data.fill_(0) 493 | self.conv4_3_norm_loc_conv.bias.requires_grad = False 494 | if self.frezze_init: 495 | self.frezze_init_func(freeze_norm_layer=False) 496 | 497 | def eval(self): 498 | super(SSD_NET, self).eval() 499 | for m in self.modules(): 500 | ps = list(m.parameters()) 501 | for p in ps: 502 | p.requires_grad = False 503 | 504 | def get_optim_policies(self): 505 | parameters_list = [] 506 | for m in self.modules(): 507 | if isinstance(m, torch.nn.Conv2d): 508 | ps = list(m.parameters()) 509 | if ps[0].requires_grad: 510 | parameters_list.append(ps[0]) 511 | if ps[1].requires_grad: 512 | parameters_list.append(ps[1]) 513 | elif isinstance(m, scale_norm): 514 | ps = list(m.parameters()) 515 | if ps[0].requires_grad: 516 | parameters_list.append(ps[0]) 517 | return parameters_list 518 | 519 | def get_loc_conf_optim_policies(self): 520 | parameters_list = [] 521 | parameters_list.append(self.conv4_3_norm_loc_conv.weight) 522 | parameters_list.append(self.conv4_3_norm_conf_conv.weight) 523 | 524 | parameters_list.append(self.fc_conv7_loc_conv.weight) 525 | parameters_list.append(self.fc_conv7_loc_conv.bias) 526 | parameters_list.append(self.fc_conv7_conf_conv.weight) 527 | parameters_list.append(self.fc_conv7_conf_conv.bias) 528 | 529 | parameters_list.append(self.conv6_loc_conv.weight) 530 | parameters_list.append(self.conv6_loc_conv.bias) 531 | parameters_list.append(self.conv6_conf_conv.weight) 532 | parameters_list.append(self.conv6_conf_conv.bias) 533 | 534 | parameters_list.append(self.conv7_loc_conv.weight) 535 | parameters_list.append(self.conv7_loc_conv.bias) 536 | parameters_list.append(self.conv7_conf_conv.weight) 537 | parameters_list.append(self.conv7_conf_conv.bias) 538 | 539 | parameters_list.append(self.conv8_loc_conv.weight) 540 | parameters_list.append(self.conv8_loc_conv.bias) 541 | parameters_list.append(self.conv8_conf_conv.weight) 542 | parameters_list.append(self.conv8_conf_conv.bias) 543 | 544 | parameters_list.append(self.conv9_loc_conv.weight) 545 | parameters_list.append(self.conv9_loc_conv.bias) 546 | parameters_list.append(self.conv9_conf_conv.weight) 547 | parameters_list.append(self.conv9_conf_conv.bias) 548 | return parameters_list 549 | 550 | def get_vgg_optim_policies(self): 551 | parameters_list = [] 552 | conv = self.__getattr__('conv1_1_{}'.format(self.layer_name)) 553 | parameters_list.append(conv.weight) 554 | parameters_list.append(conv.bias) 555 | conv = self.__getattr__('conv1_2_{}'.format(self.layer_name)) 556 | parameters_list.append(conv.weight) 557 | parameters_list.append(conv.bias) 558 | conv = self.__getattr__('conv2_1_{}'.format(self.layer_name)) 559 | parameters_list.append(conv.weight) 560 | parameters_list.append(conv.bias) 561 | conv = self.__getattr__('conv2_2_{}'.format(self.layer_name)) 562 | parameters_list.append(conv.weight) 563 | parameters_list.append(conv.bias) 564 | conv = self.__getattr__('conv3_1_{}'.format(self.layer_name)) 565 | parameters_list.append(conv.weight) 566 | parameters_list.append(conv.bias) 567 | conv = self.__getattr__('conv3_2_{}'.format(self.layer_name)) 568 | parameters_list.append(conv.weight) 569 | parameters_list.append(conv.bias) 570 | conv = self.__getattr__('conv3_3_{}'.format(self.layer_name)) 571 | parameters_list.append(conv.weight) 572 | parameters_list.append(conv.bias) 573 | conv = self.__getattr__('conv4_1_{}'.format(self.layer_name)) 574 | parameters_list.append(conv.weight) 575 | parameters_list.append(conv.bias) 576 | conv = self.__getattr__('conv4_2_{}'.format(self.layer_name)) 577 | parameters_list.append(conv.weight) 578 | parameters_list.append(conv.bias) 579 | conv = self.__getattr__('conv4_3_{}'.format(self.layer_name)) 580 | parameters_list.append(conv.weight) 581 | parameters_list.append(conv.bias) 582 | conv = self.__getattr__('conv4_3_norm_{}'.format(self.layer_name)) 583 | parameters_list.append(conv.weight) 584 | parameters_list.append(conv.bias) 585 | conv = self.__getattr__('conv5_1_{}'.format(self.layer_name)) 586 | parameters_list.append(conv.weight) 587 | parameters_list.append(conv.bias) 588 | conv = self.__getattr__('conv5_2_{}'.format(self.layer_name)) 589 | parameters_list.append(conv.weight) 590 | parameters_list.append(conv.bias) 591 | conv = self.__getattr__('conv5_3_{}'.format(self.layer_name)) 592 | parameters_list.append(conv.weight) 593 | parameters_list.append(conv.bias) 594 | return parameters_list 595 | 596 | def get_ssd_optim_policies(self): 597 | parameters_list = [] 598 | conv = self.__getattr__('fc_conv6_{}'.format(self.layer_name)) 599 | parameters_list.append(conv.weight) 600 | parameters_list.append(conv.bias) 601 | conv = self.__getattr__('fc_conv7_{}'.format(self.layer_name)) 602 | parameters_list.append(conv.weight) 603 | parameters_list.append(conv.bias) 604 | conv = self.__getattr__('conv6_1_{}'.format(self.layer_name)) 605 | parameters_list.append(conv.weight) 606 | parameters_list.append(conv.bias) 607 | conv = self.__getattr__('conv6_2_{}'.format(self.layer_name)) 608 | parameters_list.append(conv.weight) 609 | parameters_list.append(conv.bias) 610 | conv = self.__getattr__('conv7_1_{}'.format(self.layer_name)) 611 | parameters_list.append(conv.weight) 612 | parameters_list.append(conv.bias) 613 | conv = self.__getattr__('conv7_2_{}'.format(self.layer_name)) 614 | parameters_list.append(conv.weight) 615 | parameters_list.append(conv.bias) 616 | conv = self.__getattr__('conv8_1_{}'.format(self.layer_name)) 617 | parameters_list.append(conv.weight) 618 | parameters_list.append(conv.bias) 619 | conv = self.__getattr__('conv8_2_{}'.format(self.layer_name)) 620 | parameters_list.append(conv.weight) 621 | parameters_list.append(conv.bias) 622 | conv = self.__getattr__('conv9_1_{}'.format(self.layer_name)) 623 | parameters_list.append(conv.weight) 624 | parameters_list.append(conv.bias) 625 | conv = self.__getattr__('conv9_2_{}'.format(self.layer_name)) 626 | parameters_list.append(conv.weight) 627 | parameters_list.append(conv.bias) 628 | return parameters_list 629 | 630 | def frezze_init_func(self, freeze_norm_layer=False): 631 | m = self.__getattr__('conv1_1_{}'.format(self.layer_name)) 632 | m.eval() 633 | m.weight.requires_grad = False 634 | m.bias.requires_grad = False 635 | m = self.__getattr__('conv1_2_{}'.format(self.layer_name)) 636 | m.eval() 637 | m.weight.requires_grad = False 638 | m.bias.requires_grad = False 639 | m = self.__getattr__('conv2_1_{}'.format(self.layer_name)) 640 | m.eval() 641 | m.weight.requires_grad = False 642 | m.bias.requires_grad = False 643 | m = self.__getattr__('conv2_2_{}'.format(self.layer_name)) 644 | m.eval() 645 | m.weight.requires_grad = False 646 | m.bias.requires_grad = False 647 | m = self.__getattr__('conv3_1_{}'.format(self.layer_name)) 648 | m.eval() 649 | m.weight.requires_grad = False 650 | m.bias.requires_grad = False 651 | m = self.__getattr__('conv3_2_{}'.format(self.layer_name)) 652 | m.eval() 653 | m.weight.requires_grad = False 654 | m.bias.requires_grad = False 655 | m = self.__getattr__('conv3_3_{}'.format(self.layer_name)) 656 | m.eval() 657 | m.weight.requires_grad = False 658 | m.bias.requires_grad = False 659 | m = self.__getattr__('conv4_1_{}'.format(self.layer_name)) 660 | m.eval() 661 | m.weight.requires_grad = False 662 | m.bias.requires_grad = False 663 | m = self.__getattr__('conv4_2_{}'.format(self.layer_name)) 664 | m.eval() 665 | m.weight.requires_grad = False 666 | m.bias.requires_grad = False 667 | m = self.__getattr__('conv4_3_{}'.format(self.layer_name)) 668 | m.eval() 669 | m.weight.requires_grad = False 670 | m.bias.requires_grad = False 671 | 672 | m = self.__getattr__('conv5_1_{}'.format(self.layer_name)) 673 | m.eval() 674 | m.weight.requires_grad = False 675 | m.bias.requires_grad = False 676 | m = self.__getattr__('conv5_2_{}'.format(self.layer_name)) 677 | m.eval() 678 | m.weight.requires_grad = False 679 | m.bias.requires_grad = False 680 | m = self.__getattr__('conv5_3_{}'.format(self.layer_name)) 681 | m.eval() 682 | m.weight.requires_grad = False 683 | m.bias.requires_grad = False 684 | m = self.__getattr__('fc_conv6_{}'.format(self.layer_name)) 685 | m.eval() 686 | m.weight.requires_grad = False 687 | m.bias.requires_grad = False 688 | m = self.__getattr__('fc_conv7_{}'.format(self.layer_name)) 689 | m.eval() 690 | m.weight.requires_grad = False 691 | m.bias.requires_grad = False 692 | if freeze_norm_layer: 693 | m = self.__getattr__('conv4_3_norm_{}'.format(self.layer_name)) 694 | m.eval() 695 | m.scale.requires_grad = False 696 | 697 | def freeze_ssd(self, freeze_norm_layer): 698 | self.frezze_vgg(freeze_norm_layer) 699 | m = self.__getattr__('fc_conv6_{}'.format(self.layer_name)) 700 | m.eval() 701 | m.weight.requires_grad = False 702 | m.bias.requires_grad = False 703 | m = self.__getattr__('fc_conv7_{}'.format(self.layer_name)) 704 | m.eval() 705 | m.weight.requires_grad = False 706 | m.bias.requires_grad = False 707 | m = self.__getattr__('conv6_1_{}'.format(self.layer_name)) 708 | m.eval() 709 | m.weight.requires_grad = False 710 | m.bias.requires_grad = False 711 | m = self.__getattr__('conv6_2_{}'.format(self.layer_name)) 712 | m.eval() 713 | m.weight.requires_grad = False 714 | m.bias.requires_grad = False 715 | 716 | m = self.__getattr__('conv7_1_{}'.format(self.layer_name)) 717 | m.eval() 718 | m.weight.requires_grad = False 719 | m.bias.requires_grad = False 720 | m = self.__getattr__('conv7_2_{}'.format(self.layer_name)) 721 | m.eval() 722 | m.weight.requires_grad = False 723 | m.bias.requires_grad = False 724 | 725 | m = self.__getattr__('conv8_1_{}'.format(self.layer_name)) 726 | m.eval() 727 | m.weight.requires_grad = False 728 | m.bias.requires_grad = False 729 | m = self.__getattr__('conv8_2_{}'.format(self.layer_name)) 730 | m.eval() 731 | m.weight.requires_grad = False 732 | m.bias.requires_grad = False 733 | 734 | m = self.__getattr__('conv9_1_{}'.format(self.layer_name)) 735 | m.eval() 736 | m.weight.requires_grad = False 737 | m.bias.requires_grad = False 738 | m = self.__getattr__('conv9_2_{}'.format(self.layer_name)) 739 | m.eval() 740 | m.weight.requires_grad = False 741 | m.bias.requires_grad = False 742 | 743 | 744 | def vgg(cfg, i, batch_norm=False): 745 | layers = [] 746 | in_channels = i 747 | for v in cfg: 748 | if v == 'M': 749 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 750 | elif v == 'C': 751 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 752 | else: 753 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 754 | if batch_norm: 755 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 756 | else: 757 | layers += [conv2d, nn.ReLU(inplace=True)] 758 | in_channels = v 759 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 760 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 761 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 762 | layers += [pool5, conv6, 763 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 764 | return layers 765 | 766 | if __name__ == '__main__': 767 | # with open('/media/main/_sdc/qzw/ACT-Detector/my-act-detector/caffe-models/initialization_VGG_ILSVRC16_K6_RGB.pkl', 'rb') as f: 768 | # initialization_dict = pickle.load(f) 769 | # f.close() 770 | # rgb_net = SSD_NET(num_classes=25, rgb=True) 771 | 772 | # rgb_net.load_init_weights( 773 | # './caffe-models/UCF101v2/RGB-UCF101v2-numpy.pkl') 774 | # torch.save(rgb_net.state_dict(), 'RGB-UCF101v2-pytorch.pkl') 775 | # print("RGB OK!!!") 776 | 777 | flow_net = SSD_NET(num_classes=25, rgb=False) 778 | flow_net.load_init_weights( 779 | './caffe-models/UCF101v2/FLOW5-UCF101v2-numpy.pkl') 780 | torch.save(flow_net.state_dict(), 'FLOW5-UCF101v2-pytorch.pkl') 781 | print("FLOW5 OK!!!") 782 | 783 | 784 | 785 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from utils import prior_tubes 4 | from utils import map_eval 5 | from data import dataset 6 | import multiprocessing 7 | import time 8 | import torch.nn.functional as F 9 | import pickle 10 | 11 | 12 | def data_handle_and_save_process(all_frame_boxes_dict, video_index, conf_preds, decode_video_tubes, num_class, 13 | sequence_length, height, width): 14 | all_frame_boxes_list = [] 15 | start_frame = 0 16 | frame_boxes = {} 17 | for batch in range(conf_preds.shape[0]): 18 | start_frame += 1 19 | nms_tubes_blist, nms_scores, nms_label_list = prior_tubes.apply_nms(conf_preds[batch, :], 20 | decode_video_tubes[batch, :], 21 | nms_threshold=0.45, 22 | num_class=num_class) 23 | if nms_scores.__len__() > 0: 24 | tt1 = (torch.Tensor(nms_label_list) - 1).view(-1, 1).numpy() 25 | tt2 = torch.Tensor(nms_scores).view(-1, 1).numpy() 26 | tt3 = np.vstack([tt.view(1, -1).cpu().numpy() for tt in nms_tubes_blist]) 27 | best_tube = np.hstack([tt1, tt2, tt3]) 28 | else: 29 | best_tube = np.array([]) 30 | for m in range(best_tube.shape[0]): 31 | for n in range(sequence_length): 32 | if (n + start_frame) not in frame_boxes: 33 | frame_boxes[n + start_frame] = [] 34 | frame_boxes[n + start_frame] += [ 35 | best_tube[m, np.array([0, 1, 2 + 4 * n, 3 + 4 * n, 4 + 4 * n, 5 + 4 * n])]] 36 | # print("video:{}/{}ok!".format(video_index, eval_dataset.__len__()), "\ttime:", time.time() - time_start, 37 | # "frame:{}".format(nframes)) 38 | for frame_index in frame_boxes: 39 | frame_label = {} # 记录了当前帧上各个label的所有框 40 | for bb in frame_boxes[frame_index]: 41 | if bb[0] not in frame_label: 42 | frame_label[bb[0]] = [] 43 | frame_label[bb[0]] += [bb[1:]] 44 | for tt in frame_label: 45 | idx = map_eval.nms_class(frame_label[tt], nms_threshold=0.3) 46 | for id in idx: 47 | all_frame_boxes_list += [np.hstack([np.array([video_index, frame_index, tt]), 48 | frame_label[tt][id] * np.array([1, width, height, width, height])])] 49 | all_frame_boxes_dict[video_index] = all_frame_boxes_list 50 | print("video_index:{} OK!!".format(video_index)) 51 | 52 | 53 | def eval_rgb_or_flow(model, eval_dataset, eval_dataloader, args, GEN_NUM): 54 | if args.dataset == 'UCF101v2': 55 | num_class = 25 56 | elif args.dataset == 'UCFSports': 57 | num_class = 11 58 | else: 59 | num_class = 0 60 | print("No dataset name {}".format(args.dataset)) 61 | exit(0) 62 | rgb = args.modality == 'rgb' 63 | use_gpu = args.use_gpu 64 | variance = args.variance 65 | # model = ssd_net_ucf101.SSD_NET(dataset=args.dataset, num_classes=num_class, modality=args.modality) 66 | # if args.reinit_all: 67 | # print("reinit all data!!!") 68 | # # model.load_trained_weights('/home/qzw/code/my-act-detector/caffe-models/UCFSports/FLOW5-UCFSports.pkl') 69 | # # pytorch_model = '/home/qzw/code/my-act-detector/pytorch-models/{}/{}-trained-model-{}-pytorch-single.pkl'.format(args.dataset, args.dataset, args.modality) 70 | # # model.load_state_dict(torch.load(pytorch_model)) 71 | # # GEN_NUM = 0 72 | # pytorch_model = '/home/qzw/code/my-act-detector-12-13/my_trained_pytorch_model_{}-{}.pkl'.format(args.dataset, args.modality) 73 | # data_dict = torch.load(pytorch_model) 74 | # GEN_NUM = data_dict['gen_num'] 75 | # net_state_dict = {} 76 | # for key in data_dict['net_state_dict']: 77 | # if 'module.' in key: 78 | # new_key = key.replace('module.', '') 79 | # else: 80 | # new_key = key 81 | # net_state_dict[new_key] = data_dict['net_state_dict'][key] 82 | # model.load_state_dict(net_state_dict) 83 | # if use_gpu: 84 | # model.cuda() 85 | # # model = torch.nn.DataParallel(model).cuda() 86 | model.eval() 87 | eval_dataset = dataset.TubeDataset(args.dataset, data_path=args.data_path, phase='eval', 88 | modality=args.modality, sequence_length=6) 89 | eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=1, shuffle=False, 90 | num_workers=8, pin_memory=True) 91 | tubes_init = prior_tubes.RGB_TUBES(phase='eval', use_gpu=use_gpu, variance=variance, sequence_length=6) 92 | manager = multiprocessing.Manager() 93 | all_frame_boxes_dict = manager.dict() 94 | pool = multiprocessing.Pool(processes=16) 95 | resolution = eval_dataset.get_resolution() 96 | start_time = time.time() 97 | nframes_sum = 0 98 | for i, (input, target) in enumerate(eval_dataloader): 99 | video_index = i + 1 100 | nframes = int(input.shape[1] / 3) 101 | print("GEN_NUM:{} video_index:{}/{} start!! frame num:{} nframes_sum:{} fps:{}".format(GEN_NUM, video_index, 102 | eval_dataset.__len__(), 103 | nframes, nframes_sum, 104 | nframes_sum / ( 105 | time.time() - start_time))) 106 | nframes_sum += nframes 107 | height, width = resolution[eval_dataset.videos_list[int(target[0, 0])]] 108 | if use_gpu: 109 | input = input.cuda() 110 | d36_dict = {} 111 | d36_dict['conv4_3'] = [0] 112 | d36_dict['fc_conv7'] = [0] 113 | d36_dict['conv6'] = [0] 114 | d36_dict['conv7'] = [0] 115 | d36_dict['conv8'] = [0] 116 | d36_dict['conv9'] = [0] 117 | conf_preds_list = [] 118 | decode_video_tubes_list = [] 119 | for d in range(1, args.sequence_length - 1): 120 | conv4_3_d36, fc_conv7_d36, conv6_d36, conv7_d36, conv8_d36, conv9_d36 = model.get_feature_map( 121 | input[0, 3 * d:3 * (1 + d), :, :].unsqueeze(0), (36, 36)) 122 | d36_dict['conv4_3'] += [conv4_3_d36] 123 | d36_dict['fc_conv7'] += [fc_conv7_d36] 124 | d36_dict['conv6'] += [conv6_d36] 125 | d36_dict['conv7'] += [conv7_d36] 126 | d36_dict['conv8'] += [conv8_d36] 127 | d36_dict['conv9'] += [conv9_d36] 128 | 129 | for frame_index in range(nframes - args.sequence_length + 1): 130 | if rgb: 131 | conv4_3_d6, fc_conv7_d6, conv6_d6, conv7_d6, conv8_d6, conv9_d6 = model.get_feature_map( 132 | input[0, 3 * frame_index:3 * (frame_index + 1), :, :].unsqueeze(0), (6, 6)) 133 | conv4_3_d36, fc_conv7_d36, conv6_d36, conv7_d36, conv8_d36, conv9_d36 = model.get_feature_map( 134 | input[0, 3 * (frame_index + args.sequence_length - 1):3 * (frame_index + args.sequence_length), :, 135 | :].unsqueeze(0), (36, 36)) 136 | else: 137 | conv4_3_d6, fc_conv7_d6, conv6_d6, conv7_d6, conv8_d6, conv9_d6 = model.get_feature_map( 138 | input[0, 3 * frame_index:3 * (frame_index + args.sequence_length - 1), :, :].unsqueeze(0), (6, 6)) 139 | conv4_3_d36, fc_conv7_d36, conv6_d36, conv7_d36, conv8_d36, conv9_d36 = model.get_feature_map( 140 | torch.cat([input[0, 3 * min(frame_index + args.sequence_length - 1 + ff, nframes - 1):3 * ( 141 | min(frame_index + args.sequence_length - 1 + ff, nframes - 1) + 1), :, :].unsqueeze(0) for 142 | ff in range(args.sequence_length - 1)], dim=1), (36, 36)) 143 | d36_dict['conv4_3'] += [conv4_3_d36] 144 | d36_dict['fc_conv7'] += [fc_conv7_d36] 145 | d36_dict['conv6'] += [conv6_d36] 146 | d36_dict['conv7'] += [conv7_d36] 147 | d36_dict['conv8'] += [conv8_d36] 148 | d36_dict['conv9'] += [conv9_d36] 149 | d36_dict['conv4_3'][frame_index] = 0 150 | d36_dict['fc_conv7'][frame_index] = 0 151 | d36_dict['conv6'][frame_index] = 0 152 | d36_dict['conv7'][frame_index] = 0 153 | d36_dict['conv8'][frame_index] = 0 154 | d36_dict['conv9'][frame_index] = 0 155 | conv4_3_data = torch.cat([conv4_3_d6] + [d36_dict['conv4_3'][ff] for ff in 156 | range(frame_index + 1, frame_index + args.sequence_length)], 157 | dim=1) 158 | fc_conv7_data = torch.cat([fc_conv7_d6] + [d36_dict['fc_conv7'][ff] for ff in 159 | range(frame_index + 1, frame_index + args.sequence_length)], 160 | dim=1) 161 | conv6_data = torch.cat([conv6_d6] + [d36_dict['conv6'][ff] for ff in 162 | range(frame_index + 1, frame_index + args.sequence_length)], 163 | dim=1) 164 | conv7_data = torch.cat([conv7_d6] + [d36_dict['conv7'][ff] for ff in 165 | range(frame_index + 1, frame_index + args.sequence_length)], 166 | dim=1) 167 | conv8_data = torch.cat([conv8_d6] + [d36_dict['conv8'][ff] for ff in 168 | range(frame_index + 1, frame_index + args.sequence_length)], 169 | dim=1) 170 | conv9_data = torch.cat([conv9_d6] + [d36_dict['conv9'][ff] for ff in 171 | range(frame_index + 1, frame_index + args.sequence_length)], 172 | dim=1) 173 | loc_preds, conf_preds = model.get_loc_conf(conv4_3_data, fc_conv7_data, conv6_data, conv7_data, 174 | conv8_data, conv9_data) 175 | conf_preds = F.softmax(conf_preds, dim=-1) 176 | decode_video_tubes = prior_tubes.decode_tubes(tubes_init, loc_preds) 177 | conf_preds_list += [conf_preds.cpu()] 178 | decode_video_tubes_list += [decode_video_tubes.cpu()] 179 | conf_preds = torch.cat(conf_preds_list, dim=0) 180 | decode_video_tubes = torch.cat(decode_video_tubes_list, dim=0) 181 | # data_handle_and_save_process(all_frame_boxes_dict, video_index, conf_preds, decode_video_tubes, num_class, 182 | # args.sequence_length, height, width) 183 | pool.apply_async(data_handle_and_save_process, (all_frame_boxes_dict, video_index, conf_preds, 184 | decode_video_tubes, num_class, args.sequence_length, 185 | height, width, )) 186 | print("waiting calc!!") 187 | pool.close() 188 | pool.join() 189 | print("all ok!!") 190 | all_frame_boxes_list = [] 191 | for key in all_frame_boxes_dict: 192 | all_frame_boxes_list += all_frame_boxes_dict[key] 193 | with open(args.all_frame_boxes_list_result, "wb") as file: 194 | pickle.dump(all_frame_boxes_list, file) 195 | return map_eval.calc_pr(all_frame_boxes_list, eval_dataset) 196 | 197 | 198 | if __name__ == '__main__': 199 | import config 200 | from layers import ssd 201 | args = config.Config() 202 | if args.dataset == 'UCF101v2': 203 | num_class = 25 204 | elif args.dataset == 'UCFSports': 205 | num_class = 11 206 | else: 207 | num_class = 0 208 | print("No dataset name {}".format(args.dataset)) 209 | exit(0) 210 | eval_net = ssd.SSD_NET(dataset=args.dataset, frezze_init=args.freeze_init, num_classes=num_class, 211 | modality=args.modality) 212 | data_dict = torch.load("/mnt/data/qzw/model/pytorch-act-detector/{}/best-rgb-0.8601.pkl".format(args.dataset)) 213 | net_state_dict = {} 214 | for key in data_dict['net_state_dict']: 215 | if 'module.' in key: 216 | new_key = key.replace('module.', '') 217 | else: 218 | new_key = key 219 | net_state_dict[new_key] = data_dict['net_state_dict'][key] 220 | eval_net.load_state_dict(net_state_dict) 221 | if args.use_gpu: 222 | eval_net = eval_net.cuda() 223 | mmap = eval_rgb_or_flow(model=eval_net, eval_dataset=None, eval_dataloader=None, args=args, 224 | GEN_NUM=data_dict['gen_num']) 225 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import config 2 | from layers import act_cuboid_loss 3 | from data import dataset 4 | from layers import ssd 5 | import torch 6 | import time 7 | import torch.optim.lr_scheduler as lr_scheduler 8 | import test 9 | 10 | 11 | def main(): 12 | args = config.Config() 13 | train(args) 14 | exit(0) 15 | 16 | 17 | def train(args): 18 | use_gpu = args.use_gpu 19 | if args.dataset == 'UCF101v2': 20 | num_class = 25 21 | elif args.dataset == 'UCFSports': 22 | num_class = 11 23 | else: 24 | num_class = 0 25 | print("No dataset name {}".format(args.dataset)) 26 | exit(0) 27 | variance = args.variance 28 | MAX_GEN = args.epochs 29 | k_frames = args.sequence_length 30 | print("train batch size:", args.train_batch_size, 'lr', args.lr) 31 | train_net = ssd.SSD_NET(dataset=args.dataset, frezze_init=args.freeze_init, num_classes=num_class, modality=args.modality) 32 | if args.reinit_all: 33 | print("reinit all data!!!") 34 | start_gen = 0 35 | train_net.load_state_dict( 36 | torch.load(args.init_model)) 37 | train_net.train(True) 38 | optimizer = torch.optim.SGD(train_net.get_optim_policies(), 39 | lr=args.lr, 40 | momentum=args.momentum, 41 | weight_decay=args.weight_decay) 42 | scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.94) 43 | loss_class_list = [] 44 | loss_loc_list = [] 45 | loss_list = [] 46 | else: 47 | print("load last train data!!!") 48 | data_dict = torch.load(args.new_trained_model) 49 | start_gen = data_dict['gen_num'] 50 | # start_gen = 0 51 | net_state_dict = {} 52 | for key in data_dict['net_state_dict']: 53 | if 'module.' in key: 54 | new_key = key.replace('module.', '') 55 | else: 56 | new_key = key 57 | net_state_dict[new_key] = data_dict['net_state_dict'][key] 58 | train_net.load_state_dict(net_state_dict) 59 | train_net.train(True) 60 | optimizer = torch.optim.SGD(train_net.get_optim_policies(), 61 | lr=args.lr, 62 | momentum=args.momentum, 63 | weight_decay=args.weight_decay) 64 | # optimizer.load_state_dict(data_dict['optimizer']) 65 | for group in optimizer.param_groups: 66 | if 'initial_lr' not in group: 67 | group['initial_lr'] = args.lr 68 | # optimizer.defaults['lr'] = args.lr 69 | scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.94, last_epoch=start_gen - 1) 70 | loss_class_list = data_dict['loss_class_list'] 71 | loss_loc_list = data_dict['loss_loc_list'] 72 | loss_list = data_dict['loss_list'] 73 | print("last data: GEN:", start_gen, "\tloss loc:", loss_loc_list[-1], "\tloss conf:", loss_class_list[-1], 74 | "\tloss:", loss_list[-1], 75 | "\tlr:", scheduler.get_lr()) 76 | 77 | if use_gpu: 78 | train_net = torch.nn.DataParallel(train_net).cuda() 79 | print('all net loaded ok!!!') 80 | train_dataset = dataset.TubeDataset(args.dataset, data_path=args.data_path, phase='train', 81 | modality=args.modality, 82 | sequence_length=6) 83 | dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True, 84 | num_workers=args.workers, pin_memory=True) 85 | criterion = act_cuboid_loss.CuboidLoss(use_gpu, variance, num_class, k_frames) 86 | mmap_best = 0 87 | mmap_list = [] 88 | if args.reinit_all: 89 | warm_up(train_net, dataloader, train_dataset, criterion, optimizer, scheduler, use_gpu, args, loss_loc_list, 90 | loss_class_list, loss_list) 91 | for gen in range(start_gen, MAX_GEN): 92 | train_epoch(train_net, dataloader, train_dataset, criterion, optimizer, scheduler, use_gpu, args, gen, 93 | loss_loc_list, loss_class_list, loss_list) 94 | if (gen + 1) % 1 == 0: 95 | temp_dict = {} 96 | temp_dict['net_state_dict'] = train_net.module.state_dict() 97 | temp_dict['gen_num'] = gen + 1 98 | temp_dict['optimizer'] = optimizer.state_dict() 99 | temp_dict['loss_loc_list'] = loss_loc_list 100 | temp_dict['loss_class_list'] = loss_class_list 101 | temp_dict['loss_list'] = loss_list 102 | temp_dict['mmap_list'] = mmap_list 103 | torch.save(temp_dict, args.new_trained_model) 104 | print("net save ok!!") 105 | if loss_list[-1] < 1.0: 106 | mmap = test.eval_rgb_or_flow(model=train_net.module, eval_dataset=None, eval_dataloader=None, args=args, 107 | GEN_NUM=gen + 1) 108 | with open('./train_log_{}.txt'.format(args.dataset), 'a') as train_log: 109 | log = "GEN:{}".format(gen) + "\tmap:{}".format(mmap) + "\tbest map:{}\n".format(mmap_best) 110 | train_log.write(log) 111 | train_net.module.train(True) 112 | mmap_list += [mmap] 113 | if mmap > mmap_best: 114 | mmap_best = mmap 115 | temp_dict['mmap_best'] = mmap_best 116 | torch.save(temp_dict, args.best_trained_model % mmap_best) 117 | print("current map:{} best map:{}, best model saved ok!".format(mmap, mmap_best)) 118 | 119 | 120 | def train_epoch(train_net, dataloader, train_dataset, criterion, optimizer, scheduler, use_gpu, args, gen, loss_loc_list, 121 | loss_class_list, loss_list, warm_up_lr_inc=None): 122 | total_loss = AverageMeter() 123 | loss_ls = AverageMeter() 124 | loss_cs = AverageMeter() 125 | if warm_up_lr_inc is None: 126 | scheduler.step() 127 | total_loss.reset() 128 | loss_ls.reset() 129 | loss_cs.reset() 130 | for i, (input, target) in enumerate(dataloader): 131 | # st = time.time() 132 | if warm_up_lr_inc is not None: 133 | for lr in range(len(optimizer.param_groups)): 134 | optimizer.param_groups[lr]['lr'] += warm_up_lr_inc[lr] 135 | if use_gpu: 136 | input = input.cuda() 137 | target = target.cuda() 138 | loc_preds, conf_preds = train_net(input) 139 | loss_l, loss_c = criterion((loc_preds, conf_preds), target) 140 | loss = loss_l + loss_c 141 | optimizer.zero_grad() 142 | loss.backward() 143 | optimizer.step() 144 | if use_gpu: 145 | total_loss.update(loss.cpu().detach().numpy()) 146 | loss_ls.update(loss_l.cpu().detach().numpy()) 147 | loss_cs.update(loss_c.cpu().detach().numpy()) 148 | else: 149 | total_loss.update(loss.detach().numpy()) 150 | loss_ls.update(loss_l.detach().numpy()) 151 | loss_cs.update(loss_c.detach().numpy()) 152 | # print("{}s one batch".format(time.time() - st)) 153 | if (i+1) % 100 == 0: 154 | print("GEN:", gen, "\tnum:{}/{}".format((i + 1) * args.train_batch_size, train_dataset.__len__()), 155 | "\tloss loc:", loss_ls.avg, "\tloss conf:", loss_cs.avg, "\tloss:", total_loss.avg, 156 | "\tlr:", scheduler.get_lr(), time.strftime('\t%m/%d %H:%M:%S', time.localtime(time.time()))) 157 | print("\tloss loc:", loss_ls.avg, "\tloss conf:", loss_cs.avg, "\tloss:", total_loss.avg) 158 | with open('./train_log_{}.txt'.format(args.dataset), 'a') as train_log: 159 | log = "GEN:{}".format(gen) + "\tloss loc:{}".format(loss_ls.avg) + "\tloss conf:{}".format(loss_cs.avg) + \ 160 | "\tloss:{}".format(total_loss.avg) + "\tlr:{}".format(scheduler.get_lr()) + time.strftime( 161 | '\t%m/%d %H:%M:%S\n', time.localtime(time.time())) 162 | train_log.write(log) 163 | loss_loc_list += [loss_ls.avg] 164 | loss_class_list += [loss_cs.avg] 165 | loss_list += [total_loss.avg] 166 | 167 | 168 | def warm_up(train_net, dataloader, train_dataset, criterion, optimizer, scheduler, use_gpu, args, loss_loc_list, 169 | loss_class_list, loss_list): 170 | warm_up_ratio = args.warm_up_ratio 171 | warm_up_epoch = args.warm_up_epoch 172 | lr_inc = [] 173 | for i in range(len(optimizer.param_groups)): 174 | lr_inc.append(optimizer.param_groups[i]['lr'] * (1 - warm_up_ratio) 175 | / (len(dataloader) * warm_up_epoch)) 176 | optimizer.param_groups[i]['lr'] *= warm_up_ratio 177 | for warm_up_index in range(warm_up_epoch): 178 | train_epoch(train_net, dataloader, train_dataset, criterion, optimizer, scheduler, use_gpu, args, 179 | warm_up_index - warm_up_epoch, loss_loc_list, loss_class_list, loss_list, warm_up_lr_inc=lr_inc) 180 | 181 | 182 | class AverageMeter(object): 183 | """Computes and stores the average and current value""" 184 | 185 | def __init__(self): 186 | self.reset() 187 | 188 | def reset(self): 189 | self.val = 0 190 | self.avg = 0 191 | self.sum = 0 192 | self.count = 0 193 | 194 | def update(self, val, n=1): 195 | self.val = val 196 | self.sum += val * n 197 | self.count += n 198 | self.avg = self.sum / self.count 199 | 200 | 201 | if __name__ == '__main__': 202 | main() 203 | # image_test_from_file('./ucf101_test.pkl') 204 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import box 2 | from . import map_eval 3 | from . import prior_tubes 4 | -------------------------------------------------------------------------------- /utils/act_tubes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import tube_dataset 4 | import ACT_utils 5 | import box 6 | import prior_tubes 7 | 8 | 9 | def nms_tublets(tubes_conf, decode_tubes, nms_threshold=0.45, top_k=400): 10 | cnt = 0 11 | # tubes_conf = tubes_conf.squeeze().detach().numpy() 12 | # decode_tubes = decode_tubes.reshape(-1, 6, 4) 13 | # max_class = np.max(tubes_conf[:, 1:], axis=1) 14 | class_index = np.argsort(-tubes_conf) 15 | class_nms_index_list = [class_index[0]] 16 | for index in class_index[1:]: 17 | keep = True 18 | for _max_index in class_nms_index_list: 19 | if prior_tubes.jaccard_overlap_tubes(decode_tubes[index, :], decode_tubes[_max_index, :]) > nms_threshold: 20 | keep = False 21 | break 22 | if keep: 23 | class_nms_index_list += [index] 24 | cnt += 1 25 | if cnt >= top_k: 26 | break 27 | return np.array(class_nms_index_list)[:top_k] 28 | 29 | 30 | def nms_tublets_caffe(tubes_conf, decode_tubes, nms_threshold=0.45, top_k=400, K=6): 31 | counter = 0 32 | x1 = [decode_tubes[:, i, 0] for i in range(K)] 33 | y1 = [decode_tubes[:, i, 1] for i in range(K)] 34 | x2 = [decode_tubes[:, i, 2] for i in range(K)] 35 | y2 = [decode_tubes[:, i, 3] for i in range(K)] 36 | dets = tubes_conf 37 | area = [(x2[k] - x1[k]) * (y2[k] - y1[k]) for k in range(K)] 38 | I = np.argsort(dets) 39 | indices = np.empty(top_k, dtype=np.int32) 40 | 41 | while I.size > 0: 42 | i = I[-1] 43 | indices[counter] = i 44 | counter += 1 45 | 46 | # Compute overlap 47 | xx1 = [np.maximum(x1[k][i], x1[k][I[:-1]]) for k in range(K)] 48 | yy1 = [np.maximum(y1[k][i], y1[k][I[:-1]]) for k in range(K)] 49 | xx2 = [np.minimum(x2[k][i], x2[k][I[:-1]]) for k in range(K)] 50 | yy2 = [np.minimum(y2[k][i], y2[k][I[:-1]]) for k in range(K)] 51 | 52 | w = [np.maximum(0, xx2[k] - xx1[k]) for k in range(K)] 53 | h = [np.maximum(0, yy2[k] - yy1[k]) for k in range(K)] 54 | 55 | inter_area = [w[k] * h[k] for k in range(K)] 56 | ious = sum([inter_area[k] / (area[k][I[:-1]] + area[k][i] - inter_area[k]) for k in range(K)]) 57 | 58 | I = I[np.where(ious <= nms_threshold * K)[0]] 59 | 60 | if counter == top_k: break 61 | 62 | return indices[:counter] 63 | 64 | 65 | if __name__ == '__main__': 66 | data_cache = tube_dataset.TubeDataset('UCFSports') 67 | build_tubes(data_cache, K=6) 68 | 69 | -------------------------------------------------------------------------------- /utils/box.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def jaccard_overlap_boxes(box1, box2): 4 | # box.shape=[xmin,ymin,xmax,ymax] 5 | if box1[0] > box2[2] or box1[2] < box2[0] or box1[1] > box2[3] or box1[3] < box2[1]: 6 | return 0.0 7 | else: 8 | box = np.array([max(box1[0], box2[0]), max(box1[1], box2[1]), min(box1[2], box2[2]), min(box1[3], box2[3])]) 9 | size = box_size(box) 10 | if box_size(box1) + box_size(box2) < size: 11 | size = size 12 | return size / (box_size(box1) + box_size(box2) - size) 13 | 14 | 15 | def box_size(box): 16 | # box.shape=[xmin,ymin,xmax,ymax] 17 | return (box[2] - box[0]) * (box[3] - box[1]) 18 | -------------------------------------------------------------------------------- /utils/map_eval.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | from . import box 4 | 5 | 6 | def nms_class(boxes_scores_list, nms_threshold): 7 | if boxes_scores_list.__len__() <= 1: 8 | return [0] 9 | boxes = np.vstack(boxes_scores_list) 10 | scores_index = np.argsort(-boxes[:, 0]) 11 | class_nms_index_list = [scores_index[0]] 12 | for index in scores_index[1:]: 13 | keep = True 14 | for _max_index in class_nms_index_list: 15 | if box.jaccard_overlap_boxes(boxes[index, 1:], boxes[_max_index, 1:]) > nms_threshold: 16 | keep = False 17 | break 18 | if keep: 19 | class_nms_index_list += [index] 20 | return class_nms_index_list 21 | 22 | 23 | def get_pr(data_cache, K=6): 24 | tubelet_file = './data/UCFSports/{}/{}.pkl' 25 | output_file = './data/UCFSports_build_tubes/pr_data.pkl' 26 | test_videos_list = data_cache.get_test_videos() 27 | nframes_dict = data_cache.get_nframes() 28 | labels = data_cache.get_labels() 29 | gt_tubes = data_cache.get_gttubes() 30 | resolution = data_cache._resolution 31 | gt_dict = {} 32 | gt_label_num = np.zeros(labels.__len__()) 33 | video_index = 0 34 | for video in test_videos_list[0]: 35 | video_index += 1 36 | for label in gt_tubes[video]: 37 | for tube in gt_tubes[video][label]: 38 | for i in range(tube.shape[0]): 39 | if (video_index, int(tube[i, 0]), label) not in gt_dict: 40 | gt_dict[(video_index, int(tube[i, 0]), label)] = [] 41 | gt_dict[(video_index, int(tube[i, 0]), label)] += [tube[i, 1:]] 42 | gt_label_num[label] += 1 43 | 44 | frame_boxes = {} 45 | all_frame_boxes_list = [] 46 | video_index = 0 47 | for videos in test_videos_list[0]: 48 | nframes = nframes_dict[videos] 49 | frame_boxes[videos] = {} 50 | video_index += 1 51 | height, width = resolution[videos] 52 | for start_frame in range(1, nframes - K + 2): 53 | file = open(tubelet_file.format(videos, start_frame), 'rb') 54 | _, __, best_tube = pickle.load(file) 55 | file.close() 56 | for i in range(best_tube.shape[0]): 57 | for j in range(K): 58 | if (j+start_frame) not in frame_boxes[videos]: 59 | frame_boxes[videos][j+start_frame] = [] 60 | frame_boxes[videos][j+start_frame] += [best_tube[i, np.array([0, 1, 2+4*j, 3+4*j, 4+4*j, 5+4*j])]] 61 | 62 | for frame_index in range(1, nframes+1): 63 | frame_label = {} 64 | for bb in frame_boxes[videos][frame_index]: 65 | if bb[0] not in frame_label: 66 | frame_label[bb[0]] = [] 67 | frame_label[bb[0]] += [bb[1:]] 68 | 69 | for tt in frame_label: 70 | idx = nms_class(frame_label[tt], nms_threshold=0.3) 71 | for id in idx: 72 | all_frame_boxes_list += [np.hstack([np.array([video_index, frame_index, tt]), frame_label[tt][id] * np.array([1, width, height, width, height])])] 73 | 74 | all_frame_boxes = np.vstack(all_frame_boxes_list) 75 | label_pr_dict = {} 76 | for label in range(labels.__len__()): 77 | print("label:", label) 78 | pre_idx = np.where(all_frame_boxes[:, 2] == label)[0] 79 | label_pre_box = all_frame_boxes[pre_idx] 80 | pre_idx = np.argsort(-label_pre_box[:, 3]) 81 | pr = np.empty((pre_idx.shape[0]+1, 2)) 82 | pr[0, 0] = 1.0 # precision 83 | pr[0, 1] = 0.0 # recall 84 | pr_cnt = 1 85 | fn = gt_label_num[label] 86 | fp = 0 87 | tp = 0 88 | for id in pre_idx: 89 | pre_box = label_pre_box[id, :] 90 | positive = False 91 | if (int(pre_box[0]), int(pre_box[1]), int(pre_box[2])) in gt_dict: 92 | _gt = gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))] 93 | ious = np.zeros(_gt.__len__()) 94 | for i, g in enumerate(_gt): 95 | ious[i] = box.jaccard_overlap_boxes(pre_box[4:], g) 96 | i_max = np.argmax(ious) 97 | if ious[i_max] > 0.5: 98 | positive = True 99 | del _gt[i_max] 100 | if _gt.__len__() == 0: 101 | del gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))] 102 | if positive: 103 | tp += 1 104 | fn -= 1 105 | else: 106 | fp += 1 107 | pr[pr_cnt, 0] = tp / (fp + tp) 108 | pr[pr_cnt, 1] = tp / (tp + fn) 109 | pr_cnt += 1 110 | label_pr_dict[label] = pr 111 | with open(output_file, 'wb') as f: 112 | pickle.dump(label_pr_dict, f) 113 | ap = np.empty(labels.__len__()) 114 | for label in label_pr_dict: 115 | prdif = label_pr_dict[label][1:, 1] - label_pr_dict[label][:-1, 1] 116 | prsum = label_pr_dict[label][1:, 0] + label_pr_dict[label][:-1, 0] 117 | ap[label] = np.sum(prdif * prsum * 0.5) 118 | print("map:", np.mean(ap)) 119 | 120 | 121 | def get_ground_truth(test_videos_list, labels, gt_tubes): 122 | gt_dict = {} 123 | gt_label_num = np.zeros(labels.__len__()) 124 | video_index = 0 125 | for video in test_videos_list: 126 | video_index += 1 127 | for label in gt_tubes[video]: 128 | for tube in gt_tubes[video][label]: 129 | for i in range(tube.shape[0]): 130 | if (video_index, int(tube[i, 0]), label) not in gt_dict: 131 | gt_dict[(video_index, int(tube[i, 0]), label)] = [] 132 | gt_dict[(video_index, int(tube[i, 0]), label)] += [tube[i, 1:]] 133 | gt_label_num[label] += 1 134 | return gt_dict, gt_label_num 135 | 136 | 137 | def calc_pr(all_frame_boxes_list, dataset): 138 | output_file = './pr_data_{}_{}.pkl'.format(dataset.DNAME, dataset.modality) 139 | labels = dataset.get_labels() 140 | gt_tubes = dataset.get_gttubes() 141 | gt_dict, gt_label_num = get_ground_truth(dataset.videos_list, labels, gt_tubes) 142 | all_frame_boxes = np.vstack(all_frame_boxes_list) 143 | label_pr_dict = {} 144 | for label in range(labels.__len__()): 145 | pre_idx = np.where(all_frame_boxes[:, 2] == label)[0] 146 | label_pre_box = all_frame_boxes[pre_idx] 147 | pre_idx = np.argsort(-label_pre_box[:, 3]) 148 | pr = np.empty((pre_idx.shape[0]+1, 2)) 149 | pr[0, 0] = 1.0 # precision 150 | pr[0, 1] = 0.0 # recall 151 | pr_cnt = 1 152 | fn = gt_label_num[label] 153 | fp = 0 154 | tp = 0 155 | for id in pre_idx: 156 | pre_box = label_pre_box[id, :] 157 | positive = False 158 | if (int(pre_box[0]), int(pre_box[1]), int(pre_box[2])) in gt_dict: 159 | _gt = gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))] 160 | ious = np.zeros(_gt.__len__()) 161 | for i, g in enumerate(_gt): 162 | ious[i] = box.jaccard_overlap_boxes(pre_box[4:], g) 163 | i_max = np.argmax(ious) 164 | if ious[i_max] > 0.5: 165 | positive = True 166 | del _gt[i_max] 167 | if _gt.__len__() == 0: 168 | del gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))] 169 | if positive: 170 | tp += 1 171 | fn -= 1 172 | else: 173 | fp += 1 174 | pr[pr_cnt, 0] = tp / (fp + tp) 175 | pr[pr_cnt, 1] = tp / (tp + fn) 176 | pr_cnt += 1 177 | label_pr_dict[label] = pr 178 | with open(output_file, 'wb') as f: 179 | pickle.dump(label_pr_dict, f) 180 | ap = np.empty(labels.__len__()) 181 | for label in label_pr_dict: 182 | prdif = label_pr_dict[label][1:, 1] - label_pr_dict[label][:-1, 1] 183 | prsum = label_pr_dict[label][1:, 0] + label_pr_dict[label][:-1, 0] 184 | ap[label] = np.sum(prdif * prsum * 0.5) 185 | mmap = np.mean(ap) 186 | print("map:", mmap) 187 | return mmap 188 | 189 | 190 | 191 | 192 | if __name__ == '__main__': 193 | data_cache = tube_dataset.TubeDataset('UCFSports') 194 | get_pr(data_cache, K=6) 195 | 196 | -------------------------------------------------------------------------------- /utils/prior_tubes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import torch 4 | from layers import act_cuboid_loss 5 | 6 | 7 | class RGB_TUBES: 8 | def __init__(self, phase, use_gpu, variance=(0.1, 0.1, 0.2, 0.2), sequence_length=6): 9 | center_mode = phase == 'eval' 10 | self.conv4_3_norm_tubes = self.generate_prior_tubes(min_size=30.0, max_size=60.0, aspect_ratio=(2,), flip=True, 11 | clip=False, layer_size=(38, 38), image_size=(300, 300), 12 | step=8, offset=0.5, 13 | sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length) 14 | self.fc_conv7_tubes = self.generate_prior_tubes(min_size=60.0, max_size=111.0, aspect_ratio=(2, 3,), flip=True, 15 | clip=False, layer_size=(18, 18), image_size=(300, 300), step=16, 16 | offset=0.5, sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length) 17 | self.conv6_tubes = self.generate_prior_tubes(min_size=111.0, max_size=162.0, aspect_ratio=(2, 3,), flip=True, 18 | clip=False, layer_size=(9, 9), image_size=(300, 300), step=32, 19 | offset=0.5, sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length) 20 | self.conv7_tubes = self.generate_prior_tubes(min_size=162.0, max_size=213.0, aspect_ratio=(2, 3,), flip=True, 21 | clip=False, layer_size=(5, 5), image_size=(300, 300), step=64, 22 | offset=0.5, sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length) 23 | self.conv8_tubes = self.generate_prior_tubes(min_size=213.0, max_size=264.0, aspect_ratio=(2,), flip=True, 24 | clip=False, layer_size=(3, 3), image_size=(300, 300), step=100, 25 | offset=0.5, sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length) 26 | self.conv9_tubes = self.generate_prior_tubes(min_size=264.0, max_size=315.0, aspect_ratio=(2,), flip=True, 27 | clip=False, layer_size=(1, 1), image_size=(300, 300), step=300, 28 | offset=0.5, sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length) 29 | if use_gpu: 30 | self.all_tubes = torch.from_numpy(np.vstack([self.conv4_3_norm_tubes, self.fc_conv7_tubes, self.conv6_tubes, self.conv7_tubes, self.conv8_tubes, self.conv9_tubes])).cuda() 31 | # self.all_tubes = torch.clamp(self.all_tubes, min=0, max=1) 32 | else: 33 | self.all_tubes = torch.from_numpy(np.vstack( 34 | [self.conv4_3_norm_tubes, self.fc_conv7_tubes, self.conv6_tubes, self.conv7_tubes, self.conv8_tubes, 35 | self.conv9_tubes])) 36 | # self.all_tubes = torch.clamp(self.all_tubes, min=0, max=1) 37 | self.sequence_length = sequence_length 38 | self.variance = variance 39 | 40 | def generate_prior_tubes(self, min_size=30.0, max_size=None, aspect_ratio=(2,), flip=True, clip=False, 41 | layer_size=(38, 38), image_size=(300, 300), 42 | step=None, offset=0.5, sequence_length=6, center_mode=False): 43 | tubes_list = [] 44 | if max_size is not None: 45 | num_priors = aspect_ratio.__len__() * 2 + 2 46 | else: 47 | num_priors = aspect_ratio.__len__() * 2 + 1 48 | if step is None: 49 | step_w = image_size[0] / layer_size[0] 50 | step_h = image_size[1] / layer_size[1] 51 | else: 52 | step_w = step 53 | step_h = step 54 | ar_list = [] 55 | for a in aspect_ratio: 56 | ar_list.append(a) 57 | if flip: 58 | ar_list.append(1 / a) 59 | for h in range(layer_size[1]): 60 | for w in range(layer_size[0]): 61 | tube_set = np.zeros((num_priors, sequence_length, 4), dtype='float32') 62 | center_x, center_y = (w + offset) * step_w, (h + offset) * step_h 63 | box_width, box_height = min_size, min_size 64 | if center_mode: 65 | tube_set[0, :, 0] = center_x / image_size[0] 66 | tube_set[0, :, 1] = center_y / image_size[1] 67 | tube_set[0, :, 2] = box_width / image_size[0] 68 | tube_set[0, :, 3] = box_height / image_size[1] 69 | if max_size is not None: 70 | box_width, box_height = np.sqrt(min_size * max_size), np.sqrt(min_size * max_size) 71 | tube_set[1, :, 0] = center_x / image_size[0] 72 | tube_set[1, :, 1] = center_y / image_size[1] 73 | tube_set[1, :, 2] = box_width / image_size[0] 74 | tube_set[1, :, 3] = box_height / image_size[1] 75 | prior_index = 2 76 | for a in ar_list: 77 | if (np.abs(a - 1.0) < 0.000001) or a < 0.000001: 78 | continue 79 | box_width, box_height = min_size * np.sqrt(a), min_size / np.sqrt(a) 80 | tube_set[prior_index, :, 0] = center_x / image_size[0] 81 | tube_set[prior_index, :, 1] = center_y / image_size[1] 82 | tube_set[prior_index, :, 2] = box_width / image_size[0] 83 | tube_set[prior_index, :, 3] = box_height / image_size[1] 84 | prior_index += 1 85 | else: 86 | tube_set[0, :, 0] = (center_x - box_width/2.0) / image_size[0] # xmin 87 | tube_set[0, :, 1] = (center_y - box_height/2.0) / image_size[1] # ymin 88 | tube_set[0, :, 2] = (center_x + box_width/2.0) / image_size[0] 89 | tube_set[0, :, 3] = (center_y + box_height/2.0) / image_size[1] # ymax 90 | if max_size is not None: 91 | box_width, box_height = np.sqrt(min_size * max_size), np.sqrt(min_size * max_size) 92 | tube_set[1, :, 0] = (center_x - box_width / 2.0) / image_size[0] # xmin 93 | tube_set[1, :, 1] = (center_y - box_height / 2.0) / image_size[1] # ymin 94 | tube_set[1, :, 2] = (center_x + box_width / 2.0) / image_size[0] 95 | tube_set[1, :, 3] = (center_y + box_height / 2.0) / image_size[1] # ymax 96 | prior_index = 2 97 | for a in ar_list: 98 | if (np.abs(a - 1.0) < 0.000001) or a < 0.000001: 99 | continue 100 | box_width, box_height = min_size * np.sqrt(a), min_size / np.sqrt(a) 101 | tube_set[prior_index, :, 0] = (center_x - box_width / 2.0) / image_size[0] # xmin 102 | tube_set[prior_index, :, 1] = (center_y - box_height / 2.0) / image_size[1] # ymin 103 | tube_set[prior_index, :, 2] = (center_x + box_width / 2.0) / image_size[0] 104 | tube_set[prior_index, :, 3] = (center_y + box_height / 2.0) / image_size[1] # ymax 105 | prior_index += 1 106 | if clip: 107 | tube_set[tube_set > 1.0] = 1.0 108 | tube_set[tube_set < 0.0] = 0.0 109 | tubes_list.append(tube_set) 110 | return np.vstack(tubes_list) # 这里得到的结果是按照顺序的各个tubes，和feature map上的顺序是完全对应的 111 | 112 | 113 | def get_all_video_tubes(tubes): 114 | return copy.deepcopy(tubes.all_tubes) 115 | 116 | 117 | def decode_tubes(tubes, loc_preds=None): # just for one video 118 | decode_video_tubes = get_all_video_tubes(tubes) 119 | decode_video_tubes = torch.stack([decode_video_tubes for i in range(loc_preds.shape[0])], dim=0) 120 | var = tubes.variance 121 | center_x = decode_video_tubes[:, :, 0::4] 122 | center_y = decode_video_tubes[:, :, 1::4] 123 | width = decode_video_tubes[:, :, 2::4] 124 | height = decode_video_tubes[:, :, 3::4] 125 | new_center_x = var[0] * loc_preds[:, :, 0::4] * width + center_x 126 | new_center_y = var[1] * loc_preds[:, :, 1::4] * height + center_y 127 | new_width = torch.exp(var[2] * loc_preds[:, :, 2::4]) * width 128 | new_height = torch.exp(var[3] * loc_preds[:, :, 3::4]) * height 129 | decode_video_tubes[:, :, 0::4] = new_center_x - new_width / 2.0 # x_min 130 | decode_video_tubes[:, :, 1::4] = new_center_y - new_height / 2.0 # y_min 131 | decode_video_tubes[:, :, 2::4] = new_center_x + new_width / 2.0 # x_max 132 | decode_video_tubes[:, :, 3::4] = new_center_y + new_height / 2.0 # y_max 133 | decode_video_tubes[:, :, 0::4] = torch.clamp(decode_video_tubes[:, :, 0::4], min=0) 134 | decode_video_tubes[:, :, 1::4] = torch.clamp(decode_video_tubes[:, :, 1::4], min=0) 135 | decode_video_tubes[:, :, 2::4] = torch.clamp(decode_video_tubes[:, :, 2::4], max=1) 136 | decode_video_tubes[:, :, 3::4] = torch.clamp(decode_video_tubes[:, :, 3::4], max=1) 137 | return decode_video_tubes 138 | 139 | 140 | def get_tubes_conf(conf_preds_list=None, num_class=25): 141 | # 这个函数按照tubes的顺序把所有的confidence 打分提取出来 142 | conf_list = [] 143 | for conf_preds in conf_preds_list: 144 | batch_num, channel_num, w, h = conf_preds.shape 145 | feature_flat = conf_preds.detach().numpy().reshape((channel_num, w * h)) 146 | prior_num = int(channel_num / num_class) 147 | for i in range(w * h): 148 | for j in range(prior_num): 149 | conf = feature_flat[j * num_class:(j + 1) * num_class, i].reshape(1, -1) 150 | conf_list.append(conf) 151 | return np.vstack(conf_list) 152 | 153 | 154 | def apply_nms(tubes_conf, decode_tubes, conf_threshold=0.01, nms_threshold=0.45, nms_top_k=400, keep_topk=200, 155 | num_class=25): 156 | nms_tubes_list = [] 157 | nms_scores_list = [] 158 | nms_label_list = [] 159 | for i in range(1, num_class): # 不要背景的 160 | scores_c = tubes_conf[:, i] 161 | select = scores_c > conf_threshold 162 | if select.sum() > 0: 163 | select_tubes = decode_tubes[select, :] 164 | scores_c = scores_c[select] 165 | sort_index = torch.argsort(-scores_c) 166 | if sort_index.shape[0] > nms_top_k: 167 | sort_index = sort_index[:nms_top_k] 168 | class_nms_index_list = [sort_index[0]] 169 | class_nms_tube = select_tubes[sort_index[0], :].unsqueeze(dim=0) 170 | for index in sort_index[1:]: 171 | ioutable = torch.zeros(class_nms_index_list.__len__()) 172 | act_cuboid_loss.get_tube_overlap(class_nms_tube, select_tubes[index, :], ioutable) 173 | if (ioutable > nms_threshold).sum() == 0: 174 | class_nms_index_list += [index] 175 | class_nms_tube = torch.cat([class_nms_tube, select_tubes[index, :].unsqueeze(dim=0)], dim=0) 176 | for k, index in enumerate(class_nms_index_list): 177 | nms_tubes_list += [select_tubes[index, :]] 178 | nms_scores_list += [scores_c[index]] 179 | nms_label_list += [i] 180 | return_tubes_list = [] 181 | return_scores_list = [] 182 | return_label_list = [] 183 | nms_scores = torch.Tensor(nms_scores_list) 184 | nms_scores_index = torch.argsort(-nms_scores) 185 | if nms_tubes_list.__len__() > keep_topk: 186 | for index in nms_scores_index[:keep_topk]: 187 | return_tubes_list += [nms_tubes_list[index]] 188 | return_scores_list += [nms_scores[index]] 189 | return_label_list += [nms_label_list[index]] 190 | else: 191 | for index in nms_scores_index: 192 | return_tubes_list += [nms_tubes_list[index]] 193 | return_scores_list += [nms_scores[index]] 194 | return_label_list += [nms_label_list[index]] 195 | return return_tubes_list, return_scores_list, return_label_list -------------------------------------------------------------------------------- /visual_featuremaps.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from layers import ssd 3 | import numpy as np 4 | import cv2 5 | import os 6 | 7 | def main(): 8 | dataset = 'UCFSports' 9 | modality = 'rgb' 10 | data_path = "/mnt/data/qzw/data/UCFSports/" 11 | feature_map_path = "/mnt/data/qzw/result/pytorch-act-detector/{}/feature_maps/".format(dataset) 12 | MEAN = np.array([[[104, 117, 123]]], dtype=np.float32) 13 | test_net = ssd.SSD_NET(dataset=dataset, frezze_init=True, num_classes=11, 14 | modality=modality) 15 | 16 | data_dict = torch.load("/mnt/data/qzw/model/pytorch-act-detector/{}/best-{}-cpu-0.8601.pkl" .format(dataset, modality)) 17 | # net_state_dict = {} 18 | # for key in data_dict['net_state_dict']: 19 | # if 'module.' in key: 20 | # new_key = key.replace('module.', '') 21 | # else: 22 | # new_key = key 23 | # net_state_dict[new_key] = data_dict['net_state_dict'][key] 24 | test_net.load_state_dict(data_dict) 25 | 26 | image = cv2.imread(os.path.join(data_path, "Frames", '084', '%06d.jpg' % 1)) 27 | image = cv2.resize(image, (300, 300), interpolation=cv2.INTER_LINEAR) 28 | image = np.transpose(image - MEAN, (2, 0, 1))[None, :, :, :] 29 | image = torch.from_numpy(image.astype('float32')) 30 | 31 | conv6_dilation = (6, 6) 32 | conv6 = test_net.__getattr__('fc_conv6_{}'.format(test_net.layer_name)) 33 | conv6.dilation = conv6_dilation 34 | conv6.padding = conv6_dilation 35 | 36 | output = image 37 | for name, layer in test_net._modules.items(): 38 | output = layer(output) 39 | if 'conv' in name or '9' in name: 40 | continue 41 | save_path = os.path.join(feature_map_path, name) 42 | if os.path.exists(save_path) is not True: 43 | os.mkdir(save_path) 44 | feature_maps = output.squeeze().detach().numpy() 45 | for i in range(feature_maps.shape[0]): 46 | feature_map = feature_maps[i, :, :][:, :, None] 47 | if np.max(feature_map) > 0.001: 48 | feature_map = feature_map*255.0/np.max(feature_map) 49 | feature_map = cv2.resize(feature_map, (300, 300), interpolation=cv2.INTER_LINEAR) 50 | cv2.imwrite(os.path.join(save_path, "%03d.jpg" % (i+1)), feature_map) 51 | 52 | 53 | 54 | 55 | 56 | if __name__ == '__main__': 57 | main() 58 | --------------------------------------------------------------------------------