├── .idea
├── deployment.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── pytorch-act-detector.iml
└── vcs.xml
├── README.md
├── config.py
├── data
├── __init__.py
├── dataset.py
└── transforms.py
├── error_handle.py
├── layers
├── __init__.py
├── act_cuboid_loss.py
└── ssd.py
├── test.py
├── train.py
├── utils
├── __init__.py
├── act_tubes.py
├── box.py
├── map_eval.py
└── prior_tubes.py
└── visual_featuremaps.py
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/pytorch-act-detector.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pytorch-act-detector
2 | paper:https://arxiv.org/abs/1705.01861
3 |
4 | |model |modilaty |map |
5 | |:----:|:--------:|:---:|
6 | |UCFSports|rgb|0.8259|
7 |
8 |
9 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import os.path as osp
3 | import sys
4 |
5 |
6 | def add_path(path):
7 | if path not in sys.path:
8 | sys.path.insert(0, path)
9 |
10 |
11 | root_dir = osp.abspath(osp.join(osp.dirname(__file__), '..', '..'))
12 |
13 |
14 | class Config:
15 | # -------------------------data config ------------------------------#
16 | dataset = 'UCF101v2'
17 | # dataset = 'UCFSports'
18 | if dataset == 'UCF101v2':
19 | data_path = "/mnt/data/qzw/data/UCF101/"
20 | elif dataset == 'UCFSports':
21 | data_path = "/mnt/data/qzw/data/UCFSports/"
22 | else:
23 | data_path = " "
24 | print("dataset not found!!")
25 | exit(0)
26 |
27 | modality = 'rgb'
28 | init_model = "/mnt/data/qzw/model/pytorch-act-detector/{}/{}-init-model-{}-pytorch-single.pkl".format(dataset, dataset, modality)
29 | trained_model = "/mnt/data/qzw/model/pytorch-act-detector/{}/my_trained_pytorch_model_{}-{}.pkl".format(dataset, dataset, modality)
30 | new_trained_model = "/mnt/data/qzw/model/pytorch-act-detector/{}/my_new_trained_pytorch_model_{}-{}.pkl".format(dataset, dataset, modality)
31 | best_trained_model = "/mnt/data/qzw/model/pytorch-act-detector/{}/best_trained_pytorch_model_{}-{}-%.4f.pkl".format(dataset, dataset, modality)
32 | all_frame_boxes_list_result = "/mnt/data/qzw/result/pytorch-act-detector/{}/all_frame_boxes_list-{}-{}.pkl".format(dataset, dataset, modality)
33 |
34 | variance = [0.1, 0.1, 0.2, 0.2]
35 | sequence_length = 6
36 |
37 | # -------------------------model config ------------------------------#
38 |
39 | base_model_name = 'vgg16'
40 | freeze_init = True
41 |
42 | # -------------------------train config ------------------------------#
43 | reinit_all = False
44 | use_gpu = True
45 | warm_up_epoch = 1
46 | warm_up_ratio = 1 / 100
47 | epochs = 200
48 | train_batch_size = 192
49 | valid_batch_size = 1
50 | workers = 16
51 |
52 | lr = 0.001
53 | momentum = 0.9
54 | weight_decay = 5e-4
55 |
56 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
57 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
58 |
59 |
60 | # config = Config()
61 |
--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import dataset
2 | from . import transforms
3 |
--------------------------------------------------------------------------------
/data/dataset.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import os
3 | import torch
4 | import torch.utils.data as data
5 | import numpy as np
6 | import cv2
7 | from . import transforms
8 | from torchvision.transforms import functional as F
9 |
10 |
11 | class TubeDataset(data.Dataset):
12 | def __init__(self, DNAME, data_path, phase, modality, sequence_length):
13 | ground_truth_file = os.path.join(data_path, './cache/{}-GT.pkl'.format(DNAME))
14 | with open(ground_truth_file, 'rb') as fid:
15 | cache = pickle.load(fid, encoding='iso-8859-1')
16 | for k in cache:
17 | setattr(self, k, cache[k])
18 | self.MEAN = np.array([[[104, 117, 123]]], dtype=np.float32)
19 | self.DNAME = DNAME
20 | if DNAME == 'UCF101v2':
21 | self.image_format = '%05d.jpg'
22 | elif DNAME == 'UCFSports':
23 | self.image_format = '%06d.jpg'
24 | else:
25 | print("TubeDataset.DNAME value error!!")
26 | exit(-1)
27 | self.color_jitter = transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3)
28 | self.expand = transforms.Expand(self.MEAN)
29 | self.modality = modality
30 | self.data_path = data_path
31 | self.sequence_length = sequence_length
32 | self.image_shape = cache['resolution']
33 | if modality == 'rgb':
34 | self.rgb = True
35 | elif modality == 'flow':
36 | self.rgb = False
37 | else:
38 | print("dataset mode value error!")
39 | exit(-1)
40 | self.ground_tube_list = []
41 | self.videos_list = []
42 | self.label_list = []
43 | video_cnt = 0
44 | if phase == 'train':
45 | self.train = True
46 | for video in self.train_videos[0]:
47 | gttube = self.gttubes[video]
48 | if gttube.__len__() > 1:
49 | print("tube_dataset.py: warning! gttube_list length > 1!! maybe multi-labels in one video")
50 | print("video:{}".format(video))
51 | exit(-1)
52 | for key in gttube:
53 | ts = gttube[key]
54 | for t in ts:
55 | for stf in range(t.shape[0]-self.sequence_length+1):
56 | self.label_list += [key + 1] # background label is 0
57 | self.ground_tube_list += [t[stf:stf+self.sequence_length, :]]
58 | self.videos_list += [video]
59 | video_cnt += 1
60 | # if video_cnt >= 8:
61 | # return
62 | elif phase == 'eval':
63 | self.train = False
64 | self.videos_list = self.test_videos[0]
65 | # self.videos_list = self.train_videos[0]
66 | else:
67 | print("dataset phase value error!")
68 | exit(-1)
69 |
70 | def __getitem__(self, index):
71 | # index = 10
72 | if self.rgb:
73 | root_path = os.path.join(self.data_path, 'Frames', self.videos_list[index])
74 | else:
75 | root_path = os.path.join(self.data_path, 'FlowBrox04', self.videos_list[index])
76 | image_list = []
77 | all_frames = os.listdir(root_path)
78 | nframes = all_frames.__len__()
79 | if self.train:
80 | gttube = self.ground_tube_list[index]
81 | ground_truth = np.zeros((1 + self.sequence_length * 4), dtype='float32')
82 | ground_truth[0] = self.label_list[index]
83 | for i in range(self.sequence_length):
84 | ground_truth[4*i+1:4*i+5] = gttube[i, 1:]
85 | image_path = os.path.join(root_path, self.image_format % min(int(gttube[i, 0]), nframes))
86 | im = cv2.imread(image_path)
87 | if im is None:
88 | print("{}not found!!".format(image_path))
89 | exit(-1)
90 | image_list += [im]
91 |
92 | image_list, ground_truth, _random_crop_data = transforms.random_crop(image_list, ground_truth)
93 | if (ground_truth[1::4] < ground_truth[3::4]).sum() != 6 or (ground_truth[2::4] < ground_truth[4::4]).sum() != 6:
94 | print("_random_crop_data saved!!")
95 | with open('./error_random_crop_data.pkl', 'wb') as f:
96 | pickle.dump(index, f)
97 | pickle.dump(image_list, f)
98 | pickle.dump(ground_truth, f)
99 | pickle.dump(_random_crop_data, f)
100 | exit(-1)
101 |
102 | image_list, ground_truth, _random_flip_data = transforms.random_flip(image_list, ground_truth)
103 | if (ground_truth[1::4] < ground_truth[3::4]).sum() != 6 or (ground_truth[2::4] < ground_truth[4::4]).sum() != 6:
104 | print("_random_crop_data saved!!")
105 | with open('./error_random_flip_data.pkl', 'wb') as f:
106 | pickle.dump(index, f)
107 | pickle.dump(image_list, f)
108 | pickle.dump(ground_truth, f)
109 | pickle.dump(_random_crop_data, f)
110 | pickle.dump(_random_flip_data, f)
111 | exit(-1)
112 |
113 | image_list = self.color_jitter(image_list)
114 | image_list, ground_truth, _random_expand_data = self.expand(image_list, ground_truth)
115 | if (ground_truth[1::4] < ground_truth[3::4]).sum() != 6 or (
116 | ground_truth[2::4] < ground_truth[4::4]).sum() != 6:
117 | print("_random_expand_data saved!!")
118 | with open('./error_random_expand_data.pkl', 'wb') as f:
119 | pickle.dump(index, f)
120 | pickle.dump(image_list, f)
121 | pickle.dump(ground_truth, f)
122 | pickle.dump(_random_crop_data, f)
123 | pickle.dump(_random_flip_data, f)
124 | pickle.dump(_random_expand_data, f)
125 | exit(-1)
126 |
127 | height_new, width_new, _ = image_list[0].shape
128 | ground_truth[1::4] = ground_truth[1::4] / width_new
129 | ground_truth[2::4] = ground_truth[2::4] / height_new
130 | ground_truth[3::4] = ground_truth[3::4] / width_new
131 | ground_truth[4::4] = ground_truth[4::4] / height_new
132 | for i in range(image_list.__len__()):
133 | image_list[i] = image_list[i] - self.MEAN
134 | image_data = np.concatenate(image_list, axis=2).astype('float32')
135 | image_data = cv2.resize(image_data, (300, 300), interpolation=cv2.INTER_LINEAR)
136 | image_data = np.transpose(image_data, (2, 0, 1))
137 | image_data = torch.from_numpy(image_data)
138 | ground_truth = torch.from_numpy(ground_truth)
139 | else:
140 | if self.rgb:
141 | for i in range(1, nframes+1):
142 | image_path = os.path.join(root_path, self.image_format % i)
143 | im = cv2.imread(image_path)
144 | if im is None:
145 | print("{}not found!!".format(image_path))
146 | exit(-1)
147 | im = cv2.resize(im, (300, 300), interpolation=cv2.INTER_LINEAR)
148 | im = np.transpose(im - self.MEAN, (2, 0, 1))
149 | image_list += [im]
150 | image_data = torch.from_numpy(np.vstack(image_list).astype('float32'))
151 | ground_truth = torch.Tensor([index])
152 | else:
153 | for i in range(1, nframes+1):
154 | flow_path = os.path.join(root_path, self.image_format % i)
155 | im = cv2.imread(flow_path)
156 | im = cv2.resize(im, (300, 300), interpolation=cv2.INTER_LINEAR)
157 | im = np.transpose(im - self.MEAN, (2, 0, 1))
158 | # im = np.transpose(im, (2, 0, 1))
159 | image_list += [im]
160 | image_data = torch.from_numpy(np.vstack(image_list).astype('float32'))
161 | ground_truth = torch.Tensor([index])
162 | return image_data, ground_truth
163 |
164 | def __len__(self):
165 | return self.videos_list.__len__()
166 | # return 48
167 |
168 | def get_test_videos(self):
169 | return self.test_videos
170 |
171 | def get_nframes(self):
172 | return self.nframes
173 |
174 | def get_resolution(self):
175 | return self.resolution
176 |
177 | def get_labels(self):
178 | return self.labels
179 |
180 | def get_gttubes(self):
181 | return self.gttubes
182 |
183 |
184 | if __name__ == '__main__':
185 | import config
186 | args = config.Config()
187 | with open('./error_random_flip_data.pkl', 'wb') as f:
188 | index = pickle.load(f)
189 | image_list = pickle.load(f)
190 | ground_truth = pickle.load(f)
191 | _random_crop_data = pickle.load(f)
192 | _random_flip_data = pickle.load(f)
193 | train_dataset = TubeDataset(args.dataset, data_path=args.data_path, phase='train',
194 | modality=args.modality,
195 | sequence_length=6)
196 | a, b = train_dataset[index]
197 |
--------------------------------------------------------------------------------
/data/transforms.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import cv2
4 | from torchvision.transforms import functional as F
5 | from PIL import Image
6 |
7 |
8 | def random_flip(image_list, target):
9 | pro = np.random.random()
10 | if pro < 0.5:
11 | height, width, _ = image_list[0].shape
12 | for i in range(image_list.__len__()):
13 | image_list[i] = np.flip(image_list[i], axis=1)
14 | xmin_new = width - target[3::4]
15 | target[3::4] = width - target[1::4]
16 | target[1::4] = xmin_new
17 | return image_list, target, pro
18 |
19 |
20 | def random_crop(image_list, target):
21 | target = target.copy()
22 | scale = 0.5
23 | height, width, _ = image_list[0].shape
24 | gt_w = target[3::4] - target[1::4]
25 | gt_h = target[4::4] - target[2::4]
26 | gt_area = gt_w * gt_h
27 | gt_area = gt_area.sum()
28 | while True:
29 | xmin_crop_ratio = (1 - scale) * np.random.random()
30 | ymin_crop_ratio = xmin_crop_ratio
31 | xmax_crop_ratio = xmin_crop_ratio + (1 - xmin_crop_ratio - scale) * np.random.random() + scale
32 | ymax_crop_ratio = xmax_crop_ratio
33 | xmin_crop, ymin_crop, xmax_crop, ymax_crop = int(width * xmin_crop_ratio), int(height * ymin_crop_ratio), int(
34 | width * xmax_crop_ratio), int(height * ymax_crop_ratio)
35 |
36 | xmin_cross = np.maximum(target[1::4], xmin_crop)
37 | ymin_cross = np.maximum(target[2::4], ymin_crop)
38 | xmax_cross = np.minimum(target[3::4], xmax_crop)
39 | ymax_cross = np.minimum(target[4::4], ymax_crop)
40 |
41 | cross_w = xmax_cross - xmin_cross
42 | cross_h = ymax_cross - ymin_cross
43 | if (cross_w < 0.01).sum() > 0:
44 | continue
45 | if (cross_h < 0.01).sum() > 0:
46 | continue
47 | cross_area = cross_w * cross_h
48 | cross_area = cross_area.sum()
49 | if cross_area / gt_area < 0.8:
50 | continue
51 | target[1::4] = xmin_cross - xmin_crop
52 | target[2::4] = ymin_cross - ymin_crop
53 | target[3::4] = xmax_cross - xmin_crop
54 | target[4::4] = ymax_cross - ymin_crop
55 | break
56 |
57 | image_list_new = [image_list[i][ymin_crop:ymax_crop+1, xmin_crop:xmax_crop+1, :] for i in range(len(image_list))]
58 |
59 | return image_list_new, target, (xmin_crop_ratio, xmax_crop_ratio)
60 |
61 |
62 | class ColorJitter(object):
63 | def __init__(self, brightness=0.0, contrast=0.0, saturation=0.0, hue=0.0):
64 | self.brightness = brightness
65 | self.contrast = contrast
66 | self.saturation = saturation
67 | self.hue = hue
68 |
69 | @staticmethod
70 | def get_params(brightness, contrast, saturation, hue):
71 | transforms = []
72 | if brightness > 0:
73 | brightness_factor = np.random.uniform(max(0, 1 - brightness), 1 + brightness)
74 | transforms.append(lambda img: F.adjust_brightness(img, brightness_factor))
75 |
76 | if contrast > 0:
77 | contrast_factor = np.random.uniform(max(0, 1 - contrast), 1 + contrast)
78 | transforms.append(lambda img: F.adjust_contrast(img, contrast_factor))
79 |
80 | if saturation > 0:
81 | saturation_factor = np.random.uniform(max(0, 1 - saturation), 1 + saturation)
82 | transforms.append(lambda img: F.adjust_saturation(img, saturation_factor))
83 |
84 | if hue > 0:
85 | hue_factor = np.random.uniform(-hue, hue)
86 | transforms.append(lambda img: F.adjust_hue(img, hue_factor))
87 | np.random.shuffle(transforms)
88 | return transforms
89 |
90 | def __call__(self, img_list):
91 | transforms = self.get_params(self.brightness, self.contrast,
92 | self.saturation, self.hue)
93 | for i in range(img_list.__len__()):
94 | img = img_list[i][..., -1::-1] # bgr2rgb
95 | img = Image.fromarray(np.uint8(img))
96 | for t in transforms:
97 | img = t(img)
98 | img = np.asarray(img)
99 | img_list[i] = img[..., -1::-1] # rgb2bgr
100 | return img_list
101 |
102 |
103 | class Normalize(object):
104 | def __init__(self, mean, std):
105 | self.mean = mean
106 | self.std = std
107 |
108 | def __call__(self, tensor):
109 | tensor = F.normalize(tensor, self.mean, self.std)
110 | return tensor
111 |
112 |
113 | class Expand(object):
114 | def __init__(self, mean):
115 | self.expand_prob = 0.5
116 | self.max_expand_ratio = 4.0
117 | self.mean = mean
118 |
119 | def __call__(self, image_list, ground_truth):
120 | out_image_list = image_list
121 | pro = np.random.random()
122 | if pro < self.expand_prob:
123 | expand_ratio = np.random.uniform(1, self.max_expand_ratio)
124 | ori_h, ori_w, _ = image_list[0].shape
125 | new_h, new_w = int(ori_h * expand_ratio), int(ori_w * expand_ratio)
126 | out_image_list = [(np.zeros((new_h, new_w, 3), dtype=np.float32) + self.mean) for i in
127 | range(len(image_list))]
128 | h_off, w_off = int(np.floor(new_h - ori_h)), int(np.floor(new_w - ori_w))
129 | for i in range(len(image_list)):
130 | out_image_list[i][h_off:h_off + ori_h, w_off:w_off + ori_w] = image_list[i]
131 | ground_truth[1:] += np.array([w_off, h_off, w_off, h_off] * len(image_list), dtype=np.float32)
132 | else:
133 | expand_ratio = 1.0
134 | return out_image_list, ground_truth, (pro, expand_ratio)
135 |
136 |
137 | def PCA_Jittering(img):
138 | img_size = img.size / 3
139 | # print(img.size, img_size)
140 | img1 = img.reshape(int(img_size), 3)
141 | img1 = np.transpose(img1)
142 | img_cov = np.cov([img1[0], img1[1], img1[2]])
143 | # 计算矩阵特征向量
144 | lamda, p = np.linalg.eig(img_cov)
145 |
146 | p = np.transpose(p)
147 | # 生成正态分布的随机数
148 | alpha1 = random.normalvariate(0, 0.05)
149 | alpha2 = random.normalvariate(0, 0.05)
150 | alpha3 = random.normalvariate(0, 0.05)
151 | v = np.transpose((alpha1 * lamda[0], alpha2 * lamda[1], alpha3 * lamda[2])) # 加入扰动
152 | add_num = np.dot(p, v)
153 | img2 = np.array([img[:, :, 0] + add_num[0], img[:, :, 1] + add_num[1], img[:, :, 2] + add_num[2]])
154 | img2 = np.swapaxes(img2, 0, 2)
155 | img2 = np.swapaxes(img2, 0, 1)
156 | img2[img2 < 0] = 0
157 | img2[img2 > 255] = 255
158 | # max_t = np.max(img2[:, :, 0])
159 | # min_t = np.min(img2[:, :, 0])
160 | # img2[:, :, 0] = 255 / (max_t - min_t) * (img2[:, :, 0] - min_t)
161 | #
162 | # max_t = np.max(img2[:, :, 1])
163 | # min_t = np.min(img2[:, :, 1])
164 | # img2[:, :, 1] = 255 / (max_t - min_t) * (img2[:, :, 1] - min_t)
165 | #
166 | # max_t = np.max(img2[:, :, 2])
167 | # min_t = np.min(img2[:, :, 2])
168 | # img2[:, :, 2] = 255 / (max_t - min_t) * (img2[:, :, 2] - min_t)
169 | return img2
170 |
171 | if __name__ == '__main__':
172 | import os
173 | video = '002'
174 | color = ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3)
175 | root_path = os.path.join("/mnt/data/qzw/data/UCFSports/", 'Frames', video)
176 | image_list = []
177 | data_path = "/mnt/data/qzw/data/UCFSports/"
178 | dataset = "UCFSports"
179 | train_dataset = tube_dataset.TubeDataset(dataset, data_path=data_path, phase='eval',
180 | modality='rgb',
181 | sequence_length=6)
182 | height, width = train_dataset.resolution[video]
183 | s = 10
184 | target = np.zeros(25)
185 | target[1:] = train_dataset.gttubes[video][0][0][s: s+6, 1: 5].reshape(-1)
186 | target[1::2] = target[1::2] / width
187 | target[2::2] = target[2::2] / height
188 | for i in range(s, s+6):
189 | path = os.path.join(root_path, '%06d.jpg' % (i+1))
190 | image = cv2.imread(path)
191 | # image = PCA_Jittering(image)
192 | image = color(image)
193 | image_list += [image]
194 | image, target_new = random_crop(np.concatenate(image_list, axis=2), target)
195 | image = cv2.resize(image, (300, 300), interpolation=cv2.INTER_LINEAR)
196 | height_new, width_new, _ = image.shape
197 | for i in range(6):
198 | p1 = (int(target_new[i * 4 + 1]*width_new), int(target_new[i * 4 + 2]*height_new))
199 | p2 = (int(target_new[i * 4 + 3]*width_new), int(target_new[i * 4 + 4]*height_new))
200 | im1 = (image[:, :, 3*i:3*(i+1)]).astype('uint8')
201 | cv2.rectangle(im1, p1, p2, (255, 0, 0))
202 | cv2.imwrite('./image/test{}.jpg'.format(i), im1)
203 | # image_list += [image]
204 |
205 | # img_new = cv2.imread('./test_img.jpg')
206 | # print(img_new)
207 | ss = 0
208 |
--------------------------------------------------------------------------------
/error_handle.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pickle
3 | import os
4 | from utils import map_eval
5 | from utils import box
6 | import cv2
7 | import matplotlib.pyplot as plt
8 |
9 |
10 | def error_analyse(dataset):
11 | error_images_path = "/mnt/data/qzw/result/pytorch-act-detector/{}/error_images/".format(dataset.DNAME)
12 | result_file = "/mnt/data/qzw/result/pytorch-act-detector/{}/all_frame_boxes_list-{}-{}-0.8601.pkl".format(dataset.DNAME, dataset.DNAME, dataset.modality)
13 | if not os.path.isfile(result_file):
14 | raise ValueError("file:{} not found".format(result_file))
15 | with open(result_file, "rb") as file:
16 | all_frame_boxes_list = pickle.load(file)
17 | labels = dataset.get_labels()
18 | gt_tubes = dataset.get_gttubes()
19 | gt_dict, gt_label_num = map_eval.get_ground_truth(dataset.videos_list, labels, gt_tubes)
20 | all_frame_boxes = np.vstack(all_frame_boxes_list)
21 | frame_gt_box_dict = {}
22 | frame_error_box_dict = {}
23 | frame_correct_box_dict = {}
24 | label_pr_dict = {}
25 | for label in range(labels.__len__()):
26 | pre_idx = np.where(all_frame_boxes[:, 2] == label)[0]
27 | label_pre_box = all_frame_boxes[pre_idx]
28 | pre_idx = np.argsort(-label_pre_box[:, 3])
29 | pr = np.empty((pre_idx.shape[0]+1, 2))
30 | pr[0, 0] = 1.0 # precision
31 | pr[0, 1] = 0.0 # recall
32 | pr_cnt = 1
33 | fn = gt_label_num[label]
34 | fp = 0
35 | tp = 0
36 | video_name_list = []
37 | for cnt, id in enumerate(pre_idx):
38 | pre_box = label_pre_box[id, :]
39 | video_name_list += [int(pre_box[0])-1]
40 | # video_label = list(dataset.gttubes[dataset.videos_list[int(pre_box[0])-1]].keys())[0]
41 | # if labels[label] == 'Run' and dataset.labels[video_label] == 'SkateBoarding':
42 | # continue
43 | positive = False
44 | if (int(pre_box[0]), int(pre_box[1]), int(pre_box[2])) in gt_dict:
45 | _gt = gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))]
46 |
47 | if (int(pre_box[0]), int(pre_box[1])) not in frame_gt_box_dict:
48 | frame_gt_box_dict[(int(pre_box[0]), int(pre_box[1]))] = []
49 | frame_gt_box_dict[(int(pre_box[0]), int(pre_box[1]))] += _gt.copy()
50 |
51 | ious = np.zeros(_gt.__len__())
52 | for i, g in enumerate(_gt):
53 | ious[i] = box.jaccard_overlap_boxes(pre_box[4:], g)
54 | i_max = np.argmax(ious)
55 | if ious[i_max] > 0.5:
56 | positive = True
57 | del _gt[i_max]
58 | if _gt.__len__() == 0:
59 | del gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))]
60 | if positive:
61 | tp += 1
62 | fn -= 1
63 | if (int(pre_box[0]), int(pre_box[1])) not in frame_correct_box_dict:
64 | frame_correct_box_dict[(int(pre_box[0]), int(pre_box[1]))] = []
65 | frame_correct_box_dict[(int(pre_box[0]), int(pre_box[1]))] += [pre_box]
66 | else:
67 | fp += 1
68 | if (int(pre_box[0]), int(pre_box[1])) not in frame_error_box_dict:
69 | frame_error_box_dict[(int(pre_box[0]), int(pre_box[1]))] = []
70 | frame_error_box_dict[(int(pre_box[0]), int(pre_box[1]))] += [pre_box]
71 | # video_name = dataset.videos_list[int(pre_box[0])-1]
72 | # video_label = list(dataset.gttubes[video_name].keys())[0]
73 | # err_image_root = os.path.join(error_images_path, video_name+"-"+dataset.labels[video_label])
74 | # if os.path.exists(err_image_root) is not True:
75 | # os.mkdir(err_image_root)
76 | # image = cv2.imread(os.path.join(dataset.data_path, 'Frames', video_name, dataset.image_format % int(pre_box[1])))
77 | # draw_rec_and_save_image(_gt, pre_box, dataset.labels[int(pre_box[2])], image, err_image_root)
78 | pr[pr_cnt, 0] = tp / (fp + tp)
79 | pr[pr_cnt, 1] = tp / (tp + fn)
80 | if labels[label] == 'SkateBoarding' and cnt < 1000:
81 | pause = 0
82 | image = cv2.imread(os.path.join(dataset.data_path, 'Frames', dataset.videos_list[int(pre_box[0]) - 1],
83 | dataset.image_format % int(pre_box[1])))
84 | err_image_root = os.path.join(error_images_path, labels[label])
85 | if os.path.exists(err_image_root) is not True:
86 | os.mkdir(err_image_root)
87 | if positive:
88 | draw_rec_and_save_image([], [pre_box], [], labels, image, err_image_root, cnt)
89 | else:
90 | draw_rec_and_save_image([], [], [pre_box], labels, image, err_image_root, cnt)
91 | pr_cnt += 1
92 | video_name_list = np.array(video_name_list).reshape(-1, 1)
93 | label_pr_dict[label] = pr
94 | # plt.cla()
95 | # plt.plot(pr[:, 1], pr[:, 0], color='blue')
96 | # plt.xlabel('recall')
97 | # plt.ylabel('precision')
98 | # plt.savefig('./{}.jpg'.format(labels[label]))
99 | # exit(0)
100 | # for i, video in enumerate(dataset.videos_list):
101 | # video_label = list(dataset.gttubes[video].keys())[0]
102 | # if dataset.labels[video_label] != 'Walk':
103 | # continue
104 | # nframes = os.listdir(os.path.join(dataset.data_path, 'Frames', video)).__len__()
105 | # print("video index:", i)
106 | # for j in range(nframes):
107 | # image = cv2.imread(os.path.join(dataset.data_path, 'Frames', video, dataset.image_format % int(j+1)))
108 | # video_label = list(dataset.gttubes[video].keys())[0]
109 | # err_image_root = os.path.join(error_images_path, video + "-" + dataset.labels[video_label])
110 | # if os.path.exists(err_image_root) is not True:
111 | # os.mkdir(err_image_root)
112 | # if (i+1, j+1) in frame_gt_box_dict:
113 | # gt = frame_gt_box_dict[(i+1, j+1)]
114 | # else:
115 | # gt = []
116 | # if (i+1, j+1) in frame_correct_box_dict:
117 | # cpb = frame_correct_box_dict[(i+1, j+1)]
118 | # else:
119 | # cpb = []
120 | # if (i+1, j+1) in frame_error_box_dict:
121 | # epb = frame_error_box_dict[(i+1, j+1)]
122 | # else:
123 | # epb = []
124 | # draw_rec_and_save_image(gt, cpb, epb, labels, image, err_image_root, j+1)
125 |
126 | ap = np.empty(labels.__len__())
127 | for label in label_pr_dict:
128 | prdif = label_pr_dict[label][1:, 1] - label_pr_dict[label][:-1, 1]
129 | prsum = label_pr_dict[label][1:, 0] + label_pr_dict[label][:-1, 0]
130 | ap[label] = np.sum(prdif * prsum * 0.5)
131 | mmap = np.mean(ap)
132 | print("map:", mmap)
133 | return mmap
134 |
135 |
136 | def draw_rec_and_save_image(ground_truths, correct_pre_boxes, error_pre_boxes, labels, image, image_save_path, frame_index):
137 | for gt in ground_truths:
138 | p1 = (int(gt[0]), int(gt[1]))
139 | p2 = (int(gt[2]), int(gt[3]))
140 | cv2.rectangle(image, p1, p2, (0, 255, 0))
141 | for pb in error_pre_boxes:
142 | if pb[3] < 0.1:
143 | continue
144 | p1 = (int(pb[4]), int(pb[5]))
145 | p2 = (int(pb[6]), int(pb[7]))
146 | pt = (int(pb[4]), int(pb[7]))
147 | cv2.rectangle(image, p1, p2, (0, 0, 255))
148 | cv2.putText(image, "conf:%.3f " % pb[3] + labels[int(pb[2])], pt, cv2.FONT_HERSHEY_SIMPLEX, 0.4, (32, 32, 32))
149 | for pb in correct_pre_boxes:
150 | p1 = (int(pb[4]), int(pb[5]))
151 | p2 = (int(pb[6]), int(pb[7]))
152 | pt = (int(pb[4]), int(pb[5]+10))
153 | cv2.rectangle(image, p1, p2, (255, 0, 0))
154 | cv2.putText(image, "conf:%.3f " % pb[3] + labels[int(pb[2])], pt, cv2.FONT_HERSHEY_SIMPLEX, 0.4, (132, 132, 32))
155 | cv2.imwrite(os.path.join(image_save_path, "frame-{}.jpg").format(int(frame_index)), image)
156 |
157 |
158 | if __name__ == '__main__':
159 | # from data import dataset
160 | # dataset_name = 'UCFSports'
161 | # modality = 'rgb'
162 | # data_path = "/mnt/data/qzw/data/{}/".format(dataset_name)
163 | # analyse_data_set = dataset.TubeDataset(dataset_name, data_path=data_path, phase='eval',
164 | # modality=modality,
165 | # sequence_length=6)
166 | # error_analyse(analyse_data_set)
167 | import config
168 | from data import dataset
169 | args = config.Config()
170 | with open('./error_random_crop_data.pkl', 'rb') as f:
171 | index = pickle.load(f)
172 | image_list = pickle.load(f)
173 | ground_truth = pickle.load(f)
174 | _random_crop_data = pickle.load(f)
175 | # _random_flip_data = pickle.load(f)
176 | train_dataset = dataset.TubeDataset(args.dataset, data_path=args.data_path, phase='train',
177 | modality=args.modality,
178 | sequence_length=6)
179 | a, b = train_dataset[index]
180 |
181 |
182 |
--------------------------------------------------------------------------------
/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from . import act_cuboid_loss
2 | from . import ssd
3 |
--------------------------------------------------------------------------------
/layers/act_cuboid_loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import numpy as np
5 | from utils import prior_tubes
6 |
7 |
8 | def get_tube_overlap(tube1, tube2, ioutable):
9 | ground_truth = tube2.expand(tube1.shape)
10 | total_tube = torch.cat([tube1.unsqueeze(0), ground_truth.unsqueeze(0)], dim=0)
11 |
12 | xmin = torch.max(total_tube[:, :, 0::4], dim=0)[0]
13 | ymin = torch.max(total_tube[:, :, 1::4], dim=0)[0]
14 | xmax = torch.min(total_tube[:, :, 2::4], dim=0)[0]
15 | ymax = torch.min(total_tube[:, :, 3::4], dim=0)[0]
16 |
17 | cross_area = torch.clamp(xmax - xmin, min=0)*torch.clamp(ymax - ymin, min=0)
18 |
19 | valid_area = cross_area.sum(dim=1) > 0
20 | valid = valid_area.unsqueeze(1).expand(tube1.shape)
21 | valid_priortubes = tube1[valid].view((-1, tube1.shape[1]))
22 |
23 | prior_area = (valid_priortubes[:, 2::4] - valid_priortubes[:, 0::4])*(valid_priortubes[:, 3::4] - valid_priortubes[:, 1::4])
24 | valid = valid_area.unsqueeze(1).expand(-1, prior_area.shape[1])
25 | valid_cross_area = cross_area[valid].view((-1, prior_area.shape[1]))
26 |
27 | gt_area = (tube2[2::4] - tube2[0::4])*(tube2[3::4] - tube2[1::4])
28 |
29 | ratio = valid_cross_area/(gt_area + prior_area - valid_cross_area)
30 | ratio = ratio.sum(dim=1)
31 | ioutable[valid_area] = ratio/prior_area.shape[1]
32 |
33 |
34 | class CuboidLoss(nn.Module):
35 | def __init__(self, use_gpu, variance, num_class, k_frames):
36 | super(CuboidLoss, self).__init__()
37 | self.use_gpu = use_gpu
38 | self.variance = variance
39 | self.k_frames = k_frames
40 | self.num_class = num_class
41 | self.tubes_init = prior_tubes.RGB_TUBES(phase='train', use_gpu=use_gpu)
42 |
43 |
44 | def ACTMatchTube(self, prior_tubes, ground_truth):
45 | # prior_tubes.shape = (8396*24), it's same to all tubes
46 | # ground_truth is a tensor ,,every batch just one ground truth,,ground_truth.shape = (bath_num*sequence*(1+4))
47 | # if self.use_gpu:
48 | # _ground_truth = ground_truth.cpu().numpy()
49 | # else:
50 | # _ground_truth = ground_truth.numpy()
51 | batch_num = ground_truth.shape[0]
52 | prior_tubes_num = prior_tubes.shape[0]
53 | tubes_label = torch.zeros((batch_num, prior_tubes_num, self.num_class), dtype=torch.uint8)
54 | tubes_label_index = torch.zeros((batch_num, prior_tubes_num), dtype=torch.int64)
55 | tubes_label[:, :, 0] = 1
56 | positive_samples_index_list = []
57 | N = 0
58 | if self.use_gpu:
59 | iou_table = torch.zeros(prior_tubes_num, dtype=torch.float32).cuda()
60 | else:
61 | iou_table = torch.zeros(prior_tubes_num, dtype=torch.float32)
62 | for i in range(batch_num):
63 | iou_table.fill_(0)
64 | # for prior in range(prior_tubes_num):
65 | # iou_table[prior] = get_tube_overlap(prior_tubes[prior, :], _ground_truth[i, 1:], self.k_frames)
66 | get_tube_overlap(prior_tubes, ground_truth[i, 1:], iou_table)
67 | positive_sample_index = []
68 | max_prior_index = torch.argmax(iou_table, 0)
69 | positive_sample_index += [(max_prior_index, i)]
70 | tubes_label_index[i, max_prior_index] = int(ground_truth[i, 0])
71 | tubes_label[i, max_prior_index, int(ground_truth[i, 0])] = 1
72 | tubes_label[i, max_prior_index, 0] = 0
73 | pp = torch.argsort(-iou_table)
74 | for tt in pp:
75 | if iou_table[tt] < 0.5:
76 | break
77 | if tubes_label[i, tt, 0] == 1:
78 | positive_sample_index += [(tt, i)]
79 | tubes_label_index[i, tt] = int(ground_truth[i, 0])
80 | tubes_label[i, tt, int(ground_truth[i, 0])] = 1
81 | tubes_label[i, tt, 0] = 0
82 | N += positive_sample_index.__len__()
83 | positive_samples_index_list += [torch.tensor(positive_sample_index)]
84 | if N == 0:
85 | print("no positive samples!")
86 | exit(-1)
87 | return positive_samples_index_list, tubes_label, tubes_label_index, N
88 |
89 | def ACTComputeConfLoss(self, conf_preds, tubes_label):
90 | conf_preds_max = torch.max(conf_preds, dim=-1)[0].unsqueeze(-1)
91 | my_conf_preds = F.softmax(conf_preds - conf_preds_max, dim=-1)
92 | aa = my_conf_preds[tubes_label]
93 | aa = aa.view(tubes_label.shape[0], tubes_label.shape[1])
94 | tubes_loss = -torch.log(aa + 0.000001)
95 | return tubes_loss
96 |
97 | def ACTMineHardExamples(self, tubes_loss, positive_samples_index_list):
98 | negtive_samples_index_list = []
99 | if self.use_gpu:
100 | tubes_loss = tubes_loss.cpu().detach().numpy()
101 | else:
102 | tubes_loss = tubes_loss.detach().numpy()
103 | for i in range(tubes_loss.shape[0]):
104 | positive_sample_index = positive_samples_index_list[i]
105 | positive_num = positive_sample_index.shape[0]
106 | negtive_num = 3 * positive_num
107 | hard_examples_index = []
108 | tube_loss = tubes_loss[i, :]
109 | max_index = np.argsort(-tube_loss)
110 | negtive_count = 0
111 | for index in max_index:
112 | if index not in positive_sample_index[:, 0]:
113 | hard_examples_index += [index]
114 | negtive_count += 1
115 | if negtive_count >= negtive_num:
116 | break
117 | negtive_samples_index_list += [np.array(hard_examples_index)]
118 | return negtive_samples_index_list
119 |
120 | def ACTGetLocLoss(self, loc_preds, positive_samples_index_list, prior_tubes, ground_truth):
121 | # ground_truth is a list ,,its len is batch_num, its element.shape = gt_num*(1+24), the first is label
122 | batch_num = loc_preds.shape[0]
123 | if self.use_gpu:
124 | # _prior_tubes = torch.from_numpy(prior_tubes).cuda()
125 | encode_loc = torch.zeros(loc_preds.shape, requires_grad=False).cuda()
126 | pos_index = torch.zeros(loc_preds.shape, dtype=torch.uint8).cuda()
127 | else:
128 | # _prior_tubes = torch.from_numpy(prior_tubes)
129 | encode_loc = torch.zeros(loc_preds.shape, requires_grad=False)
130 | pos_index = torch.zeros(loc_preds.shape, dtype=torch.uint8)
131 | for i in range(batch_num):
132 | positive_samples_index = positive_samples_index_list[i]
133 | for j in range(positive_samples_index.shape[0]):
134 | pos_index[i, positive_samples_index[j, 0], :] = 1
135 | self.EncodeTube(prior_tubes[positive_samples_index[j, 0], :], ground_truth[i, 1:],
136 | encode_loc[i, positive_samples_index[j, 0], :])
137 | loc_p = loc_preds[pos_index].view(-1, self.k_frames * 4)
138 | loc_t = encode_loc[pos_index].view(-1, self.k_frames * 4)
139 | loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') / self.k_frames
140 | return loss_l
141 |
142 | def ACTGetConfLoss(self, conf_preds, positive_samples_index_list, negtive_samples_index_list, tubes_label):
143 | '''
144 | :param conf_preds:
145 | :param positive_samples_index_list:
146 | :param negtive_samples_index_list:
147 | :param tubes_label: (batch_num * 8396)
148 | :return:
149 | '''
150 | batch_num = conf_preds.shape[0]
151 | prior_num = conf_preds.shape[1]
152 | conf_pos_index = torch.zeros(conf_preds.shape, dtype=torch.uint8)
153 | target_pos_index = torch.zeros((batch_num, prior_num), dtype=torch.uint8)
154 | for i in range(batch_num):
155 | positive_samples_index = positive_samples_index_list[i]
156 | negtive_samples_index = negtive_samples_index_list[i]
157 | for j in range(positive_samples_index.shape[0]):
158 | conf_pos_index[i, positive_samples_index[j, 0], :] = 1
159 | target_pos_index[i, positive_samples_index[j, 0]] = 1
160 | for j in range(negtive_samples_index.shape[0]):
161 | conf_pos_index[i, negtive_samples_index[j], :] = 1
162 | target_pos_index[i, negtive_samples_index[j]] = 1
163 | tubes_label.requires_grad = False
164 | if self.use_gpu:
165 | tubes_label = tubes_label.cuda()
166 | conf_p = conf_preds[conf_pos_index].view(-1, self.num_class)
167 | target_weights = tubes_label[target_pos_index]
168 | loss_c = F.cross_entropy(conf_p, target_weights, reduction='mean')
169 | return loss_c
170 |
171 | def EncodeTube(self, prior_tube, gt_tube, encode):
172 | '''
173 | prior_tube=(xmin, ymin, xmax, ymax)*sequence_length
174 | gt_tube=(xmin, ymin, xmax, ymax)*sequence_length
175 | '''
176 | # encode = torch.zeros_like(prior_tube)
177 | p_x_min = prior_tube[0::4]
178 | p_y_min = prior_tube[1::4]
179 | p_x_max = prior_tube[2::4]
180 | p_y_max = prior_tube[3::4]
181 | prior_center_x = (p_x_min + p_x_max) / 2
182 | prior_center_y = (p_y_max + p_y_min) / 2
183 | prior_w = p_x_max - p_x_min
184 | prior_h = p_y_max - p_y_min
185 |
186 | g_x_min = gt_tube[0::4]
187 | g_y_min = gt_tube[1::4]
188 | g_x_max = gt_tube[2::4]
189 | g_y_max = gt_tube[3::4]
190 | gt_center_x = (g_x_min + g_x_max) / 2
191 | gt_center_y = (g_y_min + g_y_max) / 2
192 | gt_w = g_x_max - g_x_min
193 | gt_h = g_y_max - g_y_min
194 |
195 | encode[0::4] = (gt_center_x - prior_center_x) / prior_w / self.variance[0]
196 | encode[1::4] = (gt_center_y - prior_center_y) / prior_h / self.variance[1]
197 | encode[2::4] = torch.log(gt_w / prior_w) / self.variance[2]
198 | encode[3::4] = torch.log(gt_h / prior_h) / self.variance[3]
199 |
200 | def forward(self, output, ground_truth):
201 | loc_preds, conf_preds = output
202 | positive_samples_index_list, tubes_label, tubes_label_index, N = self.ACTMatchTube(self.tubes_init.all_tubes, ground_truth)
203 | loss_l = self.ACTGetLocLoss(loc_preds, positive_samples_index_list, self.tubes_init.all_tubes, ground_truth)
204 | tubes_loss = self.ACTComputeConfLoss(conf_preds, tubes_label)
205 | negtive_samples_index_list = self.ACTMineHardExamples(tubes_loss, positive_samples_index_list)
206 | loss_c = self.ACTGetConfLoss(conf_preds, positive_samples_index_list, negtive_samples_index_list, tubes_label_index)
207 | loss_l /= N
208 | # loss_c /= N
209 | if torch.isinf(loss_l) or torch.isnan(loss_l) or torch.isinf(loss_c) or torch.isnan(loss_c):
210 | with open("./inf_temp_data.pkl", 'wb') as f:
211 | pickle.dump(loc_preds.cpu().detach(), f)
212 | pickle.dump(conf_preds.cpu().detach(), f)
213 | pickle.dump(ground_truth.cpu().detach(), f)
214 | print("get inf or nan data!!")
215 | exit(-1)
216 | return loss_l, loss_c
217 |
218 |
219 | if __name__ == '__main__':
220 | import pickle
221 |
222 | with open("./debugfile.pkl", 'rb') as f:
223 | tube1 = pickle.load(f)
224 | tube2 = pickle.load(f)
225 | get_tube_overlap(tube1, tube2, 6)
226 |
--------------------------------------------------------------------------------
/layers/ssd.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import pickle
5 | import numpy as np
6 |
7 |
8 | class scale_norm(nn.Module):
9 | def __init__(self, channels):
10 | super(scale_norm, self).__init__()
11 | temp = torch.zeros(channels)
12 | temp.fill_(20)
13 | self.scale = nn.Parameter(temp.reshape(1, channels, 1, 1))
14 |
15 | def forward(self, input):
16 | output = F.normalize(input, p=2, dim=1)
17 | return output * self.scale
18 |
19 |
20 | class SSD_NET(nn.Module):
21 | def __init__(self, dataset, frezze_init, num_classes=11, modality='rgb', k=6):
22 | super(SSD_NET, self).__init__()
23 | self.frezze_init = frezze_init
24 | self.k_frames = k
25 | self.dataset = dataset
26 | self.modality = modality
27 | if modality == 'rgb':
28 | self.rgb = True
29 | elif modality == 'flow':
30 | self.rgb = False
31 | else:
32 | print("modality value error!!")
33 | exit(-1)
34 | self.num_classes = num_classes
35 | if self.rgb:
36 | self.in_channels = 3
37 | self.layer_name = 'frame'
38 | else:
39 | self.in_channels = 15
40 | self.layer_name = 'flow'
41 |
42 | self.__setattr__('conv1_1_{}'.format(self.layer_name),
43 | nn.Conv2d(in_channels=self.in_channels, out_channels=64, kernel_size=3, stride=1, padding=1))
44 | self.__setattr__('relu1_1_{}'.format(self.layer_name), nn.ReLU())
45 | self.__setattr__('conv1_2_{}'.format(self.layer_name),
46 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1))
47 | self.__setattr__('relu1_2_{}'.format(self.layer_name), nn.ReLU())
48 | self.__setattr__('pool1_{}'.format(self.layer_name), nn.MaxPool2d(kernel_size=2, stride=2))
49 | #####################################
50 |
51 | self.__setattr__('conv2_1_{}'.format(self.layer_name),
52 | nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1))
53 | self.__setattr__('relu2_1_{}'.format(self.layer_name), nn.ReLU())
54 | self.__setattr__('conv2_2_{}'.format(self.layer_name),
55 | nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1))
56 | self.__setattr__('relu2_2_{}'.format(self.layer_name), nn.ReLU())
57 | self.__setattr__('pool2_{}'.format(self.layer_name), nn.MaxPool2d(kernel_size=2, stride=2))
58 | #####################################
59 |
60 | self.__setattr__('conv3_1_{}'.format(self.layer_name),
61 | nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1))
62 | self.__setattr__('relu3_1_{}'.format(self.layer_name), nn.ReLU())
63 | self.__setattr__('conv3_2_{}'.format(self.layer_name),
64 | nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1))
65 | self.__setattr__('relu3_2_{}'.format(self.layer_name), nn.ReLU())
66 | self.__setattr__('conv3_3_{}'.format(self.layer_name),
67 | nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1))
68 | self.__setattr__('relu3_3_{}'.format(self.layer_name), nn.ReLU())
69 | self.__setattr__('pool3_{}'.format(self.layer_name), nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True))
70 | #####################################
71 |
72 | self.__setattr__('conv4_1_{}'.format(self.layer_name),
73 | nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1))
74 | self.__setattr__('relu4_1_{}'.format(self.layer_name), nn.ReLU())
75 | self.__setattr__('conv4_2_{}'.format(self.layer_name),
76 | nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1))
77 | self.__setattr__('relu4_2_{}'.format(self.layer_name), nn.ReLU())
78 | self.__setattr__('conv4_3_{}'.format(self.layer_name),
79 | nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1))
80 | self.__setattr__('relu4_3_{}'.format(self.layer_name), nn.ReLU())
81 | self.__setattr__('conv4_3_norm_{}'.format(self.layer_name), scale_norm(512))
82 | self.__setattr__('pool4_{}'.format(self.layer_name), nn.MaxPool2d(kernel_size=2, stride=2))
83 | #####################################
84 |
85 | self.__setattr__('conv5_1_{}'.format(self.layer_name),
86 | nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1))
87 | self.__setattr__('relu5_1_{}'.format(self.layer_name), nn.ReLU())
88 | self.__setattr__('conv5_2_{}'.format(self.layer_name),
89 | nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1))
90 | self.__setattr__('relu5_2_{}'.format(self.layer_name), nn.ReLU())
91 | self.__setattr__('conv5_3_{}'.format(self.layer_name),
92 | nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1))
93 | self.__setattr__('relu5_3_{}'.format(self.layer_name), nn.ReLU())
94 | self.__setattr__('pool5_{}'.format(self.layer_name), nn.MaxPool2d(kernel_size=2, stride=1))
95 | #####################################
96 |
97 | self.__setattr__('fc_conv6_{}'.format(self.layer_name),
98 | nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1,
99 | dilation=6,
100 | padding=6))
101 | self.__setattr__('fc_relu6_{}'.format(self.layer_name), nn.ReLU())
102 | self.__setattr__('fc_conv7_{}'.format(self.layer_name),
103 | nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1, stride=1, dilation=1,
104 | padding=0))
105 | self.__setattr__('fc_relu7_{}'.format(self.layer_name), nn.ReLU())
106 | #####################################
107 |
108 | self.__setattr__('conv6_1_{}'.format(self.layer_name),
109 | nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1, stride=1, padding=0))
110 | self.__setattr__('relu6_1_{}'.format(self.layer_name), nn.ReLU())
111 | self.__setattr__('conv6_2_{}'.format(self.layer_name),
112 | nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1))
113 | self.__setattr__('relu6_2_{}'.format(self.layer_name), nn.ReLU())
114 |
115 | self.__setattr__('conv7_1_{}'.format(self.layer_name),
116 | nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1, stride=1, padding=0))
117 | self.__setattr__('relu7_1_{}'.format(self.layer_name), nn.ReLU())
118 | self.__setattr__('conv7_2_{}'.format(self.layer_name),
119 | nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1))
120 | self.__setattr__('relu7_2_{}'.format(self.layer_name), nn.ReLU())
121 |
122 | self.__setattr__('conv8_1_{}'.format(self.layer_name),
123 | nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, stride=1, padding=0))
124 | self.__setattr__('relu8_1_{}'.format(self.layer_name), nn.ReLU())
125 | self.__setattr__('conv8_2_{}'.format(self.layer_name),
126 | nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=0))
127 | self.__setattr__('relu8_2_{}'.format(self.layer_name), nn.ReLU())
128 |
129 | self.__setattr__('conv9_1_{}'.format(self.layer_name),
130 | nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, stride=1, padding=0))
131 | self.__setattr__('relu9_1_{}'.format(self.layer_name), nn.ReLU())
132 | self.__setattr__('conv9_2_{}'.format(self.layer_name),
133 | nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=0))
134 | self.__setattr__('relu9_2_{}'.format(self.layer_name), nn.ReLU())
135 | #####################################
136 | self.conv4_3_norm_loc_conv = nn.Conv2d(in_channels=3072, out_channels=96, kernel_size=3, stride=1, padding=1)
137 | self.conv4_3_norm_conf_conv = nn.Conv2d(in_channels=3072, out_channels=self.num_classes*4, kernel_size=3, stride=1, padding=1)
138 |
139 | self.fc_conv7_loc_conv = nn.Conv2d(in_channels=6144, out_channels=144, kernel_size=3, stride=1, padding=1)
140 | self.fc_conv7_conf_conv = nn.Conv2d(in_channels=6144, out_channels=self.num_classes*6, kernel_size=3, stride=1, padding=1)
141 |
142 | self.conv6_loc_conv = nn.Conv2d(in_channels=3072, out_channels=144, kernel_size=3, stride=1, padding=1)
143 | self.conv6_conf_conv = nn.Conv2d(in_channels=3072, out_channels=self.num_classes*6, kernel_size=3, stride=1, padding=1)
144 |
145 | self.conv7_loc_conv = nn.Conv2d(in_channels=1536, out_channels=144, kernel_size=3, stride=1, padding=1)
146 | self.conv7_conf_conv = nn.Conv2d(in_channels=1536, out_channels=self.num_classes*6, kernel_size=3, stride=1, padding=1)
147 |
148 | self.conv8_loc_conv = nn.Conv2d(in_channels=1536, out_channels=96, kernel_size=3, stride=1, padding=1)
149 | self.conv8_conf_conv = nn.Conv2d(in_channels=1536, out_channels=self.num_classes*4, kernel_size=3, stride=1, padding=1)
150 |
151 | self.conv9_loc_conv = nn.Conv2d(in_channels=1536, out_channels=96, kernel_size=3, stride=1, padding=1)
152 | self.conv9_conf_conv = nn.Conv2d(in_channels=1536, out_channels=self.num_classes*4, kernel_size=3, stride=1, padding=1)
153 |
154 | def copy_weights(self, conv_name_caffe, conv_name_pytorch, init_dict):
155 | my_conv = self.__getattr__(conv_name_pytorch)
156 | conv_name_caffe_list = init_dict[conv_name_caffe]
157 | caffe_weight = conv_name_caffe_list[0]
158 | my_conv.weight.data.copy_(torch.from_numpy(caffe_weight))
159 | if conv_name_caffe_list.__len__() == 2:
160 | caffe_bias = init_dict[conv_name_caffe][1]
161 | else:
162 | caffe_bias = np.zeros_like(my_conv.bias.data.numpy())
163 | my_conv.bias.data.copy_(torch.from_numpy(caffe_bias))
164 |
165 | def load_trained_weights(self, pkl_file):
166 | print('load trained weights^^^^^^')
167 | f = open(pkl_file, 'rb')
168 | init_dict = pickle.load(f, encoding='iso-8859-1')
169 | f.close()
170 | if self.rgb is True:
171 | cn = ''
172 | else:
173 | cn = 'flow'
174 | for j in range(1, 3):
175 | self.copy_weights('conv{}_1_stream{}{}'.format(j, 0, cn), 'conv{}_1_{}'.format(j, self.layer_name), init_dict)
176 | self.copy_weights('conv{}_2_stream{}{}'.format(j, 0, cn), 'conv{}_2_{}'.format(j, self.layer_name), init_dict)
177 |
178 | for j in range(3, 6):
179 | self.copy_weights('conv{}_1_stream{}{}'.format(j, 0, cn), 'conv{}_1_{}'.format(j, self.layer_name), init_dict)
180 | self.copy_weights('conv{}_2_stream{}{}'.format(j, 0, cn), 'conv{}_2_{}'.format(j, self.layer_name), init_dict)
181 | self.copy_weights('conv{}_3_stream{}{}'.format(j, 0, cn), 'conv{}_3_{}'.format(j, self.layer_name), init_dict)
182 |
183 | self.copy_weights('fc6_stream{}{}'.format(0, cn), 'fc_conv6_{}'.format(self.layer_name), init_dict)
184 | self.copy_weights('fc7_stream{}{}'.format(0, cn), 'fc_conv7_{}'.format(self.layer_name), init_dict)
185 |
186 | for j in range(6, 10):
187 | self.copy_weights('conv{}_1_stream{}{}'.format(j, 0, cn), 'conv{}_1_{}'.format(j, self.layer_name), init_dict)
188 | self.copy_weights('conv{}_2_stream{}{}'.format(j, 0, cn), 'conv{}_2_{}'.format(j, self.layer_name), init_dict)
189 | self.copy_weights('conv4_3_norm_concat_mbox_conf', 'conv4_3_norm_conf_conv', init_dict)
190 | self.copy_weights('conv4_3_norm_concat_mbox_loc', 'conv4_3_norm_loc_conv', init_dict)
191 | self.copy_weights('fc7_concat_mbox_loc', 'fc_conv7_loc_conv', init_dict)
192 | self.copy_weights('fc7_concat_mbox_conf', 'fc_conv7_conf_conv', init_dict)
193 | for j in range(6, 10):
194 | self.copy_weights('conv{}_2_concat_mbox_conf'.format(j), 'conv{}_conf_conv'.format(j), init_dict)
195 | self.copy_weights('conv{}_2_concat_mbox_loc'.format(j), 'conv{}_loc_conv'.format(j), init_dict)
196 | for j in range(0, 6):
197 | my_norm = self.__getattr__('conv4_3_norm_{}'.format(self.layer_name))
198 | caffe_weight = init_dict['conv4_3_norm_stream{}{}'.format(j, cn)][0]
199 | my_norm.scale.data.copy_(torch.from_numpy(caffe_weight).reshape(1, 512, 1, 1))
200 | torch.save(self.state_dict(), './pytorch-models/{}/{}-trained-model-{}-pytorch-single.pkl'.format(self.dataset, self.dataset, self.modality))
201 | print("pytorch model saved!!!")
202 | exit(0)
203 |
204 | def load_init_weights(self, pkl_file):
205 | print("load_init_weights")
206 | init_weights = torch.load(pkl_file)
207 | if self.rgb is True:
208 | cn = ''
209 | else:
210 | cn = 'flow'
211 | for j in range(1, 3):
212 | for k in range(1, 3):
213 | conv = self.__getattr__('conv{}_{}_{}'.format(j, k, self.layer_name))
214 | conv.weight.data.copy_(init_weights['conv{}_{}_stream{}{}.weight'.format(j, k, 0, cn)])
215 | conv.bias.data.copy_(init_weights['conv{}_{}_stream{}{}.bias'.format(j, k, 0, cn)])
216 | for j in range(3, 6):
217 | for k in range(1, 4):
218 | conv = self.__getattr__('conv{}_{}_{}'.format(j, k, self.layer_name))
219 | conv.weight.data.copy_(init_weights['conv{}_{}_stream{}{}.weight'.format(j, k, 0, cn)])
220 | conv.bias.data.copy_(init_weights['conv{}_{}_stream{}{}.bias'.format(j, k, 0, cn)])
221 | for j in range(6, 8):
222 | conv = self.__getattr__('fc_conv{}_{}'.format(j, self.layer_name))
223 | conv.weight.data.copy_(init_weights['fc{}_stream{}.weight'.format(j, 0)])
224 | conv.bias.data.copy_(init_weights['fc{}_stream{}.bias'.format(j, 0)])
225 | print("load ok!, save it!")
226 | torch.save(self.state_dict(), '/home/qzw/code/my-act-detector/pytorch-models/{}/{}-init-model-{}-pytorch-single.pkl'.format(self.dataset, self.dataset, self.modality))
227 | print("pytorch model saved!!!")
228 | exit(0)
229 |
230 | def forward(self, input):
231 | conv4_3_list = []
232 | fc_conv7_list = []
233 | conv6_list = []
234 | conv7_list = []
235 | conv8_list = []
236 | conv9_list = []
237 |
238 | for i in range(self.k_frames):
239 | output = input[:, self.in_channels * i:self.in_channels * (i + 1), :, :]
240 | output = self.__getattr__('conv1_1_{}'.format(self.layer_name))(output)
241 | output = self.__getattr__('relu1_1_{}'.format(self.layer_name))(output)
242 | output = self.__getattr__('conv1_2_{}'.format(self.layer_name))(output)
243 | output = self.__getattr__('relu1_2_{}'.format(self.layer_name))(output)
244 | output = self.__getattr__('pool1_{}'.format(self.layer_name))(output)
245 |
246 | output = self.__getattr__('conv2_1_{}'.format(self.layer_name))(output)
247 | output = self.__getattr__('relu2_1_{}'.format(self.layer_name))(output)
248 | output = self.__getattr__('conv2_2_{}'.format(self.layer_name))(output)
249 | output = self.__getattr__('relu2_2_{}'.format(self.layer_name))(output)
250 | output = self.__getattr__('pool2_{}'.format(self.layer_name))(output)
251 |
252 | output = self.__getattr__('conv3_1_{}'.format(self.layer_name))(output)
253 | output = self.__getattr__('relu3_1_{}'.format(self.layer_name))(output)
254 | output = self.__getattr__('conv3_2_{}'.format(self.layer_name))(output)
255 | output = self.__getattr__('relu3_2_{}'.format(self.layer_name))(output)
256 | output = self.__getattr__('conv3_3_{}'.format(self.layer_name))(output)
257 | output = self.__getattr__('relu3_3_{}'.format(self.layer_name))(output)
258 | output = self.__getattr__('pool3_{}'.format(self.layer_name))(output)
259 |
260 | output = self.__getattr__('conv4_1_{}'.format(self.layer_name))(output)
261 | output = self.__getattr__('relu4_1_{}'.format(self.layer_name))(output)
262 | output = self.__getattr__('conv4_2_{}'.format(self.layer_name))(output)
263 | output = self.__getattr__('relu4_2_{}'.format(self.layer_name))(output)
264 | output = self.__getattr__('conv4_3_{}'.format(self.layer_name))(output)
265 | output = self.__getattr__('relu4_3_{}'.format(self.layer_name))(output)
266 | conv4_3_list.append(self.__getattr__('conv4_3_norm_{}'.format(self.layer_name))(output))
267 |
268 | output = self.__getattr__('pool4_{}'.format(self.layer_name))(output)
269 |
270 | output = self.__getattr__('conv5_1_{}'.format(self.layer_name))(output)
271 | output = self.__getattr__('relu5_1_{}'.format(self.layer_name))(output)
272 | output = self.__getattr__('conv5_2_{}'.format(self.layer_name))(output)
273 | output = self.__getattr__('relu5_2_{}'.format(self.layer_name))(output)
274 | output = self.__getattr__('conv5_3_{}'.format(self.layer_name))(output)
275 | output = self.__getattr__('relu5_3_{}'.format(self.layer_name))(output)
276 | output = self.__getattr__('pool5_{}'.format(self.layer_name))(output)
277 |
278 | conv6 = self.__getattr__('fc_conv6_{}'.format(self.layer_name))
279 | conv6.dilation = (6 ** (i + 1), 6 ** (i + 1))
280 | conv6.padding = (6 ** (i + 1), 6 ** (i + 1))
281 | output = conv6(output)
282 | output = self.__getattr__('fc_relu6_{}'.format(self.layer_name))(output)
283 | output = self.__getattr__('fc_conv7_{}'.format(self.layer_name))(output)
284 | output = self.__getattr__('fc_relu7_{}'.format(self.layer_name))(output)
285 | fc_conv7_list.append(output)
286 |
287 | output = self.__getattr__('conv6_1_{}'.format(self.layer_name))(output)
288 | output = self.__getattr__('relu6_1_{}'.format(self.layer_name))(output)
289 | output = self.__getattr__('conv6_2_{}'.format(self.layer_name))(output)
290 | output = self.__getattr__('relu6_2_{}'.format(self.layer_name))(output)
291 | conv6_list.append(output)
292 |
293 | output = self.__getattr__('conv7_1_{}'.format(self.layer_name))(output)
294 | output = self.__getattr__('relu7_1_{}'.format(self.layer_name))(output)
295 | output = self.__getattr__('conv7_2_{}'.format(self.layer_name))(output)
296 | output = self.__getattr__('relu7_2_{}'.format(self.layer_name))(output)
297 | conv7_list.append(output)
298 |
299 | output = self.__getattr__('conv8_1_{}'.format(self.layer_name))(output)
300 | output = self.__getattr__('relu8_1_{}'.format(self.layer_name))(output)
301 | output = self.__getattr__('conv8_2_{}'.format(self.layer_name))(output)
302 | output = self.__getattr__('relu8_2_{}'.format(self.layer_name))(output)
303 | conv8_list.append(output)
304 |
305 | output = self.__getattr__('conv9_1_{}'.format(self.layer_name))(output)
306 | output = self.__getattr__('relu9_1_{}'.format(self.layer_name))(output)
307 | output = self.__getattr__('conv9_2_{}'.format(self.layer_name))(output)
308 | output = self.__getattr__('relu9_2_{}'.format(self.layer_name))(output)
309 | conv9_list.append(output)
310 | conv4_3_fm = torch.cat(conv4_3_list, dim=1).contiguous()
311 | fc_conv7_fm = torch.cat(fc_conv7_list, dim=1).contiguous()
312 | conv6_fm = torch.cat(conv6_list, dim=1).contiguous()
313 | conv7_fm = torch.cat(conv7_list, dim=1).contiguous()
314 | conv8_fm = torch.cat(conv8_list, dim=1).contiguous()
315 | conv9_fm = torch.cat(conv9_list, dim=1).contiguous()
316 |
317 | conv4_3_norm_localization = self.conv4_3_norm_loc_conv(conv4_3_fm)
318 | conv4_3_norm_localization = conv4_3_norm_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4)
319 | conv4_3_norm_confidence = self.conv4_3_norm_conf_conv(conv4_3_fm)
320 | conv4_3_norm_confidence = conv4_3_norm_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes)
321 | # conv4_3_norm_confidence = F.softmax(conv4_3_norm_confidence, dim=-1)
322 |
323 | fc_conv7_localization = self.fc_conv7_loc_conv(fc_conv7_fm)
324 | fc_conv7_localization = fc_conv7_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4)
325 | fc_conv7_confidence = self.fc_conv7_conf_conv(fc_conv7_fm)
326 | fc_conv7_confidence = fc_conv7_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes)
327 | # fc_conv7_confidence = F.softmax(fc_conv7_confidence, dim=-1)
328 |
329 | conv6_localization = self.conv6_loc_conv(conv6_fm)
330 | conv6_localization = conv6_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4)
331 | conv6_confidence = self.conv6_conf_conv(conv6_fm)
332 | conv6_confidence = conv6_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes)
333 | # conv6_confidence = F.softmax(conv6_confidence, dim=-1)
334 |
335 | conv7_localization = self.conv7_loc_conv(conv7_fm)
336 | conv7_localization = conv7_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4)
337 | conv7_confidence = self.conv7_conf_conv(conv7_fm)
338 | conv7_confidence = conv7_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes)
339 | # conv7_confidence = F.softmax(conv7_confidence, dim=-1)
340 |
341 | conv8_localization = self.conv8_loc_conv(conv8_fm)
342 | conv8_localization = conv8_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4)
343 | conv8_confidence = self.conv8_conf_conv(conv8_fm)
344 | conv8_confidence = conv8_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes)
345 | # conv8_confidence = F.softmax(conv8_confidence, dim=-1)
346 |
347 | conv9_localization = self.conv9_loc_conv(conv9_fm)
348 | conv9_localization = conv9_localization.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.k_frames*4)
349 | conv9_confidence = self.conv9_conf_conv(conv9_fm)
350 | conv9_confidence = conv9_confidence.permute(0, 2, 3, 1).contiguous().view(input.shape[0], -1, self.num_classes)
351 | # conv9_confidence = F.softmax(conv9_confidence, dim=-1)
352 |
353 | loc_preds = torch.cat([conv4_3_norm_localization, fc_conv7_localization, conv6_localization, conv7_localization,
354 | conv8_localization, conv9_localization], dim=1)
355 | conf_preds = torch.cat([conv4_3_norm_confidence, fc_conv7_confidence, conv6_confidence, conv7_confidence,
356 | conv8_confidence, conv9_confidence], dim=1)
357 | return loc_preds, conf_preds
358 |
359 | def get_feature_map(self, input, conv6_dilation):
360 | conv6 = self.__getattr__('fc_conv6_{}'.format(self.layer_name))
361 | conv6.dilation = conv6_dilation
362 | conv6.padding = conv6_dilation
363 | output = self.__getattr__('conv1_1_{}'.format(self.layer_name))(input)
364 | output = self.__getattr__('relu1_1_{}'.format(self.layer_name))(output)
365 | output = self.__getattr__('conv1_2_{}'.format(self.layer_name))(output)
366 | output = self.__getattr__('relu1_2_{}'.format(self.layer_name))(output)
367 | output = self.__getattr__('pool1_{}'.format(self.layer_name))(output)
368 |
369 | output = self.__getattr__('conv2_1_{}'.format(self.layer_name))(output)
370 | output = self.__getattr__('relu2_1_{}'.format(self.layer_name))(output)
371 | output = self.__getattr__('conv2_2_{}'.format(self.layer_name))(output)
372 | output = self.__getattr__('relu2_2_{}'.format(self.layer_name))(output)
373 | output = self.__getattr__('pool2_{}'.format(self.layer_name))(output)
374 |
375 | output = self.__getattr__('conv3_1_{}'.format(self.layer_name))(output)
376 | output = self.__getattr__('relu3_1_{}'.format(self.layer_name))(output)
377 | output = self.__getattr__('conv3_2_{}'.format(self.layer_name))(output)
378 | output = self.__getattr__('relu3_2_{}'.format(self.layer_name))(output)
379 | output = self.__getattr__('conv3_3_{}'.format(self.layer_name))(output)
380 | output = self.__getattr__('relu3_3_{}'.format(self.layer_name))(output)
381 | output = self.__getattr__('pool3_{}'.format(self.layer_name))(output)
382 |
383 | output = self.__getattr__('conv4_1_{}'.format(self.layer_name))(output)
384 | output = self.__getattr__('relu4_1_{}'.format(self.layer_name))(output)
385 | output = self.__getattr__('conv4_2_{}'.format(self.layer_name))(output)
386 | output = self.__getattr__('relu4_2_{}'.format(self.layer_name))(output)
387 | output = self.__getattr__('conv4_3_{}'.format(self.layer_name))(output)
388 | output = self.__getattr__('relu4_3_{}'.format(self.layer_name))(output)
389 | conv4_3 = self.__getattr__('conv4_3_norm_{}'.format(self.layer_name))(output)
390 |
391 | output = self.__getattr__('pool4_{}'.format(self.layer_name))(output)
392 |
393 | output = self.__getattr__('conv5_1_{}'.format(self.layer_name))(output)
394 | output = self.__getattr__('relu5_1_{}'.format(self.layer_name))(output)
395 | output = self.__getattr__('conv5_2_{}'.format(self.layer_name))(output)
396 | output = self.__getattr__('relu5_2_{}'.format(self.layer_name))(output)
397 | output = self.__getattr__('conv5_3_{}'.format(self.layer_name))(output)
398 | output = self.__getattr__('relu5_3_{}'.format(self.layer_name))(output)
399 | output = self.__getattr__('pool5_{}'.format(self.layer_name))(output)
400 |
401 | output = self.__getattr__('fc_conv6_{}'.format(self.layer_name))(output)
402 | output = self.__getattr__('fc_relu6_{}'.format(self.layer_name))(output)
403 | output = self.__getattr__('fc_conv7_{}'.format(self.layer_name))(output)
404 | output = self.__getattr__('fc_relu7_{}'.format(self.layer_name))(output)
405 | fc_conv7 = output
406 |
407 | output = self.__getattr__('conv6_1_{}'.format(self.layer_name))(output)
408 | output = self.__getattr__('relu6_1_{}'.format(self.layer_name))(output)
409 | output = self.__getattr__('conv6_2_{}'.format(self.layer_name))(output)
410 | output = self.__getattr__('relu6_2_{}'.format(self.layer_name))(output)
411 | conv6 = output
412 |
413 | output = self.__getattr__('conv7_1_{}'.format(self.layer_name))(output)
414 | output = self.__getattr__('relu7_1_{}'.format(self.layer_name))(output)
415 | output = self.__getattr__('conv7_2_{}'.format(self.layer_name))(output)
416 | output = self.__getattr__('relu7_2_{}'.format(self.layer_name))(output)
417 | conv7 = output
418 |
419 | output = self.__getattr__('conv8_1_{}'.format(self.layer_name))(output)
420 | output = self.__getattr__('relu8_1_{}'.format(self.layer_name))(output)
421 | output = self.__getattr__('conv8_2_{}'.format(self.layer_name))(output)
422 | output = self.__getattr__('relu8_2_{}'.format(self.layer_name))(output)
423 | conv8 = output
424 |
425 | output = self.__getattr__('conv9_1_{}'.format(self.layer_name))(output)
426 | output = self.__getattr__('relu9_1_{}'.format(self.layer_name))(output)
427 | output = self.__getattr__('conv9_2_{}'.format(self.layer_name))(output)
428 | output = self.__getattr__('relu9_2_{}'.format(self.layer_name))(output)
429 | conv9 = output
430 | return conv4_3, fc_conv7, conv6, conv7, conv8, conv9
431 |
432 | def get_loc_conf(self, conv4_3_data, fc_conv7_data, conv6_data, conv7_data, conv8_data, conv9_data):
433 | conv4_3_norm_localization = self.conv4_3_norm_loc_conv(conv4_3_data.cuda())
434 | conv4_3_norm_localization = conv4_3_norm_localization.permute(0, 2, 3, 1).contiguous().view(
435 | conv4_3_data.shape[0],
436 | -1,
437 | self.k_frames * 4)
438 | conv4_3_norm_confidence = self.conv4_3_norm_conf_conv(conv4_3_data.cuda())
439 | conv4_3_norm_confidence = conv4_3_norm_confidence.permute(0, 2, 3, 1).contiguous().view(conv4_3_data.shape[0],
440 | -1,
441 | self.num_classes)
442 | fc_conv7_localization = self.fc_conv7_loc_conv(fc_conv7_data.cuda())
443 | fc_conv7_localization = fc_conv7_localization.permute(0, 2, 3, 1).contiguous().view(fc_conv7_data.shape[0], -1,
444 | self.k_frames * 4)
445 |
446 | fc_conv7_confidence = self.fc_conv7_conf_conv(fc_conv7_data.cuda())
447 | fc_conv7_confidence = fc_conv7_confidence.permute(0, 2, 3, 1).contiguous().view(fc_conv7_data.shape[0], -1,
448 | self.num_classes)
449 |
450 | conv6_localization = self.conv6_loc_conv(conv6_data.cuda())
451 | conv6_localization = conv6_localization.permute(0, 2, 3, 1).contiguous().view(conv6_data.shape[0], -1,
452 | self.k_frames * 4)
453 | conv6_confidence = self.conv6_conf_conv(conv6_data.cuda())
454 | conv6_confidence = conv6_confidence.permute(0, 2, 3, 1).contiguous().view(conv6_data.shape[0], -1,
455 | self.num_classes)
456 |
457 | conv7_localization = self.conv7_loc_conv(conv7_data.cuda())
458 | conv7_localization = conv7_localization.permute(0, 2, 3, 1).contiguous().view(conv7_data.shape[0], -1,
459 | self.k_frames * 4)
460 | conv7_confidence = self.conv7_conf_conv(conv7_data.cuda())
461 | conv7_confidence = conv7_confidence.permute(0, 2, 3, 1).contiguous().view(conv7_data.shape[0], -1,
462 | self.num_classes)
463 |
464 | conv8_localization = self.conv8_loc_conv(conv8_data.cuda())
465 | conv8_localization = conv8_localization.permute(0, 2, 3, 1).contiguous().view(conv8_data.shape[0], -1,
466 | self.k_frames * 4)
467 | conv8_confidence = self.conv8_conf_conv(conv8_data.cuda())
468 | conv8_confidence = conv8_confidence.permute(0, 2, 3, 1).contiguous().view(conv8_data.shape[0], -1,
469 | self.num_classes)
470 |
471 | conv9_localization = self.conv9_loc_conv(conv9_data.cuda())
472 | conv9_localization = conv9_localization.permute(0, 2, 3, 1).contiguous().view(conv9_data.shape[0], -1,
473 | self.k_frames * 4)
474 | conv9_confidence = self.conv9_conf_conv(conv9_data.cuda())
475 | conv9_confidence = conv9_confidence.permute(0, 2, 3, 1).contiguous().view(conv9_data.shape[0], -1,
476 | self.num_classes)
477 | loc_preds = torch.cat(
478 | [conv4_3_norm_localization, fc_conv7_localization, conv6_localization, conv7_localization,
479 | conv8_localization, conv9_localization], dim=1)
480 | conf_preds = torch.cat([conv4_3_norm_confidence, fc_conv7_confidence, conv6_confidence, conv7_confidence,
481 | conv8_confidence, conv9_confidence], dim=1)
482 | return loc_preds, conf_preds
483 |
484 | def train(self, mode=True):
485 | super(SSD_NET, self).train(mode)
486 | for m in self.modules():
487 | ps = list(m.parameters())
488 | for p in ps:
489 | p.requires_grad = True
490 | self.conv4_3_norm_conf_conv.bias.data.fill_(0)
491 | self.conv4_3_norm_conf_conv.bias.requires_grad = False
492 | self.conv4_3_norm_loc_conv.bias.data.fill_(0)
493 | self.conv4_3_norm_loc_conv.bias.requires_grad = False
494 | if self.frezze_init:
495 | self.frezze_init_func(freeze_norm_layer=False)
496 |
497 | def eval(self):
498 | super(SSD_NET, self).eval()
499 | for m in self.modules():
500 | ps = list(m.parameters())
501 | for p in ps:
502 | p.requires_grad = False
503 |
504 | def get_optim_policies(self):
505 | parameters_list = []
506 | for m in self.modules():
507 | if isinstance(m, torch.nn.Conv2d):
508 | ps = list(m.parameters())
509 | if ps[0].requires_grad:
510 | parameters_list.append(ps[0])
511 | if ps[1].requires_grad:
512 | parameters_list.append(ps[1])
513 | elif isinstance(m, scale_norm):
514 | ps = list(m.parameters())
515 | if ps[0].requires_grad:
516 | parameters_list.append(ps[0])
517 | return parameters_list
518 |
519 | def get_loc_conf_optim_policies(self):
520 | parameters_list = []
521 | parameters_list.append(self.conv4_3_norm_loc_conv.weight)
522 | parameters_list.append(self.conv4_3_norm_conf_conv.weight)
523 |
524 | parameters_list.append(self.fc_conv7_loc_conv.weight)
525 | parameters_list.append(self.fc_conv7_loc_conv.bias)
526 | parameters_list.append(self.fc_conv7_conf_conv.weight)
527 | parameters_list.append(self.fc_conv7_conf_conv.bias)
528 |
529 | parameters_list.append(self.conv6_loc_conv.weight)
530 | parameters_list.append(self.conv6_loc_conv.bias)
531 | parameters_list.append(self.conv6_conf_conv.weight)
532 | parameters_list.append(self.conv6_conf_conv.bias)
533 |
534 | parameters_list.append(self.conv7_loc_conv.weight)
535 | parameters_list.append(self.conv7_loc_conv.bias)
536 | parameters_list.append(self.conv7_conf_conv.weight)
537 | parameters_list.append(self.conv7_conf_conv.bias)
538 |
539 | parameters_list.append(self.conv8_loc_conv.weight)
540 | parameters_list.append(self.conv8_loc_conv.bias)
541 | parameters_list.append(self.conv8_conf_conv.weight)
542 | parameters_list.append(self.conv8_conf_conv.bias)
543 |
544 | parameters_list.append(self.conv9_loc_conv.weight)
545 | parameters_list.append(self.conv9_loc_conv.bias)
546 | parameters_list.append(self.conv9_conf_conv.weight)
547 | parameters_list.append(self.conv9_conf_conv.bias)
548 | return parameters_list
549 |
550 | def get_vgg_optim_policies(self):
551 | parameters_list = []
552 | conv = self.__getattr__('conv1_1_{}'.format(self.layer_name))
553 | parameters_list.append(conv.weight)
554 | parameters_list.append(conv.bias)
555 | conv = self.__getattr__('conv1_2_{}'.format(self.layer_name))
556 | parameters_list.append(conv.weight)
557 | parameters_list.append(conv.bias)
558 | conv = self.__getattr__('conv2_1_{}'.format(self.layer_name))
559 | parameters_list.append(conv.weight)
560 | parameters_list.append(conv.bias)
561 | conv = self.__getattr__('conv2_2_{}'.format(self.layer_name))
562 | parameters_list.append(conv.weight)
563 | parameters_list.append(conv.bias)
564 | conv = self.__getattr__('conv3_1_{}'.format(self.layer_name))
565 | parameters_list.append(conv.weight)
566 | parameters_list.append(conv.bias)
567 | conv = self.__getattr__('conv3_2_{}'.format(self.layer_name))
568 | parameters_list.append(conv.weight)
569 | parameters_list.append(conv.bias)
570 | conv = self.__getattr__('conv3_3_{}'.format(self.layer_name))
571 | parameters_list.append(conv.weight)
572 | parameters_list.append(conv.bias)
573 | conv = self.__getattr__('conv4_1_{}'.format(self.layer_name))
574 | parameters_list.append(conv.weight)
575 | parameters_list.append(conv.bias)
576 | conv = self.__getattr__('conv4_2_{}'.format(self.layer_name))
577 | parameters_list.append(conv.weight)
578 | parameters_list.append(conv.bias)
579 | conv = self.__getattr__('conv4_3_{}'.format(self.layer_name))
580 | parameters_list.append(conv.weight)
581 | parameters_list.append(conv.bias)
582 | conv = self.__getattr__('conv4_3_norm_{}'.format(self.layer_name))
583 | parameters_list.append(conv.weight)
584 | parameters_list.append(conv.bias)
585 | conv = self.__getattr__('conv5_1_{}'.format(self.layer_name))
586 | parameters_list.append(conv.weight)
587 | parameters_list.append(conv.bias)
588 | conv = self.__getattr__('conv5_2_{}'.format(self.layer_name))
589 | parameters_list.append(conv.weight)
590 | parameters_list.append(conv.bias)
591 | conv = self.__getattr__('conv5_3_{}'.format(self.layer_name))
592 | parameters_list.append(conv.weight)
593 | parameters_list.append(conv.bias)
594 | return parameters_list
595 |
596 | def get_ssd_optim_policies(self):
597 | parameters_list = []
598 | conv = self.__getattr__('fc_conv6_{}'.format(self.layer_name))
599 | parameters_list.append(conv.weight)
600 | parameters_list.append(conv.bias)
601 | conv = self.__getattr__('fc_conv7_{}'.format(self.layer_name))
602 | parameters_list.append(conv.weight)
603 | parameters_list.append(conv.bias)
604 | conv = self.__getattr__('conv6_1_{}'.format(self.layer_name))
605 | parameters_list.append(conv.weight)
606 | parameters_list.append(conv.bias)
607 | conv = self.__getattr__('conv6_2_{}'.format(self.layer_name))
608 | parameters_list.append(conv.weight)
609 | parameters_list.append(conv.bias)
610 | conv = self.__getattr__('conv7_1_{}'.format(self.layer_name))
611 | parameters_list.append(conv.weight)
612 | parameters_list.append(conv.bias)
613 | conv = self.__getattr__('conv7_2_{}'.format(self.layer_name))
614 | parameters_list.append(conv.weight)
615 | parameters_list.append(conv.bias)
616 | conv = self.__getattr__('conv8_1_{}'.format(self.layer_name))
617 | parameters_list.append(conv.weight)
618 | parameters_list.append(conv.bias)
619 | conv = self.__getattr__('conv8_2_{}'.format(self.layer_name))
620 | parameters_list.append(conv.weight)
621 | parameters_list.append(conv.bias)
622 | conv = self.__getattr__('conv9_1_{}'.format(self.layer_name))
623 | parameters_list.append(conv.weight)
624 | parameters_list.append(conv.bias)
625 | conv = self.__getattr__('conv9_2_{}'.format(self.layer_name))
626 | parameters_list.append(conv.weight)
627 | parameters_list.append(conv.bias)
628 | return parameters_list
629 |
630 | def frezze_init_func(self, freeze_norm_layer=False):
631 | m = self.__getattr__('conv1_1_{}'.format(self.layer_name))
632 | m.eval()
633 | m.weight.requires_grad = False
634 | m.bias.requires_grad = False
635 | m = self.__getattr__('conv1_2_{}'.format(self.layer_name))
636 | m.eval()
637 | m.weight.requires_grad = False
638 | m.bias.requires_grad = False
639 | m = self.__getattr__('conv2_1_{}'.format(self.layer_name))
640 | m.eval()
641 | m.weight.requires_grad = False
642 | m.bias.requires_grad = False
643 | m = self.__getattr__('conv2_2_{}'.format(self.layer_name))
644 | m.eval()
645 | m.weight.requires_grad = False
646 | m.bias.requires_grad = False
647 | m = self.__getattr__('conv3_1_{}'.format(self.layer_name))
648 | m.eval()
649 | m.weight.requires_grad = False
650 | m.bias.requires_grad = False
651 | m = self.__getattr__('conv3_2_{}'.format(self.layer_name))
652 | m.eval()
653 | m.weight.requires_grad = False
654 | m.bias.requires_grad = False
655 | m = self.__getattr__('conv3_3_{}'.format(self.layer_name))
656 | m.eval()
657 | m.weight.requires_grad = False
658 | m.bias.requires_grad = False
659 | m = self.__getattr__('conv4_1_{}'.format(self.layer_name))
660 | m.eval()
661 | m.weight.requires_grad = False
662 | m.bias.requires_grad = False
663 | m = self.__getattr__('conv4_2_{}'.format(self.layer_name))
664 | m.eval()
665 | m.weight.requires_grad = False
666 | m.bias.requires_grad = False
667 | m = self.__getattr__('conv4_3_{}'.format(self.layer_name))
668 | m.eval()
669 | m.weight.requires_grad = False
670 | m.bias.requires_grad = False
671 |
672 | m = self.__getattr__('conv5_1_{}'.format(self.layer_name))
673 | m.eval()
674 | m.weight.requires_grad = False
675 | m.bias.requires_grad = False
676 | m = self.__getattr__('conv5_2_{}'.format(self.layer_name))
677 | m.eval()
678 | m.weight.requires_grad = False
679 | m.bias.requires_grad = False
680 | m = self.__getattr__('conv5_3_{}'.format(self.layer_name))
681 | m.eval()
682 | m.weight.requires_grad = False
683 | m.bias.requires_grad = False
684 | m = self.__getattr__('fc_conv6_{}'.format(self.layer_name))
685 | m.eval()
686 | m.weight.requires_grad = False
687 | m.bias.requires_grad = False
688 | m = self.__getattr__('fc_conv7_{}'.format(self.layer_name))
689 | m.eval()
690 | m.weight.requires_grad = False
691 | m.bias.requires_grad = False
692 | if freeze_norm_layer:
693 | m = self.__getattr__('conv4_3_norm_{}'.format(self.layer_name))
694 | m.eval()
695 | m.scale.requires_grad = False
696 |
697 | def freeze_ssd(self, freeze_norm_layer):
698 | self.frezze_vgg(freeze_norm_layer)
699 | m = self.__getattr__('fc_conv6_{}'.format(self.layer_name))
700 | m.eval()
701 | m.weight.requires_grad = False
702 | m.bias.requires_grad = False
703 | m = self.__getattr__('fc_conv7_{}'.format(self.layer_name))
704 | m.eval()
705 | m.weight.requires_grad = False
706 | m.bias.requires_grad = False
707 | m = self.__getattr__('conv6_1_{}'.format(self.layer_name))
708 | m.eval()
709 | m.weight.requires_grad = False
710 | m.bias.requires_grad = False
711 | m = self.__getattr__('conv6_2_{}'.format(self.layer_name))
712 | m.eval()
713 | m.weight.requires_grad = False
714 | m.bias.requires_grad = False
715 |
716 | m = self.__getattr__('conv7_1_{}'.format(self.layer_name))
717 | m.eval()
718 | m.weight.requires_grad = False
719 | m.bias.requires_grad = False
720 | m = self.__getattr__('conv7_2_{}'.format(self.layer_name))
721 | m.eval()
722 | m.weight.requires_grad = False
723 | m.bias.requires_grad = False
724 |
725 | m = self.__getattr__('conv8_1_{}'.format(self.layer_name))
726 | m.eval()
727 | m.weight.requires_grad = False
728 | m.bias.requires_grad = False
729 | m = self.__getattr__('conv8_2_{}'.format(self.layer_name))
730 | m.eval()
731 | m.weight.requires_grad = False
732 | m.bias.requires_grad = False
733 |
734 | m = self.__getattr__('conv9_1_{}'.format(self.layer_name))
735 | m.eval()
736 | m.weight.requires_grad = False
737 | m.bias.requires_grad = False
738 | m = self.__getattr__('conv9_2_{}'.format(self.layer_name))
739 | m.eval()
740 | m.weight.requires_grad = False
741 | m.bias.requires_grad = False
742 |
743 |
744 | def vgg(cfg, i, batch_norm=False):
745 | layers = []
746 | in_channels = i
747 | for v in cfg:
748 | if v == 'M':
749 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
750 | elif v == 'C':
751 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
752 | else:
753 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
754 | if batch_norm:
755 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
756 | else:
757 | layers += [conv2d, nn.ReLU(inplace=True)]
758 | in_channels = v
759 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
760 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
761 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
762 | layers += [pool5, conv6,
763 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
764 | return layers
765 |
766 | if __name__ == '__main__':
767 | # with open('/media/main/_sdc/qzw/ACT-Detector/my-act-detector/caffe-models/initialization_VGG_ILSVRC16_K6_RGB.pkl', 'rb') as f:
768 | # initialization_dict = pickle.load(f)
769 | # f.close()
770 | # rgb_net = SSD_NET(num_classes=25, rgb=True)
771 |
772 | # rgb_net.load_init_weights(
773 | # './caffe-models/UCF101v2/RGB-UCF101v2-numpy.pkl')
774 | # torch.save(rgb_net.state_dict(), 'RGB-UCF101v2-pytorch.pkl')
775 | # print("RGB OK!!!")
776 |
777 | flow_net = SSD_NET(num_classes=25, rgb=False)
778 | flow_net.load_init_weights(
779 | './caffe-models/UCF101v2/FLOW5-UCF101v2-numpy.pkl')
780 | torch.save(flow_net.state_dict(), 'FLOW5-UCF101v2-pytorch.pkl')
781 | print("FLOW5 OK!!!")
782 |
783 |
784 |
785 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from utils import prior_tubes
4 | from utils import map_eval
5 | from data import dataset
6 | import multiprocessing
7 | import time
8 | import torch.nn.functional as F
9 | import pickle
10 |
11 |
12 | def data_handle_and_save_process(all_frame_boxes_dict, video_index, conf_preds, decode_video_tubes, num_class,
13 | sequence_length, height, width):
14 | all_frame_boxes_list = []
15 | start_frame = 0
16 | frame_boxes = {}
17 | for batch in range(conf_preds.shape[0]):
18 | start_frame += 1
19 | nms_tubes_blist, nms_scores, nms_label_list = prior_tubes.apply_nms(conf_preds[batch, :],
20 | decode_video_tubes[batch, :],
21 | nms_threshold=0.45,
22 | num_class=num_class)
23 | if nms_scores.__len__() > 0:
24 | tt1 = (torch.Tensor(nms_label_list) - 1).view(-1, 1).numpy()
25 | tt2 = torch.Tensor(nms_scores).view(-1, 1).numpy()
26 | tt3 = np.vstack([tt.view(1, -1).cpu().numpy() for tt in nms_tubes_blist])
27 | best_tube = np.hstack([tt1, tt2, tt3])
28 | else:
29 | best_tube = np.array([])
30 | for m in range(best_tube.shape[0]):
31 | for n in range(sequence_length):
32 | if (n + start_frame) not in frame_boxes:
33 | frame_boxes[n + start_frame] = []
34 | frame_boxes[n + start_frame] += [
35 | best_tube[m, np.array([0, 1, 2 + 4 * n, 3 + 4 * n, 4 + 4 * n, 5 + 4 * n])]]
36 | # print("video:{}/{}ok!".format(video_index, eval_dataset.__len__()), "\ttime:", time.time() - time_start,
37 | # "frame:{}".format(nframes))
38 | for frame_index in frame_boxes:
39 | frame_label = {} # 记录了当前帧上各个label的所有框
40 | for bb in frame_boxes[frame_index]:
41 | if bb[0] not in frame_label:
42 | frame_label[bb[0]] = []
43 | frame_label[bb[0]] += [bb[1:]]
44 | for tt in frame_label:
45 | idx = map_eval.nms_class(frame_label[tt], nms_threshold=0.3)
46 | for id in idx:
47 | all_frame_boxes_list += [np.hstack([np.array([video_index, frame_index, tt]),
48 | frame_label[tt][id] * np.array([1, width, height, width, height])])]
49 | all_frame_boxes_dict[video_index] = all_frame_boxes_list
50 | print("video_index:{} OK!!".format(video_index))
51 |
52 |
53 | def eval_rgb_or_flow(model, eval_dataset, eval_dataloader, args, GEN_NUM):
54 | if args.dataset == 'UCF101v2':
55 | num_class = 25
56 | elif args.dataset == 'UCFSports':
57 | num_class = 11
58 | else:
59 | num_class = 0
60 | print("No dataset name {}".format(args.dataset))
61 | exit(0)
62 | rgb = args.modality == 'rgb'
63 | use_gpu = args.use_gpu
64 | variance = args.variance
65 | # model = ssd_net_ucf101.SSD_NET(dataset=args.dataset, num_classes=num_class, modality=args.modality)
66 | # if args.reinit_all:
67 | # print("reinit all data!!!")
68 | # # model.load_trained_weights('/home/qzw/code/my-act-detector/caffe-models/UCFSports/FLOW5-UCFSports.pkl')
69 | # # pytorch_model = '/home/qzw/code/my-act-detector/pytorch-models/{}/{}-trained-model-{}-pytorch-single.pkl'.format(args.dataset, args.dataset, args.modality)
70 | # # model.load_state_dict(torch.load(pytorch_model))
71 | # # GEN_NUM = 0
72 | # pytorch_model = '/home/qzw/code/my-act-detector-12-13/my_trained_pytorch_model_{}-{}.pkl'.format(args.dataset, args.modality)
73 | # data_dict = torch.load(pytorch_model)
74 | # GEN_NUM = data_dict['gen_num']
75 | # net_state_dict = {}
76 | # for key in data_dict['net_state_dict']:
77 | # if 'module.' in key:
78 | # new_key = key.replace('module.', '')
79 | # else:
80 | # new_key = key
81 | # net_state_dict[new_key] = data_dict['net_state_dict'][key]
82 | # model.load_state_dict(net_state_dict)
83 | # if use_gpu:
84 | # model.cuda()
85 | # # model = torch.nn.DataParallel(model).cuda()
86 | model.eval()
87 | eval_dataset = dataset.TubeDataset(args.dataset, data_path=args.data_path, phase='eval',
88 | modality=args.modality, sequence_length=6)
89 | eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=1, shuffle=False,
90 | num_workers=8, pin_memory=True)
91 | tubes_init = prior_tubes.RGB_TUBES(phase='eval', use_gpu=use_gpu, variance=variance, sequence_length=6)
92 | manager = multiprocessing.Manager()
93 | all_frame_boxes_dict = manager.dict()
94 | pool = multiprocessing.Pool(processes=16)
95 | resolution = eval_dataset.get_resolution()
96 | start_time = time.time()
97 | nframes_sum = 0
98 | for i, (input, target) in enumerate(eval_dataloader):
99 | video_index = i + 1
100 | nframes = int(input.shape[1] / 3)
101 | print("GEN_NUM:{} video_index:{}/{} start!! frame num:{} nframes_sum:{} fps:{}".format(GEN_NUM, video_index,
102 | eval_dataset.__len__(),
103 | nframes, nframes_sum,
104 | nframes_sum / (
105 | time.time() - start_time)))
106 | nframes_sum += nframes
107 | height, width = resolution[eval_dataset.videos_list[int(target[0, 0])]]
108 | if use_gpu:
109 | input = input.cuda()
110 | d36_dict = {}
111 | d36_dict['conv4_3'] = [0]
112 | d36_dict['fc_conv7'] = [0]
113 | d36_dict['conv6'] = [0]
114 | d36_dict['conv7'] = [0]
115 | d36_dict['conv8'] = [0]
116 | d36_dict['conv9'] = [0]
117 | conf_preds_list = []
118 | decode_video_tubes_list = []
119 | for d in range(1, args.sequence_length - 1):
120 | conv4_3_d36, fc_conv7_d36, conv6_d36, conv7_d36, conv8_d36, conv9_d36 = model.get_feature_map(
121 | input[0, 3 * d:3 * (1 + d), :, :].unsqueeze(0), (36, 36))
122 | d36_dict['conv4_3'] += [conv4_3_d36]
123 | d36_dict['fc_conv7'] += [fc_conv7_d36]
124 | d36_dict['conv6'] += [conv6_d36]
125 | d36_dict['conv7'] += [conv7_d36]
126 | d36_dict['conv8'] += [conv8_d36]
127 | d36_dict['conv9'] += [conv9_d36]
128 |
129 | for frame_index in range(nframes - args.sequence_length + 1):
130 | if rgb:
131 | conv4_3_d6, fc_conv7_d6, conv6_d6, conv7_d6, conv8_d6, conv9_d6 = model.get_feature_map(
132 | input[0, 3 * frame_index:3 * (frame_index + 1), :, :].unsqueeze(0), (6, 6))
133 | conv4_3_d36, fc_conv7_d36, conv6_d36, conv7_d36, conv8_d36, conv9_d36 = model.get_feature_map(
134 | input[0, 3 * (frame_index + args.sequence_length - 1):3 * (frame_index + args.sequence_length), :,
135 | :].unsqueeze(0), (36, 36))
136 | else:
137 | conv4_3_d6, fc_conv7_d6, conv6_d6, conv7_d6, conv8_d6, conv9_d6 = model.get_feature_map(
138 | input[0, 3 * frame_index:3 * (frame_index + args.sequence_length - 1), :, :].unsqueeze(0), (6, 6))
139 | conv4_3_d36, fc_conv7_d36, conv6_d36, conv7_d36, conv8_d36, conv9_d36 = model.get_feature_map(
140 | torch.cat([input[0, 3 * min(frame_index + args.sequence_length - 1 + ff, nframes - 1):3 * (
141 | min(frame_index + args.sequence_length - 1 + ff, nframes - 1) + 1), :, :].unsqueeze(0) for
142 | ff in range(args.sequence_length - 1)], dim=1), (36, 36))
143 | d36_dict['conv4_3'] += [conv4_3_d36]
144 | d36_dict['fc_conv7'] += [fc_conv7_d36]
145 | d36_dict['conv6'] += [conv6_d36]
146 | d36_dict['conv7'] += [conv7_d36]
147 | d36_dict['conv8'] += [conv8_d36]
148 | d36_dict['conv9'] += [conv9_d36]
149 | d36_dict['conv4_3'][frame_index] = 0
150 | d36_dict['fc_conv7'][frame_index] = 0
151 | d36_dict['conv6'][frame_index] = 0
152 | d36_dict['conv7'][frame_index] = 0
153 | d36_dict['conv8'][frame_index] = 0
154 | d36_dict['conv9'][frame_index] = 0
155 | conv4_3_data = torch.cat([conv4_3_d6] + [d36_dict['conv4_3'][ff] for ff in
156 | range(frame_index + 1, frame_index + args.sequence_length)],
157 | dim=1)
158 | fc_conv7_data = torch.cat([fc_conv7_d6] + [d36_dict['fc_conv7'][ff] for ff in
159 | range(frame_index + 1, frame_index + args.sequence_length)],
160 | dim=1)
161 | conv6_data = torch.cat([conv6_d6] + [d36_dict['conv6'][ff] for ff in
162 | range(frame_index + 1, frame_index + args.sequence_length)],
163 | dim=1)
164 | conv7_data = torch.cat([conv7_d6] + [d36_dict['conv7'][ff] for ff in
165 | range(frame_index + 1, frame_index + args.sequence_length)],
166 | dim=1)
167 | conv8_data = torch.cat([conv8_d6] + [d36_dict['conv8'][ff] for ff in
168 | range(frame_index + 1, frame_index + args.sequence_length)],
169 | dim=1)
170 | conv9_data = torch.cat([conv9_d6] + [d36_dict['conv9'][ff] for ff in
171 | range(frame_index + 1, frame_index + args.sequence_length)],
172 | dim=1)
173 | loc_preds, conf_preds = model.get_loc_conf(conv4_3_data, fc_conv7_data, conv6_data, conv7_data,
174 | conv8_data, conv9_data)
175 | conf_preds = F.softmax(conf_preds, dim=-1)
176 | decode_video_tubes = prior_tubes.decode_tubes(tubes_init, loc_preds)
177 | conf_preds_list += [conf_preds.cpu()]
178 | decode_video_tubes_list += [decode_video_tubes.cpu()]
179 | conf_preds = torch.cat(conf_preds_list, dim=0)
180 | decode_video_tubes = torch.cat(decode_video_tubes_list, dim=0)
181 | # data_handle_and_save_process(all_frame_boxes_dict, video_index, conf_preds, decode_video_tubes, num_class,
182 | # args.sequence_length, height, width)
183 | pool.apply_async(data_handle_and_save_process, (all_frame_boxes_dict, video_index, conf_preds,
184 | decode_video_tubes, num_class, args.sequence_length,
185 | height, width, ))
186 | print("waiting calc!!")
187 | pool.close()
188 | pool.join()
189 | print("all ok!!")
190 | all_frame_boxes_list = []
191 | for key in all_frame_boxes_dict:
192 | all_frame_boxes_list += all_frame_boxes_dict[key]
193 | with open(args.all_frame_boxes_list_result, "wb") as file:
194 | pickle.dump(all_frame_boxes_list, file)
195 | return map_eval.calc_pr(all_frame_boxes_list, eval_dataset)
196 |
197 |
198 | if __name__ == '__main__':
199 | import config
200 | from layers import ssd
201 | args = config.Config()
202 | if args.dataset == 'UCF101v2':
203 | num_class = 25
204 | elif args.dataset == 'UCFSports':
205 | num_class = 11
206 | else:
207 | num_class = 0
208 | print("No dataset name {}".format(args.dataset))
209 | exit(0)
210 | eval_net = ssd.SSD_NET(dataset=args.dataset, frezze_init=args.freeze_init, num_classes=num_class,
211 | modality=args.modality)
212 | data_dict = torch.load("/mnt/data/qzw/model/pytorch-act-detector/{}/best-rgb-0.8601.pkl".format(args.dataset))
213 | net_state_dict = {}
214 | for key in data_dict['net_state_dict']:
215 | if 'module.' in key:
216 | new_key = key.replace('module.', '')
217 | else:
218 | new_key = key
219 | net_state_dict[new_key] = data_dict['net_state_dict'][key]
220 | eval_net.load_state_dict(net_state_dict)
221 | if args.use_gpu:
222 | eval_net = eval_net.cuda()
223 | mmap = eval_rgb_or_flow(model=eval_net, eval_dataset=None, eval_dataloader=None, args=args,
224 | GEN_NUM=data_dict['gen_num'])
225 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import config
2 | from layers import act_cuboid_loss
3 | from data import dataset
4 | from layers import ssd
5 | import torch
6 | import time
7 | import torch.optim.lr_scheduler as lr_scheduler
8 | import test
9 |
10 |
11 | def main():
12 | args = config.Config()
13 | train(args)
14 | exit(0)
15 |
16 |
17 | def train(args):
18 | use_gpu = args.use_gpu
19 | if args.dataset == 'UCF101v2':
20 | num_class = 25
21 | elif args.dataset == 'UCFSports':
22 | num_class = 11
23 | else:
24 | num_class = 0
25 | print("No dataset name {}".format(args.dataset))
26 | exit(0)
27 | variance = args.variance
28 | MAX_GEN = args.epochs
29 | k_frames = args.sequence_length
30 | print("train batch size:", args.train_batch_size, 'lr', args.lr)
31 | train_net = ssd.SSD_NET(dataset=args.dataset, frezze_init=args.freeze_init, num_classes=num_class, modality=args.modality)
32 | if args.reinit_all:
33 | print("reinit all data!!!")
34 | start_gen = 0
35 | train_net.load_state_dict(
36 | torch.load(args.init_model))
37 | train_net.train(True)
38 | optimizer = torch.optim.SGD(train_net.get_optim_policies(),
39 | lr=args.lr,
40 | momentum=args.momentum,
41 | weight_decay=args.weight_decay)
42 | scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.94)
43 | loss_class_list = []
44 | loss_loc_list = []
45 | loss_list = []
46 | else:
47 | print("load last train data!!!")
48 | data_dict = torch.load(args.new_trained_model)
49 | start_gen = data_dict['gen_num']
50 | # start_gen = 0
51 | net_state_dict = {}
52 | for key in data_dict['net_state_dict']:
53 | if 'module.' in key:
54 | new_key = key.replace('module.', '')
55 | else:
56 | new_key = key
57 | net_state_dict[new_key] = data_dict['net_state_dict'][key]
58 | train_net.load_state_dict(net_state_dict)
59 | train_net.train(True)
60 | optimizer = torch.optim.SGD(train_net.get_optim_policies(),
61 | lr=args.lr,
62 | momentum=args.momentum,
63 | weight_decay=args.weight_decay)
64 | # optimizer.load_state_dict(data_dict['optimizer'])
65 | for group in optimizer.param_groups:
66 | if 'initial_lr' not in group:
67 | group['initial_lr'] = args.lr
68 | # optimizer.defaults['lr'] = args.lr
69 | scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.94, last_epoch=start_gen - 1)
70 | loss_class_list = data_dict['loss_class_list']
71 | loss_loc_list = data_dict['loss_loc_list']
72 | loss_list = data_dict['loss_list']
73 | print("last data: GEN:", start_gen, "\tloss loc:", loss_loc_list[-1], "\tloss conf:", loss_class_list[-1],
74 | "\tloss:", loss_list[-1],
75 | "\tlr:", scheduler.get_lr())
76 |
77 | if use_gpu:
78 | train_net = torch.nn.DataParallel(train_net).cuda()
79 | print('all net loaded ok!!!')
80 | train_dataset = dataset.TubeDataset(args.dataset, data_path=args.data_path, phase='train',
81 | modality=args.modality,
82 | sequence_length=6)
83 | dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True,
84 | num_workers=args.workers, pin_memory=True)
85 | criterion = act_cuboid_loss.CuboidLoss(use_gpu, variance, num_class, k_frames)
86 | mmap_best = 0
87 | mmap_list = []
88 | if args.reinit_all:
89 | warm_up(train_net, dataloader, train_dataset, criterion, optimizer, scheduler, use_gpu, args, loss_loc_list,
90 | loss_class_list, loss_list)
91 | for gen in range(start_gen, MAX_GEN):
92 | train_epoch(train_net, dataloader, train_dataset, criterion, optimizer, scheduler, use_gpu, args, gen,
93 | loss_loc_list, loss_class_list, loss_list)
94 | if (gen + 1) % 1 == 0:
95 | temp_dict = {}
96 | temp_dict['net_state_dict'] = train_net.module.state_dict()
97 | temp_dict['gen_num'] = gen + 1
98 | temp_dict['optimizer'] = optimizer.state_dict()
99 | temp_dict['loss_loc_list'] = loss_loc_list
100 | temp_dict['loss_class_list'] = loss_class_list
101 | temp_dict['loss_list'] = loss_list
102 | temp_dict['mmap_list'] = mmap_list
103 | torch.save(temp_dict, args.new_trained_model)
104 | print("net save ok!!")
105 | if loss_list[-1] < 1.0:
106 | mmap = test.eval_rgb_or_flow(model=train_net.module, eval_dataset=None, eval_dataloader=None, args=args,
107 | GEN_NUM=gen + 1)
108 | with open('./train_log_{}.txt'.format(args.dataset), 'a') as train_log:
109 | log = "GEN:{}".format(gen) + "\tmap:{}".format(mmap) + "\tbest map:{}\n".format(mmap_best)
110 | train_log.write(log)
111 | train_net.module.train(True)
112 | mmap_list += [mmap]
113 | if mmap > mmap_best:
114 | mmap_best = mmap
115 | temp_dict['mmap_best'] = mmap_best
116 | torch.save(temp_dict, args.best_trained_model % mmap_best)
117 | print("current map:{} best map:{}, best model saved ok!".format(mmap, mmap_best))
118 |
119 |
120 | def train_epoch(train_net, dataloader, train_dataset, criterion, optimizer, scheduler, use_gpu, args, gen, loss_loc_list,
121 | loss_class_list, loss_list, warm_up_lr_inc=None):
122 | total_loss = AverageMeter()
123 | loss_ls = AverageMeter()
124 | loss_cs = AverageMeter()
125 | if warm_up_lr_inc is None:
126 | scheduler.step()
127 | total_loss.reset()
128 | loss_ls.reset()
129 | loss_cs.reset()
130 | for i, (input, target) in enumerate(dataloader):
131 | # st = time.time()
132 | if warm_up_lr_inc is not None:
133 | for lr in range(len(optimizer.param_groups)):
134 | optimizer.param_groups[lr]['lr'] += warm_up_lr_inc[lr]
135 | if use_gpu:
136 | input = input.cuda()
137 | target = target.cuda()
138 | loc_preds, conf_preds = train_net(input)
139 | loss_l, loss_c = criterion((loc_preds, conf_preds), target)
140 | loss = loss_l + loss_c
141 | optimizer.zero_grad()
142 | loss.backward()
143 | optimizer.step()
144 | if use_gpu:
145 | total_loss.update(loss.cpu().detach().numpy())
146 | loss_ls.update(loss_l.cpu().detach().numpy())
147 | loss_cs.update(loss_c.cpu().detach().numpy())
148 | else:
149 | total_loss.update(loss.detach().numpy())
150 | loss_ls.update(loss_l.detach().numpy())
151 | loss_cs.update(loss_c.detach().numpy())
152 | # print("{}s one batch".format(time.time() - st))
153 | if (i+1) % 100 == 0:
154 | print("GEN:", gen, "\tnum:{}/{}".format((i + 1) * args.train_batch_size, train_dataset.__len__()),
155 | "\tloss loc:", loss_ls.avg, "\tloss conf:", loss_cs.avg, "\tloss:", total_loss.avg,
156 | "\tlr:", scheduler.get_lr(), time.strftime('\t%m/%d %H:%M:%S', time.localtime(time.time())))
157 | print("\tloss loc:", loss_ls.avg, "\tloss conf:", loss_cs.avg, "\tloss:", total_loss.avg)
158 | with open('./train_log_{}.txt'.format(args.dataset), 'a') as train_log:
159 | log = "GEN:{}".format(gen) + "\tloss loc:{}".format(loss_ls.avg) + "\tloss conf:{}".format(loss_cs.avg) + \
160 | "\tloss:{}".format(total_loss.avg) + "\tlr:{}".format(scheduler.get_lr()) + time.strftime(
161 | '\t%m/%d %H:%M:%S\n', time.localtime(time.time()))
162 | train_log.write(log)
163 | loss_loc_list += [loss_ls.avg]
164 | loss_class_list += [loss_cs.avg]
165 | loss_list += [total_loss.avg]
166 |
167 |
168 | def warm_up(train_net, dataloader, train_dataset, criterion, optimizer, scheduler, use_gpu, args, loss_loc_list,
169 | loss_class_list, loss_list):
170 | warm_up_ratio = args.warm_up_ratio
171 | warm_up_epoch = args.warm_up_epoch
172 | lr_inc = []
173 | for i in range(len(optimizer.param_groups)):
174 | lr_inc.append(optimizer.param_groups[i]['lr'] * (1 - warm_up_ratio)
175 | / (len(dataloader) * warm_up_epoch))
176 | optimizer.param_groups[i]['lr'] *= warm_up_ratio
177 | for warm_up_index in range(warm_up_epoch):
178 | train_epoch(train_net, dataloader, train_dataset, criterion, optimizer, scheduler, use_gpu, args,
179 | warm_up_index - warm_up_epoch, loss_loc_list, loss_class_list, loss_list, warm_up_lr_inc=lr_inc)
180 |
181 |
182 | class AverageMeter(object):
183 | """Computes and stores the average and current value"""
184 |
185 | def __init__(self):
186 | self.reset()
187 |
188 | def reset(self):
189 | self.val = 0
190 | self.avg = 0
191 | self.sum = 0
192 | self.count = 0
193 |
194 | def update(self, val, n=1):
195 | self.val = val
196 | self.sum += val * n
197 | self.count += n
198 | self.avg = self.sum / self.count
199 |
200 |
201 | if __name__ == '__main__':
202 | main()
203 | # image_test_from_file('./ucf101_test.pkl')
204 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import box
2 | from . import map_eval
3 | from . import prior_tubes
4 |
--------------------------------------------------------------------------------
/utils/act_tubes.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pickle
3 | import tube_dataset
4 | import ACT_utils
5 | import box
6 | import prior_tubes
7 |
8 |
9 | def nms_tublets(tubes_conf, decode_tubes, nms_threshold=0.45, top_k=400):
10 | cnt = 0
11 | # tubes_conf = tubes_conf.squeeze().detach().numpy()
12 | # decode_tubes = decode_tubes.reshape(-1, 6, 4)
13 | # max_class = np.max(tubes_conf[:, 1:], axis=1)
14 | class_index = np.argsort(-tubes_conf)
15 | class_nms_index_list = [class_index[0]]
16 | for index in class_index[1:]:
17 | keep = True
18 | for _max_index in class_nms_index_list:
19 | if prior_tubes.jaccard_overlap_tubes(decode_tubes[index, :], decode_tubes[_max_index, :]) > nms_threshold:
20 | keep = False
21 | break
22 | if keep:
23 | class_nms_index_list += [index]
24 | cnt += 1
25 | if cnt >= top_k:
26 | break
27 | return np.array(class_nms_index_list)[:top_k]
28 |
29 |
30 | def nms_tublets_caffe(tubes_conf, decode_tubes, nms_threshold=0.45, top_k=400, K=6):
31 | counter = 0
32 | x1 = [decode_tubes[:, i, 0] for i in range(K)]
33 | y1 = [decode_tubes[:, i, 1] for i in range(K)]
34 | x2 = [decode_tubes[:, i, 2] for i in range(K)]
35 | y2 = [decode_tubes[:, i, 3] for i in range(K)]
36 | dets = tubes_conf
37 | area = [(x2[k] - x1[k]) * (y2[k] - y1[k]) for k in range(K)]
38 | I = np.argsort(dets)
39 | indices = np.empty(top_k, dtype=np.int32)
40 |
41 | while I.size > 0:
42 | i = I[-1]
43 | indices[counter] = i
44 | counter += 1
45 |
46 | # Compute overlap
47 | xx1 = [np.maximum(x1[k][i], x1[k][I[:-1]]) for k in range(K)]
48 | yy1 = [np.maximum(y1[k][i], y1[k][I[:-1]]) for k in range(K)]
49 | xx2 = [np.minimum(x2[k][i], x2[k][I[:-1]]) for k in range(K)]
50 | yy2 = [np.minimum(y2[k][i], y2[k][I[:-1]]) for k in range(K)]
51 |
52 | w = [np.maximum(0, xx2[k] - xx1[k]) for k in range(K)]
53 | h = [np.maximum(0, yy2[k] - yy1[k]) for k in range(K)]
54 |
55 | inter_area = [w[k] * h[k] for k in range(K)]
56 | ious = sum([inter_area[k] / (area[k][I[:-1]] + area[k][i] - inter_area[k]) for k in range(K)])
57 |
58 | I = I[np.where(ious <= nms_threshold * K)[0]]
59 |
60 | if counter == top_k: break
61 |
62 | return indices[:counter]
63 |
64 |
65 | if __name__ == '__main__':
66 | data_cache = tube_dataset.TubeDataset('UCFSports')
67 | build_tubes(data_cache, K=6)
68 |
69 |
--------------------------------------------------------------------------------
/utils/box.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def jaccard_overlap_boxes(box1, box2):
4 | # box.shape=[xmin,ymin,xmax,ymax]
5 | if box1[0] > box2[2] or box1[2] < box2[0] or box1[1] > box2[3] or box1[3] < box2[1]:
6 | return 0.0
7 | else:
8 | box = np.array([max(box1[0], box2[0]), max(box1[1], box2[1]), min(box1[2], box2[2]), min(box1[3], box2[3])])
9 | size = box_size(box)
10 | if box_size(box1) + box_size(box2) < size:
11 | size = size
12 | return size / (box_size(box1) + box_size(box2) - size)
13 |
14 |
15 | def box_size(box):
16 | # box.shape=[xmin,ymin,xmax,ymax]
17 | return (box[2] - box[0]) * (box[3] - box[1])
18 |
--------------------------------------------------------------------------------
/utils/map_eval.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import numpy as np
3 | from . import box
4 |
5 |
6 | def nms_class(boxes_scores_list, nms_threshold):
7 | if boxes_scores_list.__len__() <= 1:
8 | return [0]
9 | boxes = np.vstack(boxes_scores_list)
10 | scores_index = np.argsort(-boxes[:, 0])
11 | class_nms_index_list = [scores_index[0]]
12 | for index in scores_index[1:]:
13 | keep = True
14 | for _max_index in class_nms_index_list:
15 | if box.jaccard_overlap_boxes(boxes[index, 1:], boxes[_max_index, 1:]) > nms_threshold:
16 | keep = False
17 | break
18 | if keep:
19 | class_nms_index_list += [index]
20 | return class_nms_index_list
21 |
22 |
23 | def get_pr(data_cache, K=6):
24 | tubelet_file = './data/UCFSports/{}/{}.pkl'
25 | output_file = './data/UCFSports_build_tubes/pr_data.pkl'
26 | test_videos_list = data_cache.get_test_videos()
27 | nframes_dict = data_cache.get_nframes()
28 | labels = data_cache.get_labels()
29 | gt_tubes = data_cache.get_gttubes()
30 | resolution = data_cache._resolution
31 | gt_dict = {}
32 | gt_label_num = np.zeros(labels.__len__())
33 | video_index = 0
34 | for video in test_videos_list[0]:
35 | video_index += 1
36 | for label in gt_tubes[video]:
37 | for tube in gt_tubes[video][label]:
38 | for i in range(tube.shape[0]):
39 | if (video_index, int(tube[i, 0]), label) not in gt_dict:
40 | gt_dict[(video_index, int(tube[i, 0]), label)] = []
41 | gt_dict[(video_index, int(tube[i, 0]), label)] += [tube[i, 1:]]
42 | gt_label_num[label] += 1
43 |
44 | frame_boxes = {}
45 | all_frame_boxes_list = []
46 | video_index = 0
47 | for videos in test_videos_list[0]:
48 | nframes = nframes_dict[videos]
49 | frame_boxes[videos] = {}
50 | video_index += 1
51 | height, width = resolution[videos]
52 | for start_frame in range(1, nframes - K + 2):
53 | file = open(tubelet_file.format(videos, start_frame), 'rb')
54 | _, __, best_tube = pickle.load(file)
55 | file.close()
56 | for i in range(best_tube.shape[0]):
57 | for j in range(K):
58 | if (j+start_frame) not in frame_boxes[videos]:
59 | frame_boxes[videos][j+start_frame] = []
60 | frame_boxes[videos][j+start_frame] += [best_tube[i, np.array([0, 1, 2+4*j, 3+4*j, 4+4*j, 5+4*j])]]
61 |
62 | for frame_index in range(1, nframes+1):
63 | frame_label = {}
64 | for bb in frame_boxes[videos][frame_index]:
65 | if bb[0] not in frame_label:
66 | frame_label[bb[0]] = []
67 | frame_label[bb[0]] += [bb[1:]]
68 |
69 | for tt in frame_label:
70 | idx = nms_class(frame_label[tt], nms_threshold=0.3)
71 | for id in idx:
72 | all_frame_boxes_list += [np.hstack([np.array([video_index, frame_index, tt]), frame_label[tt][id] * np.array([1, width, height, width, height])])]
73 |
74 | all_frame_boxes = np.vstack(all_frame_boxes_list)
75 | label_pr_dict = {}
76 | for label in range(labels.__len__()):
77 | print("label:", label)
78 | pre_idx = np.where(all_frame_boxes[:, 2] == label)[0]
79 | label_pre_box = all_frame_boxes[pre_idx]
80 | pre_idx = np.argsort(-label_pre_box[:, 3])
81 | pr = np.empty((pre_idx.shape[0]+1, 2))
82 | pr[0, 0] = 1.0 # precision
83 | pr[0, 1] = 0.0 # recall
84 | pr_cnt = 1
85 | fn = gt_label_num[label]
86 | fp = 0
87 | tp = 0
88 | for id in pre_idx:
89 | pre_box = label_pre_box[id, :]
90 | positive = False
91 | if (int(pre_box[0]), int(pre_box[1]), int(pre_box[2])) in gt_dict:
92 | _gt = gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))]
93 | ious = np.zeros(_gt.__len__())
94 | for i, g in enumerate(_gt):
95 | ious[i] = box.jaccard_overlap_boxes(pre_box[4:], g)
96 | i_max = np.argmax(ious)
97 | if ious[i_max] > 0.5:
98 | positive = True
99 | del _gt[i_max]
100 | if _gt.__len__() == 0:
101 | del gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))]
102 | if positive:
103 | tp += 1
104 | fn -= 1
105 | else:
106 | fp += 1
107 | pr[pr_cnt, 0] = tp / (fp + tp)
108 | pr[pr_cnt, 1] = tp / (tp + fn)
109 | pr_cnt += 1
110 | label_pr_dict[label] = pr
111 | with open(output_file, 'wb') as f:
112 | pickle.dump(label_pr_dict, f)
113 | ap = np.empty(labels.__len__())
114 | for label in label_pr_dict:
115 | prdif = label_pr_dict[label][1:, 1] - label_pr_dict[label][:-1, 1]
116 | prsum = label_pr_dict[label][1:, 0] + label_pr_dict[label][:-1, 0]
117 | ap[label] = np.sum(prdif * prsum * 0.5)
118 | print("map:", np.mean(ap))
119 |
120 |
121 | def get_ground_truth(test_videos_list, labels, gt_tubes):
122 | gt_dict = {}
123 | gt_label_num = np.zeros(labels.__len__())
124 | video_index = 0
125 | for video in test_videos_list:
126 | video_index += 1
127 | for label in gt_tubes[video]:
128 | for tube in gt_tubes[video][label]:
129 | for i in range(tube.shape[0]):
130 | if (video_index, int(tube[i, 0]), label) not in gt_dict:
131 | gt_dict[(video_index, int(tube[i, 0]), label)] = []
132 | gt_dict[(video_index, int(tube[i, 0]), label)] += [tube[i, 1:]]
133 | gt_label_num[label] += 1
134 | return gt_dict, gt_label_num
135 |
136 |
137 | def calc_pr(all_frame_boxes_list, dataset):
138 | output_file = './pr_data_{}_{}.pkl'.format(dataset.DNAME, dataset.modality)
139 | labels = dataset.get_labels()
140 | gt_tubes = dataset.get_gttubes()
141 | gt_dict, gt_label_num = get_ground_truth(dataset.videos_list, labels, gt_tubes)
142 | all_frame_boxes = np.vstack(all_frame_boxes_list)
143 | label_pr_dict = {}
144 | for label in range(labels.__len__()):
145 | pre_idx = np.where(all_frame_boxes[:, 2] == label)[0]
146 | label_pre_box = all_frame_boxes[pre_idx]
147 | pre_idx = np.argsort(-label_pre_box[:, 3])
148 | pr = np.empty((pre_idx.shape[0]+1, 2))
149 | pr[0, 0] = 1.0 # precision
150 | pr[0, 1] = 0.0 # recall
151 | pr_cnt = 1
152 | fn = gt_label_num[label]
153 | fp = 0
154 | tp = 0
155 | for id in pre_idx:
156 | pre_box = label_pre_box[id, :]
157 | positive = False
158 | if (int(pre_box[0]), int(pre_box[1]), int(pre_box[2])) in gt_dict:
159 | _gt = gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))]
160 | ious = np.zeros(_gt.__len__())
161 | for i, g in enumerate(_gt):
162 | ious[i] = box.jaccard_overlap_boxes(pre_box[4:], g)
163 | i_max = np.argmax(ious)
164 | if ious[i_max] > 0.5:
165 | positive = True
166 | del _gt[i_max]
167 | if _gt.__len__() == 0:
168 | del gt_dict[(int(pre_box[0]), int(pre_box[1]), int(pre_box[2]))]
169 | if positive:
170 | tp += 1
171 | fn -= 1
172 | else:
173 | fp += 1
174 | pr[pr_cnt, 0] = tp / (fp + tp)
175 | pr[pr_cnt, 1] = tp / (tp + fn)
176 | pr_cnt += 1
177 | label_pr_dict[label] = pr
178 | with open(output_file, 'wb') as f:
179 | pickle.dump(label_pr_dict, f)
180 | ap = np.empty(labels.__len__())
181 | for label in label_pr_dict:
182 | prdif = label_pr_dict[label][1:, 1] - label_pr_dict[label][:-1, 1]
183 | prsum = label_pr_dict[label][1:, 0] + label_pr_dict[label][:-1, 0]
184 | ap[label] = np.sum(prdif * prsum * 0.5)
185 | mmap = np.mean(ap)
186 | print("map:", mmap)
187 | return mmap
188 |
189 |
190 |
191 |
192 | if __name__ == '__main__':
193 | data_cache = tube_dataset.TubeDataset('UCFSports')
194 | get_pr(data_cache, K=6)
195 |
196 |
--------------------------------------------------------------------------------
/utils/prior_tubes.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 | import torch
4 | from layers import act_cuboid_loss
5 |
6 |
7 | class RGB_TUBES:
8 | def __init__(self, phase, use_gpu, variance=(0.1, 0.1, 0.2, 0.2), sequence_length=6):
9 | center_mode = phase == 'eval'
10 | self.conv4_3_norm_tubes = self.generate_prior_tubes(min_size=30.0, max_size=60.0, aspect_ratio=(2,), flip=True,
11 | clip=False, layer_size=(38, 38), image_size=(300, 300),
12 | step=8, offset=0.5,
13 | sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length)
14 | self.fc_conv7_tubes = self.generate_prior_tubes(min_size=60.0, max_size=111.0, aspect_ratio=(2, 3,), flip=True,
15 | clip=False, layer_size=(18, 18), image_size=(300, 300), step=16,
16 | offset=0.5, sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length)
17 | self.conv6_tubes = self.generate_prior_tubes(min_size=111.0, max_size=162.0, aspect_ratio=(2, 3,), flip=True,
18 | clip=False, layer_size=(9, 9), image_size=(300, 300), step=32,
19 | offset=0.5, sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length)
20 | self.conv7_tubes = self.generate_prior_tubes(min_size=162.0, max_size=213.0, aspect_ratio=(2, 3,), flip=True,
21 | clip=False, layer_size=(5, 5), image_size=(300, 300), step=64,
22 | offset=0.5, sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length)
23 | self.conv8_tubes = self.generate_prior_tubes(min_size=213.0, max_size=264.0, aspect_ratio=(2,), flip=True,
24 | clip=False, layer_size=(3, 3), image_size=(300, 300), step=100,
25 | offset=0.5, sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length)
26 | self.conv9_tubes = self.generate_prior_tubes(min_size=264.0, max_size=315.0, aspect_ratio=(2,), flip=True,
27 | clip=False, layer_size=(1, 1), image_size=(300, 300), step=300,
28 | offset=0.5, sequence_length=sequence_length, center_mode=center_mode).reshape(-1, 4*sequence_length)
29 | if use_gpu:
30 | self.all_tubes = torch.from_numpy(np.vstack([self.conv4_3_norm_tubes, self.fc_conv7_tubes, self.conv6_tubes, self.conv7_tubes, self.conv8_tubes, self.conv9_tubes])).cuda()
31 | # self.all_tubes = torch.clamp(self.all_tubes, min=0, max=1)
32 | else:
33 | self.all_tubes = torch.from_numpy(np.vstack(
34 | [self.conv4_3_norm_tubes, self.fc_conv7_tubes, self.conv6_tubes, self.conv7_tubes, self.conv8_tubes,
35 | self.conv9_tubes]))
36 | # self.all_tubes = torch.clamp(self.all_tubes, min=0, max=1)
37 | self.sequence_length = sequence_length
38 | self.variance = variance
39 |
40 | def generate_prior_tubes(self, min_size=30.0, max_size=None, aspect_ratio=(2,), flip=True, clip=False,
41 | layer_size=(38, 38), image_size=(300, 300),
42 | step=None, offset=0.5, sequence_length=6, center_mode=False):
43 | tubes_list = []
44 | if max_size is not None:
45 | num_priors = aspect_ratio.__len__() * 2 + 2
46 | else:
47 | num_priors = aspect_ratio.__len__() * 2 + 1
48 | if step is None:
49 | step_w = image_size[0] / layer_size[0]
50 | step_h = image_size[1] / layer_size[1]
51 | else:
52 | step_w = step
53 | step_h = step
54 | ar_list = []
55 | for a in aspect_ratio:
56 | ar_list.append(a)
57 | if flip:
58 | ar_list.append(1 / a)
59 | for h in range(layer_size[1]):
60 | for w in range(layer_size[0]):
61 | tube_set = np.zeros((num_priors, sequence_length, 4), dtype='float32')
62 | center_x, center_y = (w + offset) * step_w, (h + offset) * step_h
63 | box_width, box_height = min_size, min_size
64 | if center_mode:
65 | tube_set[0, :, 0] = center_x / image_size[0]
66 | tube_set[0, :, 1] = center_y / image_size[1]
67 | tube_set[0, :, 2] = box_width / image_size[0]
68 | tube_set[0, :, 3] = box_height / image_size[1]
69 | if max_size is not None:
70 | box_width, box_height = np.sqrt(min_size * max_size), np.sqrt(min_size * max_size)
71 | tube_set[1, :, 0] = center_x / image_size[0]
72 | tube_set[1, :, 1] = center_y / image_size[1]
73 | tube_set[1, :, 2] = box_width / image_size[0]
74 | tube_set[1, :, 3] = box_height / image_size[1]
75 | prior_index = 2
76 | for a in ar_list:
77 | if (np.abs(a - 1.0) < 0.000001) or a < 0.000001:
78 | continue
79 | box_width, box_height = min_size * np.sqrt(a), min_size / np.sqrt(a)
80 | tube_set[prior_index, :, 0] = center_x / image_size[0]
81 | tube_set[prior_index, :, 1] = center_y / image_size[1]
82 | tube_set[prior_index, :, 2] = box_width / image_size[0]
83 | tube_set[prior_index, :, 3] = box_height / image_size[1]
84 | prior_index += 1
85 | else:
86 | tube_set[0, :, 0] = (center_x - box_width/2.0) / image_size[0] # xmin
87 | tube_set[0, :, 1] = (center_y - box_height/2.0) / image_size[1] # ymin
88 | tube_set[0, :, 2] = (center_x + box_width/2.0) / image_size[0]
89 | tube_set[0, :, 3] = (center_y + box_height/2.0) / image_size[1] # ymax
90 | if max_size is not None:
91 | box_width, box_height = np.sqrt(min_size * max_size), np.sqrt(min_size * max_size)
92 | tube_set[1, :, 0] = (center_x - box_width / 2.0) / image_size[0] # xmin
93 | tube_set[1, :, 1] = (center_y - box_height / 2.0) / image_size[1] # ymin
94 | tube_set[1, :, 2] = (center_x + box_width / 2.0) / image_size[0]
95 | tube_set[1, :, 3] = (center_y + box_height / 2.0) / image_size[1] # ymax
96 | prior_index = 2
97 | for a in ar_list:
98 | if (np.abs(a - 1.0) < 0.000001) or a < 0.000001:
99 | continue
100 | box_width, box_height = min_size * np.sqrt(a), min_size / np.sqrt(a)
101 | tube_set[prior_index, :, 0] = (center_x - box_width / 2.0) / image_size[0] # xmin
102 | tube_set[prior_index, :, 1] = (center_y - box_height / 2.0) / image_size[1] # ymin
103 | tube_set[prior_index, :, 2] = (center_x + box_width / 2.0) / image_size[0]
104 | tube_set[prior_index, :, 3] = (center_y + box_height / 2.0) / image_size[1] # ymax
105 | prior_index += 1
106 | if clip:
107 | tube_set[tube_set > 1.0] = 1.0
108 | tube_set[tube_set < 0.0] = 0.0
109 | tubes_list.append(tube_set)
110 | return np.vstack(tubes_list) # 这里得到的结果是按照顺序的各个tubes,和feature map上的顺序是完全对应的
111 |
112 |
113 | def get_all_video_tubes(tubes):
114 | return copy.deepcopy(tubes.all_tubes)
115 |
116 |
117 | def decode_tubes(tubes, loc_preds=None): # just for one video
118 | decode_video_tubes = get_all_video_tubes(tubes)
119 | decode_video_tubes = torch.stack([decode_video_tubes for i in range(loc_preds.shape[0])], dim=0)
120 | var = tubes.variance
121 | center_x = decode_video_tubes[:, :, 0::4]
122 | center_y = decode_video_tubes[:, :, 1::4]
123 | width = decode_video_tubes[:, :, 2::4]
124 | height = decode_video_tubes[:, :, 3::4]
125 | new_center_x = var[0] * loc_preds[:, :, 0::4] * width + center_x
126 | new_center_y = var[1] * loc_preds[:, :, 1::4] * height + center_y
127 | new_width = torch.exp(var[2] * loc_preds[:, :, 2::4]) * width
128 | new_height = torch.exp(var[3] * loc_preds[:, :, 3::4]) * height
129 | decode_video_tubes[:, :, 0::4] = new_center_x - new_width / 2.0 # x_min
130 | decode_video_tubes[:, :, 1::4] = new_center_y - new_height / 2.0 # y_min
131 | decode_video_tubes[:, :, 2::4] = new_center_x + new_width / 2.0 # x_max
132 | decode_video_tubes[:, :, 3::4] = new_center_y + new_height / 2.0 # y_max
133 | decode_video_tubes[:, :, 0::4] = torch.clamp(decode_video_tubes[:, :, 0::4], min=0)
134 | decode_video_tubes[:, :, 1::4] = torch.clamp(decode_video_tubes[:, :, 1::4], min=0)
135 | decode_video_tubes[:, :, 2::4] = torch.clamp(decode_video_tubes[:, :, 2::4], max=1)
136 | decode_video_tubes[:, :, 3::4] = torch.clamp(decode_video_tubes[:, :, 3::4], max=1)
137 | return decode_video_tubes
138 |
139 |
140 | def get_tubes_conf(conf_preds_list=None, num_class=25):
141 | # 这个函数按照tubes的顺序把所有的confidence 打分提取出来
142 | conf_list = []
143 | for conf_preds in conf_preds_list:
144 | batch_num, channel_num, w, h = conf_preds.shape
145 | feature_flat = conf_preds.detach().numpy().reshape((channel_num, w * h))
146 | prior_num = int(channel_num / num_class)
147 | for i in range(w * h):
148 | for j in range(prior_num):
149 | conf = feature_flat[j * num_class:(j + 1) * num_class, i].reshape(1, -1)
150 | conf_list.append(conf)
151 | return np.vstack(conf_list)
152 |
153 |
154 | def apply_nms(tubes_conf, decode_tubes, conf_threshold=0.01, nms_threshold=0.45, nms_top_k=400, keep_topk=200,
155 | num_class=25):
156 | nms_tubes_list = []
157 | nms_scores_list = []
158 | nms_label_list = []
159 | for i in range(1, num_class): # 不要背景的
160 | scores_c = tubes_conf[:, i]
161 | select = scores_c > conf_threshold
162 | if select.sum() > 0:
163 | select_tubes = decode_tubes[select, :]
164 | scores_c = scores_c[select]
165 | sort_index = torch.argsort(-scores_c)
166 | if sort_index.shape[0] > nms_top_k:
167 | sort_index = sort_index[:nms_top_k]
168 | class_nms_index_list = [sort_index[0]]
169 | class_nms_tube = select_tubes[sort_index[0], :].unsqueeze(dim=0)
170 | for index in sort_index[1:]:
171 | ioutable = torch.zeros(class_nms_index_list.__len__())
172 | act_cuboid_loss.get_tube_overlap(class_nms_tube, select_tubes[index, :], ioutable)
173 | if (ioutable > nms_threshold).sum() == 0:
174 | class_nms_index_list += [index]
175 | class_nms_tube = torch.cat([class_nms_tube, select_tubes[index, :].unsqueeze(dim=0)], dim=0)
176 | for k, index in enumerate(class_nms_index_list):
177 | nms_tubes_list += [select_tubes[index, :]]
178 | nms_scores_list += [scores_c[index]]
179 | nms_label_list += [i]
180 | return_tubes_list = []
181 | return_scores_list = []
182 | return_label_list = []
183 | nms_scores = torch.Tensor(nms_scores_list)
184 | nms_scores_index = torch.argsort(-nms_scores)
185 | if nms_tubes_list.__len__() > keep_topk:
186 | for index in nms_scores_index[:keep_topk]:
187 | return_tubes_list += [nms_tubes_list[index]]
188 | return_scores_list += [nms_scores[index]]
189 | return_label_list += [nms_label_list[index]]
190 | else:
191 | for index in nms_scores_index:
192 | return_tubes_list += [nms_tubes_list[index]]
193 | return_scores_list += [nms_scores[index]]
194 | return_label_list += [nms_label_list[index]]
195 | return return_tubes_list, return_scores_list, return_label_list
--------------------------------------------------------------------------------
/visual_featuremaps.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from layers import ssd
3 | import numpy as np
4 | import cv2
5 | import os
6 |
7 | def main():
8 | dataset = 'UCFSports'
9 | modality = 'rgb'
10 | data_path = "/mnt/data/qzw/data/UCFSports/"
11 | feature_map_path = "/mnt/data/qzw/result/pytorch-act-detector/{}/feature_maps/".format(dataset)
12 | MEAN = np.array([[[104, 117, 123]]], dtype=np.float32)
13 | test_net = ssd.SSD_NET(dataset=dataset, frezze_init=True, num_classes=11,
14 | modality=modality)
15 |
16 | data_dict = torch.load("/mnt/data/qzw/model/pytorch-act-detector/{}/best-{}-cpu-0.8601.pkl" .format(dataset, modality))
17 | # net_state_dict = {}
18 | # for key in data_dict['net_state_dict']:
19 | # if 'module.' in key:
20 | # new_key = key.replace('module.', '')
21 | # else:
22 | # new_key = key
23 | # net_state_dict[new_key] = data_dict['net_state_dict'][key]
24 | test_net.load_state_dict(data_dict)
25 |
26 | image = cv2.imread(os.path.join(data_path, "Frames", '084', '%06d.jpg' % 1))
27 | image = cv2.resize(image, (300, 300), interpolation=cv2.INTER_LINEAR)
28 | image = np.transpose(image - MEAN, (2, 0, 1))[None, :, :, :]
29 | image = torch.from_numpy(image.astype('float32'))
30 |
31 | conv6_dilation = (6, 6)
32 | conv6 = test_net.__getattr__('fc_conv6_{}'.format(test_net.layer_name))
33 | conv6.dilation = conv6_dilation
34 | conv6.padding = conv6_dilation
35 |
36 | output = image
37 | for name, layer in test_net._modules.items():
38 | output = layer(output)
39 | if 'conv' in name or '9' in name:
40 | continue
41 | save_path = os.path.join(feature_map_path, name)
42 | if os.path.exists(save_path) is not True:
43 | os.mkdir(save_path)
44 | feature_maps = output.squeeze().detach().numpy()
45 | for i in range(feature_maps.shape[0]):
46 | feature_map = feature_maps[i, :, :][:, :, None]
47 | if np.max(feature_map) > 0.001:
48 | feature_map = feature_map*255.0/np.max(feature_map)
49 | feature_map = cv2.resize(feature_map, (300, 300), interpolation=cv2.INTER_LINEAR)
50 | cv2.imwrite(os.path.join(save_path, "%03d.jpg" % (i+1)), feature_map)
51 |
52 |
53 |
54 |
55 |
56 | if __name__ == '__main__':
57 | main()
58 |
--------------------------------------------------------------------------------