├── models
    ├── __init__.py
    ├── wide_resnet.py
    ├── resnext.py
    ├── pre_act_resnet.py
    ├── densenet.py
    ├── resnet.py
    ├── resnet2p1d.py
    └── resnet_strg.py
├── datasets
    ├── __init__.py
    ├── loader.py
    ├── videodataset_multiclips.py
    ├── videodataset.py
    └── activitynet.py
├── util_scripts
    ├── __init__.py
    ├── utils.py
    ├── remove_dataparallel.py
    ├── add_fps_into_activitynet_json.py
    ├── vid2img_sthv2.py
    ├── hmdb51_json.py
    ├── eval_accuracy.py
    ├── ucf101_json.py
    ├── mit_json.py
    ├── generate_video_jpgs.py
    ├── sthv2_json.py
    ├── sthv1_json.py
    ├── kinetics_json.py
    └── generate_video_hdf5.py
├── requirements.txt
├── gen.sh
├── mean.py
├── LICENSE
├── module
    ├── roi_graph.py
    └── gcn.py
├── utils.py
├── inference.py
├── validation.py
├── strg.py
├── temporal_transforms.py
├── rpn.py
├── training.py
├── model.py
├── spatial_transforms.py
├── README.md
├── rgcn_models.py
├── dataset.py
├── transform.py
├── opts.py
└── main.py


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/util_scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tqdm
 2 | scikit-learn
 3 | pandas
 4 | numpy
 5 | torch==2.2.0
 6 | torchvision==0.6
 7 | wandb
 8 | h5py
 9 | tensorboardX
10 | 


--------------------------------------------------------------------------------
/gen.sh:
--------------------------------------------------------------------------------
1 | python sthv1_json.py 'data/something/v1' 'data/something/v1/img' 'data/sthv1.json'
2 | #python sthv2_json.py 'data/something/v2' 'data/something/v2/img' 'data/sthv2.json'
3 | 


--------------------------------------------------------------------------------
/util_scripts/utils.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | 
 3 | 
 4 | def get_n_frames(video_path):
 5 |     return len([
 6 |         x for x in video_path.iterdir()
 7 |         if 'jpg' in x.name and x.name[0] != '.'
 8 |     ])
 9 | 
10 | 
11 | def get_n_frames_hdf5(video_path):
12 |     with h5py.File(video_path, 'r') as f:
13 |         video_data = f['video']
14 |         return len(video_data)
15 | 


--------------------------------------------------------------------------------
/mean.py:
--------------------------------------------------------------------------------
 1 | def get_mean_std(value_scale, dataset):
 2 |     assert dataset in ['activitynet', 'kinetics', '0.5']
 3 | 
 4 |     if dataset == 'activitynet':
 5 |         mean = [0.4477, 0.4209, 0.3906]
 6 |         std = [0.2767, 0.2695, 0.2714]
 7 |     elif dataset == 'kinetics':
 8 |         mean = [0.4345, 0.4051, 0.3775]
 9 |         std = [0.2768, 0.2713, 0.2737]
10 |     elif dataset == '0.5':
11 |         mean = [0.5, 0.5, 0.5]
12 |         std = [0.5, 0.5, 0.5]
13 | 
14 |     mean = [x * value_scale for x in mean]
15 |     std = [x * value_scale for x in std]
16 | 
17 |     return mean, std


--------------------------------------------------------------------------------
/util_scripts/remove_dataparallel.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from collections import OrderedDict
 3 | 
 4 | import torch
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('file_path', type=str)
 8 | parser.add_argument('--dst_file_path', default=None, type=str)
 9 | args = parser.parse_args()
10 | 
11 | if args.dst_file_path is None:
12 |     args.dst_file_path = args.file_path
13 | 
14 | x = torch.load(args.file_path)
15 | state_dict = x['state_dict']
16 | new_state_dict = OrderedDict()
17 | 
18 | for k, v in state_dict.items():
19 |     new_k = '.'.join(k.split('.')[1:])
20 |     new_state_dict[new_k] = v
21 | 
22 | x['state_dict'] = new_state_dict
23 | 
24 | torch.save(x, args.dst_file_path)


--------------------------------------------------------------------------------
/models/wide_resnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from . import resnet
 6 | 
 7 | 
 8 | class WideBottleneck(resnet.Bottleneck):
 9 |     expansion = 2
10 | 
11 | 
12 | def generate_model(model_depth, k, **kwargs):
13 |     assert model_depth in [50, 101, 152, 200]
14 | 
15 |     inplanes = [x * k for x in resnet.get_inplanes()]
16 |     if model_depth == 50:
17 |         model = resnet.ResNet(WideBottleneck, [3, 4, 6, 3], inplanes, **kwargs)
18 |     elif model_depth == 101:
19 |         model = resnet.ResNet(WideBottleneck, [3, 4, 23, 3], inplanes, **kwargs)
20 |     elif model_depth == 152:
21 |         model = resnet.ResNet(WideBottleneck, [3, 8, 36, 3], inplanes, **kwargs)
22 |     elif model_depth == 200:
23 |         model = resnet.ResNet(WideBottleneck, [3, 24, 36, 3], inplanes,
24 |                               **kwargs)
25 | 
26 |     return model
27 | 


--------------------------------------------------------------------------------
/util_scripts/add_fps_into_activitynet_json.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import subprocess
 4 | from pathlib import Path
 5 | 
 6 | if __name__ == '__main__':
 7 |     video_dir_path = Path(sys.argv[1])
 8 |     json_path = Path(sys.argv[2])
 9 |     if len(sys.argv) > 3:
10 |         dst_json_path = Path(sys.argv[3])
11 |     else:
12 |         dst_json_path = json_path
13 | 
14 |     with json_path.open('r') as f:
15 |         json_data = json.load(f)
16 | 
17 |     for video_file_path in sorted(video_dir_path.iterdir()):
18 |         file_name = video_file_path.name
19 |         if '.mp4' not in file_name:
20 |             continue
21 |         name = video_file_path.stem
22 | 
23 |         ffprobe_cmd = ['ffprobe', str(video_file_path)]
24 |         p = subprocess.Popen(
25 |             ffprobe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
26 |         res = p.communicate()[1].decode('utf-8')
27 | 
28 |         fps = float([x for x in res.split(',') if 'fps' in x][0].rstrip('fps'))
29 |         json_data['database'][name[2:]]['fps'] = fps
30 | 
31 |     with dst_json_path.open('w') as f:
32 |         json.dump(json_data, f)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Kensho Hara
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/util_scripts/vid2img_sthv2.py:
--------------------------------------------------------------------------------
 1 | # Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
 2 | # arXiv:1811.08383
 3 | # Ji Lin*, Chuang Gan, Song Han
 4 | # {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
 5 | 
 6 | import os
 7 | import pdb
 8 | 
 9 | import threading
10 | 
11 | NUM_THREADS = 1000
12 | VIDEO_ROOT = 'data/something/v2/video'
13 | FRAME_ROOT = 'data/something/v2/img'
14 | 
15 | 
16 | def split(l, n):
17 |     """Yield successive n-sized chunks from l."""
18 |     for i in range(0, len(l), n):
19 |         yield l[i:i + n]
20 | 
21 | 
22 | def extract(video, tmpl='%06d.jpg'):
23 |     # os.system(f'ffmpeg -i {VIDEO_ROOT}/{video} -vf -threads 1 -vf scale=-1:256 -q:v 0 '
24 |     #           f'{FRAME_ROOT}/{video[:-5]}/{tmpl}')
25 |     cmd = 'ffmpeg -i \"{}/{}\" -threads 1 -vf scale=-1:256 -q:v 0 \"{}/{}/%06d.jpg\"'.format(VIDEO_ROOT, video,
26 |                                                                                              FRAME_ROOT, video[:-5])
27 |     os.system(cmd)
28 | 
29 | 
30 | def target(video_list):
31 |     for video in video_list:
32 |         os.makedirs(os.path.join(FRAME_ROOT, video[:-5]))
33 |         extract(video)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     if not os.path.exists(VIDEO_ROOT):
38 |         raise ValueError('Please download videos and set VIDEO_ROOT variable.')
39 |     if not os.path.exists(FRAME_ROOT):
40 |         os.makedirs(FRAME_ROOT)
41 | 
42 |     video_list = os.listdir(VIDEO_ROOT)
43 |     splits = list(split(video_list, NUM_THREADS))
44 |     threads = []
45 |     for i, split in enumerate(splits):
46 |         thread = threading.Thread(target=target, args=(split,))
47 |         thread.start()
48 |         threads.append(thread)
49 | 
50 |     for thread in threads:
51 |         thread.join()
52 | 


--------------------------------------------------------------------------------
/module/roi_graph.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | import numpy as np
 5 | 
 6 | import torch
 7 | import torch.nn as n
 8 | import torch.nn.functional as F
 9 | import pdb
10 | 
11 | 
12 | 
13 | def get_iou(roi, rois, area, areas) :
14 |     y_min = torch.max(roi[:,0:1], rois[:,:,0])
15 |     x_min = torch.max(roi[:,1:2], rois[:,:,1])
16 |     y_max = torch.min(roi[:,2:3], rois[:,:,2])
17 |     x_max = torch.min(roi[:,3:4], rois[:,:,3])
18 |     axis0 = x_max - x_min + 1
19 |     axis1 = y_max - y_min + 1
20 |     axis0[axis0 < 0] = 0
21 |     axis1[axis1 < 0] = 0
22 |     intersection = axis0 * axis1
23 |     iou = intersection / (areas + area - intersection)
24 |     return iou
25 | 
26 | 
27 | def get_st_graph(rois, threshold=0):
28 |     B, T, N, _ = rois.size()
29 | 
30 |     M = T*N
31 |     front_graph = torch.zeros((B,M,M))
32 | 
33 |     if M ==0 :
34 |         return front_graph, front_graph.transpoes(1,2)
35 |     areas = (rois[:,:,:,3] - rois[:,:,:,1] + 1) * (rois[:,:,:,2] - rois[:,:,:,0] + 1)
36 | 
37 |     for t in range(T-1):
38 |         for i in range(N):
39 |             ious = get_iou(rois[:,t,i], rois[:,t+1], areas[:,t,i:i+1], areas[:,t+1])
40 |             ious[ious < threshold] = 0
41 |             front_graph[:, t*N+i, (t+1)*N:(t+2)*N] = ious
42 | 
43 |     back_graph = front_graph.transpose(1,2)
44 | 
45 |     # Normalize
46 |     front_graph = front_graph / front_graph.sum(dim=-1, keepdim=True)
47 |     back_graph = back_graph / back_graph.sum(dim=-1, keepdim=True)
48 |     # NaN to zero
49 |     front_graph[front_graph != front_graph] = 0
50 |     back_graph[back_graph != back_graph] = 0
51 | 
52 |     return front_graph, back_graph
53 | 
54 | 
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     rois = torch.rand((4,8,10,4))
59 |     front_graph, back_graph = get_st_graph(rois)
60 | 
61 |     pdb.set_trace()
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/module/gcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | from torch.nn.parameter import Parameter
 5 | import math
 6 | import pdb
 7 | 
 8 | class GraphConvolution(nn.Module):
 9 |     """
10 |     Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
11 |     """
12 | 
13 |     def __init__(self, in_features, out_features, bias=True, batch=False):
14 |         super(GraphConvolution, self).__init__()
15 |         self.in_features = in_features
16 |         self.out_features = out_features
17 |         self.weight = Parameter(torch.Tensor(in_features, out_features))
18 |         if bias:
19 |             self.bias = Parameter(torch.Tensor(out_features))
20 |         else:
21 |             self.register_parameter('bias', None)
22 |         self.reset_parameters()
23 |         self.batch = batch
24 | 
25 |     def reset_parameters(self):
26 |         stdv = 1. / math.sqrt(self.weight.size(1))
27 |         self.weight.data.uniform_(-stdv, stdv)
28 |         if self.bias is not None:
29 |             self.bias.data.uniform_(-stdv, stdv)
30 | 
31 |     def forward(self, input, adj):
32 |         if self.batch:
33 |             support = torch.matmul(input, self.weight)
34 |             output = torch.matmul(adj, support)
35 |         else:
36 |             support = torch.mm(input, self.weight)
37 |             output = torch.mm(adj, support)
38 |             #output = SparseMM(adj)(support)
39 |         if self.bias is not None:
40 |             return output + self.bias
41 |         else:
42 |             return output
43 | 
44 |     def __repr__(self):
45 |         return self.__class__.__name__ + ' (' \
46 |                + str(self.in_features) + ' -> ' \
47 |                + str(self.out_features) + ')'
48 | 
49 | 
50 | class GCN(nn.Module):
51 |     def __init__(self, nfeat, nhid, nclass, dropout):
52 |         super(GCN, self).__init__()
53 | 
54 |         self.gc1 = GraphConvolution(nfeat, nhid)
55 |         self.gc2 = GraphConvolution(nhid, nclass)
56 |         self.dropout = dropout
57 | 
58 |     def forward(self, x, adj):
59 |         x = F.relu(self.gc1(x, adj))
60 |         x = F.dropout(x, self.dropout, training=self.training)
61 |         x = self.gc2(x, adj)
62 |         # x = F.relu(self.gc2(x, adj))
63 |         # x = F.dropout(x, self.dropout, training=self.training)
64 |         return x
65 | 


--------------------------------------------------------------------------------
/datasets/loader.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | import h5py
 4 | from PIL import Image
 5 | 
 6 | 
 7 | class ImageLoaderPIL(object):
 8 | 
 9 |     def __call__(self, path):
10 |         # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
11 |         with path.open('rb') as f:
12 |             with Image.open(f) as img:
13 |                 return img.convert('RGB')
14 | 
15 | 
16 | class ImageLoaderAccImage(object):
17 | 
18 |     def __call__(self, path):
19 |         import accimage
20 |         return accimage.Image(str(path))
21 | 
22 | 
23 | class VideoLoader(object):
24 | 
25 |     def __init__(self, image_name_formatter, image_loader=None):
26 |         self.image_name_formatter = image_name_formatter
27 |         if image_loader is None:
28 |             self.image_loader = ImageLoaderPIL()
29 |         else:
30 |             self.image_loader = image_loader
31 | 
32 |     def __call__(self, video_path, frame_indices):
33 |         video = []
34 |         for i in frame_indices:
35 |             image_path = video_path / self.image_name_formatter(i)
36 |             if image_path.exists():
37 |                 video.append(self.image_loader(image_path))
38 | 
39 |         return video
40 | 
41 | 
42 | class VideoLoaderHDF5(object):
43 | 
44 |     def __call__(self, video_path, frame_indices):
45 |         with h5py.File(video_path, 'r') as f:
46 |             video_data = f['video']
47 | 
48 |             video = []
49 |             for i in frame_indices:
50 |                 if i < len(video_data):
51 |                     video.append(Image.open(io.BytesIO(video_data[i])))
52 |                 else:
53 |                     return video
54 | 
55 |         return video
56 | 
57 | 
58 | class VideoLoaderFlowHDF5(object):
59 | 
60 |     def __init__(self):
61 |         self.flows = ['u', 'v']
62 | 
63 |     def __call__(self, video_path, frame_indices):
64 |         with h5py.File(video_path, 'r') as f:
65 | 
66 |             flow_data = []
67 |             for flow in self.flows:
68 |                 flow_data.append(f['video_{}'.format(flow)])
69 | 
70 |             video = []
71 |             for i in frame_indices:
72 |                 if i < len(flow_data[0]):
73 |                     frame = [
74 |                         Image.open(io.BytesIO(video_data[i]))
75 |                         for video_data in flow_data
76 |                     ]
77 |                     frame.append(frame[-1])  # add dummy data into third channel
78 |                     video.append(Image.merge('RGB', frame))
79 | 
80 |         return video
81 | 


--------------------------------------------------------------------------------
/datasets/videodataset_multiclips.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import copy
 3 | import functools
 4 | import pdb
 5 | import torch
 6 | from torch.utils.data.dataloader import default_collate
 7 | 
 8 | from .videodataset import VideoDataset
 9 | 
10 | 
11 | def collate_fn(batch):
12 |     batch_clips, batch_targets = zip(*batch)
13 | 
14 |     batch_clips = [clip for multi_clips in batch_clips for clip in multi_clips]
15 |     batch_targets = [
16 |         target for multi_targets in batch_targets for target in multi_targets
17 |     ]
18 | 
19 |     target_element = batch_targets[0]
20 |     if isinstance(target_element, int) or isinstance(target_element, str):
21 |         return default_collate(batch_clips), default_collate(batch_targets)
22 |     else:
23 |         return default_collate(batch_clips), batch_targets
24 | 
25 | 
26 | class VideoDatasetMultiClips(VideoDataset):
27 | 
28 |     def __loading(self, path, video_frame_indices):
29 |         clips = []
30 |         segments = []
31 |         for clip_frame_indices in video_frame_indices:
32 |             clip = self.loader(path, clip_frame_indices)
33 |             if self.spatial_transform is not None:
34 |                 self.spatial_transform.randomize_parameters()
35 |                 clip = [self.spatial_transform(img) for img in clip]
36 |             clips.append(torch.stack(clip, 0).permute(1, 0, 2, 3))
37 |             segments.append(
38 |                 [min(clip_frame_indices),
39 |                  max(clip_frame_indices) + 1])
40 | 
41 |         return clips, segments
42 | 
43 |     def __getitem__(self, index):
44 |         path = self.data[index]['video']
45 | 
46 |         video_frame_indices = self.data[index]['frame_indices']
47 |         if self.temporal_transform is not None:
48 |             video_frame_indices = self.temporal_transform(video_frame_indices)
49 |         clips, segments = self.__loading(path, video_frame_indices)
50 | 
51 |         if isinstance(self.target_type, list):
52 |             target = [self.data[index][t] for t in self.target_type]
53 |         else:
54 |             target = self.data[index][self.target_type]
55 | 
56 |         if 'segment' in self.target_type:
57 |             if isinstance(self.target_type, list):
58 |                 segment_index = self.target_type.index('segment')
59 |                 targets = []
60 |                 for s in segments:
61 |                     targets.append(copy.deepcopy(target))
62 |                     targets[-1][segment_index] = s
63 |             else:
64 |                 targets = segments
65 |         else:
66 |             targets = [target for _ in range(len(segments))]
67 | 
68 |         return clips, targets
69 | 


--------------------------------------------------------------------------------
/models/resnext.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from functools import partial
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | 
 8 | from .resnet import conv1x1x1, Bottleneck, ResNet
 9 | from utils import partialclass
10 | 
11 | 
12 | def get_inplanes():
13 |     return [128, 256, 512, 1024]
14 | 
15 | 
16 | class ResNeXtBottleneck(Bottleneck):
17 |     expansion = 2
18 | 
19 |     def __init__(self, inplanes, planes, cardinality, stride=1,
20 |                  downsample=None):
21 |         super().__init__(inplanes, planes, stride, downsample)
22 | 
23 |         mid_planes = cardinality * planes // 32
24 |         self.conv1 = conv1x1x1(inplanes, mid_planes)
25 |         self.bn1 = nn.BatchNorm3d(mid_planes)
26 |         self.conv2 = nn.Conv3d(mid_planes,
27 |                                mid_planes,
28 |                                kernel_size=3,
29 |                                stride=stride,
30 |                                padding=1,
31 |                                groups=cardinality,
32 |                                bias=False)
33 |         self.bn2 = nn.BatchNorm3d(mid_planes)
34 |         self.conv3 = conv1x1x1(mid_planes, planes * self.expansion)
35 | 
36 | 
37 | class ResNeXt(ResNet):
38 | 
39 |     def __init__(self,
40 |                  block,
41 |                  layers,
42 |                  block_inplanes,
43 |                  n_input_channels=3,
44 |                  conv1_t_size=7,
45 |                  conv1_t_stride=1,
46 |                  no_max_pool=False,
47 |                  shortcut_type='B',
48 |                  cardinality=32,
49 |                  n_classes=400):
50 |         block = partialclass(block, cardinality=cardinality)
51 |         super().__init__(block, layers, block_inplanes, n_input_channels,
52 |                          conv1_t_size, conv1_t_stride, no_max_pool,
53 |                          shortcut_type, n_classes)
54 | 
55 |         self.fc = nn.Linear(cardinality * 32 * block.expansion, n_classes)
56 | 
57 | 
58 | def generate_model(model_depth, **kwargs):
59 |     assert model_depth in [50, 101, 152, 200]
60 | 
61 |     if model_depth == 50:
62 |         model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], get_inplanes(),
63 |                         **kwargs)
64 |     elif model_depth == 101:
65 |         model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], get_inplanes(),
66 |                         **kwargs)
67 |     elif model_depth == 152:
68 |         model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], get_inplanes(),
69 |                         **kwargs)
70 |     elif model_depth == 200:
71 |         model = ResNeXt(ResNeXtBottleneck, [3, 24, 36, 3], get_inplanes(),
72 |                         **kwargs)
73 | 
74 |     return model
75 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import random
 3 | from functools import partialmethod
 4 | 
 5 | import torch
 6 | import numpy as np
 7 | from sklearn.metrics import precision_recall_fscore_support
 8 | 
 9 | 
10 | class AverageMeter(object):
11 |     """Computes and stores the average and current value"""
12 | 
13 |     def __init__(self):
14 |         self.reset()
15 | 
16 |     def reset(self):
17 |         self.val = 0
18 |         self.avg = 0
19 |         self.sum = 0
20 |         self.count = 0
21 | 
22 |     def update(self, val, n=1):
23 |         self.val = val
24 |         self.sum += val * n
25 |         self.count += n
26 |         self.avg = self.sum / self.count
27 | 
28 | 
29 | class Logger(object):
30 | 
31 |     def __init__(self, path, header):
32 |         self.log_file = path.open('w')
33 |         self.logger = csv.writer(self.log_file, delimiter='\t')
34 | 
35 |         self.logger.writerow(header)
36 |         self.header = header
37 | 
38 |     def __del(self):
39 |         self.log_file.close()
40 | 
41 |     def log(self, values):
42 |         write_values = []
43 |         for col in self.header:
44 |             assert col in values
45 |             write_values.append(values[col])
46 | 
47 |         self.logger.writerow(write_values)
48 |         self.log_file.flush()
49 | 
50 | 
51 | def calculate_accuracy(outputs, targets):
52 |     with torch.no_grad():
53 |         batch_size = targets.size(0)
54 | 
55 |         _, pred = outputs.topk(1, 1, largest=True, sorted=True)
56 |         pred = pred.t()
57 |         correct = pred.eq(targets.view(1, -1))
58 |         n_correct_elems = correct.float().sum().item()
59 | 
60 |         return n_correct_elems / batch_size
61 | 
62 | 
63 | def calculate_precision_and_recall(outputs, targets, pos_label=1):
64 |     with torch.no_grad():
65 |         _, pred = outputs.topk(1, 1, largest=True, sorted=True)
66 |         precision, recall, _, _ = precision_recall_fscore_support(
67 |             targets.view(-1, 1).cpu().numpy(),
68 |             pred.cpu().numpy())
69 | 
70 |         return precision[pos_label], recall[pos_label]
71 | 
72 | 
73 | def worker_init_fn(worker_id):
74 |     torch_seed = torch.initial_seed()
75 | 
76 |     random.seed(torch_seed + worker_id)
77 | 
78 |     if torch_seed >= 2**32:
79 |         torch_seed = torch_seed % 2**32
80 |     np.random.seed(torch_seed + worker_id)
81 | 
82 | 
83 | def get_lr(optimizer):
84 |     lrs = []
85 |     for param_group in optimizer.param_groups:
86 |         lr = float(param_group['lr'])
87 |         lrs.append(lr)
88 | 
89 |     return max(lrs)
90 | 
91 | 
92 | def partialclass(cls, *args, **kwargs):
93 | 
94 |     class PartialClass(cls):
95 |         __init__ = partialmethod(cls.__init__, *args, **kwargs)
96 | 
97 |     return PartialClass


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import json
 3 | from collections import defaultdict
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | 
 8 | from utils import AverageMeter
 9 | 
10 | 
11 | def get_video_results(outputs, class_names, output_topk):
12 |     sorted_scores, locs = torch.topk(outputs,
13 |                                      k=min(output_topk, len(class_names)))
14 | 
15 |     video_results = []
16 |     for i in range(sorted_scores.size(0)):
17 |         video_results.append({
18 |             'label': class_names[locs[i].item()],
19 |             'score': sorted_scores[i].item()
20 |         })
21 | 
22 |     return video_results
23 | 
24 | 
25 | def inference(data_loader, model, result_path, class_names, no_average,
26 |               output_topk):
27 |     print('inference')
28 | 
29 |     model.eval()
30 | 
31 |     batch_time = AverageMeter()
32 |     data_time = AverageMeter()
33 |     results = {'results': defaultdict(list)}
34 | 
35 |     end_time = time.time()
36 | 
37 |     with torch.no_grad():
38 |         for i, (inputs, targets) in enumerate(data_loader):
39 |             data_time.update(time.time() - end_time)
40 | 
41 |             video_ids, segments = zip(*targets)
42 |             outputs = model(inputs)
43 |             outputs = F.softmax(outputs, dim=1).cpu()
44 | 
45 |             for j in range(outputs.size(0)):
46 |                 results['results'][video_ids[j]].append({
47 |                     'segment': segments[j],
48 |                     'output': outputs[j]
49 |                 })
50 | 
51 |             batch_time.update(time.time() - end_time)
52 |             end_time = time.time()
53 | 
54 |             print('[{}/{}]\t'
55 |                   'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
56 |                   'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
57 |                       i + 1,
58 |                       len(data_loader),
59 |                       batch_time=batch_time,
60 |                       data_time=data_time))
61 | 
62 |     inference_results = {'results': {}}
63 |     if not no_average:
64 |         for video_id, video_results in results['results'].items():
65 |             video_outputs = [
66 |                 segment_result['output'] for segment_result in video_results
67 |             ]
68 |             video_outputs = torch.stack(video_outputs)
69 |             average_scores = torch.mean(video_outputs, dim=0)
70 |             inference_results['results'][video_id] = get_video_results(
71 |                 average_scores, class_names, output_topk)
72 |     else:
73 |         for video_id, video_results in results['results'].items():
74 |             inference_results['results'][video_id] = []
75 |             for segment_result in video_results:
76 |                 segment = segment_result['segment']
77 |                 result = get_video_results(segment_result['output'],
78 |                                            class_names, output_topk)
79 |                 inference_results['results'][video_id].append({
80 |                     'segment': segment,
81 |                     'result': result
82 |                 })
83 | 
84 |     with result_path.open('w') as f:
85 |         json.dump(inference_results, f)
86 | 


--------------------------------------------------------------------------------
/util_scripts/hmdb51_json.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from pathlib import Path
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from .utils import get_n_frames
 8 | 
 9 | 
10 | def convert_csv_to_dict(csv_dir_path, split_index):
11 |     database = {}
12 |     for file_path in csv_dir_path.iterdir():
13 |         filename = file_path.name
14 |         if 'split{}'.format(split_index) not in filename:
15 |             continue
16 | 
17 |         data = pd.read_csv(csv_dir_path / filename, delimiter=' ', header=None)
18 |         keys = []
19 |         subsets = []
20 |         for i in range(data.shape[0]):
21 |             row = data.iloc[i, :]
22 |             if row[1] == 0:
23 |                 continue
24 |             elif row[1] == 1:
25 |                 subset = 'training'
26 |             elif row[1] == 2:
27 |                 subset = 'validation'
28 | 
29 |             keys.append(row[0].split('.')[0])
30 |             subsets.append(subset)
31 | 
32 |         for i in range(len(keys)):
33 |             key = keys[i]
34 |             database[key] = {}
35 |             database[key]['subset'] = subsets[i]
36 |             label = '_'.join(filename.split('_')[:-2])
37 |             database[key]['annotations'] = {'label': label}
38 | 
39 |     return database
40 | 
41 | 
42 | def get_labels(csv_dir_path):
43 |     labels = []
44 |     for file_path in csv_dir_path.iterdir():
45 |         labels.append('_'.join(file_path.name.split('_')[:-2]))
46 |     return sorted(list(set(labels)))
47 | 
48 | 
49 | def convert_hmdb51_csv_to_json(csv_dir_path, split_index, video_dir_path,
50 |                                dst_json_path):
51 |     labels = get_labels(csv_dir_path)
52 |     database = convert_csv_to_dict(csv_dir_path, split_index)
53 | 
54 |     dst_data = {}
55 |     dst_data['labels'] = labels
56 |     dst_data['database'] = {}
57 |     dst_data['database'].update(database)
58 | 
59 |     for k, v in dst_data['database'].items():
60 |         if v['annotations'] is not None:
61 |             label = v['annotations']['label']
62 |         else:
63 |             label = 'test'
64 | 
65 |         video_path = video_dir_path / label / k
66 |         n_frames = get_n_frames(video_path)
67 |         v['annotations']['segment'] = (1, n_frames + 1)
68 | 
69 |     with dst_json_path.open('w') as dst_file:
70 |         json.dump(dst_data, dst_file)
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     parser = argparse.ArgumentParser()
75 |     parser.add_argument('dir_path',
76 |                         default=None,
77 |                         type=Path,
78 |                         help='Directory path of HMDB51 annotation files.')
79 |     parser.add_argument('video_path',
80 |                         default=None,
81 |                         type=Path,
82 |                         help=('Path of video directory (jpg).'
83 |                               'Using to get n_frames of each video.'))
84 |     parser.add_argument('dst_dir_path',
85 |                         default=None,
86 |                         type=Path,
87 |                         help='Directory path of dst json file.')
88 | 
89 |     args = parser.parse_args()
90 | 
91 |     for split_index in range(1, 4):
92 |         dst_json_path = args.dst_dir_path / 'hmdb51_{}.json'.format(split_index)
93 |         convert_hmdb51_csv_to_json(args.dir_path, split_index, args.video_path,
94 |                                    dst_json_path)
95 | 


--------------------------------------------------------------------------------
/util_scripts/eval_accuracy.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | def get_class_labels(data):
 7 |     class_labels_map = {}
 8 |     index = 0
 9 |     for class_label in data['labels']:
10 |         class_labels_map[class_label] = index
11 |         index += 1
12 |     return class_labels_map
13 | 
14 | 
15 | def load_ground_truth(ground_truth_path, subset):
16 |     with ground_truth_path.open('r') as f:
17 |         data = json.load(f)
18 | 
19 |     class_labels_map = get_class_labels(data)
20 | 
21 |     ground_truth = []
22 |     for video_id, v in data['database'].items():
23 |         if subset != v['subset']:
24 |             continue
25 |         this_label = v['annotations']['label']
26 |         ground_truth.append((video_id, class_labels_map[this_label]))
27 | 
28 |     return ground_truth, class_labels_map
29 | 
30 | 
31 | def load_result(result_path, top_k, class_labels_map):
32 |     with result_path.open('r') as f:
33 |         data = json.load(f)
34 | 
35 |     result = {}
36 |     for video_id, v in data['results'].items():
37 |         labels_and_scores = []
38 |         for this_result in v:
39 |             label = class_labels_map[this_result['label']]
40 |             score = this_result['score']
41 |             labels_and_scores.append((label, score))
42 |         labels_and_scores.sort(key=lambda x: x[1], reverse=True)
43 |         result[video_id] = list(zip(*labels_and_scores[:top_k]))[0]
44 |     return result
45 | 
46 | 
47 | def remove_nonexistent_ground_truth(ground_truth, result):
48 |     exist_ground_truth = [line for line in ground_truth if line[0] in result]
49 | 
50 |     return exist_ground_truth
51 | 
52 | 
53 | def evaluate(ground_truth_path, result_path, subset, top_k, ignore):
54 |     print('load ground truth')
55 |     ground_truth, class_labels_map = load_ground_truth(ground_truth_path,
56 |                                                        subset)
57 |     print('number of ground truth: {}'.format(len(ground_truth)))
58 | 
59 |     print('load result')
60 |     result = load_result(result_path, top_k, class_labels_map)
61 |     print('number of result: {}'.format(len(result)))
62 | 
63 |     n_ground_truth = len(ground_truth)
64 |     ground_truth = remove_nonexistent_ground_truth(ground_truth, result)
65 |     if ignore:
66 |         n_ground_truth = len(ground_truth)
67 | 
68 |     print('calculate top-{} accuracy'.format(top_k))
69 |     correct = [1 if line[1] in result[line[0]] else 0 for line in ground_truth]
70 |     accuracy = sum(correct) / n_ground_truth
71 | 
72 |     print('top-{} accuracy: {}'.format(top_k, accuracy))
73 |     return accuracy
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     parser = argparse.ArgumentParser()
78 |     parser.add_argument('ground_truth_path', type=Path)
79 |     parser.add_argument('result_path', type=Path)
80 |     parser.add_argument('-k', type=int, default=1)
81 |     parser.add_argument('--subset', type=str, default='validation')
82 |     parser.add_argument('--save', action='store_true')
83 |     parser.add_argument(
84 |         '--ignore',
85 |         action='store_true',
86 |         help='ignore nonexistent videos in result')
87 | 
88 |     args = parser.parse_args()
89 | 
90 |     accuracy = evaluate(args.ground_truth_path, args.result_path, args.subset,
91 |                         args.k, args.ignore)
92 | 
93 |     if args.save:
94 |         with (args.result_path.parent / 'top{}.txt'.format(
95 |                 args.k)).open('w') as f:
96 |             f.write(str(accuracy))
97 | 


--------------------------------------------------------------------------------
/util_scripts/ucf101_json.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from pathlib import Path
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from .utils import get_n_frames
 8 | 
 9 | 
10 | def convert_csv_to_dict(csv_path, subset):
11 |     data = pd.read_csv(csv_path, delimiter=' ', header=None)
12 |     keys = []
13 |     key_labels = []
14 |     for i in range(data.shape[0]):
15 |         row = data.iloc[i, :]
16 |         slash_rows = data.iloc[i, 0].split('/')
17 |         class_name = slash_rows[0]
18 |         basename = slash_rows[1].split('.')[0]
19 | 
20 |         keys.append(basename)
21 |         key_labels.append(class_name)
22 | 
23 |     database = {}
24 |     for i in range(len(keys)):
25 |         key = keys[i]
26 |         database[key] = {}
27 |         database[key]['subset'] = subset
28 |         label = key_labels[i]
29 |         database[key]['annotations'] = {'label': label}
30 | 
31 |     return database
32 | 
33 | 
34 | def load_labels(label_csv_path):
35 |     data = pd.read_csv(label_csv_path, delimiter=' ', header=None)
36 |     labels = []
37 |     for i in range(data.shape[0]):
38 |         labels.append(data.iloc[i, 1])
39 |     return labels
40 | 
41 | 
42 | def convert_ucf101_csv_to_json(label_csv_path, train_csv_path, val_csv_path,
43 |                                video_dir_path, dst_json_path):
44 |     labels = load_labels(label_csv_path)
45 |     train_database = convert_csv_to_dict(train_csv_path, 'training')
46 |     val_database = convert_csv_to_dict(val_csv_path, 'validation')
47 | 
48 |     dst_data = {}
49 |     dst_data['labels'] = labels
50 |     dst_data['database'] = {}
51 |     dst_data['database'].update(train_database)
52 |     dst_data['database'].update(val_database)
53 | 
54 |     for k, v in dst_data['database'].items():
55 |         if v['annotations'] is not None:
56 |             label = v['annotations']['label']
57 |         else:
58 |             label = 'test'
59 | 
60 |         video_path = video_dir_path / label / k
61 |         n_frames = get_n_frames(video_path)
62 |         v['annotations']['segment'] = (1, n_frames + 1)
63 | 
64 |     with dst_json_path.open('w') as dst_file:
65 |         json.dump(dst_data, dst_file)
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     parser = argparse.ArgumentParser()
70 |     parser.add_argument('dir_path',
71 |                         default=None,
72 |                         type=Path,
73 |                         help=('Directory path including classInd.txt, '
74 |                               'trainlist0-.txt, testlist0-.txt'))
75 |     parser.add_argument('video_path',
76 |                         default=None,
77 |                         type=Path,
78 |                         help=('Path of video directory (jpg).'
79 |                               'Using to get n_frames of each video.'))
80 |     parser.add_argument('dst_path',
81 |                         default=None,
82 |                         type=Path,
83 |                         help='Directory path of dst json file.')
84 | 
85 |     args = parser.parse_args()
86 | 
87 |     for split_index in range(1, 4):
88 |         label_csv_path = args.dir_path / 'classInd.txt'
89 |         train_csv_path = args.dir_path / 'trainlist0{}.txt'.format(split_index)
90 |         val_csv_path = args.dir_path / 'testlist0{}.txt'.format(split_index)
91 |         dst_json_path = args.dst_path / 'ucf101_0{}.json'.format(split_index)
92 | 
93 |         convert_ucf101_csv_to_json(label_csv_path, train_csv_path, val_csv_path,
94 |                                    args.video_path, dst_json_path)
95 | 


--------------------------------------------------------------------------------
/models/pre_act_resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from .resnet import conv3x3x3, conv1x1x1, get_inplanes, ResNet
  6 | 
  7 | 
  8 | class PreActivationBasicBlock(nn.Module):
  9 |     expansion = 1
 10 | 
 11 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 12 |         super().__init__()
 13 | 
 14 |         self.bn1 = nn.BatchNorm3d(inplanes)
 15 |         self.conv1 = conv3x3x3(inplanes, planes, stride)
 16 |         self.bn2 = nn.BatchNorm3d(planes)
 17 |         self.conv2 = conv3x3x3(planes, planes)
 18 |         self.relu = nn.ReLU(inplace=True)
 19 |         self.downsample = downsample
 20 |         self.stride = stride
 21 | 
 22 |     def forward(self, x):
 23 |         residual = x
 24 | 
 25 |         out = self.bn1(x)
 26 |         out = self.relu(out)
 27 |         out = self.conv1(out)
 28 | 
 29 |         out = self.bn2(out)
 30 |         out = self.relu(out)
 31 |         out = self.conv2(out)
 32 | 
 33 |         if self.downsample is not None:
 34 |             residual = self.downsample(x)
 35 | 
 36 |         out += residual
 37 | 
 38 |         return out
 39 | 
 40 | 
 41 | class PreActivationBottleneck(nn.Module):
 42 |     expansion = 4
 43 | 
 44 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 45 |         super().__init__()
 46 | 
 47 |         self.bn1 = nn.BatchNorm3d(inplanes)
 48 |         self.conv1 = conv1x1x1(inplanes, planes)
 49 |         self.bn2 = nn.BatchNorm3d(planes)
 50 |         self.conv2 = conv3x3x3(planes, planes, stride)
 51 |         self.bn3 = nn.BatchNorm3d(planes)
 52 |         self.conv3 = conv1x1x1(planes, planes * self.expansion)
 53 |         self.relu = nn.ReLU(inplace=True)
 54 |         self.downsample = downsample
 55 |         self.stride = stride
 56 | 
 57 |     def forward(self, x):
 58 |         residual = x
 59 | 
 60 |         out = self.bn1(x)
 61 |         out = self.relu(out)
 62 |         out = self.conv1(out)
 63 | 
 64 |         out = self.bn2(out)
 65 |         out = self.relu(out)
 66 |         out = self.conv2(out)
 67 | 
 68 |         out = self.bn3(out)
 69 |         out = self.relu(out)
 70 |         out = self.conv3(out)
 71 | 
 72 |         if self.downsample is not None:
 73 |             residual = self.downsample(x)
 74 | 
 75 |         out += residual
 76 | 
 77 |         return out
 78 | 
 79 | 
 80 | def generate_model(model_depth, **kwargs):
 81 |     assert model_depth in [10, 18, 34, 50, 101, 152, 200]
 82 | 
 83 |     if model_depth == 10:
 84 |         model = ResNet(PreActivationBasicBlock, [1, 1, 1, 1], get_inplanes(),
 85 |                        **kwargs)
 86 |     elif model_depth == 18:
 87 |         model = ResNet(PreActivationBasicBlock, [2, 2, 2, 2], get_inplanes(),
 88 |                        **kwargs)
 89 |     elif model_depth == 34:
 90 |         model = ResNet(PreActivationBasicBlock, [3, 4, 6, 3], get_inplanes(),
 91 |                        **kwargs)
 92 |     elif model_depth == 50:
 93 |         model = ResNet(PreActivationBottleneck, [3, 4, 6, 3], get_inplanes(),
 94 |                        **kwargs)
 95 |     elif model_depth == 101:
 96 |         model = ResNet(PreActivationBottleneck, [3, 4, 23, 3], get_inplanes(),
 97 |                        **kwargs)
 98 |     elif model_depth == 152:
 99 |         model = ResNet(PreActivationBottleneck, [3, 8, 36, 3], get_inplanes(),
100 |                        **kwargs)
101 |     elif model_depth == 200:
102 |         model = ResNet(PreActivationBottleneck, [3, 24, 36, 3], get_inplanes(),
103 |                        **kwargs)
104 | 
105 |     return model
106 | 


--------------------------------------------------------------------------------
/util_scripts/mit_json.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | from pathlib import Path
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from .utils import get_n_frames
  8 | 
  9 | 
 10 | def convert_csv_to_dict(csv_path, subset):
 11 |     data = pd.read_csv(csv_path, header=None)
 12 |     keys = []
 13 |     key_labels = []
 14 |     if subset == 'testing':
 15 |         for i in range(data.shape[0]):
 16 |             basename = data.iloc[i, 0].split('/')
 17 |             assert len(basename) == 1
 18 |             basename = Path(basename[0]).stem
 19 | 
 20 |             keys.append(basename)
 21 |     else:
 22 |         for i in range(data.shape[0]):
 23 |             basename = data.iloc[i, 0].split('/')
 24 |             assert len(basename) == 2
 25 |             basename = Path(basename[1]).stem
 26 | 
 27 |             keys.append(basename)
 28 |             key_labels.append(data.iloc[i, 1])
 29 | 
 30 |     database = {}
 31 |     for i in range(len(keys)):
 32 |         key = keys[i]
 33 |         database[key] = {}
 34 |         database[key]['subset'] = subset
 35 |         if subset != 'testing':
 36 |             label = key_labels[i]
 37 |             database[key]['annotations'] = {'label': label}
 38 |         else:
 39 |             database[key]['annotations'] = {}
 40 | 
 41 |     return database
 42 | 
 43 | 
 44 | def load_labels(train_csv_path):
 45 |     data = pd.read_csv(train_csv_path, header=None)
 46 |     return data.iloc[:, 0].tolist()
 47 | 
 48 | 
 49 | def convert_mit_csv_to_json(class_file_path, train_csv_path, val_csv_path,
 50 |                             test_csv_path, video_dir_path, dst_json_path):
 51 |     labels = load_labels(class_file_path)
 52 |     train_database = convert_csv_to_dict(train_csv_path, 'training')
 53 |     val_database = convert_csv_to_dict(val_csv_path, 'validation')
 54 |     if test_csv_path.exists():
 55 |         test_database = convert_csv_to_dict(test_csv_path, 'testing')
 56 | 
 57 |     dst_data = {}
 58 |     dst_data['labels'] = labels
 59 |     dst_data['database'] = {}
 60 |     dst_data['database'].update(train_database)
 61 |     dst_data['database'].update(val_database)
 62 |     if test_csv_path.exists():
 63 |         dst_data['database'].update(test_database)
 64 | 
 65 |     for k, v in dst_data['database'].items():
 66 |         if 'label' in v['annotations']:
 67 |             label = v['annotations']['label']
 68 |         else:
 69 |             label = 'test'
 70 | 
 71 |         video_path = video_dir_path / label / k
 72 |         n_frames = get_n_frames(video_path)
 73 |         v['annotations']['segment'] = (1, n_frames + 1)
 74 | 
 75 |     with dst_json_path.open('w') as dst_file:
 76 |         json.dump(dst_data, dst_file)
 77 | 
 78 | 
 79 | if __name__ == '__main__':
 80 |     parser = argparse.ArgumentParser()
 81 |     parser.add_argument(
 82 |         'dir_path',
 83 |         default=None,
 84 |         type=Path,
 85 |         help=('Directory path including moments_categories.txt, '
 86 |               'trainingSet.csv, validationSet.csv, '
 87 |               '(testingSet.csv (optional))'))
 88 |     parser.add_argument('video_path',
 89 |                         default=None,
 90 |                         type=Path,
 91 |                         help=('Path of video directory (jpg).'
 92 |                               'Using to get n_frames of each video.'))
 93 |     parser.add_argument('dst_path',
 94 |                         default=None,
 95 |                         type=Path,
 96 |                         help='Path of dst json file.')
 97 | 
 98 |     args = parser.parse_args()
 99 | 
100 |     class_file_path = args.dir_path / 'moments_categories.txt'
101 |     train_csv_path = args.dir_path / 'trainingSet.csv'
102 |     val_csv_path = args.dir_path / 'validationSet.csv'
103 |     test_csv_path = args.dir_path / 'testingSet.csv'
104 | 
105 |     convert_mit_csv_to_json(class_file_path, train_csv_path, val_csv_path,
106 |                             test_csv_path, args.video_path, args.dst_path)
107 | 


--------------------------------------------------------------------------------
/util_scripts/generate_video_jpgs.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import argparse
  3 | from pathlib import Path
  4 | 
  5 | from joblib import Parallel, delayed
  6 | 
  7 | 
  8 | def video_process(video_file_path, dst_root_path, ext, fps=-1, size=240):
  9 |     if ext != video_file_path.suffix:
 10 |         return
 11 | 
 12 |     ffprobe_cmd = ('ffprobe -v error -select_streams v:0 '
 13 |                    '-of default=noprint_wrappers=1:nokey=1 -show_entries '
 14 |                    'stream=width,height,avg_frame_rate,duration').split()
 15 |     ffprobe_cmd.append(str(video_file_path))
 16 | 
 17 |     p = subprocess.run(ffprobe_cmd, capture_output=True)
 18 |     res = p.stdout.decode('utf-8').splitlines()
 19 |     if len(res) < 4:
 20 |         return
 21 | 
 22 |     frame_rate = [float(r) for r in res[2].split('/')]
 23 |     frame_rate = frame_rate[0] / frame_rate[1]
 24 |     duration = float(res[3])
 25 |     n_frames = int(frame_rate * duration)
 26 | 
 27 |     name = video_file_path.stem
 28 |     dst_dir_path = dst_root_path / name
 29 |     dst_dir_path.mkdir(exist_ok=True)
 30 |     n_exist_frames = len([
 31 |         x for x in dst_dir_path.iterdir()
 32 |         if x.suffix == '.jpg' and x.name[0] != '.'
 33 |     ])
 34 | 
 35 |     if n_exist_frames >= n_frames:
 36 |         return
 37 | 
 38 |     width = int(res[0])
 39 |     height = int(res[1])
 40 | 
 41 |     if width > height:
 42 |         vf_param = 'scale=-1:{}'.format(size)
 43 |     else:
 44 |         vf_param = 'scale={}:-1'.format(size)
 45 | 
 46 |     if fps > 0:
 47 |         vf_param += ',minterpolate={}'.format(fps)
 48 | 
 49 |     ffmpeg_cmd = ['ffmpeg', '-i', str(video_file_path), '-vf', vf_param]
 50 |     ffmpeg_cmd += ['-threads', '1', '{}/image_%05d.jpg'.format(dst_dir_path)]
 51 |     print(ffmpeg_cmd)
 52 |     subprocess.run(ffmpeg_cmd)
 53 |     print('\n')
 54 | 
 55 | 
 56 | def class_process(class_dir_path, dst_root_path, ext, fps=-1, size=240):
 57 |     if not class_dir_path.is_dir():
 58 |         return
 59 | 
 60 |     dst_class_path = dst_root_path / class_dir_path.name
 61 |     dst_class_path.mkdir(exist_ok=True)
 62 | 
 63 |     for video_file_path in sorted(class_dir_path.iterdir()):
 64 |         video_process(video_file_path, dst_class_path, ext, fps, size)
 65 | 
 66 | 
 67 | if __name__ == '__main__':
 68 |     parser = argparse.ArgumentParser()
 69 |     parser.add_argument(
 70 |         'dir_path', default=None, type=Path, help='Directory path of videos')
 71 |     parser.add_argument(
 72 |         'dst_path',
 73 |         default=None,
 74 |         type=Path,
 75 |         help='Directory path of jpg videos')
 76 |     parser.add_argument(
 77 |         'dataset',
 78 |         default='',
 79 |         type=str,
 80 |         help='Dataset name (kinetics | mit | ucf101 | hmdb51 | activitynet)')
 81 |     parser.add_argument(
 82 |         '--n_jobs', default=-1, type=int, help='Number of parallel jobs')
 83 |     parser.add_argument(
 84 |         '--fps',
 85 |         default=-1,
 86 |         type=int,
 87 |         help=('Frame rates of output videos. '
 88 |               '-1 means original frame rates.'))
 89 |     parser.add_argument(
 90 |         '--size', default=240, type=int, help='Frame size of output videos.')
 91 |     args = parser.parse_args()
 92 | 
 93 |     if args.dataset in ['kinetics', 'mit', 'activitynet']:
 94 |         ext = '.mp4'
 95 |     else:
 96 |         ext = '.avi'
 97 | 
 98 |     if args.dataset == 'activitynet':
 99 |         video_file_paths = [x for x in sorted(args.dir_path.iterdir())]
100 |         status_list = Parallel(
101 |             n_jobs=args.n_jobs,
102 |             backend='threading')(delayed(video_process)(
103 |                 video_file_path, args.dst_path, ext, args.fps, args.size)
104 |                                  for video_file_path in video_file_paths)
105 |     else:
106 |         class_dir_paths = [x for x in sorted(args.dir_path.iterdir())]
107 |         test_set_video_path = args.dir_path / 'test'
108 |         if test_set_video_path.exists():
109 |             class_dir_paths.append(test_set_video_path)
110 | 
111 |         status_list = Parallel(
112 |             n_jobs=args.n_jobs,
113 |             backend='threading')(delayed(class_process)(
114 |                 class_dir_path, args.dst_path, ext, args.fps, args.size)
115 |                                  for class_dir_path in class_dir_paths)
116 | 


--------------------------------------------------------------------------------
/util_scripts/sthv2_json.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | from pathlib import Path
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from utils import get_n_frames
  8 | import pdb
  9 | import tqdm
 10 | import json
 11 | 
 12 | 
 13 | def convert_json_to_dict(csv_path, subset):
 14 |     lines = json.load(open(csv_path,'r'))
 15 |     database = {}
 16 | 
 17 |     for line in lines:
 18 |         video_id = line['id']
 19 |         database[video_id] = {}
 20 |         database[video_id]['subset'] = subset
 21 |         if subset != 'testing':
 22 |             label = line['template'].replace('[','').replace(']','')
 23 |             database[video_id]['annotations'] = {'label': label}
 24 |         else:
 25 |             database[video_id]['annotations'] = {}
 26 | 
 27 |     return database
 28 | 
 29 | 
 30 | 
 31 | def convert_csv_to_dict(csv_path, subset):
 32 |     lines = open(csv_path, 'r').readlines()
 33 |     keys = []
 34 |     key_labels = []
 35 |     database = {}
 36 | 
 37 |     for line in lines:
 38 |         video_id, nframe, label = line.strip('\n').split(' ')
 39 | 
 40 |         database[video_id] = {}
 41 |         database[video_id]['subset'] = subset
 42 |         if subset != 'testing':
 43 |             database[video_id]['annotations'] = {'label': label}
 44 |         else:
 45 |             database[video_id]['annotations'] = {}
 46 | 
 47 |     return database
 48 | 
 49 | 
 50 | def load_labels(train_csv_path):
 51 |     data = open(train_csv_path, 'r').readlines()
 52 |     data = [e.strip('\n') for e in data]
 53 |     return data
 54 | #    data = pd.read_csv(train_csv_path, header=None)
 55 | #    return data.iloc[:, 0].tolist()
 56 | 
 57 | 
 58 | def convert_sthv2_csv_to_json(class_file_path, train_csv_path, val_csv_path,
 59 |                             test_csv_path, video_dir_path, dst_json_path):
 60 |     labels = load_labels(class_file_path)
 61 |     train_database = convert_json_to_dict(train_csv_path, 'training')
 62 |     val_database = convert_json_to_dict(val_csv_path, 'validation')
 63 |     if test_csv_path.exists():
 64 |         test_database = convert_json_to_dict(test_csv_path, 'testing')
 65 | 
 66 |     dst_data = {}
 67 |     dst_data['labels'] = labels
 68 |     dst_data['database'] = {}
 69 |     dst_data['database'].update(train_database)
 70 |     dst_data['database'].update(val_database)
 71 |     if test_csv_path.exists():
 72 |         dst_data['database'].update(test_database)
 73 | 
 74 |     count = 0
 75 |     for k, v in tqdm.tqdm(dst_data['database'].items()):
 76 |         if 'label' in v['annotations']:
 77 |             label = v['annotations']['label']
 78 |         else:
 79 |             label = 'test'
 80 | 
 81 |         video_path = video_dir_path / k
 82 |         n_frames = get_n_frames(video_path)
 83 |         v['annotations']['segment'] = (1, n_frames + 1)
 84 |         v['video_path'] = str(video_path)
 85 | #        count += 1
 86 | #        if count == 1000:
 87 | #            break
 88 | 
 89 |     with dst_json_path.open('w') as dst_file:
 90 |         json.dump(dst_data, dst_file)
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument(
 96 |         'dir_path',
 97 |         default='data/something/v2',
 98 |         type=Path,
 99 |         help=('Directory path including moments_categories.txt, '
100 |               'trainingSet.csv, validationSet.csv, '
101 |               '(testingSet.csv (optional))'))
102 |     parser.add_argument('video_path',
103 |         default='data/something/v2/img',
104 |                         type=Path,
105 |                         help=('Path of video directory (jpg).'
106 |                               'Using to get n_frames of each video.'))
107 |     parser.add_argument('dst_path',
108 |                         default='./',
109 |                         type=Path,
110 |                         help='Path of dst json file.')
111 | 
112 |     args = parser.parse_args()
113 | 
114 |     class_file_path = args.dir_path / 'category.txt'
115 |     train_csv_path = args.dir_path / 'something-something-v2-train.json'
116 |     val_csv_path = args.dir_path / 'something-something-v2-validation.json'
117 |     test_csv_path = args.dir_path / 'something-something-v2-test.json'
118 | #    train_csv_path = args.dir_path / 'train_videofolder.txt'
119 | #    val_csv_path = args.dir_path / 'val_videofolder.txt'
120 | #    test_csv_path = args.dir_path / 'test_videofolder.txt'
121 | 
122 |     convert_sthv2_csv_to_json(class_file_path, train_csv_path, val_csv_path,
123 |                             test_csv_path, args.video_path, args.dst_path)
124 | 


--------------------------------------------------------------------------------
/util_scripts/sthv1_json.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | from pathlib import Path
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from utils import get_n_frames
  8 | import pdb
  9 | import tqdm
 10 | import json
 11 | 
 12 | 
 13 | def convert_json_to_dict(csv_path, subset):
 14 |     lines = json.load(open(csv_path,'r'))
 15 |     database = {}
 16 | 
 17 |     for line in lines:
 18 |         video_id = line['id']
 19 |         database[video_id] = {}
 20 |         database[video_id]['subset'] = subset
 21 |         if subset != 'testing':
 22 |             label = line['template'].replace('[','').replace(']','')
 23 |             database[video_id]['annotations'] = {'label': label}
 24 |         else:
 25 |             database[video_id]['annotations'] = {}
 26 | 
 27 |     return database
 28 | 
 29 | 
 30 | 
 31 | def convert_csv_to_dict(csv_path, subset):
 32 |     lines = open(csv_path, 'r').readlines()
 33 |     keys = []
 34 |     key_labels = []
 35 |     database = {}
 36 | 
 37 |     for line in lines:
 38 |         if subset != 'testing':
 39 |             video_id, label = line.strip('\n').split(';')
 40 |         else:
 41 |             video_id = line.strip('\n')
 42 | 
 43 |         database[video_id] = {}
 44 |         database[video_id]['subset'] = subset
 45 |         if subset != 'testing':
 46 |             database[video_id]['annotations'] = {'label': label}
 47 |         else:
 48 |             database[video_id]['annotations'] = {}
 49 | 
 50 |     return database
 51 | 
 52 | 
 53 | def load_labels(train_csv_path):
 54 |     data = open(train_csv_path, 'r').readlines()
 55 |     data = [e.strip('\n') for e in data]
 56 |     return data
 57 | #    data = pd.read_csv(train_csv_path, header=None)
 58 | #    return data.iloc[:, 0].tolist()
 59 | 
 60 | 
 61 | def convert_sthv1_csv_to_json(class_file_path, train_csv_path, val_csv_path,
 62 |                             test_csv_path, video_dir_path, dst_json_path):
 63 |     labels = load_labels(class_file_path)
 64 |     train_database = convert_csv_to_dict(train_csv_path, 'training')
 65 |     val_database = convert_csv_to_dict(val_csv_path, 'validation')
 66 |     if test_csv_path.exists():
 67 |         test_database = convert_csv_to_dict(test_csv_path, 'testing')
 68 | 
 69 |     dst_data = {}
 70 |     dst_data['labels'] = labels
 71 |     dst_data['database'] = {}
 72 |     dst_data['database'].update(train_database)
 73 |     dst_data['database'].update(val_database)
 74 |     if test_csv_path.exists():
 75 |         dst_data['database'].update(test_database)
 76 | 
 77 |     count = 0
 78 |     for k, v in tqdm.tqdm(dst_data['database'].items()):
 79 |         if 'label' in v['annotations']:
 80 |             label = v['annotations']['label']
 81 |         else:
 82 |             label = 'test'
 83 | 
 84 |         video_path = video_dir_path / k
 85 |         n_frames = get_n_frames(video_path)
 86 |         v['annotations']['segment'] = (1, n_frames + 1)
 87 |         v['video_path'] = str(video_path)
 88 | #        count += 1
 89 | #        if count == 1000:
 90 | #            break
 91 | 
 92 |     with dst_json_path.open('w') as dst_file:
 93 |         json.dump(dst_data, dst_file)
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     parser = argparse.ArgumentParser()
 98 |     parser.add_argument(
 99 |         'dir_path',
100 |         default='data/something/v1',
101 |         type=Path,
102 |         help=('Directory path including moments_categories.txt, '
103 |               'trainingSet.csv, validationSet.csv, '
104 |               '(testingSet.csv (optional))'))
105 |     parser.add_argument('video_path',
106 |         default='data/something/v1/img',
107 |                         type=Path,
108 |                         help=('Path of video directory (jpg).'
109 |                               'Using to get n_frames of each video.'))
110 |     parser.add_argument('dst_path',
111 |                         default='./',
112 |                         type=Path,
113 |                         help='Path of dst json file.')
114 | 
115 |     args = parser.parse_args()
116 | 
117 |     class_file_path = args.dir_path / 'category.txt'
118 |     train_csv_path = args.dir_path / 'something-something-v1-train.csv'
119 |     val_csv_path = args.dir_path / 'something-something-v1-validation.csv'
120 |     test_csv_path = args.dir_path / 'something-something-v1-test.csv'
121 | #    train_csv_path = args.dir_path / 'train_videofolder.txt'
122 | #    val_csv_path = args.dir_path / 'val_videofolder.txt'
123 | #    test_csv_path = args.dir_path / 'test_videofolder.txt'
124 | 
125 |     convert_sthv1_csv_to_json(class_file_path, train_csv_path, val_csv_path,
126 |                             test_csv_path, args.video_path, args.dst_path)
127 | 


--------------------------------------------------------------------------------
/util_scripts/kinetics_json.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | from pathlib import Path
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from .utils import get_n_frames, get_n_frames_hdf5
  8 | 
  9 | 
 10 | def convert_csv_to_dict(csv_path, subset):
 11 |     data = pd.read_csv(csv_path)
 12 |     keys = []
 13 |     key_labels = []
 14 |     for i in range(data.shape[0]):
 15 |         row = data.iloc[i, :]
 16 |         basename = '%s_%s_%s' % (row['youtube_id'], '%06d' % row['time_start'],
 17 |                                  '%06d' % row['time_end'])
 18 |         keys.append(basename)
 19 |         if subset != 'testing':
 20 |             key_labels.append(row['label'])
 21 | 
 22 |     database = {}
 23 |     for i in range(len(keys)):
 24 |         key = keys[i]
 25 |         database[key] = {}
 26 |         database[key]['subset'] = subset
 27 |         if subset != 'testing':
 28 |             label = key_labels[i]
 29 |             database[key]['annotations'] = {'label': label}
 30 |         else:
 31 |             database[key]['annotations'] = {}
 32 | 
 33 |     return database
 34 | 
 35 | 
 36 | def load_labels(train_csv_path):
 37 |     data = pd.read_csv(train_csv_path)
 38 |     return data['label'].unique().tolist()
 39 | 
 40 | 
 41 | def convert_kinetics_csv_to_json(train_csv_path, val_csv_path, test_csv_path,
 42 |                                  video_dir_path, video_type, dst_json_path):
 43 |     labels = load_labels(train_csv_path)
 44 |     train_database = convert_csv_to_dict(train_csv_path, 'training')
 45 |     val_database = convert_csv_to_dict(val_csv_path, 'validation')
 46 |     if test_csv_path.exists():
 47 |         test_database = convert_csv_to_dict(test_csv_path, 'testing')
 48 | 
 49 |     dst_data = {}
 50 |     dst_data['labels'] = labels
 51 |     dst_data['database'] = {}
 52 |     dst_data['database'].update(train_database)
 53 |     dst_data['database'].update(val_database)
 54 |     if test_csv_path.exists():
 55 |         dst_data['database'].update(test_database)
 56 | 
 57 |     for k, v in dst_data['database'].items():
 58 |         if 'label' in v['annotations']:
 59 |             label = v['annotations']['label']
 60 |         else:
 61 |             label = 'test'
 62 | 
 63 |         if video_type == 'jpg':
 64 |             video_path = video_dir_path / label / k
 65 |             if video_path.exists():
 66 |                 n_frames = get_n_frames(video_path)
 67 |                 v['annotations']['segment'] = (1, n_frames + 1)
 68 |         else:
 69 |             video_path = video_dir_path / label / f'{k}.hdf5'
 70 |             if video_path.exists():
 71 |                 n_frames = get_n_frames_hdf5(video_path)
 72 |                 v['annotations']['segment'] = (0, n_frames)
 73 | 
 74 |     with dst_json_path.open('w') as dst_file:
 75 |         json.dump(dst_data, dst_file)
 76 | 
 77 | 
 78 | if __name__ == '__main__':
 79 |     parser = argparse.ArgumentParser()
 80 |     parser.add_argument('dir_path',
 81 |                         default=None,
 82 |                         type=Path,
 83 |                         help=('Directory path including '
 84 |                               'kinetics_train.csv, kinetics_val.csv, '
 85 |                               '(kinetics_test.csv (optional))'))
 86 |     parser.add_argument(
 87 |         'n_classes',
 88 |         default=700,
 89 |         type=int,
 90 |         help='400, 600, or 700 (Kinetics-400, Kinetics-600, or Kinetics-700)')
 91 |     parser.add_argument('video_path',
 92 |                         default=None,
 93 |                         type=Path,
 94 |                         help=('Path of video directory (jpg or hdf5).'
 95 |                               'Using to get n_frames of each video.'))
 96 |     parser.add_argument('video_type',
 97 |                         default='jpg',
 98 |                         type=str,
 99 |                         help=('jpg or hdf5'))
100 |     parser.add_argument('dst_path',
101 |                         default=None,
102 |                         type=Path,
103 |                         help='Path of dst json file.')
104 | 
105 |     args = parser.parse_args()
106 | 
107 |     assert args.video_type in ['jpg', 'hdf5']
108 | 
109 |     train_csv_path = (args.dir_path /
110 |                       'kinetics-{}_train.csv'.format(args.n_classes))
111 |     val_csv_path = (args.dir_path /
112 |                     'kinetics-{}_val.csv'.format(args.n_classes))
113 |     test_csv_path = (args.dir_path /
114 |                      'kinetics-{}_test.csv'.format(args.n_classes))
115 | 
116 |     convert_kinetics_csv_to_json(train_csv_path, val_csv_path, test_csv_path,
117 |                                  args.video_path, args.video_type,
118 |                                  args.dst_path)
119 | 


--------------------------------------------------------------------------------
/validation.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import time
  3 | import sys
  4 | import pdb
  5 | import torch
  6 | import torch.distributed as dist
  7 | 
  8 | from utils import AverageMeter, calculate_accuracy
  9 | 
 10 | 
 11 | def val_epoch(epoch,
 12 |               data_loader,
 13 |               model,
 14 |               criterion,
 15 |               device,
 16 |               logger,
 17 |               tb_writer=None,
 18 |               distributed=False,
 19 |               rpn=None,
 20 |               det_interval=2,
 21 |               nrois=10):
 22 |     print('validation at epoch {}'.format(epoch))
 23 | 
 24 |     model.eval()
 25 |     if rpn is not None:
 26 |         rpn.eval()
 27 | 
 28 |     batch_time = AverageMeter()
 29 |     data_time = AverageMeter()
 30 |     losses = AverageMeter()
 31 |     accuracies = AverageMeter()
 32 | 
 33 |     end_time = time.time()
 34 | 
 35 |     with torch.no_grad():
 36 |         for i, (inputs, targets) in enumerate(data_loader):
 37 |             data_time.update(time.time() - end_time)
 38 |             targets = targets.to(device, non_blocking=True)
 39 |             if rpn is not None:
 40 |                 '''
 41 |                     There was an unexpected CUDNN_ERROR when len(rpn_inputs) is
 42 |                     decrased.
 43 |                 '''
 44 |                 T = inputs.shape[2]
 45 |                 N, C, T, H, W = inputs.size()
 46 |                 if i == 0:
 47 |                     max_N = N
 48 |                 # sample frames for RPN
 49 |                 sample = torch.arange(0,T,det_interval)
 50 |                 rpn_inputs = inputs[:,:,sample].transpose(1,2).contiguous()
 51 |                 rpn_inputs = rpn_inputs.view(-1,C,H,W)
 52 |                 if len(inputs) < max_N:
 53 |                     print("Modified from {} to {}".format(len(inputs), max_N))
 54 |                     rpn_inputs = torch.cat((rpn_inputs, rpn_inputs[:(max_N-len(inputs))*(T//det_interval)]))
 55 |                 with torch.no_grad():
 56 |                     proposals = rpn(rpn_inputs)
 57 |                 proposals = proposals.view(-1,T//det_interval,nrois,4)
 58 |                 if len(inputs) < max_N:
 59 |                     proposals = proposals[:len(inputs)]
 60 |                 outputs = model(inputs, proposals.detach())
 61 |                 # update to the largest batch_size
 62 |                 max_N = max(N, max_N)
 63 |             else:
 64 |                 outputs = model(inputs)
 65 | 
 66 |             loss = criterion(outputs, targets)
 67 |             acc = calculate_accuracy(outputs, targets)
 68 | 
 69 |             losses.update(loss.item(), inputs.size(0))
 70 |             accuracies.update(acc, inputs.size(0))
 71 | 
 72 |             batch_time.update(time.time() - end_time)
 73 |             end_time = time.time()
 74 | 
 75 |             print('Epoch: [{0}][{1}/{2}]\t'
 76 |                   'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
 77 |                   'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
 78 |                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
 79 |                   'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
 80 |                       epoch,
 81 |                       i + 1,
 82 |                       len(data_loader),
 83 |                       batch_time=batch_time,
 84 |                       data_time=data_time,
 85 |                       loss=losses,
 86 |                       acc=accuracies))
 87 | 
 88 |     if distributed:
 89 |         loss_sum = torch.tensor([losses.sum],
 90 |                                 dtype=torch.float32,
 91 |                                 device=device)
 92 |         loss_count = torch.tensor([losses.count],
 93 |                                   dtype=torch.float32,
 94 |                                   device=device)
 95 |         acc_sum = torch.tensor([accuracies.sum],
 96 |                                dtype=torch.float32,
 97 |                                device=device)
 98 |         acc_count = torch.tensor([accuracies.count],
 99 |                                  dtype=torch.float32,
100 |                                  device=device)
101 | 
102 |         dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM)
103 |         dist.all_reduce(loss_count, op=dist.ReduceOp.SUM)
104 |         dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM)
105 |         dist.all_reduce(acc_count, op=dist.ReduceOp.SUM)
106 | 
107 |         losses.avg = loss_sum.item() / loss_count.item()
108 |         accuracies.avg = acc_sum.item() / acc_count.item()
109 | 
110 |     if logger is not None:
111 |         logger.log({'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg})
112 | 
113 |     if tb_writer is not None:
114 |         tb_writer.add_scalar('val/loss', losses.avg, epoch)
115 |         tb_writer.add_scalar('val/acc', accuracies.avg, epoch)
116 | 
117 |     return losses.avg
118 | 


--------------------------------------------------------------------------------
/strg.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import pdb
  3 | import os
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torchvision.ops import RoIAlign
  9 | 
 10 | from rgcn_models import RGCN
 11 | 
 12 | from model import (generate_model, load_pretrained_model, make_data_parallel,
 13 |                    get_fine_tuning_parameters)
 14 | 
 15 | from models import resnet, resnet2p1d, pre_act_resnet, wide_resnet, resnext, densenet
 16 | 
 17 | class STRG(nn.Module):
 18 |     def __init__(self, base_model, in_channel=2048, out_channel=512,
 19 |                  nclass=174, dropout=0.3, nrois=10,
 20 |                  freeze_bn=True, freeze_bn_affine=True,
 21 |                  roi_size=7
 22 |                  ):
 23 |         super(STRG,self).__init__()
 24 |         self.base_model = base_model
 25 |         self.in_channel = in_channel
 26 |         self.out_channel = out_channel
 27 |         self.nclass = nclass
 28 |         self.nrois = nrois
 29 | 
 30 |         self.freeze_bn = freeze_bn
 31 |         self.freeze_bn_affine = freeze_bn_affine
 32 | 
 33 |         self.base_model.fc = nn.Identity()
 34 |         self.base_model.avgpool = nn.Identity()
 35 |         if False:
 36 |             self.base_model.maxpool.stride = (1,2,2)
 37 |             self.base_model.layer3[0].conv2.stride=(1,2,2)
 38 |             self.base_model.layer3[0].downsample[0].stride=(1,2,2)
 39 |             self.base_model.layer4[0].conv2.stride=(1,1,1)
 40 |             self.base_model.layer4[0].downsample[0].stride=(1,1,1)
 41 | 
 42 |         self.reducer = nn.Conv3d(self.in_channel, self.out_channel,1)
 43 |         self.classifier = nn.Linear(2*self.out_channel, nclass)
 44 |         self.avg_pool = nn.Sequential(
 45 |             nn.AdaptiveAvgPool3d(1),
 46 |             nn.Dropout(p=dropout)
 47 |         )
 48 |         self.max_pool = nn.AdaptiveAvgPool2d(1)
 49 | 
 50 |         self.strg_gcn = RGCN()
 51 |         self.roi_align = RoIAlign((roi_size,roi_size), 1/8, -1, aligned=True)
 52 | 
 53 |     def extract_feature(self, x):
 54 |         return self.base_model.extract_feature(x)
 55 | 
 56 | 
 57 | #        x = self.base_model.conv1(x)
 58 | #        x = self.base_model.bn1(x)
 59 | #        x = self.base_model.relu(x)
 60 | #        if not self.base_model.no_max_pool:
 61 | #            x = self.base_model.maxpool(x)
 62 | 
 63 | #        x = self.base_model.layer1(x)
 64 | #        x = self.base_model.layer2(x)
 65 | #        x = self.base_model.layer3(x)
 66 | #        x = self.base_model.layer4(x)
 67 | #        return x
 68 | 
 69 | 
 70 |     def forward(self, inputs, rois=None):
 71 |         features = self.extract_feature(inputs)
 72 |         features = self.reducer(features) # N C T H W
 73 |         pooled_features = self.avg_pool(features).squeeze(-1).squeeze(-1).squeeze(-1)
 74 |         N, C, T, H, W = features.shape
 75 | 
 76 |         rois_list = rois.view(-1, self.nrois, 4)
 77 |         rois_list = [r for r in rois_list]
 78 | 
 79 |         features = features.transpose(1,2).contiguous().view(N*T,C,H,W)
 80 |         rois_features = self.roi_align(features, rois_list)
 81 |         rois_features = self.max_pool(rois_features)
 82 |         rois_features = rois_features.view(N,T,self.nrois,C)
 83 |         gcn_features = self.strg_gcn(rois_features, rois)
 84 | 
 85 |         features = torch.cat((pooled_features, gcn_features), dim=-1)
 86 |         outputs = self.classifier(features)
 87 | 
 88 |         return outputs
 89 | 
 90 | 
 91 |     def train(self, mode=True):
 92 |         """
 93 |             Override the default train() to freeze the BN parameters
 94 |         """
 95 | 
 96 |         super(STRG, self).train(mode)
 97 |         if self.freeze_bn:
 98 |             print("Freezing Mean/Var of BatchNorm2D.")
 99 |             if self.freeze_bn_affine:
100 |                 print("Freezing Weight/Bias of BatchNorm2D.")
101 |         if self.freeze_bn:
102 |             for m in self.base_model.modules():
103 |                 if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm1d):
104 |                     m.eval()
105 |                     if self.freeze_bn_affine:
106 |                         m.weight.requires_grad = False
107 |                         m.bias.requires_grad = False
108 | 
109 | 
110 | if __name__ == '__main__':
111 | 
112 |     model = resnet.generate_model(model_depth=50,
113 |                                     n_classes=174,
114 |                                     n_input_channels=3,
115 |                                     shortcut_type='B',
116 |                                     conv1_t_size=7,
117 |                                     conv1_t_stride=1,
118 |                                     no_max_pool=False,
119 |                                     widen_factor=1.0)
120 | 
121 |     rois = torch.rand((4,8,10,4))
122 |     inputs = torch.rand((4,3,16,224,224))
123 |     strg = STRG(model)
124 |     out = strg(inputs, rois)
125 | 
126 |     pdb.set_trace()
127 |     print(out.shape)
128 | 


--------------------------------------------------------------------------------
/temporal_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | 
  4 | 
  5 | class Compose(object):
  6 | 
  7 |     def __init__(self, transforms):
  8 |         self.transforms = transforms
  9 | 
 10 |     def __call__(self, frame_indices):
 11 |         for i, t in enumerate(self.transforms):
 12 |             if isinstance(frame_indices[0], list):
 13 |                 next_transforms = Compose(self.transforms[i:])
 14 |                 dst_frame_indices = [
 15 |                     next_transforms(clip_frame_indices)
 16 |                     for clip_frame_indices in frame_indices
 17 |                 ]
 18 | 
 19 |                 return dst_frame_indices
 20 |             else:
 21 |                 frame_indices = t(frame_indices)
 22 |         return frame_indices
 23 | 
 24 | 
 25 | class LoopPadding(object):
 26 | 
 27 |     def __init__(self, size):
 28 |         self.size = size
 29 | 
 30 |     def __call__(self, frame_indices):
 31 |         out = frame_indices
 32 | 
 33 |         for index in out:
 34 |             if len(out) >= self.size:
 35 |                 break
 36 |             out.append(index)
 37 | 
 38 |         return out
 39 | 
 40 | 
 41 | class TemporalBeginCrop(object):
 42 | 
 43 |     def __init__(self, size):
 44 |         self.size = size
 45 | 
 46 |     def __call__(self, frame_indices):
 47 |         out = frame_indices[:self.size]
 48 | 
 49 |         for index in out:
 50 |             if len(out) >= self.size:
 51 |                 break
 52 |             out.append(index)
 53 | 
 54 |         return out
 55 | 
 56 | 
 57 | class TemporalCenterCrop(object):
 58 | 
 59 |     def __init__(self, size):
 60 |         self.size = size
 61 | 
 62 |     def __call__(self, frame_indices):
 63 | 
 64 |         center_index = len(frame_indices) // 2
 65 |         begin_index = max(0, center_index - (self.size // 2))
 66 |         end_index = min(begin_index + self.size, len(frame_indices))
 67 | 
 68 |         out = frame_indices[begin_index:end_index]
 69 | 
 70 |         for index in out:
 71 |             if len(out) >= self.size:
 72 |                 break
 73 |             out.append(index)
 74 | 
 75 |         return out
 76 | 
 77 | 
 78 | class TemporalRandomCrop(object):
 79 | 
 80 |     def __init__(self, size):
 81 |         self.size = size
 82 |         self.loop = LoopPadding(size)
 83 | 
 84 |     def __call__(self, frame_indices):
 85 | 
 86 |         rand_end = max(0, len(frame_indices) - self.size - 1)
 87 |         begin_index = random.randint(0, rand_end)
 88 |         end_index = min(begin_index + self.size, len(frame_indices))
 89 | 
 90 |         out = frame_indices[begin_index:end_index]
 91 | 
 92 |         if len(out) < self.size:
 93 |             out = self.loop(out)
 94 | 
 95 |         return out
 96 | 
 97 | 
 98 | class TemporalEvenCrop(object):
 99 | 
100 |     def __init__(self, size, n_samples=1):
101 |         self.size = size
102 |         self.n_samples = n_samples
103 |         self.loop = LoopPadding(size)
104 | 
105 |     def __call__(self, frame_indices):
106 |         n_frames = len(frame_indices)
107 |         stride = max(
108 |             1, math.ceil((n_frames - 1 - self.size) / (self.n_samples - 1)))
109 | 
110 |         out = []
111 |         for begin_index in frame_indices[::stride]:
112 |             if len(out) >= self.n_samples:
113 |                 break
114 |             end_index = min(frame_indices[-1] + 1, begin_index + self.size)
115 |             sample = list(range(begin_index, end_index))
116 | 
117 |             if len(sample) < self.size:
118 |                 out.append(self.loop(sample))
119 |                 break
120 |             else:
121 |                 out.append(sample)
122 | 
123 |         return out
124 | 
125 | 
126 | class SlidingWindow(object):
127 | 
128 |     def __init__(self, size, stride=0):
129 |         self.size = size
130 |         if stride == 0:
131 |             self.stride = self.size
132 |         else:
133 |             self.stride = stride
134 |         self.loop = LoopPadding(size)
135 | 
136 |     def __call__(self, frame_indices):
137 |         out = []
138 |         for begin_index in frame_indices[::self.stride]:
139 |             end_index = min(frame_indices[-1] + 1, begin_index + self.size)
140 |             sample = list(range(begin_index, end_index))
141 | 
142 |             if len(sample) < self.size:
143 |                 out.append(self.loop(sample))
144 |                 break
145 |             else:
146 |                 out.append(sample)
147 | 
148 |         return out
149 | 
150 | 
151 | class TemporalSubsampling(object):
152 | 
153 |     def __init__(self, stride):
154 |         self.stride = stride
155 | 
156 |     def __call__(self, frame_indices):
157 |         return frame_indices[::self.stride]
158 | 
159 | 
160 | class Shuffle(object):
161 | 
162 |     def __init__(self, block_size):
163 |         self.block_size = block_size
164 | 
165 |     def __call__(self, frame_indices):
166 |         frame_indices = [
167 |             frame_indices[i:(i + self.block_size)]
168 |             for i in range(0, len(frame_indices), self.block_size)
169 |         ]
170 |         random.shuffle(frame_indices)
171 |         frame_indices = [t for block in frame_indices for t in block]
172 |         return frame_indices


--------------------------------------------------------------------------------
/datasets/videodataset.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | 
  4 | import torch
  5 | import torch.utils.data as data
  6 | 
  7 | from .loader import VideoLoader
  8 | import pdb
  9 | 
 10 | def get_class_labels(data):
 11 |     class_labels_map = {}
 12 |     index = 0
 13 |     for class_label in data['labels']:
 14 |         class_labels_map[class_label] = index
 15 |         index += 1
 16 |     return class_labels_map
 17 | 
 18 | 
 19 | def get_database(data, subset, root_path, video_path_formatter):
 20 |     video_ids = []
 21 |     video_paths = []
 22 |     annotations = []
 23 |     for key, value in data['database'].items():
 24 |         this_subset = value['subset']
 25 |         if this_subset == subset:
 26 |             video_ids.append(key)
 27 |             annotations.append(value['annotations'])
 28 |             if 'video_path' in value:
 29 |                 video_paths.append(Path(value['video_path']))
 30 |             else:
 31 |                 label = value['annotations']['label']
 32 |                 video_paths.append(video_path_formatter(root_path, label, key))
 33 | 
 34 |     return video_ids, video_paths, annotations
 35 | 
 36 | 
 37 | class VideoDataset(data.Dataset):
 38 | 
 39 |     def __init__(self,
 40 |                  root_path,
 41 |                  annotation_path,
 42 |                  subset,
 43 |                  spatial_transform=None,
 44 |                  temporal_transform=None,
 45 |                  target_transform=None,
 46 |                  video_loader=None,
 47 |                  video_path_formatter=(lambda root_path, label, video_id:
 48 |                                        root_path / label / video_id),
 49 |                  image_name_formatter=lambda x: 'image_{:05d}.jpg'.format(x),
 50 |                  target_type='label'):
 51 |         self.data, self.class_names = self.__make_dataset(
 52 |             root_path, annotation_path, subset, video_path_formatter)
 53 | 
 54 |         self.spatial_transform = spatial_transform
 55 |         self.temporal_transform = temporal_transform
 56 |         self.target_transform = target_transform
 57 |         if video_loader is None:
 58 |             self.loader = VideoLoader(image_name_formatter)
 59 |         else:
 60 |             self.loader = video_loader
 61 | 
 62 |         self.target_type = target_type
 63 | 
 64 |     def __make_dataset(self, root_path, annotation_path, subset,
 65 |                        video_path_formatter):
 66 |         with annotation_path.open('r') as f:
 67 |             data = json.load(f)
 68 |         video_ids, video_paths, annotations = get_database(
 69 |             data, subset, root_path, video_path_formatter)
 70 |         class_to_idx = get_class_labels(data)
 71 |         idx_to_class = {}
 72 |         for name, label in class_to_idx.items():
 73 |             idx_to_class[label] = name
 74 | 
 75 |         n_videos = len(video_ids)
 76 |         dataset = []
 77 |         for i in range(n_videos):
 78 |             if i % (n_videos // 5) == 0:
 79 |                 print('dataset loading [{}/{}]'.format(i, len(video_ids)))
 80 | 
 81 |             if 'label' in annotations[i]:
 82 |                 label = annotations[i]['label']
 83 |                 label_id = class_to_idx[label]
 84 |             else:
 85 |                 label = 'test'
 86 |                 label_id = -1
 87 | 
 88 |             video_path = video_paths[i]
 89 |             if not video_path.exists():
 90 |                 print(video_path)
 91 |                 continue
 92 | 
 93 |             segment = annotations[i]['segment']
 94 |             if segment[1] == 1:
 95 |                 continue
 96 | 
 97 |             frame_indices = list(range(segment[0], segment[1]))
 98 |             sample = {
 99 |                 'video': video_path,
100 |                 'segment': segment,
101 |                 'frame_indices': frame_indices,
102 |                 'video_id': video_ids[i],
103 |                 'label': label_id
104 |             }
105 |             dataset.append(sample)
106 | 
107 |         return dataset, idx_to_class
108 | 
109 |     def __loading(self, path, frame_indices):
110 |         clip = self.loader(path, frame_indices)
111 |         if self.spatial_transform is not None:
112 |             self.spatial_transform.randomize_parameters()
113 |             clip = [self.spatial_transform(img) for img in clip]
114 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
115 | 
116 |         return clip
117 | 
118 |     def __getitem__(self, index):
119 |         path = self.data[index]['video']
120 |         if isinstance(self.target_type, list):
121 |             target = [self.data[index][t] for t in self.target_type]
122 |         else:
123 |             target = self.data[index][self.target_type]
124 | 
125 |         frame_indices = self.data[index]['frame_indices']
126 |         if self.temporal_transform is not None:
127 |             frame_indices = self.temporal_transform(frame_indices)
128 | 
129 |         clip = self.__loading(path, frame_indices)
130 | 
131 |         if self.target_transform is not None:
132 |             target = self.target_transform(target)
133 | 
134 |         return clip, target
135 | 
136 |     def __len__(self):
137 |         return len(self.data)
138 | 


--------------------------------------------------------------------------------
/rpn.py:
--------------------------------------------------------------------------------
  1 | import pdb
  2 | from collections import OrderedDict
  3 | import copy
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | from torchvision.models.detection import fasterrcnn_resnet50_fpn
  8 | from torchvision.models.detection.image_list import ImageList
  9 | 
 10 | from torch.jit.annotations import Tuple, List, Dict, Optional
 11 | from torch import Tensor
 12 | import warnings
 13 | from transform import STRGTransform
 14 | 
 15 | class RPN(nn.Module):
 16 |     def __init__(self, pretrained=True, nrois=10):
 17 |         super(RPN,self).__init__()
 18 |         model = fasterrcnn_resnet50_fpn(pretrained=True).eval()
 19 |         self.transform = STRGTransform(model.transform.min_size,
 20 |                                        model.transform.max_size,
 21 |                                        0,0) #copy.deepcopy(model.transform)
 22 |         self.backbone = copy.deepcopy(model.backbone)
 23 |         self.rpn = copy.deepcopy(model.rpn)
 24 | #        self.eaget_outputs = copy.deepcopy(model.eaget_outputs)
 25 |         self.roi_heads = copy.deepcopy(model.roi_heads)
 26 |         self.rpn._pre_nms_top_n = {'training':3*nrois, 'testing':3*nrois}
 27 |         self.rpn._post_nms_top_n = {'training':nrois, 'testing':nrois}
 28 |         self.rpn.fg_bg_sampler.positive_fraction = 1.0
 29 |         del model
 30 | 
 31 |     def forward(self, images, targets=None):
 32 |         # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
 33 |         """
 34 |         Arguments:
 35 |             images (list[Tensor]): images to be processed
 36 |             targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
 37 |         Returns:
 38 |             result (list[BoxList] or dict[Tensor]): the output from the model.
 39 |                 During training, it returns a dict[Tensor] which contains the losses.
 40 |                 During testing, it returns list[BoxList] contains additional fields
 41 |                like `scores`, `labels` and `mask` (for Mask R-CNN models).
 42 |         """
 43 |         bs = len(images)
 44 |         if self.training and targets is None:
 45 |             raise ValueError("In training mode, targets should be passed")
 46 |         if self.training:
 47 |             assert targets is not None
 48 |             for target in targets:
 49 |                 boxes = target["boxes"]
 50 |                 if isinstance(boxes, torch.Tensor):
 51 |                     if len(boxes.shape) != 2 or boxes.shape[-1] != 4:
 52 |                         raise ValueError("Expected target boxes to be a tensor"
 53 |                                          "of shape [N, 4], got {:}.".format(
 54 |                                              boxes.shape))
 55 |                 else:
 56 |                     raise ValueError("Expected target boxes to be of type "
 57 |                                      "Tensor, got {:}.".format(type(boxes)))
 58 |         original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], [])
 59 |         for img in images:
 60 |             val = img.shape[-2:]
 61 |             assert len(val) == 2
 62 |             original_image_sizes.append((val[0], val[1]))
 63 | 
 64 |         images, targets = self.transform(images, targets)
 65 |         # Check for degenerate boxes
 66 |         # TODO: Move this to a function
 67 |         if targets is not None:
 68 |             for target_idx, target in enumerate(targets):
 69 |                 boxes = target["boxes"]
 70 |                 degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
 71 |                 if degenerate_boxes.any():
 72 |                     # print the first degenrate box
 73 |                     bb_idx = degenerate_boxes.any(dim=1).nonzero().view(-1)[0]
 74 |                     degen_bb: List[float] = boxes[bb_idx].tolist()
 75 |                     raise ValueError("All bounding boxes should have positive height and width."
 76 |                                      " Found invaid box {} for target at index {}."
 77 |                                      .format(degen_bb, target_idx))
 78 | 
 79 |         features = self.backbone(images.tensors)
 80 |         if isinstance(features, torch.Tensor):
 81 |             features = OrderedDict([('0', features)])
 82 |         proposals, proposal_losses = self.rpn(images, features, targets)
 83 |         proposals = self.transform.rpn_postprocess(proposals, images.image_sizes, original_image_sizes)
 84 |         if False:
 85 |             for i in range(len(proposals)):
 86 |                 delta = self.rpn._post_nms_top_n['testing'] - len(proposals[i])
 87 |                 if delta != 0:
 88 |                     print("RPN finds only {} among {}".format(len(proposals[i]),
 89 |                                                         len(proposals[i])+delta))
 90 |                     dummy = -torch.ones((delta, 4)).to(proposals[i].device())
 91 |                     proposals[i] = torch.cat((proposals[i], dummy))
 92 |         return torch.cat(proposals).view(bs, -1, 4)
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     rpn = RPN().eval()
 97 | #    rpn = nn.DataParallel(rpn, device_ids=None).cuda()
 98 |     inputs = torch.rand((5,3,224,224))
 99 |     out = rpn(inputs)
100 |     pdb.set_trace()
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/util_scripts/generate_video_hdf5.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import argparse
  3 | from pathlib import Path
  4 | 
  5 | from joblib import Parallel, delayed
  6 | import h5py
  7 | import numpy as np
  8 | 
  9 | 
 10 | def video_process(video_file_path, dst_root_path, ext, fps=-1, size=240):
 11 |     if ext != video_file_path.suffix:
 12 |         return
 13 | 
 14 |     ffprobe_cmd = ('ffprobe -v error -select_streams v:0 '
 15 |                    '-of default=noprint_wrappers=1:nokey=1 -show_entries '
 16 |                    'stream=width,height,avg_frame_rate,duration').split()
 17 |     ffprobe_cmd.append(str(video_file_path))
 18 | 
 19 |     p = subprocess.run(ffprobe_cmd, capture_output=True)
 20 |     res = p.stdout.decode('utf-8').splitlines()
 21 |     if len(res) < 4:
 22 |         return
 23 | 
 24 |     name = video_file_path.stem
 25 |     dst_dir_path = dst_root_path / name
 26 |     dst_dir_path.mkdir(exist_ok=True)
 27 | 
 28 |     width = int(res[0])
 29 |     height = int(res[1])
 30 | 
 31 |     if width > height:
 32 |         vf_param = f'scale=-1:{size}'
 33 |     else:
 34 |         vf_param = f'scale={size}:-1'
 35 | 
 36 |     if fps > 0:
 37 |         vf_param += f',minterpolate={fps}'
 38 | 
 39 |     ffmpeg_cmd = ['ffmpeg', '-i', str(video_file_path), '-vf', vf_param]
 40 |     ffmpeg_cmd += ['-threads', '1', f'{dst_dir_path}/image_%05d.jpg']
 41 |     print(ffmpeg_cmd)
 42 |     subprocess.run(ffmpeg_cmd)
 43 | 
 44 |     hdf5_path = dst_dir_path.parent / f'{dst_dir_path.name}.hdf5'
 45 |     try:
 46 |         with h5py.File(hdf5_path, 'w') as f:
 47 |             dtype = h5py.special_dtype(vlen='uint8')
 48 |             video = f.create_dataset('video',
 49 |                                      (len(list(dst_dir_path.glob('*.jpg'))),),
 50 |                                      dtype=dtype)
 51 |     except OSError as exc:
 52 |         if 'errno = 36' in exc.args[0]:
 53 |             hdf5_path = dst_dir_path.parent / f'{dst_dir_path.name[:250]}.hdf5'
 54 |             with h5py.File(hdf5_path, 'w') as f:
 55 |                 dtype = h5py.special_dtype(vlen='uint8')
 56 |                 video = f.create_dataset(
 57 |                     'video', (len(list(dst_dir_path.glob('*.jpg'))),),
 58 |                     dtype=dtype)
 59 |         else:
 60 |             raise
 61 | 
 62 |     for i, file_path in enumerate(sorted(dst_dir_path.glob('*.jpg'))):
 63 |         with file_path.open('rb') as f:
 64 |             data = f.read()
 65 |         with h5py.File(hdf5_path, 'r+') as f:
 66 |             video = f['video']
 67 |             video[i] = np.frombuffer(data, dtype='uint8')
 68 | 
 69 |     for file_path in dst_dir_path.glob('*.jpg'):
 70 |         file_path.unlink()
 71 |     dst_dir_path.rmdir()
 72 | 
 73 | 
 74 | def class_process(class_dir_path, dst_root_path, ext, fps=-1, size=240):
 75 |     if not class_dir_path.is_dir():
 76 |         return
 77 | 
 78 |     dst_class_path = dst_root_path / class_dir_path.name
 79 |     dst_class_path.mkdir(exist_ok=True)
 80 | 
 81 |     for video_file_path in sorted(class_dir_path.iterdir()):
 82 |         video_process(video_file_path, dst_class_path, ext, fps, size)
 83 | 
 84 | 
 85 | if __name__ == '__main__':
 86 |     parser = argparse.ArgumentParser()
 87 |     parser.add_argument('dir_path',
 88 |                         default=None,
 89 |                         type=Path,
 90 |                         help='Directory path of videos')
 91 |     parser.add_argument('dst_path',
 92 |                         default=None,
 93 |                         type=Path,
 94 |                         help='Directory path of jpg videos')
 95 |     parser.add_argument(
 96 |         'dataset',
 97 |         default='',
 98 |         type=str,
 99 |         help='Dataset name (kinetics | mit | ucf101 | hmdb51 | activitynet)')
100 |     parser.add_argument('--n_jobs',
101 |                         default=-1,
102 |                         type=int,
103 |                         help='Number of parallel jobs')
104 |     parser.add_argument('--fps',
105 |                         default=-1,
106 |                         type=int,
107 |                         help=('Frame rates of output videos. '
108 |                               '-1 means original frame rates.'))
109 |     parser.add_argument('--size',
110 |                         default=240,
111 |                         type=int,
112 |                         help='Frame size of output videos.')
113 |     args = parser.parse_args()
114 | 
115 |     if args.dataset in ['kinetics', 'mit', 'activitynet']:
116 |         ext = '.mp4'
117 |     else:
118 |         ext = '.avi'
119 | 
120 |     if args.dataset == 'activitynet':
121 |         video_file_paths = [x for x in sorted(args.dir_path.iterdir())]
122 |         status_list = Parallel(n_jobs=args.n_jobs, backend='threading')(
123 |             delayed(video_process)(video_file_path, args.dst_path, ext,
124 |                                    args.fps, args.size)
125 |             for video_file_path in video_file_paths)
126 |     else:
127 |         class_dir_paths = [x for x in sorted(args.dir_path.iterdir())]
128 |         test_set_video_path = args.dir_path / 'test'
129 |         if test_set_video_path.exists():
130 |             class_dir_paths.append(test_set_video_path)
131 | 
132 |         status_list = Parallel(n_jobs=args.n_jobs, backend='threading')(
133 |             delayed(class_process)(class_dir_path, args.dst_path, ext, args.fps,
134 |                                    args.size)
135 |             for class_dir_path in class_dir_paths)
136 | 


--------------------------------------------------------------------------------
/training.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import time
  3 | import os
  4 | import sys
  5 | import pdb
  6 | 
  7 | import torch
  8 | import torch.distributed as dist
  9 | import torch.nn as nn
 10 | 
 11 | from utils import AverageMeter, calculate_accuracy
 12 | 
 13 | 
 14 | def freeze_bn(model):
 15 |     print("Freezing Mean/Var of BatchNorm2D.")
 16 |     print("Freezing Weight/Bias of BatchNorm2D.")
 17 |     for m in model.modules():
 18 |         if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm1d):
 19 |             m.eval()
 20 |             m.weight.requires_grad = False
 21 |             m.bias.requires_grad = False
 22 | 
 23 | 
 24 | def train_epoch(epoch,
 25 |                 data_loader,
 26 |                 model,
 27 |                 criterion,
 28 |                 optimizer,
 29 |                 device,
 30 |                 current_lr,
 31 |                 epoch_logger,
 32 |                 batch_logger,
 33 |                 tb_writer=None,
 34 |                 distributed=False,
 35 |                 rpn=None,
 36 |                 det_interval=2,
 37 |                 nrois=10):
 38 |     print('train at epoch {}'.format(epoch))
 39 | 
 40 |     model.train()
 41 |     if rpn is not None:
 42 |         rpn.eval()
 43 |     else:
 44 |         freeze_bn(model)
 45 | 
 46 |     batch_time = AverageMeter()
 47 |     data_time = AverageMeter()
 48 |     losses = AverageMeter()
 49 |     accuracies = AverageMeter()
 50 | 
 51 |     end_time = time.time()
 52 |     for i, (inputs, targets) in enumerate(data_loader):
 53 |         data_time.update(time.time() - end_time)
 54 |         targets = targets.to(device, non_blocking=True)
 55 |         if rpn is not None:
 56 |             '''
 57 |                 There was an unexpected CUDNN_ERROR when len(rpn_inputs) is
 58 |                 decrased.
 59 |             '''
 60 |             N, C, T, H, W = inputs.size()
 61 |             if i == 0:
 62 |                 max_N = N
 63 |             # sample frames for RPN
 64 |             sample = torch.arange(0,T,det_interval)
 65 |             rpn_inputs = inputs[:,:,sample].transpose(1,2).contiguous()
 66 |             rpn_inputs = rpn_inputs.view(-1,C,H,W)
 67 |             if len(inputs) < max_N:
 68 |                 print("Modified from {} to {}".format(len(inputs), max_N))
 69 |                 while len(rpn_inputs) < max_N * (T // det_interval):
 70 |                     rpn_inputs = torch.cat((rpn_inputs, rpn_inputs[:(max_N-len(inputs))*(T//det_interval)]))
 71 |             with torch.no_grad():
 72 |                 proposals = rpn(rpn_inputs)
 73 |             proposals = proposals.view(-1,T//det_interval,nrois,4)
 74 |             if len(inputs) < max_N:
 75 |                 proposals = proposals[:len(inputs)]
 76 |             outputs = model(inputs, proposals.detach())
 77 |             # update to the largest batch_size
 78 |             max_N = max(N, max_N)
 79 |         else:
 80 |             outputs = model(inputs)
 81 |         loss = criterion(outputs, targets)
 82 |         acc = calculate_accuracy(outputs, targets)
 83 | 
 84 |         losses.update(loss.item(), inputs.size(0))
 85 |         accuracies.update(acc, inputs.size(0))
 86 | 
 87 |         optimizer.zero_grad()
 88 |         loss.backward()
 89 |         optimizer.step()
 90 | 
 91 |         batch_time.update(time.time() - end_time)
 92 |         end_time = time.time()
 93 | 
 94 |         if batch_logger is not None:
 95 |             batch_logger.log({
 96 |                 'epoch': epoch,
 97 |                 'batch': i + 1,
 98 |                 'iter': (epoch - 1) * len(data_loader) + (i + 1),
 99 |                 'loss': losses.val,
100 |                 'acc': accuracies.val,
101 |                 'lr': current_lr
102 |             })
103 |         if i % 20 == 0:
104 |             print('Epoch: [{0}][{1}/{2}]\t'
105 |                 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
106 |                 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
107 |                 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
108 |                 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(epoch,
109 |                                                             i + 1,
110 |                                                             len(data_loader),
111 |                                                             batch_time=batch_time,
112 |                                                             data_time=data_time,
113 |                                                             loss=losses,
114 |                                                             acc=accuracies))
115 | 
116 |     if distributed:
117 |         loss_sum = torch.tensor([losses.sum],
118 |                                 dtype=torch.float32,
119 |                                 device=device)
120 |         loss_count = torch.tensor([losses.count],
121 |                                   dtype=torch.float32,
122 |                                   device=device)
123 |         acc_sum = torch.tensor([accuracies.sum],
124 |                                dtype=torch.float32,
125 |                                device=device)
126 |         acc_count = torch.tensor([accuracies.count],
127 |                                  dtype=torch.float32,
128 |                                  device=device)
129 | 
130 |         dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM)
131 |         dist.all_reduce(loss_count, op=dist.ReduceOp.SUM)
132 |         dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM)
133 |         dist.all_reduce(acc_count, op=dist.ReduceOp.SUM)
134 | 
135 |         losses.avg = loss_sum.item() / loss_count.item()
136 |         accuracies.avg = acc_sum.item() / acc_count.item()
137 | 
138 |     if epoch_logger is not None:
139 |         epoch_logger.log({
140 |             'epoch': epoch,
141 |             'loss': losses.avg,
142 |             'acc': accuracies.avg,
143 |             'lr': current_lr
144 |         })
145 | 
146 |     if tb_writer is not None:
147 |         tb_writer.add_scalar('train/loss', losses.avg, epoch)
148 |         tb_writer.add_scalar('train/acc', accuracies.avg, epoch)
149 |         tb_writer.add_scalar('train/lr', current_lr, epoch)
150 | 


--------------------------------------------------------------------------------
/datasets/activitynet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import json
  3 | 
  4 | import torch
  5 | import torch.utils.data as data
  6 | 
  7 | from .loader import VideoLoader
  8 | from .videodataset import VideoDataset
  9 | 
 10 | 
 11 | def get_n_frames(video_path):
 12 |     return len([
 13 |         x for x in video_path.iterdir()
 14 |         if 'image' in x.name and x.name[0] != '.'
 15 |     ])
 16 | 
 17 | 
 18 | def get_class_labels(data):
 19 |     class_names = []
 20 |     for node1 in data['taxonomy']:
 21 |         is_leaf = True
 22 |         for node2 in data['taxonomy']:
 23 |             if node2['parentId'] == node1['nodeId']:
 24 |                 is_leaf = False
 25 |                 break
 26 |         if is_leaf:
 27 |             class_names.append(node1['nodeName'])
 28 | 
 29 |     class_labels_map = {}
 30 | 
 31 |     for i, class_name in enumerate(class_names):
 32 |         class_labels_map[class_name] = i
 33 | 
 34 |     return class_labels_map
 35 | 
 36 | 
 37 | def get_video_ids_annotations_and_fps(data, subset):
 38 |     video_ids = []
 39 |     annotations = []
 40 |     fps_values = []
 41 | 
 42 |     for key, value in data['database'].items():
 43 |         this_subset = value['subset']
 44 |         if this_subset == subset:
 45 |             video_ids.append(key)
 46 |             annotations.append(value['annotations'])
 47 |             fps_values.append(value['fps'])
 48 | 
 49 |     return video_ids, annotations, fps_values
 50 | 
 51 | 
 52 | class ActivityNet(VideoDataset):
 53 | 
 54 |     def __init__(
 55 |             self,
 56 |             root_path,
 57 |             annotation_path,
 58 |             subset,
 59 |             spatial_transform=None,
 60 |             temporal_transform=None,
 61 |             target_transform=None,
 62 |             video_loader=None,
 63 |             video_path_formatter=(
 64 |                 lambda root_path, label, video_id: root_path / 'v_{}'.format(video_id)),
 65 |             image_name_formatter=lambda x: 'image_{:05d}.jpg'.format(x),
 66 |             is_untrimmed_setting=False):
 67 |         if is_untrimmed_setting:
 68 |             self.data, self.class_names = self.__make_untrimmed_dataset(
 69 |                 root_path, annotation_path, subset, video_path_formatter)
 70 |         else:
 71 |             self.data, self.class_names = self.__make_dataset(
 72 |                 root_path, annotation_path, subset, video_path_formatter)
 73 | 
 74 |         self.spatial_transform = spatial_transform
 75 |         self.temporal_transform = temporal_transform
 76 |         self.target_transform = target_transform
 77 | 
 78 |         if video_loader is None:
 79 |             self.loader = VideoLoader(image_name_formatter)
 80 |         else:
 81 |             self.loader = video_loader
 82 | 
 83 |     def __make_dataset(self, root_path, annotation_path, subset,
 84 |                        video_path_formatter):
 85 |         with annotation_path.open('r') as f:
 86 |             data = json.load(f)
 87 |         video_ids, annotations, fps_values = get_video_ids_annotations_and_fps(
 88 |             data, subset)
 89 |         class_to_idx = get_class_labels(data)
 90 |         idx_to_class = {}
 91 |         for name, label in class_to_idx.items():
 92 |             idx_to_class[label] = name
 93 | 
 94 |         dataset = []
 95 |         for i in range(len(video_ids)):
 96 |             if i % 1000 == 0:
 97 |                 print('dataset loading [{}/{}]'.format(i, len(video_ids)))
 98 | 
 99 |             video_path = video_path_formatter(root_path, label, video_ids[i])
100 |             if not video_path.exists():
101 |                 continue
102 | 
103 |             fps = fps_values[i]
104 | 
105 |             for annotation in annotations[i]:
106 |                 t_begin = math.floor(annotation['segment'][0] * fps) + 1
107 |                 t_end = math.floor(annotation['segment'][1] * fps) + 1
108 |                 n_video_frames = get_n_frames(video_path)
109 |                 t_end = min(t_end, n_video_frames)
110 |                 frame_indices = list(range(t_begin, t_end))
111 | 
112 |                 sample = {
113 |                     'video': video_path,
114 |                     'segment': (frame_indices[0], frame_indices[-1] + 1),
115 |                     'frame_indices': frame_indices,
116 |                     'fps': fps,
117 |                     'video_id': video_ids[i]
118 |                 }
119 |                 if annotations is not None:
120 |                     sample['label'] = class_to_idx[annotation['label']]
121 |                 else:
122 |                     sample['label'] = -1
123 | 
124 |                 if len(sample['frame_indices']) < 8:
125 |                     continue
126 |                 dataset.append(sample)
127 | 
128 |         return dataset, idx_to_class
129 | 
130 |     def __make_untrimmed_dataset(self, root_path, annotation_path, subset,
131 |                                  video_path_formatter):
132 |         with annotation_path.open('r') as f:
133 |             data = json.load(f)
134 |         video_ids, annotations, fps_values = get_video_ids_annotations_and_fps(
135 |             data, subset)
136 |         class_to_idx = get_class_labels(data)
137 |         idx_to_class = {}
138 |         for name, label in class_to_idx.items():
139 |             idx_to_class[label] = name
140 | 
141 |         dataset = []
142 |         for i in range(len(video_ids)):
143 |             if i % 1000 == 0:
144 |                 print('dataset loading [{}/{}]'.format(i, len(video_ids)))
145 | 
146 |             video_path = video_path_formatter(root_path, label, video_ids[i])
147 |             if not video_path.exists():
148 |                 continue
149 | 
150 |             fps = fps_values[i]
151 | 
152 |             t_begin = 1
153 |             t_end = get_n_frames(video_path) + 1
154 |             frame_indices = list(range(t_begin, t_end))
155 | 
156 |             sample = {
157 |                 'video': video_path,
158 |                 'segment': (frame_indices[0], frame_indices[-1] + 1),
159 |                 'frame_indices': frame_indices,
160 |                 'fps': fps,
161 |                 'video_id': video_ids[i]
162 |             }
163 |             dataset.append(sample)
164 | 
165 |         return dataset, idx_to_class
166 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | from models import resnet, resnet2p1d, pre_act_resnet, wide_resnet, resnext, densenet, resnet_strg
  5 | 
  6 | 
  7 | def get_module_name(name):
  8 |     name = name.split('.')
  9 |     if name[0] == 'module':
 10 |         i = 1
 11 |     else:
 12 |         i = 0
 13 |     if name[i] == 'features':
 14 |         i += 1
 15 | 
 16 |     return name[i]
 17 | 
 18 | 
 19 | def get_fine_tuning_parameters(model, ft_begin_module):
 20 |     if not ft_begin_module:
 21 |         return model.parameters()
 22 | 
 23 |     parameters = []
 24 |     add_flag = False
 25 |     for k, v in model.named_parameters():
 26 |         if ft_begin_module == get_module_name(k):
 27 |             add_flag = True
 28 | 
 29 |         if add_flag:
 30 |             parameters.append({'params': v})
 31 | 
 32 |     return parameters
 33 | 
 34 | 
 35 | def generate_model(opt):
 36 |     assert opt.model in [
 37 |         'resnet', 'resnet2p1d', 'preresnet', 'wideresnet', 'resnext', 'densenet',
 38 |         'resnet_strg'
 39 |     ]
 40 |     if opt.model == 'resnet':
 41 |         model = resnet.generate_model(model_depth=opt.model_depth,
 42 |                                       n_classes=opt.n_classes,
 43 |                                       n_input_channels=opt.n_input_channels,
 44 |                                       shortcut_type=opt.resnet_shortcut,
 45 |                                       conv1_t_size=opt.conv1_t_size,
 46 |                                       conv1_t_stride=opt.conv1_t_stride,
 47 |                                       no_max_pool=opt.no_max_pool,
 48 |                                       widen_factor=opt.resnet_widen_factor)
 49 |     elif opt.model == 'resnet_strg':
 50 |         model = resnet_strg.generate_model(model_depth=opt.model_depth,
 51 |                                       n_classes=opt.n_classes,
 52 |                                       n_input_channels=opt.n_input_channels,
 53 |                                       shortcut_type=opt.resnet_shortcut,
 54 |                                       conv1_t_size=opt.conv1_t_size,
 55 |                                       conv1_t_stride=opt.conv1_t_stride,
 56 |                                       no_max_pool=opt.no_max_pool,
 57 |                                       widen_factor=opt.resnet_widen_factor)
 58 |     elif opt.model == 'resnet2p1d':
 59 |         model = resnet2p1d.generate_model(model_depth=opt.model_depth,
 60 |                                           n_classes=opt.n_classes,
 61 |                                           n_input_channels=opt.n_input_channels,
 62 |                                           shortcut_type=opt.resnet_shortcut,
 63 |                                           conv1_t_size=opt.conv1_t_size,
 64 |                                           conv1_t_stride=opt.conv1_t_stride,
 65 |                                           no_max_pool=opt.no_max_pool,
 66 |                                           widen_factor=opt.resnet_widen_factor)
 67 |     elif opt.model == 'wideresnet':
 68 |         model = wide_resnet.generate_model(
 69 |             model_depth=opt.model_depth,
 70 |             k=opt.wide_resnet_k,
 71 |             n_classes=opt.n_classes,
 72 |             n_input_channels=opt.n_input_channels,
 73 |             shortcut_type=opt.resnet_shortcut,
 74 |             conv1_t_size=opt.conv1_t_size,
 75 |             conv1_t_stride=opt.conv1_t_stride,
 76 |             no_max_pool=opt.no_max_pool)
 77 |     elif opt.model == 'resnext':
 78 |         model = resnext.generate_model(model_depth=opt.model_depth,
 79 |                                        cardinality=opt.resnext_cardinality,
 80 |                                        n_classes=opt.n_classes,
 81 |                                        n_input_channels=opt.n_input_channels,
 82 |                                        shortcut_type=opt.resnet_shortcut,
 83 |                                        conv1_t_size=opt.conv1_t_size,
 84 |                                        conv1_t_stride=opt.conv1_t_stride,
 85 |                                        no_max_pool=opt.no_max_pool)
 86 |     elif opt.model == 'preresnet':
 87 |         model = pre_act_resnet.generate_model(
 88 |             model_depth=opt.model_depth,
 89 |             n_classes=opt.n_classes,
 90 |             n_input_channels=opt.n_input_channels,
 91 |             shortcut_type=opt.resnet_shortcut,
 92 |             conv1_t_size=opt.conv1_t_size,
 93 |             conv1_t_stride=opt.conv1_t_stride,
 94 |             no_max_pool=opt.no_max_pool)
 95 |     elif opt.model == 'densenet':
 96 |         model = densenet.generate_model(model_depth=opt.model_depth,
 97 |                                         n_classes=opt.n_classes,
 98 |                                         n_input_channels=opt.n_input_channels,
 99 |                                         conv1_t_size=opt.conv1_t_size,
100 |                                         conv1_t_stride=opt.conv1_t_stride,
101 |                                         no_max_pool=opt.no_max_pool)
102 | 
103 |     return model
104 | 
105 | 
106 | def load_pretrained_model(model, pretrain_path, model_name, n_finetune_classes,
107 |                           is_strg=False):
108 |     if pretrain_path:
109 |         print('loading pretrained model {}'.format(pretrain_path))
110 |         pretrain = torch.load(pretrain_path, map_location='cpu')
111 | 
112 |         model.load_state_dict(pretrain['state_dict'])
113 |         if is_strg:
114 |             return model
115 | 
116 |         tmp_model = model
117 |         if model_name == 'densenet':
118 |             tmp_model.classifier = nn.Linear(tmp_model.classifier.in_features,
119 |                                              n_finetune_classes)
120 |         else:
121 |             tmp_model.fc = nn.Linear(tmp_model.fc.in_features,
122 |                                      n_finetune_classes)
123 | 
124 |     return model
125 | 
126 | 
127 | def make_data_parallel(model, is_distributed, device):
128 |     if is_distributed:
129 |         if device.type == 'cuda' and device.index is not None:
130 |             torch.cuda.set_device(device)
131 |             model.to(device)
132 | 
133 |             model = nn.parallel.DistributedDataParallel(model,
134 |                                                         device_ids=[device])
135 |         else:
136 |             model.to(device)
137 |             model = nn.parallel.DistributedDataParallel(model)
138 |     elif device.type == 'cuda':
139 |         model = nn.DataParallel(model, device_ids=None).cuda()
140 | 
141 |     return model
142 | 


--------------------------------------------------------------------------------
/spatial_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | from torchvision.transforms import transforms
  4 | from torchvision.transforms import functional as F
  5 | from PIL import Image
  6 | 
  7 | 
  8 | class Compose(transforms.Compose):
  9 | 
 10 |     def randomize_parameters(self):
 11 |         for t in self.transforms:
 12 |             t.randomize_parameters()
 13 | 
 14 | 
 15 | class ToTensor(transforms.ToTensor):
 16 | 
 17 |     def randomize_parameters(self):
 18 |         pass
 19 | 
 20 | 
 21 | class Normalize(transforms.Normalize):
 22 | 
 23 |     def randomize_parameters(self):
 24 |         pass
 25 | 
 26 | 
 27 | class ScaleValue(object):
 28 | 
 29 |     def __init__(self, s):
 30 |         self.s = s
 31 | 
 32 |     def __call__(self, tensor):
 33 |         tensor *= self.s
 34 |         return tensor
 35 | 
 36 |     def randomize_parameters(self):
 37 |         pass
 38 | 
 39 | 
 40 | class Resize(transforms.Resize):
 41 | 
 42 |     def randomize_parameters(self):
 43 |         pass
 44 | 
 45 | 
 46 | class Scale(transforms.Scale):
 47 | 
 48 |     def randomize_parameters(self):
 49 |         pass
 50 | 
 51 | 
 52 | class CenterCrop(transforms.CenterCrop):
 53 | 
 54 |     def randomize_parameters(self):
 55 |         pass
 56 | 
 57 | 
 58 | class CornerCrop(object):
 59 | 
 60 |     def __init__(self,
 61 |                  size,
 62 |                  crop_position=None,
 63 |                  crop_positions=['c', 'tl', 'tr', 'bl', 'br']):
 64 |         self.size = size
 65 |         self.crop_position = crop_position
 66 |         self.crop_positions = crop_positions
 67 | 
 68 |         if crop_position is None:
 69 |             self.randomize = True
 70 |         else:
 71 |             self.randomize = False
 72 |         self.randomize_parameters()
 73 | 
 74 |     def __call__(self, img):
 75 |         image_width = img.size[0]
 76 |         image_height = img.size[1]
 77 | 
 78 |         h, w = (self.size, self.size)
 79 |         if self.crop_position == 'c':
 80 |             i = int(round((image_height - h) / 2.))
 81 |             j = int(round((image_width - w) / 2.))
 82 |         elif self.crop_position == 'tl':
 83 |             i = 0
 84 |             j = 0
 85 |         elif self.crop_position == 'tr':
 86 |             i = 0
 87 |             j = image_width - self.size
 88 |         elif self.crop_position == 'bl':
 89 |             i = image_height - self.size
 90 |             j = 0
 91 |         elif self.crop_position == 'br':
 92 |             i = image_height - self.size
 93 |             j = image_width - self.size
 94 | 
 95 |         img = F.crop(img, i, j, h, w)
 96 | 
 97 |         return img
 98 | 
 99 |     def randomize_parameters(self):
100 |         if self.randomize:
101 |             self.crop_position = self.crop_positions[random.randint(
102 |                 0,
103 |                 len(self.crop_positions) - 1)]
104 | 
105 |     def __repr__(self):
106 |         return self.__class__.__name__ + '(size={0}, crop_position={1}, randomize={2})'.format(
107 |             self.size, self.crop_position, self.randomize)
108 | 
109 | 
110 | class RandomHorizontalFlip(transforms.RandomHorizontalFlip):
111 | 
112 |     def __init__(self, p=0.5):
113 |         super().__init__(p)
114 |         self.randomize_parameters()
115 | 
116 |     def __call__(self, img):
117 |         """
118 |         Args:
119 |             img (PIL.Image): Image to be flipped.
120 |         Returns:
121 |             PIL.Image: Randomly flipped image.
122 |         """
123 |         if self.random_p < self.p:
124 |             return F.hflip(img)
125 |         return img
126 | 
127 |     def randomize_parameters(self):
128 |         self.random_p = random.random()
129 | 
130 | 
131 | class MultiScaleCornerCrop(object):
132 | 
133 |     def __init__(self,
134 |                  size,
135 |                  scales,
136 |                  crop_positions=['c', 'tl', 'tr', 'bl', 'br'],
137 |                  interpolation=Image.BILINEAR):
138 |         self.size = size
139 |         self.scales = scales
140 |         self.interpolation = interpolation
141 |         self.crop_positions = crop_positions
142 | 
143 |         self.randomize_parameters()
144 | 
145 |     def __call__(self, img):
146 |         short_side = min(img.size[0], img.size[1])
147 |         crop_size = int(short_side * self.scale)
148 |         self.corner_crop.size = crop_size
149 | 
150 |         img = self.corner_crop(img)
151 |         return img.resize((self.size, self.size), self.interpolation)
152 | 
153 |     def randomize_parameters(self):
154 |         self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
155 |         crop_position = self.crop_positions[random.randint(
156 |             0,
157 |             len(self.crop_positions) - 1)]
158 | 
159 |         self.corner_crop = CornerCrop(None, crop_position)
160 | 
161 |     def __repr__(self):
162 |         return self.__class__.__name__ + '(size={0}, scales={1}, interpolation={2})'.format(
163 |             self.size, self.scales, self.interpolation)
164 | 
165 | 
166 | class RandomResizedCrop(transforms.RandomResizedCrop):
167 | 
168 |     def __init__(self,
169 |                  size,
170 |                  scale=(0.08, 1.0),
171 |                  ratio=(3. / 4., 4. / 3.),
172 |                  interpolation=Image.BILINEAR):
173 |         super().__init__(size, scale, ratio, interpolation)
174 |         self.randomize_parameters()
175 | 
176 |     def __call__(self, img):
177 |         if self.randomize:
178 |             self.random_crop = self.get_params(img, self.scale, self.ratio)
179 |             self.randomize = False
180 | 
181 |         i, j, h, w = self.random_crop
182 |         return F.resized_crop(img, i, j, h, w, self.size, self.interpolation)
183 | 
184 |     def randomize_parameters(self):
185 |         self.randomize = True
186 | 
187 | 
188 | class ColorJitter(transforms.ColorJitter):
189 | 
190 |     def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
191 |         super().__init__(brightness, contrast, saturation, hue)
192 |         self.randomize_parameters()
193 | 
194 |     def __call__(self, img):
195 |         if self.randomize:
196 |             self.transform = self.get_params(self.brightness, self.contrast,
197 |                                              self.saturation, self.hue)
198 |             self.randomize = False
199 | 
200 |         return self.transform(img)
201 | 
202 |     def randomize_parameters(self):
203 |         self.randomize = True
204 | 
205 | 
206 | class PickFirstChannels(object):
207 | 
208 |     def __init__(self, n):
209 |         self.n = n
210 | 
211 |     def __call__(self, tensor):
212 |         return tensor[:self.n, :, :]
213 | 
214 |     def randomize_parameters(self):
215 |         pass


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Videos as Space-Time Region Graph
  2 | 
  3 | ## Summary
  4 | 
  5 | * This repository is for testing the idea of the following paper:
  6 | 
  7 | [
  8 | Wang, Xiaolong, and Abhinav Gupta. "Videos as space-time region graphs." Proceedings of the European conference on computer vision (ECCV). 2018.
  9 | ](http://openaccess.thecvf.com/content_ECCV_2018/papers/Xiaolong_Wang_Videos_as_Space-Time_ECCV_2018_paper.pdf)
 10 | 
 11 | * It means that it may contain several mismatch with the original implementation introduced on the paper.
 12 | 
 13 | * Also the performance is much lower than the publication (24 vs 43) and I never test Kinetics pre-trained ResNet-50-I3D.
 14 | 
 15 | ## Notes
 16 | 
 17 | * This repository is based on https://github.com/kenshohara/3D-ResNets-PyTorch.
 18 | 
 19 | * The architecture of ResNet-50-I3D in the paper is different from that in the above repository. I did not use Kinetics pre-trained model but use ImageNet pre-trained model.
 20 | 
 21 | * Currently, RPN is used on every iteration which requires approximately 3 times more training time.
 22 | 
 23 | * Kinetics pre-trained model can be found in [here](https://github.com/joaanna/something_else).
 24 | 
 25 | 
 26 | ## Requirements
 27 | 
 28 | * [PyTorch](http://pytorch.org/) (ver. 1.2+ required)
 29 | * [Torchvision](http://pytorch.org/) (ver. 0.4+ required)
 30 | 
 31 | ```bash
 32 | conda install pytorch torchvision cudatoolkit=10.1 -c soumith
 33 | ```
 34 | 
 35 | ```bash
 36 | pip install -r requirements.txt
 37 | ```
 38 | 
 39 | 
 40 | * FFmpeg, FFprobe
 41 | 
 42 | * Python 3
 43 | 
 44 | ## Preparation
 45 | 
 46 | ### Kinetics
 47 | 
 48 | * Download videos using [the official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics).
 49 |   * Locate test set in ```video_directory/test```.
 50 | * Convert from avi to jpg files using ```util_scripts/generate_video_jpgs.py```
 51 | 
 52 | ```bash
 53 | python -m util_scripts.generate_video_jpgs mp4_video_dir_path jpg_video_dir_path kinetics
 54 | ```
 55 | 
 56 | * Generate annotation file in json format similar to ActivityNet using ```util_scripts/kinetics_json.py```
 57 |   * The CSV files (kinetics_{train, val, test}.csv) are included in the crawler.
 58 | 
 59 | ```bash
 60 | python -m util_scripts.kinetics_json csv_dir_path 700 jpg_video_dir_path jpg dst_json_path
 61 | ```
 62 | 
 63 | 
 64 | ### Something-Something v1/v2
 65 | 
 66 | * Download videos from the official [website](https://20bn.com/datasets/something-something/v2#download).
 67 | * For Something-Something v2, please run `util_scripts/vid2img_sthv1.[py`
 68 | 
 69 | ```bash
 70 | python util_scripts/sthv1_json.py 'data/something/v1' 'data/something/v1/img' 'data/sthv1.json'
 71 | ```
 72 | 
 73 | ```bash
 74 | python util_scripts/sthv2_json.py 'data/something/v2' 'data/something/v2/img' 'data/sthv2.json'
 75 | ```
 76 | 
 77 | 
 78 | 
 79 | ## Running the code
 80 | 
 81 | ### Data Path
 82 | 
 83 | Assume the structure of data directories is the following:
 84 | 
 85 | ```misc
 86 | ~/
 87 |   data/
 88 |     something/
 89 |       v1/
 90 |         img/
 91 |           .../ (directories of video names)
 92 |             ... (jpg files)
 93 |       v2/
 94 |         img/
 95 |           .../ (directories of video names)
 96 |             ... (jpg files)
 97 |     kinetics_videos/
 98 |       jpg/
 99 |         .../ (directories of class names)
100 |           .../ (directories of video names)
101 |             ... (jpg files)
102 |     results/
103 |       save_100.pth
104 |     kinetics.json
105 | ```
106 | 
107 | Confirm all options.
108 | 
109 | ```bash
110 | python main.py -h
111 | ```
112 | 
113 | ### Kinetics Pre-training
114 | 
115 | Train ResNets-50 on the Kinetics-700 dataset (700 classes) with 4 CPU threads (for data loading).  
116 | Batch size is 128.  
117 | Save models at every 5 epochs.
118 | All GPUs is used for the training.
119 | If you want a part of GPUs, use ```CUDA_VISIBLE_DEVICES=...```.
120 | 
121 | ```bash
122 | python main.py --root_path ~/data --video_path kinetics_videos/jpg --annotation_path kinetics.json \
123 | --result_path results --dataset kinetics --model resnet \
124 | --model_depth 50 --n_classes 700 --batch_size 128 --n_threads 4 --checkpoint 5
125 | ```
126 | 
127 | 
128 | Calculate top-5 class probabilities of each video using a trained model (~/data/results/save_200.pth.)  
129 | Note that ```inference_batch_size``` should be small because actual batch size is calculated by ```inference_batch_size * (n_video_frames / inference_stride)```.
130 | 
131 | ```bash
132 | python main.py --root_path ~/data --video_path kinetics_videos/jpg --annotation_path kinetics.json \
133 | --result_path results --dataset kinetics --resume_path results/save_200.pth \
134 | --model_depth 50 --n_classes 700 --n_threads 4 --no_train --no_val --inference --output_topk 5 --inference_batch_size 1
135 | ```
136 | 
137 | Evaluate top-1 video accuracy of a recognition result (data/results/val.json).
138 | 
139 | ```bash
140 | python -m util_scripts.eval_accuracy data/sthv2.json data/results/val.json --subset val -k 1 --ignore
141 | ```
142 | 
143 | ### Something-Something-v1
144 | 
145 | First of all, we need to train backbone network (ResNet-50-I3D) for 100 epochs with learning rate as 0.00125 (decayed at 90 epoch to 0.000125)
146 | The original batchsize is 8 but in this implementation, we use 32 to reduce the training time.
147 | 
148 | ```bash
149 | python main.py --root_path data --video_path data/something/v1/img --annotation_path sthv1.json \
150 | --result_path resnet_strg_imgnet_bs32 --dataset somethingv1 --n_classes 174 --n_pretrain_classes 700 \
151 | --ft_begin_module fc --tensorboard --wandb --conv1_t_size 5 --learning_rate 0.00125 --sample_duration 32 \
152 | --n_epochs 100 --multistep_milestones 90 --model resnet_strg --model_depth 50 --batch_size 32 \
153 | --n_threads 8 --checkpoint 1
154 | ```
155 | 
156 | Then, we need to train with GCN module until 30 epochs with learning rate as 0.000125.
157 | 
158 | ```bash
159 | python main.py --root_path data --video_path data/something/v1/img --annotation_path sthv1.json \
160 | --result_path resnet_strg_imgnet_32_gcn --dataset somethingv1 --n_classes 174 --n_pretrain_classes 174 \
161 | --ft_begin_module fc --tensorboard --wandb --conv1_t_size 5  --learning_rate 0.000125 \
162 | --sample_duration 32 --n_epochs 30 --model resnet_strg --model_depth 50 --batch_size 32 \
163 | --nrois 10 --det_interval 2 --strg \
164 | --n_threads 8 --checkpoint 1 --pretrain_path resnet_strg_imgnet_bs32/save_100.pth
165 | ```
166 | 
167 | ## Results on Something-Something-v1
168 | 
169 | ### The published results
170 | 
171 | | Model name         | ResNet-50-I3D | ResNet-50-I3D + STRG |
172 | | ------------------ |---------------- | -------------- |
173 | | Top-1 Accuracy   |     41.6%         |      43.3% |
174 | 
175 | 
176 | ### This repo results (without using Kinetic pretraining model)
177 | 
178 | | Model name         | ResNet-50-I3D | ResNet-50-I3D + STRG |
179 | | ------------------ |---------------- | -------------- |
180 | | Top-1 Accuracy   |     23.2%         |      24.5% |
181 | 
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/rgcn_models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | from torch.nn.parameter import Parameter
  5 | import math
  6 | import pdb
  7 | import time
  8 | 
  9 | from module.gcn import GCN, GraphConvolution
 10 | from module.roi_graph import get_st_graph
 11 | 
 12 | 
 13 | class RGCN(torch.nn.Module):
 14 |     def __init__(self, in_channel=512, out_channel=512, test_mode=False,
 15 |                  dropout=0.5,
 16 |                  separate_fb=True):
 17 |         super(RGCN, self).__init__()
 18 | 
 19 |         # 1 by 1 conv -> 512  wang: 2048 -> 512
 20 |         self.out_channel = out_channel
 21 |         in_channel = in_channel # 512
 22 |         dropout = dropout
 23 |         self.separate_fb = separate_fb
 24 | 
 25 | 
 26 |         # wang2018video differentiates forward graph and backward graph,
 27 |         # but in this implementation we ignore this.
 28 | 
 29 |         self.sim_embed1 = nn.Linear(in_channel, in_channel, bias=False)
 30 |         self.sim_embed2 = nn.Linear(in_channel, in_channel, bias=False)
 31 | 
 32 |         self.st_gc1 = GraphConvolution(in_channel, in_channel, bias=False, batch=True)
 33 |         self.st_gc2 = GraphConvolution(in_channel, in_channel, bias=False, batch=True)
 34 |         self.st_gc3 = GraphConvolution(in_channel, self.out_channel, bias=False, batch=True)
 35 |         if self.separate_fb:
 36 |             self.st_gc1_back = GraphConvolution(in_channel, in_channel, bias=False, batch=True)
 37 |             self.st_gc2_back = GraphConvolution(in_channel, in_channel, bias=False, batch=True)
 38 |             self.st_gc3_back = GraphConvolution(in_channel, self.out_channel, bias=False, batch=True)
 39 | 
 40 |         self.sim_gc1 = GraphConvolution(in_channel, in_channel, bias=False, batch=True)
 41 |         self.sim_gc2 = GraphConvolution(in_channel, in_channel, bias=False, batch=True)
 42 |         self.sim_gc3 = GraphConvolution(in_channel, self.out_channel, bias=False, batch=True)
 43 | 
 44 |         self.dropout = nn.Dropout(dropout)
 45 |         self.init_weight()
 46 | 
 47 | 
 48 |     def st_GCN(self, input, front_graph, back_graph=None):
 49 |         input = input.squeeze(2)
 50 |         out = F.relu(self.st_gc1(input,front_graph))
 51 |         if self.separate_fb:
 52 |             out += F.relu(self.st_gc1_back(input,back_graph))
 53 | #        out = self.dropout(out)
 54 | 
 55 |         out2 = F.relu(self.st_gc2(out,front_graph))
 56 |         if self.separate_fb:
 57 |             out2 += F.relu(self.st_gc2_back(out,back_graph))
 58 |         out = out2
 59 | #        out = self.dropout(out2)
 60 | 
 61 |         out2 = F.relu(self.st_gc3(out,front_graph))
 62 |         if self.separate_fb:
 63 |             out2 += F.relu(self.st_gc3_back(out,back_graph))
 64 |         return out2
 65 | 
 66 | 
 67 |     def sim_GCN(self, input, adj):
 68 |         out = F.relu(self.sim_gc1(input,adj))
 69 | #        out = self.dropout(out)
 70 |         out = F.relu(self.sim_gc2(out,adj))
 71 | #        out = self.dropout(out)
 72 |         out = F.relu(self.sim_gc3(out,adj))
 73 |         return out
 74 | 
 75 | 
 76 | 
 77 |     def init_weight(self):
 78 | #        nn.init.constant_(self.sim_gc1.bias.data, 0)
 79 | #        nn.init.constant_(self.sim_gc2.bias.data, 0)
 80 | #        nn.init.constant_(self.sim_gc3.bias.data, 0)
 81 | #
 82 | #        nn.init.constant_(self.st_gc1.bias.data, 0)
 83 | #        nn.init.constant_(self.st_gc2.bias.data, 0)
 84 | #        nn.init.constant_(self.st_gc3.bias.data, 0)
 85 | 
 86 |         nn.init.normal_(self.sim_gc1.weight.data, 0, 0.001)
 87 |         nn.init.normal_(self.sim_gc2.weight.data, 0, 0.001)
 88 |         nn.init.normal_(self.st_gc1.weight.data, 0, 0.001)
 89 |         nn.init.normal_(self.st_gc2.weight.data, 0, 0.001)
 90 | 
 91 |         nn.init.normal_(self.st_gc3.weight.data, 0, 0.001)
 92 |         nn.init.normal_(self.sim_gc3.weight.data, 0, 0.001)
 93 | #        nn.init.constant_(self.sim_gc3.weight.data, 0)
 94 | #        nn.init.constant_(self.st_gc3.weight.data, 0)
 95 | 
 96 |         if self.separate_fb:
 97 |             nn.init.normal_(self.st_gc1_back.weight.data, 0, 0.001)
 98 |             nn.init.normal_(self.st_gc2_back.weight.data, 0, 0.001)
 99 |             nn.init.constant_(self.st_gc3_back.weight.data, 0)
100 | 
101 | 
102 | 
103 | 
104 |     def generate_st_graphs(self, rois, connection, return_dict, st=0):
105 |         for i, (r, c) in enumerate(zip(rois, connection)):
106 |             return_dict[i+st] = get_st_graph(r,c)
107 | 
108 | 
109 | 
110 |     def forward(self, rois_features, rois):
111 |         front_graph, back_graph = get_st_graph(rois)
112 | 
113 |         front_graph = front_graph.to(rois.device).detach()
114 |         back_graph = back_graph.to(rois.device).detach()
115 | 
116 |         B, T, N, C = rois_features.size()
117 |         N_rois = T*N
118 |         rois_features = rois_features.view(B, N_rois, -1)
119 |         sim_graph = self.sim_graph(rois_features).detach()
120 |         sim_gcn = self.sim_GCN(rois_features, sim_graph)
121 |         st_gcn = self.st_GCN(rois_features, front_graph, back_graph)
122 |         gcn_out = sim_gcn + st_gcn
123 |         gcn_out = gcn_out.mean(1)
124 |         gcn_out = self.dropout(gcn_out)
125 |         return gcn_out
126 | 
127 | 
128 | 
129 |     def sim_graph(self, features):
130 |         sim1 = self.sim_embed1(features)
131 |         sim2 = self.sim_embed2(features)
132 |         sim_features = torch.matmul(sim1, sim2.transpose(1,2)) # d x d mat.
133 |         sim_graph = F.softmax(sim_features, dim=-1)
134 |         return sim_graph
135 | 
136 | 
137 |     def get_optim_policies(self):
138 | 
139 |         normal_weight = []
140 |         normal_bias = []
141 | 
142 |         for m in self.modules():
143 |             if isinstance(m, torch.nn.Linear):
144 |                 ps = list(m.parameters())
145 |                 normal_weight.append(ps[0])
146 |                 if len(ps) == 2:
147 |                     normal_bias.append(ps[1])
148 |             elif isinstance(m, GraphConvolution):
149 |                 ps = list(m.parameters())
150 |                 normal_weight.append(ps[0])
151 |                 if len(ps) == 2:
152 |                     normal_bias.append(ps[1])
153 |             elif 'Conv' in str(type(m)):
154 |                 ps = list(m.parameters())
155 |                 normal_weight.append(ps[0])
156 |                 if len(ps) == 2:
157 |                     normal_bias.append(ps[1])
158 |             elif len(m._modules) == 0:
159 |                 if len(list(m.parameters())) > 0:
160 |                     raise ValueError("New atomic module type: {}. Need to give it a learning policy".format(type(m)))
161 | 
162 |         return [
163 |             {'params': normal_weight, 'lr_mult': 1, 'decay_mult': 1,
164 |              'name': "normal_weight"},
165 |             {'params': normal_bias, 'lr_mult': 2, 'decay_mult': 0,
166 |              'name': "normal_bias"},
167 |         ]
168 | 
169 | 
170 | if __name__ == '__main__':
171 |     rois = torch.rand((4,8,10,4))
172 |     rois_features = torch.rand((4,8,10,512))
173 |     rgcn = RGCN()
174 |     out = rgcn(rois_features, rois)
175 | 
176 |     pdb.set_trace()
177 | 
178 | 
179 | 
180 | 


--------------------------------------------------------------------------------
/models/densenet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from collections import OrderedDict
  7 | 
  8 | 
  9 | class _DenseLayer(nn.Sequential):
 10 | 
 11 |     def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
 12 |         super().__init__()
 13 |         self.add_module('norm1', nn.BatchNorm3d(num_input_features))
 14 |         self.add_module('relu1', nn.ReLU(inplace=True))
 15 |         self.add_module(
 16 |             'conv1',
 17 |             nn.Conv3d(num_input_features,
 18 |                       bn_size * growth_rate,
 19 |                       kernel_size=1,
 20 |                       stride=1,
 21 |                       bias=False))
 22 |         self.add_module('norm2', nn.BatchNorm3d(bn_size * growth_rate))
 23 |         self.add_module('relu2', nn.ReLU(inplace=True))
 24 |         self.add_module(
 25 |             'conv2',
 26 |             nn.Conv3d(bn_size * growth_rate,
 27 |                       growth_rate,
 28 |                       kernel_size=3,
 29 |                       stride=1,
 30 |                       padding=1,
 31 |                       bias=False))
 32 |         self.drop_rate = drop_rate
 33 | 
 34 |     def forward(self, x):
 35 |         new_features = super().forward(x)
 36 |         if self.drop_rate > 0:
 37 |             new_features = F.dropout(new_features,
 38 |                                      p=self.drop_rate,
 39 |                                      training=self.training)
 40 |         return torch.cat([x, new_features], 1)
 41 | 
 42 | 
 43 | class _DenseBlock(nn.Sequential):
 44 | 
 45 |     def __init__(self, num_layers, num_input_features, bn_size, growth_rate,
 46 |                  drop_rate):
 47 |         super().__init__()
 48 |         for i in range(num_layers):
 49 |             layer = _DenseLayer(num_input_features + i * growth_rate,
 50 |                                 growth_rate, bn_size, drop_rate)
 51 |             self.add_module('denselayer{}'.format(i + 1), layer)
 52 | 
 53 | 
 54 | class _Transition(nn.Sequential):
 55 | 
 56 |     def __init__(self, num_input_features, num_output_features):
 57 |         super().__init__()
 58 |         self.add_module('norm', nn.BatchNorm3d(num_input_features))
 59 |         self.add_module('relu', nn.ReLU(inplace=True))
 60 |         self.add_module(
 61 |             'conv',
 62 |             nn.Conv3d(num_input_features,
 63 |                       num_output_features,
 64 |                       kernel_size=1,
 65 |                       stride=1,
 66 |                       bias=False))
 67 |         self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2))
 68 | 
 69 | 
 70 | class DenseNet(nn.Module):
 71 |     """Densenet-BC model class
 72 |     Args:
 73 |         growth_rate (int) - how many filters to add each layer (k in paper)
 74 |         block_config (list of 4 ints) - how many layers in each pooling block
 75 |         num_init_features (int) - the number of filters to learn in the first convolution layer
 76 |         bn_size (int) - multiplicative factor for number of bottle neck layers
 77 |           (i.e. bn_size * k features in the bottleneck layer)
 78 |         drop_rate (float) - dropout rate after each dense layer
 79 |         num_classes (int) - number of classification classes
 80 |     """
 81 | 
 82 |     def __init__(self,
 83 |                  n_input_channels=3,
 84 |                  conv1_t_size=7,
 85 |                  conv1_t_stride=1,
 86 |                  no_max_pool=False,
 87 |                  growth_rate=32,
 88 |                  block_config=(6, 12, 24, 16),
 89 |                  num_init_features=64,
 90 |                  bn_size=4,
 91 |                  drop_rate=0,
 92 |                  num_classes=1000):
 93 | 
 94 |         super().__init__()
 95 | 
 96 |         # First convolution
 97 |         self.features = [('conv1',
 98 |                           nn.Conv3d(n_input_channels,
 99 |                                     num_init_features,
100 |                                     kernel_size=(conv1_t_size, 7, 7),
101 |                                     stride=(conv1_t_stride, 2, 2),
102 |                                     padding=(conv1_t_size // 2, 3, 3),
103 |                                     bias=False)),
104 |                          ('norm1', nn.BatchNorm3d(num_init_features)),
105 |                          ('relu1', nn.ReLU(inplace=True))]
106 |         if not no_max_pool:
107 |             self.features.append(
108 |                 ('pool1', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)))
109 |         self.features = nn.Sequential(OrderedDict(self.features))
110 | 
111 |         # Each denseblock
112 |         num_features = num_init_features
113 |         for i, num_layers in enumerate(block_config):
114 |             block = _DenseBlock(num_layers=num_layers,
115 |                                 num_input_features=num_features,
116 |                                 bn_size=bn_size,
117 |                                 growth_rate=growth_rate,
118 |                                 drop_rate=drop_rate)
119 |             self.features.add_module('denseblock{}'.format(i + 1), block)
120 |             num_features = num_features + num_layers * growth_rate
121 |             if i != len(block_config) - 1:
122 |                 trans = _Transition(num_input_features=num_features,
123 |                                     num_output_features=num_features // 2)
124 |                 self.features.add_module('transition{}'.format(i + 1), trans)
125 |                 num_features = num_features // 2
126 | 
127 |         # Final batch norm
128 |         self.features.add_module('norm5', nn.BatchNorm3d(num_features))
129 | 
130 |         for m in self.modules():
131 |             if isinstance(m, nn.Conv3d):
132 |                 m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
133 |             elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d):
134 |                 m.weight.data.fill_(1)
135 |                 m.bias.data.zero_()
136 | 
137 |         # Linear layer
138 |         self.classifier = nn.Linear(num_features, num_classes)
139 | 
140 |         for m in self.modules():
141 |             if isinstance(m, nn.Conv3d):
142 |                 nn.init.kaiming_normal_(m.weight,
143 |                                         mode='fan_out',
144 |                                         nonlinearity='relu')
145 |             elif isinstance(m, nn.BatchNorm3d):
146 |                 nn.init.constant_(m.weight, 1)
147 |                 nn.init.constant_(m.bias, 0)
148 |             elif isinstance(m, nn.Linear):
149 |                 nn.init.constant_(m.bias, 0)
150 | 
151 |     def forward(self, x):
152 |         features = self.features(x)
153 |         out = F.relu(features, inplace=True)
154 |         out = F.adaptive_avg_pool3d(out,
155 |                                     output_size=(1, 1,
156 |                                                  1)).view(features.size(0), -1)
157 |         out = self.classifier(out)
158 |         return out
159 | 
160 | 
161 | def generate_model(model_depth, **kwargs):
162 |     assert model_depth in [121, 169, 201, 264]
163 | 
164 |     if model_depth == 121:
165 |         model = DenseNet(num_init_features=64,
166 |                          growth_rate=32,
167 |                          block_config=(6, 12, 24, 16),
168 |                          **kwargs)
169 |     elif model_depth == 169:
170 |         model = DenseNet(num_init_features=64,
171 |                          growth_rate=32,
172 |                          block_config=(6, 12, 32, 32),
173 |                          **kwargs)
174 |     elif model_depth == 201:
175 |         model = DenseNet(num_init_features=64,
176 |                          growth_rate=32,
177 |                          block_config=(6, 12, 48, 32),
178 |                          **kwargs)
179 |     elif model_depth == 264:
180 |         model = DenseNet(num_init_features=64,
181 |                          growth_rate=32,
182 |                          block_config=(6, 12, 64, 48),
183 |                          **kwargs)
184 | 
185 |     return model


--------------------------------------------------------------------------------
/models/resnet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from functools import partial
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | 
  8 | def get_inplanes():
  9 |     return [64, 128, 256, 512]
 10 | 
 11 | 
 12 | def conv3x3x3(in_planes, out_planes, stride=1):
 13 |     return nn.Conv3d(in_planes,
 14 |                      out_planes,
 15 |                      kernel_size=3,
 16 |                      stride=stride,
 17 |                      padding=1,
 18 |                      bias=False)
 19 | 
 20 | 
 21 | def conv1x1x1(in_planes, out_planes, stride=1):
 22 |     return nn.Conv3d(in_planes,
 23 |                      out_planes,
 24 |                      kernel_size=1,
 25 |                      stride=stride,
 26 |                      bias=False)
 27 | 
 28 | 
 29 | class BasicBlock(nn.Module):
 30 |     expansion = 1
 31 | 
 32 |     def __init__(self, in_planes, planes, stride=1, downsample=None):
 33 |         super().__init__()
 34 | 
 35 |         self.conv1 = conv3x3x3(in_planes, planes, stride)
 36 |         self.bn1 = nn.BatchNorm3d(planes)
 37 |         self.relu = nn.ReLU(inplace=True)
 38 |         self.conv2 = conv3x3x3(planes, planes)
 39 |         self.bn2 = nn.BatchNorm3d(planes)
 40 |         self.downsample = downsample
 41 |         self.stride = stride
 42 | 
 43 |     def forward(self, x):
 44 |         residual = x
 45 | 
 46 |         out = self.conv1(x)
 47 |         out = self.bn1(out)
 48 |         out = self.relu(out)
 49 | 
 50 |         out = self.conv2(out)
 51 |         out = self.bn2(out)
 52 | 
 53 |         if self.downsample is not None:
 54 |             residual = self.downsample(x)
 55 | 
 56 |         out += residual
 57 |         out = self.relu(out)
 58 | 
 59 |         return out
 60 | 
 61 | 
 62 | class Bottleneck(nn.Module):
 63 |     expansion = 4
 64 | 
 65 |     def __init__(self, in_planes, planes, stride=1, downsample=None):
 66 |         super().__init__()
 67 | 
 68 |         self.conv1 = conv1x1x1(in_planes, planes)
 69 |         self.bn1 = nn.BatchNorm3d(planes)
 70 |         self.conv2 = conv3x3x3(planes, planes, stride)
 71 |         self.bn2 = nn.BatchNorm3d(planes)
 72 |         self.conv3 = conv1x1x1(planes, planes * self.expansion)
 73 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
 74 |         self.relu = nn.ReLU(inplace=True)
 75 |         self.downsample = downsample
 76 |         self.stride = stride
 77 | 
 78 |     def forward(self, x):
 79 |         residual = x
 80 | 
 81 |         out = self.conv1(x)
 82 |         out = self.bn1(out)
 83 |         out = self.relu(out)
 84 | 
 85 |         out = self.conv2(out)
 86 |         out = self.bn2(out)
 87 |         out = self.relu(out)
 88 | 
 89 |         out = self.conv3(out)
 90 |         out = self.bn3(out)
 91 | 
 92 |         if self.downsample is not None:
 93 |             residual = self.downsample(x)
 94 | 
 95 |         out += residual
 96 |         out = self.relu(out)
 97 | 
 98 |         return out
 99 | 
100 | 
101 | class ResNet(nn.Module):
102 | 
103 |     def __init__(self,
104 |                  block,
105 |                  layers,
106 |                  block_inplanes,
107 |                  n_input_channels=3,
108 |                  conv1_t_size=7,
109 |                  conv1_t_stride=1,
110 |                  no_max_pool=False,
111 |                  shortcut_type='B',
112 |                  widen_factor=1.0,
113 |                  n_classes=400):
114 |         super().__init__()
115 | 
116 |         block_inplanes = [int(x * widen_factor) for x in block_inplanes]
117 | 
118 |         self.in_planes = block_inplanes[0]
119 |         self.no_max_pool = no_max_pool
120 | 
121 |         self.conv1 = nn.Conv3d(n_input_channels,
122 |                                self.in_planes,
123 |                                kernel_size=(conv1_t_size, 7, 7),
124 |                                stride=(conv1_t_stride, 2, 2),
125 |                                padding=(conv1_t_size // 2, 3, 3),
126 |                                bias=False)
127 |         self.bn1 = nn.BatchNorm3d(self.in_planes)
128 |         self.relu = nn.ReLU(inplace=True)
129 |         self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1)
130 |         self.layer1 = self._make_layer(block, block_inplanes[0], layers[0],
131 |                                        shortcut_type)
132 |         self.layer2 = self._make_layer(block,
133 |                                        block_inplanes[1],
134 |                                        layers[1],
135 |                                        shortcut_type,
136 |                                        stride=2)
137 |         self.layer3 = self._make_layer(block,
138 |                                        block_inplanes[2],
139 |                                        layers[2],
140 |                                        shortcut_type,
141 |                                        stride=2)
142 |         self.layer4 = self._make_layer(block,
143 |                                        block_inplanes[3],
144 |                                        layers[3],
145 |                                        shortcut_type,
146 |                                        stride=2)
147 | 
148 |         self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
149 |         self.fc = nn.Linear(block_inplanes[3] * block.expansion, n_classes)
150 | 
151 |         for m in self.modules():
152 |             if isinstance(m, nn.Conv3d):
153 |                 nn.init.kaiming_normal_(m.weight,
154 |                                         mode='fan_out',
155 |                                         nonlinearity='relu')
156 |             elif isinstance(m, nn.BatchNorm3d):
157 |                 nn.init.constant_(m.weight, 1)
158 |                 nn.init.constant_(m.bias, 0)
159 | 
160 |     def _downsample_basic_block(self, x, planes, stride):
161 |         out = F.avg_pool3d(x, kernel_size=1, stride=stride)
162 |         zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2),
163 |                                 out.size(3), out.size(4))
164 |         if isinstance(out.data, torch.cuda.FloatTensor):
165 |             zero_pads = zero_pads.cuda()
166 | 
167 |         out = torch.cat([out.data, zero_pads], dim=1)
168 | 
169 |         return out
170 | 
171 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
172 |         downsample = None
173 |         if stride != 1 or self.in_planes != planes * block.expansion:
174 |             if shortcut_type == 'A':
175 |                 downsample = partial(self._downsample_basic_block,
176 |                                      planes=planes * block.expansion,
177 |                                      stride=stride)
178 |             else:
179 |                 downsample = nn.Sequential(
180 |                     conv1x1x1(self.in_planes, planes * block.expansion, stride),
181 |                     nn.BatchNorm3d(planes * block.expansion))
182 | 
183 |         layers = []
184 |         layers.append(
185 |             block(in_planes=self.in_planes,
186 |                   planes=planes,
187 |                   stride=stride,
188 |                   downsample=downsample))
189 |         self.in_planes = planes * block.expansion
190 |         for i in range(1, blocks):
191 |             layers.append(block(self.in_planes, planes))
192 | 
193 |         return nn.Sequential(*layers)
194 | 
195 | 
196 | 
197 |     def extract_feature(self, x):
198 |         x = self.conv1(x)
199 |         x = self.bn1(x)
200 |         x = self.relu(x)
201 |         if not self.no_max_pool:
202 |             x = self.maxpool(x)
203 | 
204 |         x = self.layer1(x)
205 |         x = self.layer2(x)
206 |         x = self.layer3(x)
207 |         x = self.layer4(x)
208 |         return x
209 | 
210 | 
211 | 
212 |     def forward(self, x):
213 |         x = self.conv1(x)
214 |         x = self.bn1(x)
215 |         x = self.relu(x)
216 |         if not self.no_max_pool:
217 |             x = self.maxpool(x)
218 | 
219 |         x = self.layer1(x)
220 |         x = self.layer2(x)
221 |         x = self.layer3(x)
222 |         x = self.layer4(x)
223 | 
224 |         x = self.avgpool(x)
225 | 
226 |         x = x.view(x.size(0), -1)
227 |         x = self.fc(x)
228 | 
229 |         return x
230 | 
231 | 
232 | def generate_model(model_depth, **kwargs):
233 |     assert model_depth in [10, 18, 34, 50, 101, 152, 200]
234 | 
235 |     if model_depth == 10:
236 |         model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs)
237 |     elif model_depth == 18:
238 |         model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs)
239 |     elif model_depth == 34:
240 |         model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs)
241 |     elif model_depth == 50:
242 |         model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs)
243 |     elif model_depth == 101:
244 |         model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs)
245 |     elif model_depth == 152:
246 |         model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs)
247 |     elif model_depth == 200:
248 |         model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs)
249 | 
250 |     return model
251 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | from torchvision import get_image_backend
  2 | 
  3 | from datasets.videodataset import VideoDataset
  4 | from datasets.videodataset_multiclips import (VideoDatasetMultiClips,
  5 |                                               collate_fn)
  6 | from datasets.activitynet import ActivityNet
  7 | from datasets.loader import VideoLoader, VideoLoaderHDF5, VideoLoaderFlowHDF5
  8 | import pdb
  9 | 
 10 | def image_name_formatter(x):
 11 |     return 'image_{:05d}.jpg'.format(x)
 12 | 
 13 | def sthv2_image_name_formatter(x):
 14 |     return '{:06d}.jpg'.format(x)
 15 | 
 16 | def sthv1_image_name_formatter(x):
 17 |     return '{:05d}.jpg'.format(x)
 18 | 
 19 | def get_training_data(video_path,
 20 |                       annotation_path,
 21 |                       dataset_name,
 22 |                       input_type,
 23 |                       file_type,
 24 |                       spatial_transform=None,
 25 |                       temporal_transform=None,
 26 |                       target_transform=None):
 27 |     assert dataset_name in [
 28 |         'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'somethingv2',
 29 |         'somethingv1'
 30 |     ]
 31 |     assert input_type in ['rgb', 'flow']
 32 |     assert file_type in ['jpg', 'hdf5']
 33 |     if 'somethingv1' in dataset_name:
 34 |         formatter = sthv1_image_name_formatter
 35 |     elif 'somethingv2' in dataset_name:
 36 |         formatter = sthv2_image_name_formatter
 37 |     else:
 38 |         formatter = image_name_formatter
 39 |     if file_type == 'jpg':
 40 |         assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'
 41 | 
 42 |         if get_image_backend() == 'accimage':
 43 |             from datasets.loader import ImageLoaderAccImage
 44 |             loader = VideoLoader(formatter, ImageLoaderAccImage())
 45 |         else:
 46 |             loader = VideoLoader(formatter)
 47 | 
 48 |         video_path_formatter = (
 49 |             lambda root_path, label, video_id: root_path / label / video_id)
 50 |     else:
 51 |         if input_type == 'rgb':
 52 |             loader = VideoLoaderHDF5()
 53 |         else:
 54 |             loader = VideoLoaderFlowHDF5()
 55 |         video_path_formatter = (lambda root_path, label, video_id: root_path /
 56 |                                 label / '{}.hdf5'.format(video_id))
 57 | 
 58 |     if dataset_name == 'activitynet':
 59 |         training_data = ActivityNet(video_path,
 60 |                                     annotation_path,
 61 |                                     'training',
 62 |                                     spatial_transform=spatial_transform,
 63 |                                     temporal_transform=temporal_transform,
 64 |                                     target_transform=target_transform,
 65 |                                     video_loader=loader,
 66 |                                     video_path_formatter=video_path_formatter)
 67 |     else:
 68 |         training_data = VideoDataset(video_path,
 69 |                                      annotation_path,
 70 |                                      'training',
 71 |                                      spatial_transform=spatial_transform,
 72 |                                      temporal_transform=temporal_transform,
 73 |                                      target_transform=target_transform,
 74 |                                      video_loader=loader,
 75 |                                      video_path_formatter=video_path_formatter)
 76 | 
 77 |     return training_data
 78 | 
 79 | 
 80 | def get_validation_data(video_path,
 81 |                         annotation_path,
 82 |                         dataset_name,
 83 |                         input_type,
 84 |                         file_type,
 85 |                         spatial_transform=None,
 86 |                         temporal_transform=None,
 87 |                         target_transform=None):
 88 |     assert dataset_name in [
 89 |         'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'somethingv2',
 90 |         'somethingv1'
 91 |     ]
 92 |     assert input_type in ['rgb', 'flow']
 93 |     assert file_type in ['jpg', 'hdf5']
 94 | 
 95 |     if 'somethingv1' in dataset_name:
 96 |         formatter = sthv1_image_name_formatter
 97 |     elif 'somethingv2' in dataset_name:
 98 |         formatter = sthv2_image_name_formatter
 99 |     else:
100 |         formatter = image_name_formatter
101 | 
102 |     if file_type == 'jpg':
103 |         assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'
104 |         if get_image_backend() == 'accimage':
105 |             from datasets.loader import ImageLoaderAccImage
106 |             loader = VideoLoader(formatter, ImageLoaderAccImage())
107 |         else:
108 |             loader = VideoLoader(formatter)
109 | 
110 |         video_path_formatter = (
111 |             lambda root_path, label, video_id: root_path / label / video_id)
112 |     else:
113 |         if input_type == 'rgb':
114 |             loader = VideoLoaderHDF5()
115 |         else:
116 |             loader = VideoLoaderFlowHDF5()
117 |         video_path_formatter = (lambda root_path, label, video_id: root_path /
118 |                                 label / '{}.hdf5'.format(video_id))
119 | 
120 |     if dataset_name == 'activitynet':
121 |         validation_data = ActivityNet(video_path,
122 |                                       annotation_path,
123 |                                       'validation',
124 |                                       spatial_transform=spatial_transform,
125 |                                       temporal_transform=temporal_transform,
126 |                                       target_transform=target_transform,
127 |                                       video_loader=loader,
128 |                                       video_path_formatter=video_path_formatter)
129 |     else:
130 |         validation_data = VideoDatasetMultiClips(
131 |             video_path,
132 |             annotation_path,
133 |             'validation',
134 |             spatial_transform=spatial_transform,
135 |             temporal_transform=temporal_transform,
136 |             target_transform=target_transform,
137 |             video_loader=loader,
138 |             video_path_formatter=video_path_formatter)
139 | 
140 |     return validation_data, collate_fn
141 | 
142 | 
143 | def get_inference_data(video_path,
144 |                        annotation_path,
145 |                        dataset_name,
146 |                        input_type,
147 |                        file_type,
148 |                        inference_subset,
149 |                        spatial_transform=None,
150 |                        temporal_transform=None,
151 |                        target_transform=None):
152 |     assert dataset_name in [
153 |         'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'somethingv2'
154 |     ]
155 |     assert input_type in ['rgb', 'flow']
156 |     assert file_type in ['jpg', 'hdf5']
157 |     assert inference_subset in ['train', 'val', 'test']
158 | 
159 |     if file_type == 'jpg':
160 |         assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.'
161 | 
162 |         if get_image_backend() == 'accimage':
163 |             from datasets.loader import ImageLoaderAccImage
164 |             loader = VideoLoader(image_name_formatter, ImageLoaderAccImage())
165 |         else:
166 |             loader = VideoLoader(image_name_formatter)
167 | 
168 |         video_path_formatter = (
169 |             lambda root_path, label, video_id: root_path / label / video_id)
170 |     else:
171 |         if input_type == 'rgb':
172 |             loader = VideoLoaderHDF5()
173 |         else:
174 |             loader = VideoLoaderFlowHDF5()
175 |         video_path_formatter = (lambda root_path, label, video_id: root_path /
176 |                                 label / '{}.hdf5'.format(video_id))
177 | 
178 |     if inference_subset == 'train':
179 |         subset = 'training'
180 |     elif inference_subset == 'val':
181 |         subset = 'validation'
182 |     elif inference_subset == 'test':
183 |         subset = 'testing'
184 |     if dataset_name == 'activitynet':
185 |         inference_data = ActivityNet(video_path,
186 |                                      annotation_path,
187 |                                      subset,
188 |                                      spatial_transform=spatial_transform,
189 |                                      temporal_transform=temporal_transform,
190 |                                      target_transform=target_transform,
191 |                                      video_loader=loader,
192 |                                      video_path_formatter=video_path_formatter,
193 |                                      is_untrimmed_setting=True)
194 |     else:
195 |         inference_data = VideoDatasetMultiClips(
196 |             video_path,
197 |             annotation_path,
198 |             subset,
199 |             spatial_transform=spatial_transform,
200 |             temporal_transform=temporal_transform,
201 |             target_transform=target_transform,
202 |             video_loader=loader,
203 |             video_path_formatter=video_path_formatter,
204 |             target_type=['video_id', 'segment'])
205 | 
206 |     return inference_data, collate_fn
207 | 


--------------------------------------------------------------------------------
/models/resnet2p1d.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from functools import partial
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | def get_inplanes():
 10 |     return [64, 128, 256, 512]
 11 | 
 12 | 
 13 | def conv1x3x3(in_planes, mid_planes, stride=1):
 14 |     return nn.Conv3d(in_planes,
 15 |                      mid_planes,
 16 |                      kernel_size=(1, 3, 3),
 17 |                      stride=(1, stride, stride),
 18 |                      padding=(0, 1, 1),
 19 |                      bias=False)
 20 | 
 21 | 
 22 | def conv3x1x1(mid_planes, planes, stride=1):
 23 |     return nn.Conv3d(mid_planes,
 24 |                      planes,
 25 |                      kernel_size=(3, 1, 1),
 26 |                      stride=(stride, 1, 1),
 27 |                      padding=(1, 0, 0),
 28 |                      bias=False)
 29 | 
 30 | 
 31 | def conv1x1x1(in_planes, out_planes, stride=1):
 32 |     return nn.Conv3d(in_planes,
 33 |                      out_planes,
 34 |                      kernel_size=1,
 35 |                      stride=stride,
 36 |                      bias=False)
 37 | 
 38 | 
 39 | class BasicBlock(nn.Module):
 40 |     expansion = 1
 41 | 
 42 |     def __init__(self, in_planes, planes, stride=1, downsample=None):
 43 |         super().__init__()
 44 | 
 45 |         n_3d_parameters1 = in_planes * planes * 3 * 3 * 3
 46 |         n_2p1d_parameters1 = in_planes * 3 * 3 + 3 * planes
 47 |         mid_planes1 = n_3d_parameters1 // n_2p1d_parameters1
 48 |         self.conv1_s = conv1x3x3(in_planes, mid_planes1, stride)
 49 |         self.bn1_s = nn.BatchNorm3d(mid_planes1)
 50 |         self.conv1_t = conv3x1x1(mid_planes1, planes, stride)
 51 |         self.bn1_t = nn.BatchNorm3d(planes)
 52 | 
 53 |         n_3d_parameters2 = planes * planes * 3 * 3 * 3
 54 |         n_2p1d_parameters2 = planes * 3 * 3 + 3 * planes
 55 |         mid_planes2 = n_3d_parameters2 // n_2p1d_parameters2
 56 |         self.conv2_s = conv1x3x3(planes, mid_planes2)
 57 |         self.bn2_s = nn.BatchNorm3d(mid_planes2)
 58 |         self.conv2_t = conv3x1x1(mid_planes2, planes)
 59 |         self.bn2_t = nn.BatchNorm3d(planes)
 60 | 
 61 |         self.relu = nn.ReLU(inplace=True)
 62 |         self.downsample = downsample
 63 |         self.stride = stride
 64 | 
 65 |     def forward(self, x):
 66 |         residual = x
 67 | 
 68 |         out = self.conv1_s(x)
 69 |         out = self.bn1_s(out)
 70 |         out = self.relu(out)
 71 |         out = self.conv1_t(out)
 72 |         out = self.bn1_t(out)
 73 |         out = self.relu(out)
 74 | 
 75 |         out = self.conv2_s(out)
 76 |         out = self.bn2_s(out)
 77 |         out = self.relu(out)
 78 |         out = self.conv2_t(out)
 79 |         out = self.bn2_t(out)
 80 | 
 81 |         if self.downsample is not None:
 82 |             residual = self.downsample(x)
 83 | 
 84 |         out += residual
 85 |         out = self.relu(out)
 86 | 
 87 |         return out
 88 | 
 89 | 
 90 | class Bottleneck(nn.Module):
 91 |     expansion = 4
 92 | 
 93 |     def __init__(self, in_planes, planes, stride=1, downsample=None):
 94 |         super().__init__()
 95 | 
 96 |         self.conv1 = conv1x1x1(in_planes, planes)
 97 |         self.bn1 = nn.BatchNorm3d(planes)
 98 | 
 99 |         n_3d_parameters = planes * planes * 3 * 3 * 3
100 |         n_2p1d_parameters = planes * 3 * 3 + 3 * planes
101 |         mid_planes = n_3d_parameters // n_2p1d_parameters
102 |         self.conv2_s = conv1x3x3(planes, mid_planes, stride)
103 |         self.bn2_s = nn.BatchNorm3d(mid_planes)
104 |         self.conv2_t = conv3x1x1(mid_planes, planes, stride)
105 |         self.bn2_t = nn.BatchNorm3d(planes)
106 | 
107 |         self.conv3 = conv1x1x1(planes, planes * self.expansion)
108 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
109 |         self.relu = nn.ReLU(inplace=True)
110 |         self.downsample = downsample
111 |         self.stride = stride
112 | 
113 |     def forward(self, x):
114 |         residual = x
115 | 
116 |         out = self.conv1(x)
117 |         out = self.bn1(out)
118 |         out = self.relu(out)
119 | 
120 |         out = self.conv2_s(out)
121 |         out = self.bn2_s(out)
122 |         out = self.relu(out)
123 |         out = self.conv2_t(out)
124 |         out = self.bn2_t(out)
125 |         out = self.relu(out)
126 | 
127 |         out = self.conv3(out)
128 |         out = self.bn3(out)
129 | 
130 |         if self.downsample is not None:
131 |             residual = self.downsample(x)
132 | 
133 |         out += residual
134 |         out = self.relu(out)
135 | 
136 |         return out
137 | 
138 | 
139 | class ResNet(nn.Module):
140 | 
141 |     def __init__(self,
142 |                  block,
143 |                  layers,
144 |                  block_inplanes,
145 |                  n_input_channels=3,
146 |                  conv1_t_size=7,
147 |                  conv1_t_stride=1,
148 |                  no_max_pool=False,
149 |                  shortcut_type='B',
150 |                  widen_factor=1.0,
151 |                  n_classes=400):
152 |         super().__init__()
153 | 
154 |         block_inplanes = [int(x * widen_factor) for x in block_inplanes]
155 | 
156 |         self.in_planes = block_inplanes[0]
157 |         self.no_max_pool = no_max_pool
158 | 
159 |         n_3d_parameters = 3 * self.in_planes * conv1_t_size * 7 * 7
160 |         n_2p1d_parameters = 3 * 7 * 7 + conv1_t_size * self.in_planes
161 |         mid_planes = n_3d_parameters // n_2p1d_parameters
162 |         self.conv1_s = nn.Conv3d(n_input_channels,
163 |                                  mid_planes,
164 |                                  kernel_size=(1, 7, 7),
165 |                                  stride=(1, 2, 2),
166 |                                  padding=(0, 3, 3),
167 |                                  bias=False)
168 |         self.bn1_s = nn.BatchNorm3d(mid_planes)
169 |         self.conv1_t = nn.Conv3d(mid_planes,
170 |                                  self.in_planes,
171 |                                  kernel_size=(conv1_t_size, 1, 1),
172 |                                  stride=(conv1_t_stride, 1, 1),
173 |                                  padding=(conv1_t_size // 2, 0, 0),
174 |                                  bias=False)
175 |         self.bn1_t = nn.BatchNorm3d(self.in_planes)
176 |         self.relu = nn.ReLU(inplace=True)
177 | 
178 |         self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1)
179 |         self.layer1 = self._make_layer(block, block_inplanes[0], layers[0],
180 |                                        shortcut_type)
181 |         self.layer2 = self._make_layer(block,
182 |                                        block_inplanes[1],
183 |                                        layers[1],
184 |                                        shortcut_type,
185 |                                        stride=2)
186 |         self.layer3 = self._make_layer(block,
187 |                                        block_inplanes[2],
188 |                                        layers[2],
189 |                                        shortcut_type,
190 |                                        stride=2)
191 |         self.layer4 = self._make_layer(block,
192 |                                        block_inplanes[3],
193 |                                        layers[3],
194 |                                        shortcut_type,
195 |                                        stride=2)
196 | 
197 |         self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
198 |         self.fc = nn.Linear(block_inplanes[3] * block.expansion, n_classes)
199 | 
200 |         for m in self.modules():
201 |             if isinstance(m, nn.Conv3d):
202 |                 nn.init.kaiming_normal_(m.weight,
203 |                                         mode='fan_out',
204 |                                         nonlinearity='relu')
205 |             elif isinstance(m, nn.BatchNorm3d):
206 |                 nn.init.constant_(m.weight, 1)
207 |                 nn.init.constant_(m.bias, 0)
208 | 
209 |     def _downsample_basic_block(self, x, planes, stride):
210 |         out = F.avg_pool3d(x, kernel_size=1, stride=stride)
211 |         zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2),
212 |                                 out.size(3), out.size(4))
213 |         if isinstance(out.data, torch.cuda.FloatTensor):
214 |             zero_pads = zero_pads.cuda()
215 | 
216 |         out = torch.cat([out.data, zero_pads], dim=1)
217 | 
218 |         return out
219 | 
220 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
221 |         downsample = None
222 |         if stride != 1 or self.in_planes != planes * block.expansion:
223 |             if shortcut_type == 'A':
224 |                 downsample = partial(self._downsample_basic_block,
225 |                                      planes=planes * block.expansion,
226 |                                      stride=stride)
227 |             else:
228 |                 downsample = nn.Sequential(
229 |                     conv1x1x1(self.in_planes, planes * block.expansion, stride),
230 |                     nn.BatchNorm3d(planes * block.expansion))
231 | 
232 |         layers = []
233 |         layers.append(
234 |             block(in_planes=self.in_planes,
235 |                   planes=planes,
236 |                   stride=stride,
237 |                   downsample=downsample))
238 |         self.in_planes = planes * block.expansion
239 |         for i in range(1, blocks):
240 |             layers.append(block(self.in_planes, planes))
241 | 
242 |         return nn.Sequential(*layers)
243 | 
244 |     def forward(self, x):
245 |         x = self.conv1_s(x)
246 |         x = self.bn1_s(x)
247 |         x = self.relu(x)
248 |         x = self.conv1_t(x)
249 |         x = self.bn1_t(x)
250 |         x = self.relu(x)
251 | 
252 |         if not self.no_max_pool:
253 |             x = self.maxpool(x)
254 | 
255 |         x = self.layer1(x)
256 |         x = self.layer2(x)
257 |         x = self.layer3(x)
258 |         x = self.layer4(x)
259 | 
260 |         x = self.avgpool(x)
261 | 
262 |         x = x.view(x.size(0), -1)
263 |         x = self.fc(x)
264 | 
265 |         return x
266 | 
267 | 
268 | def generate_model(model_depth, **kwargs):
269 |     assert model_depth in [10, 18, 34, 50, 101, 152, 200]
270 | 
271 |     if model_depth == 10:
272 |         model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs)
273 |     elif model_depth == 18:
274 |         model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs)
275 |     elif model_depth == 34:
276 |         model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs)
277 |     elif model_depth == 50:
278 |         model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs)
279 |     elif model_depth == 101:
280 |         model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs)
281 |     elif model_depth == 152:
282 |         model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs)
283 |     elif model_depth == 200:
284 |         model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs)
285 | 
286 |     return model


--------------------------------------------------------------------------------
/models/resnet_strg.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from functools import partial
  3 | import pdb
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torchvision.models as models
  8 | 
  9 | def get_inplanes():
 10 |     return [64, 128, 256, 512]
 11 | 
 12 | 
 13 | def conv3x3x3(in_planes, out_planes, stride=1):
 14 |     return nn.Conv3d(in_planes,
 15 |                      out_planes,
 16 |                      kernel_size=3,
 17 |                      stride=stride,
 18 |                      padding=1,
 19 |                      bias=False)
 20 | 
 21 | 
 22 | def conv1x1x1(in_planes, out_planes, stride=1):
 23 |     return nn.Conv3d(in_planes,
 24 |                      out_planes,
 25 |                      kernel_size=1,
 26 |                      stride=stride,
 27 |                      bias=False)
 28 | 
 29 | 
 30 | def conv1x3x3(in_planes, out_planes, stride=1):
 31 |     return nn.Conv3d(in_planes,
 32 |                      out_planes,
 33 |                      kernel_size=(1,3,3),
 34 |                      stride=stride,
 35 |                      padding=(0,1,1),
 36 |                      bias=False)
 37 | 
 38 | 
 39 | def conv3x1x1(in_planes, out_planes, stride=1):
 40 |     return nn.Conv3d(in_planes,
 41 |                      out_planes,
 42 |                      kernel_size=(3,1,1),
 43 |                      stride=stride,
 44 |                      padding=(1,0,0),
 45 |                      bias=False)
 46 | 
 47 | 
 48 | 
 49 | class BasicBlock(nn.Module):
 50 |     expansion = 1
 51 | 
 52 |     def __init__(self, in_planes, planes, stride=1, downsample=None):
 53 |         super().__init__()
 54 | 
 55 |         self.conv1 = conv3x3x3(in_planes, planes, stride)
 56 |         self.bn1 = nn.BatchNorm3d(planes)
 57 |         self.relu = nn.ReLU(inplace=True)
 58 |         self.conv2 = conv3x3x3(planes, planes)
 59 |         self.bn2 = nn.BatchNorm3d(planes)
 60 |         self.downsample = downsample
 61 |         self.stride = stride
 62 | 
 63 |     def forward(self, x):
 64 |         residual = x
 65 | 
 66 |         out = self.conv1(x)
 67 |         out = self.bn1(out)
 68 |         out = self.relu(out)
 69 | 
 70 |         out = self.conv2(out)
 71 |         out = self.bn2(out)
 72 | 
 73 |         if self.downsample is not None:
 74 |             residual = self.downsample(x)
 75 | 
 76 |         out += residual
 77 |         out = self.relu(out)
 78 | 
 79 |         return out
 80 | 
 81 | 
 82 | class Bottleneck(nn.Module):
 83 |     expansion = 4
 84 | 
 85 |     def __init__(self, in_planes, planes, stride=1, downsample=None):
 86 |         super().__init__()
 87 | 
 88 |         self.conv1 = conv3x1x1(in_planes, planes)
 89 |         self.bn1 = nn.BatchNorm3d(planes)
 90 |         self.conv2 = conv1x3x3(planes, planes, stride)
 91 |         self.bn2 = nn.BatchNorm3d(planes)
 92 |         self.conv3 = conv1x1x1(planes, planes * self.expansion)
 93 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
 94 |         self.relu = nn.ReLU(inplace=True)
 95 |         self.downsample = downsample
 96 |         self.stride = stride
 97 | 
 98 |     def forward(self, x):
 99 |         residual = x
100 | 
101 |         out = self.conv1(x)
102 |         out = self.bn1(out)
103 |         out = self.relu(out)
104 | 
105 |         out = self.conv2(out)
106 |         out = self.bn2(out)
107 |         out = self.relu(out)
108 | 
109 |         out = self.conv3(out)
110 |         out = self.bn3(out)
111 | 
112 |         if self.downsample is not None:
113 |             residual = self.downsample(x)
114 | 
115 |         out += residual
116 |         out = self.relu(out)
117 | 
118 |         return out
119 | 
120 | 
121 | class ResNet(nn.Module):
122 | 
123 |     def __init__(self,
124 |                  block,
125 |                  layers,
126 |                  block_inplanes,
127 |                  n_input_channels=3,
128 |                  conv1_t_size=5,
129 |                  conv1_t_stride=1,
130 |                  no_max_pool=False,
131 |                  shortcut_type='B',
132 |                  widen_factor=1.0,
133 |                  n_classes=400):
134 |         super().__init__()
135 | 
136 |         block_inplanes = [int(x * widen_factor) for x in block_inplanes]
137 | 
138 |         self.in_planes = block_inplanes[0]
139 |         self.no_max_pool = no_max_pool
140 | 
141 |         self.conv1 = nn.Conv3d(n_input_channels,
142 |                                self.in_planes,
143 |                                kernel_size=(conv1_t_size, 7, 7),
144 |                                stride=(conv1_t_stride, 2, 2),
145 |                                padding=(conv1_t_size // 2, 3, 3),
146 |                                bias=False)
147 |         self.bn1 = nn.BatchNorm3d(self.in_planes)
148 |         self.relu = nn.ReLU(inplace=True)
149 |         self.maxpool1 = nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1))
150 |         self.maxpool2 = nn.MaxPool3d(kernel_size=(3,1,1), stride=(2,1,1), padding=(1,0,0))
151 |         self.layer1 = self._make_layer(block, block_inplanes[0], layers[0],
152 |                                        shortcut_type)
153 |         self.layer2 = self._make_layer(block,
154 |                                        block_inplanes[1],
155 |                                        layers[1],
156 |                                        shortcut_type,
157 |                                        stride=(1,2,2))
158 |         self.layer3 = self._make_layer(block,
159 |                                        block_inplanes[2],
160 |                                        layers[2],
161 |                                        shortcut_type,
162 |                                        stride=(1,2,2))
163 |         self.layer4 = self._make_layer(block,
164 |                                        block_inplanes[3],
165 |                                        layers[3],
166 |                                        shortcut_type,
167 |                                        stride=1)
168 | 
169 |         self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
170 |         self.fc = nn.Linear(block_inplanes[3] * block.expansion, n_classes)
171 | 
172 |         for m in self.modules():
173 |             if isinstance(m, nn.Conv3d):
174 |                 nn.init.kaiming_normal_(m.weight,
175 |                                         mode='fan_out',
176 |                                         nonlinearity='relu')
177 |             elif isinstance(m, nn.BatchNorm3d):
178 |                 nn.init.constant_(m.weight, 1)
179 |                 nn.init.constant_(m.bias, 0)
180 | 
181 |     def _downsample_basic_block(self, x, planes, stride):
182 |         out = F.avg_pool3d(x, kernel_size=1, stride=stride)
183 |         zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2),
184 |                                 out.size(3), out.size(4))
185 |         if isinstance(out.data, torch.cuda.FloatTensor):
186 |             zero_pads = zero_pads.cuda()
187 | 
188 |         out = torch.cat([out.data, zero_pads], dim=1)
189 | 
190 |         return out
191 | 
192 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
193 |         downsample = None
194 |         if stride != 1 or self.in_planes != planes * block.expansion:
195 |             if shortcut_type == 'A':
196 |                 downsample = partial(self._downsample_basic_block,
197 |                                      planes=planes * block.expansion,
198 |                                      stride=stride)
199 |             else:
200 |                 downsample = nn.Sequential(
201 |                     conv1x1x1(self.in_planes, planes * block.expansion, stride),
202 |                     nn.BatchNorm3d(planes * block.expansion))
203 | 
204 |         layers = []
205 |         layers.append(
206 |             block(in_planes=self.in_planes,
207 |                   planes=planes,
208 |                   stride=stride,
209 |                   downsample=downsample))
210 |         self.in_planes = planes * block.expansion
211 |         for i in range(1, blocks):
212 |             layers.append(block(self.in_planes, planes))
213 | 
214 |         return nn.Sequential(*layers)
215 | 
216 |     def extract_feature(self, x):
217 |         x = self.conv1(x)
218 |         x = self.bn1(x)
219 |         x = self.relu(x)
220 |         if not self.no_max_pool:
221 |             x = self.maxpool1(x)
222 | 
223 |         x = self.layer1(x)
224 |         if not self.no_max_pool:
225 |             x = self.maxpool2(x)
226 |         x = self.layer2(x)
227 |         x = self.layer3(x)
228 |         x = self.layer4(x)
229 |         return x
230 | 
231 | 
232 |     def forward(self, x):
233 |         x = self.conv1(x)
234 |         x = self.bn1(x)
235 |         x = self.relu(x)
236 |         if not self.no_max_pool:
237 |             x = self.maxpool1(x)
238 | 
239 |         x = self.layer1(x)
240 |         if not self.no_max_pool:
241 |             x = self.maxpool2(x)
242 |         x = self.layer2(x)
243 |         x = self.layer3(x)
244 |         x = self.layer4(x)
245 |         x = self.avgpool(x)
246 | 
247 |         x = x.view(x.size(0), -1)
248 |         x = self.fc(x)
249 | 
250 |         return x
251 | 
252 |     def load_imagenet_pretrained(self, resnet2d):  # only ResNet 50 implemented
253 |         print("Load ImageNet pre-trained weight")
254 |         state_dict_2d = resnet2d.state_dict()
255 |         state_dict = self.state_dict()
256 |         for k in state_dict.keys():
257 |             v_2d = state_dict_2d[k]
258 |             if len(state_dict[k].shape) != len(v_2d.shape):
259 |                 state_dict[k] = v_2d.unsqueeze(2)
260 |             else:
261 |                 state_dict[k] = v_2d
262 | 
263 | 
264 | 
265 | def generate_model(model_depth, **kwargs):
266 |     assert model_depth in [10, 18, 34, 50, 101, 152, 200]
267 | 
268 |     if model_depth == 10:
269 |         model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs)
270 |         model.load_imagenet_pretrained(models.resnet10(pretrained=True))
271 |     elif model_depth == 18:
272 |         model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs)
273 |         model.load_imagenet_pretrained(models.resnet18(pretrained=True))
274 |     elif model_depth == 34:
275 |         model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs)
276 |         model.load_imagenet_pretrained(models.resnet34(pretrained=True))
277 |     elif model_depth == 50:
278 |         model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs)
279 |         model.load_imagenet_pretrained(models.resnet50(pretrained=True))
280 |     elif model_depth == 101:
281 |         model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs)
282 |         model.load_imagenet_pretrained(models.resnet101(pretrained=True))
283 |     elif model_depth == 152:
284 |         model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs)
285 |         model.load_imagenet_pretrained(models.resnet152(pretrained=True))
286 |     elif model_depth == 200:
287 |         model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs)
288 |         model.load_imagenet_pretrained(models.resnet200(pretrained=True))
289 | 
290 |     return model
291 | 
292 | if __name__ == '__main__':
293 | 
294 |     model = generate_model(model_depth=50,
295 |                                     n_classes=174,
296 |                                     n_input_channels=3,
297 |                                     shortcut_type='B',
298 |                                     conv1_t_size=5,
299 |                                     conv1_t_stride=1,
300 |                                     no_max_pool=False,
301 |                                     widen_factor=1.0)
302 |     model = model#.cuda()
303 | 
304 |     pdb.set_trace()
305 |     inputs = torch.rand((4,3,32,224,224))#.cuda()
306 |     out = model(inputs)
307 |     pdb.set_trace()
308 | 


--------------------------------------------------------------------------------
/transform.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | import torch
  4 | from torch import nn, Tensor
  5 | from torch.nn import functional as F
  6 | import torchvision
  7 | from torch.jit.annotations import List, Tuple, Dict, Optional
  8 | 
  9 | from torchvision.models.detection.image_list import ImageList
 10 | from torchvision.models.detection.roi_heads import paste_masks_in_image
 11 | import pdb
 12 | 
 13 | @torch.jit.unused
 14 | def _resize_image_and_masks_onnx(image, self_min_size, self_max_size, target):
 15 |     # type: (Tensor, float, float, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]
 16 |     from torch.onnx import operators
 17 |     im_shape = operators.shape_as_tensor(image)[-2:]
 18 |     min_size = torch.min(im_shape).to(dtype=torch.float32)
 19 |     max_size = torch.max(im_shape).to(dtype=torch.float32)
 20 |     scale_factor = torch.min(self_min_size / min_size, self_max_size / max_size)
 21 | 
 22 |     image = torch.nn.functional.interpolate(
 23 |         image[None], scale_factor=scale_factor, mode='bilinear',
 24 |         align_corners=False)[0]
 25 | 
 26 |     if target is None:
 27 |         return image, target
 28 | 
 29 |     if "masks" in target:
 30 |         mask = target["masks"]
 31 |         mask = F.interpolate(mask[:, None].float(), scale_factor=scale_factor)[:, 0].byte()
 32 |         target["masks"] = mask
 33 |     return image, target
 34 | 
 35 | 
 36 | def _resize_image_and_masks(image, self_min_size, self_max_size, target):
 37 |     # type: (Tensor, float, float, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]
 38 |     im_shape = torch.tensor(image.shape[-2:])
 39 |     min_size = float(torch.min(im_shape))
 40 |     max_size = float(torch.max(im_shape))
 41 |     scale_factor = self_min_size / min_size
 42 |     if max_size * scale_factor > self_max_size:
 43 |         scale_factor = self_max_size / max_size
 44 |     image = torch.nn.functional.interpolate(
 45 |         image[None], scale_factor=scale_factor, mode='bilinear',
 46 |         align_corners=False)[0]
 47 | 
 48 |     if target is None:
 49 |         return image, target
 50 | 
 51 |     if "masks" in target:
 52 |         mask = target["masks"]
 53 |         mask = F.interpolate(mask[:, None].float(), scale_factor=scale_factor)[:, 0].byte()
 54 |         target["masks"] = mask
 55 |     return image, target
 56 | 
 57 | 
 58 | class STRGTransform(nn.Module):
 59 |     """
 60 |     Performs input / target transformation before feeding the data to a GeneralizedRCNN
 61 |     model.
 62 | 
 63 |     The transformations it perform are:
 64 |         - input normalization (mean subtraction and std division)
 65 |         - input / target resizing to match min_size / max_size
 66 | 
 67 |     It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets
 68 |     """
 69 | 
 70 |     def __init__(self, min_size, max_size, image_mean, image_std):
 71 |         super(STRGTransform, self).__init__()
 72 |         if not isinstance(min_size, (list, tuple)):
 73 |             min_size = (min_size,)
 74 |         self.min_size = min_size
 75 |         self.max_size = max_size
 76 |         self.image_mean = image_mean
 77 |         self.image_std = image_std
 78 | 
 79 |     def forward(self,
 80 |                 images,       # type: List[Tensor]
 81 |                 targets=None  # type: Optional[List[Dict[str, Tensor]]]
 82 |                 ):
 83 |         # type: (...) -> Tuple[ImageList, Optional[List[Dict[str, Tensor]]]]
 84 |         images = [img for img in images]
 85 |         if targets is not None:
 86 |             # make a copy of targets to avoid modifying it in-place
 87 |             # once torchscript supports dict comprehension
 88 |             # this can be simplified as as follows
 89 |             # targets = [{k: v for k,v in t.items()} for t in targets]
 90 |             targets_copy: List[Dict[str, Tensor]] = []
 91 |             for t in targets:
 92 |                 data: Dict[str, Tensor] = {}
 93 |                 for k, v in t.items():
 94 |                     data[k] = v
 95 |                 targets_copy.append(data)
 96 |             targets = targets_copy
 97 |         for i in range(len(images)):
 98 |             image = images[i]
 99 |             target_index = targets[i] if targets is not None else None
100 | 
101 |             if image.dim() != 3:
102 |                 raise ValueError("images is expected to be a list of 3d tensors "
103 |                                  "of shape [C, H, W], got {}".format(image.shape))
104 |             image = self.normalize(image)
105 |             image, target_index = self.resize(image, target_index)
106 |             images[i] = image
107 |             if targets is not None and target_index is not None:
108 |                 targets[i] = target_index
109 | 
110 |         image_sizes = [img.shape[-2:] for img in images]
111 |         images = self.batch_images(images)
112 |         image_sizes_list = torch.jit.annotate(List[Tuple[int, int]], [])
113 |         for image_size in image_sizes:
114 |             assert len(image_size) == 2
115 |             image_sizes_list.append((image_size[0], image_size[1]))
116 | 
117 |         image_list = ImageList(images, image_sizes_list)
118 |         return image_list, targets
119 | 
120 | 
121 |     def normalize(self, image):
122 |         return image
123 | #        dtype, device = image.dtype, image.device
124 | #        mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device)
125 | #        std = torch.as_tensor(self.image_std, dtype=dtype, device=device)
126 | #        return (image - mean[:, None, None]) / std[:, None, None]
127 | 
128 |     def torch_choice(self, k):
129 |         # type: (List[int]) -> int
130 |         """
131 |         Implements `random.choice` via torch ops so it can be compiled with
132 |         TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
133 |         is fixed.
134 |         """
135 |         index = int(torch.empty(1).uniform_(0., float(len(k))).item())
136 |         return k[index]
137 | 
138 |     def resize(self, image, target):
139 |         # type: (Tensor, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]
140 |         h, w = image.shape[-2:]
141 |         if self.training:
142 |             size = float(self.torch_choice(self.min_size))
143 |         else:
144 |             # FIXME assume for now that testing uses the largest scale
145 |             size = float(self.min_size[-1])
146 |         if torchvision._is_tracing():
147 |             image, target = _resize_image_and_masks_onnx(image, size, float(self.max_size), target)
148 |         else:
149 |             image, target = _resize_image_and_masks(image, size, float(self.max_size), target)
150 | 
151 |         if target is None:
152 |             return image, target
153 | 
154 |         bbox = target["boxes"]
155 |         bbox = resize_boxes(bbox, (h, w), image.shape[-2:])
156 |         target["boxes"] = bbox
157 | 
158 |         if "keypoints" in target:
159 |             keypoints = target["keypoints"]
160 |             keypoints = resize_keypoints(keypoints, (h, w), image.shape[-2:])
161 |             target["keypoints"] = keypoints
162 |         return image, target
163 | 
164 |     # _onnx_batch_images() is an implementation of
165 |     # batch_images() that is supported by ONNX tracing.
166 |     @torch.jit.unused
167 |     def _onnx_batch_images(self, images, size_divisible=32):
168 |         # type: (List[Tensor], int) -> Tensor
169 |         max_size = []
170 |         for i in range(images[0].dim()):
171 |             max_size_i = torch.max(torch.stack([img.shape[i] for img in images]).to(torch.float32)).to(torch.int64)
172 |             max_size.append(max_size_i)
173 |         stride = size_divisible
174 |         max_size[1] = (torch.ceil((max_size[1].to(torch.float32)) / stride) * stride).to(torch.int64)
175 |         max_size[2] = (torch.ceil((max_size[2].to(torch.float32)) / stride) * stride).to(torch.int64)
176 |         max_size = tuple(max_size)
177 | 
178 |         # work around for
179 |         # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
180 |         # which is not yet supported in onnx
181 |         padded_imgs = []
182 |         for img in images:
183 |             padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
184 |             padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
185 |             padded_imgs.append(padded_img)
186 | 
187 |         return torch.stack(padded_imgs)
188 | 
189 |     def max_by_axis(self, the_list):
190 |         # type: (List[List[int]]) -> List[int]
191 |         maxes = the_list[0]
192 |         for sublist in the_list[1:]:
193 |             for index, item in enumerate(sublist):
194 |                 maxes[index] = max(maxes[index], item)
195 |         return maxes
196 | 
197 |     def batch_images(self, images, size_divisible=32):
198 |         # type: (List[Tensor], int) -> Tensor
199 |         if torchvision._is_tracing():
200 |             # batch_images() does not export well to ONNX
201 |             # call _onnx_batch_images() instead
202 |             return self._onnx_batch_images(images, size_divisible)
203 | 
204 |         max_size = self.max_by_axis([list(img.shape) for img in images])
205 |         stride = float(size_divisible)
206 |         max_size = list(max_size)
207 |         max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride)
208 |         max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride)
209 | 
210 |         batch_shape = [len(images)] + max_size
211 |         batched_imgs = images[0].new_full(batch_shape, 0)
212 |         for img, pad_img in zip(images, batched_imgs):
213 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
214 | 
215 |         return batched_imgs
216 | 
217 | 
218 |     def rpn_postprocess(self,
219 |                     result,               # type: List[Dict[str, Tensor]]
220 |                     image_shapes,         # type: List[Tuple[int, int]]
221 |                     original_image_sizes  # type: List[Tuple[int, int]]
222 |                     ):
223 |         # type: (...) -> List[Dict[str, Tensor]]
224 |         if self.training:
225 |             return result
226 |         new_boxes = []
227 |         for i, (boxes, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
228 |             boxes = resize_boxes(boxes, im_s, o_im_s)
229 |             new_boxes.append(boxes)
230 |         return new_boxes
231 | 
232 | 
233 | 
234 |     def postprocess(self,
235 |                     result,               # type: List[Dict[str, Tensor]]
236 |                     image_shapes,         # type: List[Tuple[int, int]]
237 |                     original_image_sizes  # type: List[Tuple[int, int]]
238 |                     ):
239 |         # type: (...) -> List[Dict[str, Tensor]]
240 |         if self.training:
241 |             return result
242 |         for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
243 |             boxes = pred["boxes"]
244 |             boxes = resize_boxes(boxes, im_s, o_im_s)
245 |             result[i]["boxes"] = boxes
246 |             if "masks" in pred:
247 |                 masks = pred["masks"]
248 |                 masks = paste_masks_in_image(masks, boxes, o_im_s)
249 |                 result[i]["masks"] = masks
250 |             if "keypoints" in pred:
251 |                 keypoints = pred["keypoints"]
252 |                 keypoints = resize_keypoints(keypoints, im_s, o_im_s)
253 |                 result[i]["keypoints"] = keypoints
254 |         return result
255 | 
256 |     def __repr__(self):
257 |         format_string = self.__class__.__name__ + '('
258 |         _indent = '\n    '
259 |         format_string += "{0}Normalize(mean={1}, std={2})".format(_indent, self.image_mean, self.image_std)
260 |         format_string += "{0}Resize(min_size={1}, max_size={2}, mode='bilinear')".format(_indent, self.min_size,
261 |                                                                                          self.max_size)
262 |         format_string += '\n)'
263 |         return format_string
264 | 
265 | 
266 | def resize_keypoints(keypoints, original_size, new_size):
267 |     # type: (Tensor, List[int], List[int]) -> Tensor
268 |     ratios = [
269 |         torch.tensor(s, dtype=torch.float32, device=keypoints.device) /
270 |         torch.tensor(s_orig, dtype=torch.float32, device=keypoints.device)
271 |         for s, s_orig in zip(new_size, original_size)
272 |     ]
273 |     ratio_h, ratio_w = ratios
274 |     resized_data = keypoints.clone()
275 |     if torch._C._get_tracing_state():
276 |         resized_data_0 = resized_data[:, :, 0] * ratio_w
277 |         resized_data_1 = resized_data[:, :, 1] * ratio_h
278 |         resized_data = torch.stack((resized_data_0, resized_data_1, resized_data[:, :, 2]), dim=2)
279 |     else:
280 |         resized_data[..., 0] *= ratio_w
281 |         resized_data[..., 1] *= ratio_h
282 |     return resized_data
283 | 
284 | 
285 | def resize_boxes(boxes, original_size, new_size):
286 |     # type: (Tensor, List[int], List[int]) -> Tensor
287 |     ratios = [
288 |         torch.tensor(s, dtype=torch.float32, device=boxes.device) /
289 |         torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
290 |         for s, s_orig in zip(new_size, original_size)
291 |     ]
292 |     ratio_height, ratio_width = ratios
293 |     xmin, ymin, xmax, ymax = boxes.unbind(1)
294 | 
295 |     xmin = xmin * ratio_width
296 |     xmax = xmax * ratio_width
297 |     ymin = ymin * ratio_height
298 |     ymax = ymax * ratio_height
299 |     return torch.stack((xmin, ymin, xmax, ymax), dim=1)
300 | 


--------------------------------------------------------------------------------
/opts.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from pathlib import Path
  3 | 
  4 | 
  5 | def parse_opts():
  6 |     parser = argparse.ArgumentParser()
  7 |     parser.add_argument('--root_path',
  8 |                         default=None,
  9 |                         type=Path,
 10 |                         help='Root directory path')
 11 |     parser.add_argument('--video_path',
 12 |                         default=None,
 13 |                         type=Path,
 14 |                         help='Directory path of videos')
 15 |     parser.add_argument('--annotation_path',
 16 |                         default=None,
 17 |                         type=Path,
 18 |                         help='Annotation file path')
 19 |     parser.add_argument('--result_path',
 20 |                         default=None,
 21 |                         type=Path,
 22 |                         help='Result directory path')
 23 |     parser.add_argument(
 24 |         '--dataset',
 25 |         default='kinetics',
 26 |         type=str,
 27 |         help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)')
 28 |     parser.add_argument(
 29 |         '--n_classes',
 30 |         default=400,
 31 |         type=int,
 32 |         help=
 33 |         'Number of classes (activitynet: 200, kinetics: 400 or 600, ucf101: 101, hmdb51: 51)'
 34 |     )
 35 |     parser.add_argument('--n_pretrain_classes',
 36 |                         default=0,
 37 |                         type=int,
 38 |                         help=('Number of classes of pretraining task.'
 39 |                               'When using --pretrain_path, this must be set.'))
 40 |     parser.add_argument('--pretrain_path',
 41 |                         default=None,
 42 |                         type=Path,
 43 |                         help='Pretrained model path (.pth).')
 44 |     parser.add_argument(
 45 |         '--ft_begin_module',
 46 |         default='',
 47 |         type=str,
 48 |         help=('Module name of beginning of fine-tuning'
 49 |               '(conv1, layer1, fc, denseblock1, classifier, ...).'
 50 |               'The default means all layers are fine-tuned.'))
 51 |     parser.add_argument('--sample_size',
 52 |                         default=224,
 53 |                         type=int,
 54 |                         help='Height and width of inputs')
 55 |     parser.add_argument('--sample_duration',
 56 |                         default=16,
 57 |                         type=int,
 58 |                         help='Temporal duration of inputs')
 59 |     parser.add_argument(
 60 |         '--sample_t_stride',
 61 |         default=1,
 62 |         type=int,
 63 |         help='If larger than 1, input frames are subsampled with the stride.')
 64 |     parser.add_argument(
 65 |         '--train_crop',
 66 |         default='random',
 67 |         type=str,
 68 |         help=('Spatial cropping method in training. '
 69 |               'random is uniform. '
 70 |               'corner is selection from 4 corners and 1 center. '
 71 |               '(random | corner | center)'))
 72 |     parser.add_argument('--train_crop_min_scale',
 73 |                         default=0.25,
 74 |                         type=float,
 75 |                         help='Min scale for random cropping in training')
 76 |     parser.add_argument('--train_crop_min_ratio',
 77 |                         default=0.75,
 78 |                         type=float,
 79 |                         help='Min aspect ratio for random cropping in training')
 80 |     parser.add_argument('--no_hflip',
 81 |                         action='store_true',
 82 |                         help='If true holizontal flipping is not performed.')
 83 |     parser.add_argument('--colorjitter',
 84 |                         action='store_true',
 85 |                         help='If true colorjitter is performed.')
 86 |     parser.add_argument('--train_t_crop',
 87 |                         default='random',
 88 |                         type=str,
 89 |                         help=('Temporal cropping method in training. '
 90 |                               'random is uniform. '
 91 |                               '(random | center)'))
 92 |     parser.add_argument('--learning_rate',
 93 |                         default=0.1,
 94 |                         type=float,
 95 |                         help=('Initial learning rate'
 96 |                               '(divided by 10 while training by lr scheduler)'))
 97 |     parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
 98 |     parser.add_argument('--dampening',
 99 |                         default=0.0,
100 |                         type=float,
101 |                         help='dampening of SGD')
102 |     parser.add_argument('--weight_decay',
103 |                         default=1e-3,
104 |                         type=float,
105 |                         help='Weight Decay')
106 |     parser.add_argument('--mean_dataset',
107 |                         default='kinetics',
108 |                         type=str,
109 |                         help=('dataset for mean values of mean subtraction'
110 |                               '(activitynet | kinetics | 0.5)'))
111 |     parser.add_argument('--no_mean_norm',
112 |                         action='store_true',
113 |                         help='If true, inputs are not normalized by mean.')
114 |     parser.add_argument(
115 |         '--no_std_norm',
116 |         action='store_true',
117 |         help='If true, inputs are not normalized by standard deviation.')
118 |     parser.add_argument(
119 |         '--value_scale',
120 |         default=1,
121 |         type=int,
122 |         help=
123 |         'If 1, range of inputs is [0-1]. If 255, range of inputs is [0-255].')
124 |     parser.add_argument('--nesterov',
125 |                         action='store_true',
126 |                         help='Nesterov momentum')
127 |     parser.add_argument('--optimizer',
128 |                         default='sgd',
129 |                         type=str,
130 |                         help='Currently only support SGD')
131 |     parser.add_argument('--lr_scheduler',
132 |                         default='multistep',
133 |                         type=str,
134 |                         help='Type of LR scheduler (multistep | plateau)')
135 |     parser.add_argument(
136 |         '--multistep_milestones',
137 |         default=[50, 100, 150],
138 |         type=int,
139 |         nargs='+',
140 |         help='Milestones of LR scheduler. See documentation of MultistepLR.')
141 |     parser.add_argument(
142 |         '--overwrite_milestones',
143 |         action='store_true',
144 |         help='If true, overwriting multistep_milestones when resuming training.'
145 |     )
146 |     parser.add_argument(
147 |         '--plateau_patience',
148 |         default=10,
149 |         type=int,
150 |         help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.'
151 |     )
152 |     parser.add_argument('--batch_size',
153 |                         default=128,
154 |                         type=int,
155 |                         help='Batch Size')
156 |     parser.add_argument(
157 |         '--inference_batch_size',
158 |         default=0,
159 |         type=int,
160 |         help='Batch Size for inference. 0 means this is the same as batch_size.'
161 |     )
162 |     parser.add_argument(
163 |         '--batchnorm_sync',
164 |         action='store_true',
165 |         help='If true, SyncBatchNorm is used instead of BatchNorm.')
166 |     parser.add_argument('--n_epochs',
167 |                         default=200,
168 |                         type=int,
169 |                         help='Number of total epochs to run')
170 |     parser.add_argument('--n_val_samples',
171 |                         default=3,
172 |                         type=int,
173 |                         help='Number of validation samples for each activity')
174 |     parser.add_argument('--resume_path',
175 |                         default=None,
176 |                         type=Path,
177 |                         help='Save data (.pth) of previous training')
178 |     parser.add_argument('--no_train',
179 |                         action='store_true',
180 |                         help='If true, training is not performed.')
181 |     parser.add_argument('--no_val',
182 |                         action='store_true',
183 |                         help='If true, validation is not performed.')
184 |     parser.add_argument('--inference',
185 |                         action='store_true',
186 |                         help='If true, inference is performed.')
187 |     parser.add_argument('--inference_subset',
188 |                         default='val',
189 |                         type=str,
190 |                         help='Used subset in inference (train | val | test)')
191 |     parser.add_argument('--inference_stride',
192 |                         default=16,
193 |                         type=int,
194 |                         help='Stride of sliding window in inference.')
195 |     parser.add_argument(
196 |         '--inference_crop',
197 |         default='center',
198 |         type=str,
199 |         help=('Cropping method in inference. (center | nocrop)'
200 |               'When nocrop, fully convolutional inference is performed,'
201 |               'and mini-batch consists of clips of one video.'))
202 |     parser.add_argument(
203 |         '--inference_no_average',
204 |         action='store_true',
205 |         help='If true, outputs for segments in a video are not averaged.')
206 |     parser.add_argument('--no_cuda',
207 |                         action='store_true',
208 |                         help='If true, cuda is not used.')
209 |     parser.add_argument('--n_threads',
210 |                         default=1,
211 |                         type=int,
212 |                         help='Number of threads for multi-thread loading')
213 |     parser.add_argument('--checkpoint',
214 |                         default=10,
215 |                         type=int,
216 |                         help='Trained model is saved at every this epochs.')
217 |     parser.add_argument(
218 |         '--model',
219 |         default='resnet',
220 |         type=str,
221 |         help=
222 |         '(resnet | resnet2p1d | preresnet | wideresnet | resnext | densenet | ')
223 |     parser.add_argument('--model_depth',
224 |                         default=18,
225 |                         type=int,
226 |                         help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
227 |     parser.add_argument('--conv1_t_size',
228 |                         default=7,
229 |                         type=int,
230 |                         help='Kernel size in t dim of conv1.')
231 |     parser.add_argument('--conv1_t_stride',
232 |                         default=1,
233 |                         type=int,
234 |                         help='Stride in t dim of conv1.')
235 |     parser.add_argument('--no_max_pool',
236 |                         action='store_true',
237 |                         help='If true, the max pooling after conv1 is removed.')
238 |     parser.add_argument('--resnet_shortcut',
239 |                         default='B',
240 |                         type=str,
241 |                         help='Shortcut type of resnet (A | B)')
242 |     parser.add_argument(
243 |         '--resnet_widen_factor',
244 |         default=1.0,
245 |         type=float,
246 |         help='The number of feature maps of resnet is multiplied by this value')
247 |     parser.add_argument('--wide_resnet_k',
248 |                         default=2,
249 |                         type=int,
250 |                         help='Wide resnet k')
251 |     parser.add_argument('--resnext_cardinality',
252 |                         default=32,
253 |                         type=int,
254 |                         help='ResNeXt cardinality')
255 |     parser.add_argument('--input_type',
256 |                         default='rgb',
257 |                         type=str,
258 |                         help='(rgb | flow)')
259 |     parser.add_argument('--manual_seed',
260 |                         default=1,
261 |                         type=int,
262 |                         help='Manually set random seed')
263 |     parser.add_argument('--accimage',
264 |                         action='store_true',
265 |                         help='If true, accimage is used to load images.')
266 |     parser.add_argument('--output_topk',
267 |                         default=5,
268 |                         type=int,
269 |                         help='Top-k scores are saved in json file.')
270 |     parser.add_argument('--file_type',
271 |                         default='jpg',
272 |                         type=str,
273 |                         help='(jpg | hdf5)')
274 |     parser.add_argument('--tensorboard',
275 |                         action='store_true',
276 |                         help='If true, output tensorboard log file.')
277 |     parser.add_argument(
278 |         '--distributed',
279 |         action='store_true',
280 |         help='Use multi-processing distributed training to launch '
281 |         'N processes per node, which has N GPUs.')
282 |     parser.add_argument('--dist_url',
283 |                         default='tcp://127.0.0.1:23456',
284 |                         type=str,
285 |                         help='url used to set up distributed training')
286 |     parser.add_argument('--world_size',
287 |                         default=-1,
288 |                         type=int,
289 |                         help='number of nodes for distributed training')
290 | 
291 |     parser.add_argument('--wandb',
292 |                         action='store_true',
293 |                         help='Use wandb.')
294 |     parser.add_argument('--strg',
295 |                         action='store_true',
296 |                         help='Use STRG.')
297 | 
298 |     parser.add_argument('--det_interval',
299 |                         default=2,
300 |                         type=int,
301 |                         help='Detection Interval which should be aligned with'
302 |                         'backbone architecture.')
303 | 
304 |     parser.add_argument('--nrois',
305 |                         default=10,
306 |                         type=int,
307 |                         help='The number of rois')
308 | 
309 |     args = parser.parse_args()
310 | 
311 |     return args
312 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import json
  3 | import random
  4 | import os
  5 | import pdb
  6 | import numpy as np
  7 | import wandb
  8 | import torch
  9 | from torch.nn import CrossEntropyLoss
 10 | from torch.optim import SGD, lr_scheduler
 11 | import torch.multiprocessing as mp
 12 | import torch.distributed as dist
 13 | from torch.backends import cudnn
 14 | import torchvision
 15 | from torchvision.models.detection import fasterrcnn_resnet50_fpn
 16 | 
 17 | from opts import parse_opts
 18 | from model import (generate_model, load_pretrained_model, make_data_parallel,
 19 |                    get_fine_tuning_parameters)
 20 | from strg import STRG
 21 | from mean import get_mean_std
 22 | from spatial_transforms import (Compose, Normalize, Resize, CenterCrop,
 23 |                                 CornerCrop, MultiScaleCornerCrop,
 24 |                                 RandomResizedCrop, RandomHorizontalFlip,
 25 |                                 ToTensor, ScaleValue, ColorJitter,
 26 |                                 PickFirstChannels)
 27 | from temporal_transforms import (LoopPadding, TemporalRandomCrop,
 28 |                                  TemporalCenterCrop, TemporalEvenCrop,
 29 |                                  SlidingWindow, TemporalSubsampling)
 30 | from temporal_transforms import Compose as TemporalCompose
 31 | from dataset import get_training_data, get_validation_data, get_inference_data
 32 | from utils import Logger, worker_init_fn, get_lr
 33 | from training import train_epoch
 34 | from validation import val_epoch
 35 | import inference
 36 | 
 37 | from rpn import RPN
 38 | 
 39 | 
 40 | def json_serial(obj):
 41 |     if isinstance(obj, Path):
 42 |         return str(obj)
 43 | 
 44 | 
 45 | def get_opt():
 46 |     opt = parse_opts()
 47 | 
 48 |     if opt.root_path is not None:
 49 |         opt.video_path = opt.root_path / opt.video_path
 50 |         opt.annotation_path = opt.root_path / opt.annotation_path
 51 |         opt.result_path = opt.root_path / opt.result_path
 52 |         if opt.resume_path is not None:
 53 |             opt.resume_path = opt.root_path / opt.resume_path
 54 |         if opt.pretrain_path is not None:
 55 |             opt.pretrain_path = opt.root_path / opt.pretrain_path
 56 | 
 57 |     if opt.pretrain_path is not None:
 58 |         opt.n_finetune_classes = opt.n_classes
 59 |         opt.n_classes = opt.n_pretrain_classes
 60 | 
 61 |     if opt.output_topk <= 0:
 62 |         opt.output_topk = opt.n_classes
 63 | 
 64 |     if opt.inference_batch_size == 0:
 65 |         opt.inference_batch_size = opt.batch_size
 66 | 
 67 |     opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
 68 |     opt.begin_epoch = 1
 69 |     opt.mean, opt.std = get_mean_std(opt.value_scale, dataset=opt.mean_dataset)
 70 |     opt.n_input_channels = 3
 71 |     if opt.input_type == 'flow':
 72 |         opt.n_input_channels = 2
 73 |         opt.mean = opt.mean[:2]
 74 |         opt.std = opt.std[:2]
 75 | 
 76 |     if opt.distributed:
 77 |         opt.dist_rank = 0 #int(os.environ["OMPI_COMM_WORLD_RANK"])
 78 | 
 79 |         if opt.dist_rank == 0:
 80 |             print(opt)
 81 |             with (opt.result_path / 'opts.json').open('w') as opt_file:
 82 |                 json.dump(vars(opt), opt_file, default=json_serial)
 83 |     else:
 84 |         print(opt)
 85 |         with (opt.result_path / 'opts.json').open('w') as opt_file:
 86 |             json.dump(vars(opt), opt_file, default=json_serial)
 87 | 
 88 |     return opt
 89 | 
 90 | 
 91 | def resume_model(resume_path, arch, model):
 92 |     print('loading checkpoint {} model'.format(resume_path))
 93 |     checkpoint = torch.load(resume_path, map_location='cpu')
 94 |     assert arch == checkpoint['arch']
 95 | 
 96 |     if hasattr(model, 'module'):
 97 |         model.module.load_state_dict(checkpoint['state_dict'])
 98 |     else:
 99 |         model.load_state_dict(checkpoint['state_dict'])
100 | 
101 |     return model
102 | 
103 | 
104 | def resume_train_utils(resume_path, begin_epoch, optimizer, scheduler):
105 |     print('loading checkpoint {} train utils'.format(resume_path))
106 |     checkpoint = torch.load(resume_path, map_location='cpu')
107 | 
108 |     begin_epoch = checkpoint['epoch'] + 1
109 |     if optimizer is not None and 'optimizer' in checkpoint:
110 |         optimizer.load_state_dict(checkpoint['optimizer'])
111 |     if scheduler is not None and 'scheduler' in checkpoint:
112 |         scheduler.load_state_dict(checkpoint['scheduler'])
113 | 
114 |     return begin_epoch, optimizer, scheduler
115 | 
116 | 
117 | def get_normalize_method(mean, std, no_mean_norm, no_std_norm):
118 |     if no_mean_norm:
119 |         if no_std_norm:
120 |             return Normalize([0, 0, 0], [1, 1, 1])
121 |         else:
122 |             return Normalize([0, 0, 0], std)
123 |     else:
124 |         if no_std_norm:
125 |             return Normalize(mean, [1, 1, 1])
126 |         else:
127 |             return Normalize(mean, std)
128 | 
129 | 
130 | def get_train_utils(opt, model_parameters):
131 |     assert opt.train_crop in ['random', 'corner', 'center']
132 |     spatial_transform = []
133 |     if opt.train_crop == 'random':
134 |         spatial_transform.append(
135 |             RandomResizedCrop(
136 |                 opt.sample_size, (opt.train_crop_min_scale, 1.0),
137 |                 (opt.train_crop_min_ratio, 1.0 / opt.train_crop_min_ratio)))
138 |     elif opt.train_crop == 'corner':
139 |         scales = [1.0]
140 |         scale_step = 1 / (2**(1 / 4))
141 |         for _ in range(1, 5):
142 |             scales.append(scales[-1] * scale_step)
143 |         spatial_transform.append(MultiScaleCornerCrop(opt.sample_size, scales))
144 |     elif opt.train_crop == 'center':
145 |         spatial_transform.append(Resize(opt.sample_size))
146 |         spatial_transform.append(CenterCrop(opt.sample_size))
147 |     normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
148 |                                      opt.no_std_norm)
149 |     if not opt.no_hflip:
150 |         spatial_transform.append(RandomHorizontalFlip())
151 |     if opt.colorjitter:
152 |         spatial_transform.append(ColorJitter())
153 |     spatial_transform.append(ToTensor())
154 |     if opt.input_type == 'flow':
155 |         spatial_transform.append(PickFirstChannels(n=2))
156 |     spatial_transform.append(ScaleValue(opt.value_scale))
157 |     spatial_transform.append(normalize)
158 |     spatial_transform = Compose(spatial_transform)
159 | 
160 |     assert opt.train_t_crop in ['random', 'center']
161 |     temporal_transform = []
162 |     if opt.sample_t_stride > 1:
163 |         temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
164 |     if opt.train_t_crop == 'random':
165 |         temporal_transform.append(TemporalRandomCrop(opt.sample_duration))
166 |     elif opt.train_t_crop == 'center':
167 |         temporal_transform.append(TemporalCenterCrop(opt.sample_duration))
168 |     temporal_transform = TemporalCompose(temporal_transform)
169 | 
170 |     train_data = get_training_data(opt.video_path, opt.annotation_path,
171 |                                    opt.dataset, opt.input_type, opt.file_type,
172 |                                    spatial_transform, temporal_transform)
173 |     if opt.distributed:
174 |         train_sampler = torch.utils.data.distributed.DistributedSampler(
175 |             train_data)
176 |     else:
177 |         train_sampler = None
178 |     train_loader = torch.utils.data.DataLoader(train_data,
179 |                                                batch_size=opt.batch_size,
180 |                                                shuffle=(train_sampler is None),
181 |                                                num_workers=opt.n_threads,
182 |                                                pin_memory=True,
183 |                                                sampler=train_sampler,
184 |                                                worker_init_fn=worker_init_fn)
185 | 
186 |     if opt.is_master_node:
187 |         train_logger = Logger(opt.result_path / 'train.log',
188 |                               ['epoch', 'loss', 'acc', 'lr'])
189 |         train_batch_logger = Logger(
190 |             opt.result_path / 'train_batch.log',
191 |             ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr'])
192 |     else:
193 |         train_logger = None
194 |         train_batch_logger = None
195 | 
196 |     if opt.nesterov:
197 |         dampening = 0
198 |     else:
199 |         dampening = opt.dampening
200 |     optimizer = SGD(model_parameters,
201 |                     lr=opt.learning_rate,
202 |                     momentum=opt.momentum,
203 |                     dampening=dampening,
204 |                     weight_decay=opt.weight_decay,
205 |                     nesterov=opt.nesterov)
206 | 
207 |     assert opt.lr_scheduler in ['plateau', 'multistep']
208 |     assert not (opt.lr_scheduler == 'plateau' and opt.no_val)
209 |     if opt.lr_scheduler == 'plateau':
210 |         scheduler = lr_scheduler.ReduceLROnPlateau(
211 |             optimizer, 'min', patience=opt.plateau_patience)
212 |     else:
213 |         scheduler = lr_scheduler.MultiStepLR(optimizer,
214 |                                              opt.multistep_milestones)
215 | 
216 |     return (train_loader, train_sampler, train_logger, train_batch_logger,
217 |             optimizer, scheduler)
218 | 
219 | 
220 | def get_val_utils(opt):
221 |     normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
222 |                                      opt.no_std_norm)
223 |     spatial_transform = [
224 |         Resize(opt.sample_size),
225 |         CenterCrop(opt.sample_size),
226 |         ToTensor()
227 |     ]
228 |     if opt.input_type == 'flow':
229 |         spatial_transform.append(PickFirstChannels(n=2))
230 |     spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
231 |     spatial_transform = Compose(spatial_transform)
232 | 
233 |     temporal_transform = []
234 |     if opt.sample_t_stride > 1:
235 |         temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
236 |     temporal_transform.append(
237 |         TemporalEvenCrop(opt.sample_duration, opt.n_val_samples))
238 |     temporal_transform = TemporalCompose(temporal_transform)
239 | 
240 |     val_data, collate_fn = get_validation_data(opt.video_path,
241 |                                                opt.annotation_path, opt.dataset,
242 |                                                opt.input_type, opt.file_type,
243 |                                                spatial_transform,
244 |                                                temporal_transform)
245 |     if opt.distributed:
246 |         val_sampler = torch.utils.data.distributed.DistributedSampler(
247 |             val_data, shuffle=False)
248 |     else:
249 |         val_sampler = None
250 |     val_loader = torch.utils.data.DataLoader(val_data,
251 | #                                             batch_size=opt.batch_size,
252 |                                              (opt.batch_size //
253 |                                                          opt.n_val_samples),
254 |                                              shuffle=False,
255 |                                              num_workers=opt.n_threads,
256 |                                              pin_memory=True,
257 |                                              sampler=val_sampler,
258 |                                              worker_init_fn=worker_init_fn,
259 |                                              collate_fn=collate_fn)
260 | 
261 |     if opt.is_master_node:
262 |         val_logger = Logger(opt.result_path / 'val.log',
263 |                             ['epoch', 'loss', 'acc'])
264 |     else:
265 |         val_logger = None
266 | 
267 |     return val_loader, val_logger
268 | 
269 | 
270 | def get_inference_utils(opt):
271 |     assert opt.inference_crop in ['center', 'nocrop']
272 | 
273 |     normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
274 |                                      opt.no_std_norm)
275 | 
276 |     spatial_transform = [Resize(opt.sample_size)]
277 |     if opt.inference_crop == 'center':
278 |         spatial_transform.append(CenterCrop(opt.sample_size))
279 |     spatial_transform.append(ToTensor())
280 |     if opt.input_type == 'flow':
281 |         spatial_transform.append(PickFirstChannels(n=2))
282 |     spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
283 |     spatial_transform = Compose(spatial_transform)
284 | 
285 |     temporal_transform = []
286 |     if opt.sample_t_stride > 1:
287 |         temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
288 |     temporal_transform.append(
289 |         SlidingWindow(opt.sample_duration, opt.inference_stride))
290 |     temporal_transform = TemporalCompose(temporal_transform)
291 | 
292 |     inference_data, collate_fn = get_inference_data(
293 |         opt.video_path, opt.annotation_path, opt.dataset, opt.input_type,
294 |         opt.file_type, opt.inference_subset, spatial_transform,
295 |         temporal_transform)
296 | 
297 |     inference_loader = torch.utils.data.DataLoader(
298 |         inference_data,
299 |         batch_size=opt.inference_batch_size,
300 |         shuffle=False,
301 |         num_workers=opt.n_threads,
302 |         pin_memory=True,
303 |         worker_init_fn=worker_init_fn,
304 |         collate_fn=collate_fn)
305 | 
306 |     return inference_loader, inference_data.class_names
307 | 
308 | 
309 | def save_checkpoint(save_file_path, epoch, arch, model, optimizer, scheduler):
310 |     if hasattr(model, 'module'):
311 |         model_state_dict = model.module.state_dict()
312 |     else:
313 |         model_state_dict = model.state_dict()
314 |     save_states = {
315 |         'epoch': epoch,
316 |         'arch': arch,
317 |         'state_dict': model_state_dict,
318 |         'optimizer': optimizer.state_dict(),
319 |         'scheduler': scheduler.state_dict()
320 |     }
321 |     torch.save(save_states, save_file_path)
322 | 
323 | 
324 | def main_worker(index, opt):
325 |     random.seed(opt.manual_seed)
326 |     np.random.seed(opt.manual_seed)
327 |     torch.manual_seed(opt.manual_seed)
328 | 
329 |     if index >= 0 and opt.device.type == 'cuda':
330 | #        opt.device = torch.device(f'cuda:{index}')
331 |         opt.device = torch.device('cuda:{}'.format(index))
332 | 
333 |     if opt.distributed:
334 |         opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
335 |         dist.init_process_group(backend='nccl',
336 |                                 init_method=opt.dist_url,
337 |                                 world_size=opt.world_size,
338 |                                 rank=opt.dist_rank)
339 |         opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
340 |         opt.n_threads = int(
341 |             (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
342 |     opt.is_master_node = not opt.distributed or opt.dist_rank == 0
343 | 
344 |     model = generate_model(opt)
345 |     if opt.batchnorm_sync:
346 |         assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
347 |         model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
348 |     if opt.pretrain_path:
349 |         model = load_pretrained_model(model, opt.pretrain_path, opt.model,
350 |                                       opt.n_finetune_classes, opt.strg)
351 | 
352 |     if opt.strg:
353 |         model = STRG(model, nclass=opt.n_classes, nrois=opt.nrois)
354 |         rpn = RPN(nrois=opt.nrois)
355 |         rpn = make_data_parallel(rpn, opt.distributed, opt.device)
356 |     else:
357 |         rpn = None
358 | 
359 |     if opt.resume_path is not None:
360 |         model = resume_model(opt.resume_path, opt.arch, model)
361 | 
362 |     model = make_data_parallel(model, opt.distributed, opt.device)
363 | 
364 | #    if opt.pretrain_path:
365 | #        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
366 | #    else:
367 |     parameters = model.parameters()
368 | 
369 |     if opt.is_master_node:
370 |         print(model)
371 | 
372 |     criterion = CrossEntropyLoss().to(opt.device)
373 | 
374 |     if not opt.no_train:
375 |         (train_loader, train_sampler, train_logger, train_batch_logger,
376 |          optimizer, scheduler) = get_train_utils(opt, parameters)
377 |         if opt.resume_path is not None:
378 |             opt.begin_epoch, optimizer, scheduler = resume_train_utils(
379 |                 opt.resume_path, opt.begin_epoch, optimizer, scheduler)
380 |             if opt.overwrite_milestones:
381 |                 scheduler.milestones = opt.multistep_milestones
382 |     if not opt.no_val:
383 |         val_loader, val_logger = get_val_utils(opt)
384 | 
385 |     if opt.tensorboard and opt.is_master_node:
386 |         #from torch.utils.tensorboard import SummaryWriter
387 |         from tensorboardX import SummaryWriter
388 |         if opt.begin_epoch == 1:
389 |             tb_writer = SummaryWriter(log_dir=opt.result_path)
390 |         else:
391 |             tb_writer = SummaryWriter(log_dir=opt.result_path,
392 |                                       purge_step=opt.begin_epoch)
393 |     else:
394 |         tb_writer = None
395 | 
396 |     if opt.wandb:
397 |         name = str(opt.result_path)
398 |         wandb.init(
399 |             project='strg',
400 |             name=name,
401 |             config=opt,
402 |             dir= name,
403 | #            resume=str(opt.resume_path) != '',
404 |             sync_tensorboard=True)
405 | 
406 | 
407 | 
408 |     prev_val_loss = None
409 |     for i in range(opt.begin_epoch, opt.n_epochs + 1):
410 |         if not opt.no_train:
411 |             if opt.distributed:
412 |                 train_sampler.set_epoch(i)
413 |             current_lr = get_lr(optimizer)
414 |             train_epoch(i, train_loader, model, criterion, optimizer,
415 |                         opt.device, current_lr, train_logger,
416 |                         train_batch_logger, tb_writer, opt.distributed,rpn=rpn,
417 |                         det_interval=opt.det_interval, nrois=opt.nrois)
418 | 
419 |             if i % opt.checkpoint == 0 and opt.is_master_node:
420 |                 save_file_path = opt.result_path / 'save_{}.pth'.format(i)
421 |                 save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
422 |                                 scheduler)
423 | 
424 |         if not opt.no_val:
425 |             prev_val_loss = val_epoch(i, val_loader, model, criterion,
426 |                                       opt.device, val_logger, tb_writer,
427 |                                       opt.distributed, rpn=rpn,
428 |                                     det_interval=opt.det_interval, nrois=opt.nrois)
429 | 
430 |         if not opt.no_train and opt.lr_scheduler == 'multistep':
431 |             scheduler.step()
432 |         elif not opt.no_train and opt.lr_scheduler == 'plateau':
433 |             scheduler.step(prev_val_loss)
434 | 
435 |     if opt.inference:
436 |         inference_loader, inference_class_names = get_inference_utils(opt)
437 |         inference_result_path = opt.result_path / '{}.json'.format(
438 |             opt.inference_subset)
439 | 
440 |         inference.inference(inference_loader, model, inference_result_path,
441 |                             inference_class_names, opt.inference_no_average,
442 |                             opt.output_topk)
443 | 
444 | 
445 | if __name__ == '__main__':
446 |     opt = get_opt()
447 | 
448 |     opt.device = torch.device('cpu' if opt.no_cuda else 'cuda')
449 |     if not opt.no_cuda:
450 |         cudnn.benchmark = True
451 |     if opt.accimage:
452 |         torchvision.set_image_backend('accimage')
453 | 
454 |     opt.ngpus_per_node = torch.cuda.device_count()
455 |     if opt.distributed:
456 |         opt.world_size = opt.ngpus_per_node * opt.world_size
457 |         mp.spawn(main_worker, nprocs=opt.ngpus_per_node, args=(opt,))
458 |     else:
459 |         main_worker(-1, opt)
460 | 


--------------------------------------------------------------------------------