├── imgs
    ├── crnn.png
    ├── visualization.png
    └── crnn_twostream.png
├── requirements.txt
├── frame_video_level_cnn
    ├── resnet101_video
    │   ├── plot_video1.png
    │   ├── plot_video2.png
    │   ├── plot_video3.png
    │   ├── plot_video4.png
    │   ├── Data.py
    │   ├── Testplot.py
    │   └── Test.py
    ├── README.md
    ├── resnet101_frame
    │   ├── TrainConfig.py
    │   ├── Data.py
    │   ├── Model.py
    │   ├── Test.py
    │   └── Train.py
    ├── vgg16bn_frame
    │   ├── TrainConfig.py
    │   ├── Data.py
    │   ├── Model.py
    │   ├── Test.py
    │   └── Train.py
    └── data_prepare.py
├── conf
    ├── cnn_flow.py
    ├── lstm_flow.py
    └── lstm_rgb.py
├── test_combined.py
├── test_cnn_flow.py
├── test_lstm_flow.py
├── test_lstm_rgb.py
├── data.py
├── README.md
├── train_cnn_flow.py
├── train_lstm_flow.py
├── train_lstm_rgb.py
└── networks.py


/imgs/crnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/imgs/crnn.png


--------------------------------------------------------------------------------
/imgs/visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/imgs/visualization.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | tqdm
3 | torchvision==0.7.0
4 | torch==1.6.0
5 | Pillow
6 | scikit_learn
7 | 


--------------------------------------------------------------------------------
/imgs/crnn_twostream.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/imgs/crnn_twostream.png


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_video/plot_video1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/frame_video_level_cnn/resnet101_video/plot_video1.png


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_video/plot_video2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/frame_video_level_cnn/resnet101_video/plot_video2.png


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_video/plot_video3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/frame_video_level_cnn/resnet101_video/plot_video3.png


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_video/plot_video4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/frame_video_level_cnn/resnet101_video/plot_video4.png


--------------------------------------------------------------------------------
/frame_video_level_cnn/README.md:
--------------------------------------------------------------------------------
 1 | # Traffic Accident Detection via Deep Learning
 2 | 
 3 | 
 4 | ## ResNet101, VGG16_BN
 5 | 
 6 | Code in folder `resnet101_frame`: A fixed ResNet101 Conv2d CNN + 2 MLP layers
 7 | 
 8 | Code in folder `vgg16bn_frame`: A fixed VGG16_bn Conv2d CNN + 2 MLP layers
 9 | 
10 | To run the training code:
11 | ```
12 | python Train.py
13 | ```
14 | 
15 | To run the test code:
16 | ```
17 | python Train.py
18 | ```
19 | 
20 | ## Video-level ResNet101
21 | 
22 | Code in folder `resnet101_video`
23 | 
24 | ### Video-level Prediction
25 | No training, just run test script:
26 | ```
27 | python Train.py
28 | ```
29 | 
30 | ### Visualization
31 | This program generates and saves video-level plots of 4 video clips. Selected videos should be saved in folders 1, 2, 3, and 4 before running this script
32 | 
33 | ```
34 | python Testplot.py
35 | ```


--------------------------------------------------------------------------------
/conf/cnn_flow.py:
--------------------------------------------------------------------------------
 1 | configs = dict()
 2 | configs['net'] = dict()
 3 | 
 4 | # whether to resume
 5 | configs['resume'] = False
 6 | configs['ckpt_path'] = 'exp/xx/m_epochxx.pt'
 7 | configs['new_lr'] = 1e-4        # if resume is True and new_lr is not None, then use this new learning rate
 8 | 
 9 | # for network
10 | configs['net']['cnn_type'] = 'resnet101'            # 'resnet50', 'resnet101', 'resnet152'
11 | configs['net']['n_frames'] = 10                     # number of frames
12 | 
13 | # for training and validation
14 | configs['lr'] = 1e-3
15 | configs['weight_decay'] = 1e-6
16 | configs['n_epochs'] = 100
17 | 
18 | configs['save_dir'] = 'exp'
19 | configs['device'] = 'cuda'
20 | configs['n_workers'] = 4
21 | configs['image_size'] = 224                         # for resnet
22 | configs['train_batch_size'] = 64
23 | configs['data_root'] = '/home/ubuntu/data_flow'     # path to folders of optical flow images
24 | configs['train_list'] = 'dataset/train.txt'
25 | configs['apply_val'] = True
26 | configs['val_batch_size'] = 32
27 | configs['val_list'] = 'dataset/val.txt'
28 | configs['val_display_interval'] = 5
29 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_frame/TrainConfig.py:
--------------------------------------------------------------------------------
 1 | configs = dict()
 2 | configs['net'] = dict()
 3 | 
 4 | # whether to resume
 5 | configs['resume'] = False
 6 | configs['ckpt_path'] = 'exp/xxxx/m_epochxx.pt'
 7 | configs['new_lr'] = 1e-3        # if resume is True and new_lr is not None, then use this new learning rate
 8 | 
 9 | # for network
10 | configs['net']['cnn_type'] = 'resnet101'            # 'alexnet'
11 | configs['net']['hidden_sizes'] = [256, 128]
12 | configs['net']['batchnorms'] = [True, True, True]
13 | configs['net']['dropouts'] = [0.3, 0.2, 0.1]   # the first is dropout for input
14 | 
15 | # for training and validation
16 | configs['lr'] = 1e-2
17 | configs['weight_decay'] = 1e-6
18 | configs['n_epochs'] = 100
19 | 
20 | configs['save_dir'] = 'exp'
21 | configs['device'] = 'cuda'
22 | configs['n_workers'] = 4
23 | configs['train_batch_size'] = 64 #1024
24 | configs['image_size'] = 224                 # for resnet
25 | configs['data_root'] = '/home/ubuntu/project/data'
26 | configs['train_list'] = 'trainlist_reduced.txt'
27 | configs['apply_val'] = True
28 | configs['val_batch_size'] = 128
29 | configs['val_list'] = 'val.txt'
30 | configs['val_display_interval'] = 5
31 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/vgg16bn_frame/TrainConfig.py:
--------------------------------------------------------------------------------
 1 | configs = dict()
 2 | configs['net'] = dict()
 3 | 
 4 | # whether to resume
 5 | configs['resume'] = False
 6 | configs['ckpt_path'] = 'exp/xxxx/m_epochxx.pt'
 7 | configs['new_lr'] = 1e-3        # if resume is True and new_lr is not None, then use this new learning rate
 8 | 
 9 | # for network
10 | configs['net']['cnn_type'] = 'vgg16_bn'            # 'alexnet'
11 | configs['net']['hidden_sizes'] = [256, 128]
12 | configs['net']['batchnorms'] = [True, True, True]
13 | configs['net']['dropouts'] = [0.3, 0.2, 0.1]   # the first is dropout for input
14 | 
15 | # for training and validation
16 | configs['lr'] = 1e-2
17 | configs['weight_decay'] = 1e-6
18 | configs['n_epochs'] = 100
19 | 
20 | configs['save_dir'] = 'exp'
21 | configs['device'] = 'cuda'
22 | configs['n_workers'] = 4
23 | configs['train_batch_size'] = 64 #1024
24 | configs['image_size'] = 224                 # for resnet
25 | configs['data_root'] = '/home/ubuntu/project/data'
26 | configs['train_list'] = 'trainlist_reduced.txt'
27 | configs['apply_val'] = True
28 | configs['val_batch_size'] = 128
29 | configs['val_list'] = 'val.txt'
30 | configs['val_display_interval'] = 5
31 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_frame/Data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import torch
 4 | import numpy as np
 5 | from PIL import Image
 6 | from tqdm import tqdm
 7 | from torch.utils.data import Dataset
 8 | 
 9 | 
10 | class VideoFrameDataset(Dataset):
11 |     def __init__(self, root, video_list_path, transform):
12 |         self.transform = transform
13 | 
14 |         with open(video_list_path, 'r') as fp:
15 |             valid_videos = [line.rstrip().split()[0].split('/')[-1] for line in fp.readlines()]
16 | 
17 |         all_jpgs = sorted(glob.glob(root.rstrip('/') + '/*/*.jpg'))
18 |         valid_jpgs = []
19 |         for name in all_jpgs:
20 |             if name.split('/')[-2] in valid_videos:
21 |                 valid_jpgs.append(name)
22 | 
23 |         self.images = valid_jpgs
24 |         self.labels = [int(n.split('/')[-2].split('-')[-1] == '1') for n in valid_jpgs]
25 | 
26 |         tqdm.write(f'There are {len(self.images)} images, {sum(self.labels)} are anomaly.')
27 | 
28 |     def __len__(self):
29 |         return len(self.images)
30 | 
31 |     def __getitem__(self, index):
32 |         img = self.transform(Image.open(self.images[index]))
33 |         return img, self.labels[index]
34 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/vgg16bn_frame/Data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import torch
 4 | import numpy as np
 5 | from PIL import Image
 6 | from tqdm import tqdm
 7 | from torch.utils.data import Dataset
 8 | 
 9 | 
10 | class VideoFrameDataset(Dataset):
11 |     def __init__(self, root, video_list_path, transform):
12 |         self.transform = transform
13 | 
14 |         with open(video_list_path, 'r') as fp:
15 |             valid_videos = [line.rstrip().split()[0].split('/')[-1] for line in fp.readlines()]
16 | 
17 |         all_jpgs = sorted(glob.glob(root.rstrip('/') + '/*/*.jpg'))
18 |         valid_jpgs = []
19 |         for name in all_jpgs:
20 |             if name.split('/')[-2] in valid_videos:
21 |                 valid_jpgs.append(name)
22 | 
23 |         self.images = valid_jpgs
24 |         self.labels = [int(n.split('/')[-2].split('-')[-1] == '1') for n in valid_jpgs]
25 | 
26 |         tqdm.write(f'There are {len(self.images)} images, {sum(self.labels)} are anomaly.')
27 | 
28 |     def __len__(self):
29 |         return len(self.images)
30 | 
31 |     def __getitem__(self, index):
32 |         img = self.transform(Image.open(self.images[index]))
33 |         return img, self.labels[index]
34 | 


--------------------------------------------------------------------------------
/conf/lstm_flow.py:
--------------------------------------------------------------------------------
 1 | configs = dict()
 2 | configs['net'] = dict()
 3 | 
 4 | # whether to resume
 5 | configs['resume'] = False
 6 | configs['ckpt_path'] = 'exp/xx/m_epochxx.pt'
 7 | configs['new_lr'] = 1e-5        # if resume is True and new_lr is not None, then use this new learning rate
 8 | 
 9 | # for network
10 | configs['net']['n_frames'] = 10
11 | configs['net']['cnn_type'] = 'resnet101'            # 'resnet50', 'resnet101', 'resnet152'
12 | configs['net']['cnn_emb_dim'] = 256
13 | configs['net']['cnn_dropout'] = 0.5
14 | configs['net']['num_rnn_layers'] = 1
15 | configs['net']['rnn_bidir'] = False                 # bidirectional or unidirectional
16 | configs['net']['rnn_hidden_size'] = 256
17 | configs['net']['rnn_dropout'] = 0.3
18 | 
19 | # for training and validation
20 | configs['lr'] = 1e-4
21 | configs['weight_decay'] = 1e-6
22 | configs['n_epochs'] = 100
23 | 
24 | configs['save_dir'] = 'exp'
25 | configs['device'] = 'cuda'
26 | configs['n_workers'] = 4
27 | configs['image_size'] = 224                             # for resnet
28 | configs['data_root'] = '/home/ubuntu/data_flow'         # path to folders of optical flows
29 | configs['train_batch_size'] = 8
30 | configs['train_list'] = 'dataset/train.txt'
31 | configs['apply_val'] = True
32 | configs['val_batch_size'] = 8
33 | configs['val_list'] = 'dataset/val.txt'
34 | configs['val_display_interval'] = 5
35 | 


--------------------------------------------------------------------------------
/conf/lstm_rgb.py:
--------------------------------------------------------------------------------
 1 | configs = dict()
 2 | configs['net'] = dict()
 3 | 
 4 | # whether to resume
 5 | configs['resume'] = False
 6 | configs['ckpt_path'] = 'exp/xx/m_epochxx.pt'
 7 | configs['new_lr'] = 2e-5        # if resume is True and new_lr is not None, then use this new learning rate
 8 | 
 9 | # for network
10 | configs['net']['cnn_type'] = 'resnet101'            # 'resnet50', 'resnet101', 'resnet152'
11 | configs['net']['cnn_emb_dim'] = 512
12 | configs['net']['cnn_dropout'] = 0.5
13 | configs['net']['cnn_finetune'] = False              # whether to fine-tune the pre-trained CNN
14 | configs['net']['num_rnn_layers'] = 1
15 | configs['net']['rnn_hidden_size'] = 256
16 | configs['net']['rnn_dropout'] = 0.3
17 | configs['net']['rnn_bidir'] = True                 # bidirectional or unidirectional
18 | 
19 | # for training and validation
20 | configs['lr'] = 2e-4
21 | configs['weight_decay'] = 1e-6
22 | configs['n_epochs'] = 100
23 | 
24 | configs['save_dir'] = 'exp'
25 | configs['device'] = 'cuda'
26 | configs['n_workers'] = 4
27 | configs['train_batch_size'] = 64
28 | configs['train_n_frames'] = 15                  # max number of frames per video during training
29 | configs['image_size'] = 224                     # for resnet
30 | configs['data_root'] = '/home/ubuntu/data'      # path to folders of rgb frames
31 | configs['train_list'] = 'dataset/train.txt'
32 | configs['apply_val'] = True
33 | configs['val_batch_size'] = 32
34 | configs['val_n_frames'] = 30
35 | configs['val_list'] = 'dataset/val.txt'
36 | configs['val_display_interval'] = 5
37 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/vgg16bn_frame/Model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torchvision.models as models
 4 | 
 5 | 
 6 | class FrameClassifier(nn.Module):
 7 |     '''2D CNN feature extractor based on pre-trained models.
 8 |     '''
 9 |     def __init__(self, fc_sizes, batchnorms, dropouts):
10 |         super(FrameClassifier, self).__init__()
11 | 
12 |         cnn = models.vgg16_bn(pretrained=True)
13 | 
14 |         modules = list(cnn.children())[:-1]         # remove the last FC layer
15 |         self.cnn = nn.Sequential(*modules)
16 |         in_features = cnn.classifier[0].in_features  # 4096
17 | 
18 |         fc_layers = [nn.Dropout(dropouts[0])]       # input dropout
19 |         dropouts = dropouts[1:]
20 |         for hidden_size, batchnorm, drop_p in zip(fc_sizes, batchnorms, dropouts):
21 |             fc_layers.append(nn.Linear(in_features, hidden_size))
22 |             in_features = hidden_size
23 |             if batchnorm:
24 |                 fc_layers.append(nn.BatchNorm1d(hidden_size))
25 |             fc_layers.append(nn.LeakyReLU(inplace=True))
26 |             fc_layers.append(nn.Dropout(p=drop_p))
27 |         fc_layers.append(nn.Linear(in_features, 1))        # binary classification
28 |         fc_layers.append(nn.Sigmoid())
29 | 
30 |         self.classifier = nn.Sequential(*fc_layers)
31 | 
32 |     def forward(self, x):
33 |         with torch.no_grad():
34 |             out = self.cnn(x)
35 |             out = out.reshape(out.shape[0], -1)
36 | 
37 |         prob = self.classifier(out)         # (N, 1)
38 |         return prob
39 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_frame/Model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torchvision.models as models
 4 | 
 5 | 
 6 | class FrameClassifier(nn.Module):
 7 |     '''2D CNN feature extractor based on pre-trained models.
 8 |     '''
 9 |     def __init__(self, fc_sizes, batchnorms, dropouts):
10 |         super(FrameClassifier, self).__init__()
11 | 
12 |         cnn = models.resnet101(pretrained=True)
13 | 
14 |         modules = list(cnn.children())[:-1]         # remove the last FC layer
15 |         self.cnn = nn.Sequential(*modules)
16 |         in_features = cnn.fc.in_features            # 2048
17 | 
18 |         fc_layers = [nn.Dropout(dropouts[0])]       # input dropout
19 |         dropouts = dropouts[1:]
20 |         for hidden_size, batchnorm, drop_p in zip(fc_sizes, batchnorms, dropouts):
21 |             fc_layers.append(nn.Linear(in_features, hidden_size))
22 |             in_features = hidden_size
23 |             if batchnorm:
24 |                 fc_layers.append(nn.BatchNorm1d(hidden_size))
25 |             fc_layers.append(nn.LeakyReLU(inplace=True))
26 |             fc_layers.append(nn.Dropout(p=drop_p))
27 |         fc_layers.append(nn.Linear(in_features, 1))        # binary classification
28 |         fc_layers.append(nn.Sigmoid())
29 | 
30 |         self.classifier = nn.Sequential(*fc_layers)
31 | 
32 |     def forward(self, x):
33 |         with torch.no_grad():
34 |             out = self.cnn(x)
35 |             out = out.reshape(out.shape[0], -1)             # (N, 2048)
36 | 
37 |         prob = self.classifier(out)         # (N, 1)
38 |         return prob
39 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_video/Data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import torch
 4 | import numpy as np
 5 | from PIL import Image
 6 | from tqdm import tqdm
 7 | from torch.utils.data import Dataset
 8 | 
 9 | 
10 | class VideoFrameDataset(Dataset):
11 |     def __init__(self, root, video_list_path, transform):
12 |         self.transform = transform
13 | 
14 |         with open(video_list_path, 'r') as fp:
15 |             valid_videos = [line.rstrip().split()[0].split('/')[-1] for line in fp.readlines()]
16 | 
17 |         all_jpgs = sorted(glob.glob(root.rstrip('/') + '/*/*.jpg'))
18 |         valid_jpgs = []
19 |         for name in all_jpgs:
20 |             if name.split('/')[-2] in valid_videos:
21 |                 valid_jpgs.append(name)
22 | 
23 |         self.images = valid_jpgs
24 |         self.labels = [int(n.split('/')[-2].split('-')[-1] == '1') for n in valid_jpgs]
25 | 
26 |         tqdm.write(f'There are {len(self.images)} images, {sum(self.labels)} are anomaly.')
27 | 
28 |     def __len__(self):
29 |         return len(self.images)
30 | 
31 |     def __getitem__(self, index):
32 |         img = self.transform(Image.open(self.images[index]))
33 |         return img, self.labels[index]
34 | 
35 | 
36 | 
37 | def loadImages(root, folder, batch, transform):
38 |     valid_jpgs = sorted(glob.glob(root.rstrip('/') + '/' + str(folder) + '/*.jpg'))
39 |     labels = [int(n.split('/')[-2].split('-')[-1] == '1') for n in valid_jpgs]
40 | 
41 |     valid_jpgs = [transform(Image.open(item)) for item in valid_jpgs]
42 | 
43 |     if len(valid_jpgs) > batch:
44 |         valid_jpgs = valid_jpgs[:batch]
45 |     elif len(valid_jpgs) < batch:
46 |         for i in range(batch - len(valid_jpgs)):
47 |             valid_jpgs.append(valid_jpgs[-1])
48 | 
49 |     return torch.stack(valid_jpgs, dim=0), torch.tensor(labels)
50 | 


--------------------------------------------------------------------------------
/test_combined.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | from sklearn.metrics import roc_auc_score
 5 | 
 6 | 
 7 | def main(args):
 8 |     """Combine video-level predictions from two streams using weighted average fusion.
 9 |     Notes:
10 |         combined_prob = rgb_weight * rgb_prob + (1. - rgb_weight) * flow_prob
11 |     """
12 |     rgb_results = np.load(args.rgb_file)
13 |     flow_results = np.load(args.flow_file)
14 |     rgb_w = float(args.rgb_weight)
15 |     assert 0. <= rgb_w <= 1.
16 | 
17 |     y_true = []
18 |     y_pred_rgb = []
19 |     y_pred_flow = []
20 |     for (name1, label_str1, target1, prob1), (name2, label_str2, target2, prob2) in zip(rgb_results, flow_results):
21 |         assert name1 == name2
22 |         assert label_str1 == label_str2
23 |         assert target1 == target2
24 | 
25 |         y_true.append(int(target1))
26 |         y_pred_rgb.append(float(prob1))
27 |         y_pred_flow.append(float(prob2))
28 | 
29 |     y_true = np.array(y_true, dtype=np.int)
30 |     y_pred_rgb = np.array(y_pred_rgb, dtype=np.float)
31 |     y_pred_flow = np.array(y_pred_flow, dtype=np.float)
32 |     y_pred = y_pred_rgb * rgb_w + y_pred_flow * (1. - rgb_w)
33 | 
34 |     auc_rgb = roc_auc_score(y_true, y_pred_rgb)
35 |     auc_flow = roc_auc_score(y_true, y_pred_flow)
36 |     auc = roc_auc_score(y_true, y_pred)
37 | 
38 |     acc_rgb = ((y_pred_rgb >= 0.5) == y_true).sum() / y_true.shape[0]
39 |     acc_flow = ((y_pred_flow >= 0.5) == y_true).sum() / y_true.shape[0]
40 |     acc = ((y_pred >= 0.5) == y_true).sum() / y_true.shape[0]
41 | 
42 |     print(f'=============== AUC ===============')
43 |     print(f'== RGB:  {auc_rgb:.5f}')
44 |     print(f'== Flow: {auc_flow:.5f}')
45 |     print(f'== Both: {auc:.5f}')
46 |     print(f'============= Accuracy ==============')
47 |     print(f'== RGB:  {acc_rgb*100:.3f}%')
48 |     print(f'== Flow: {acc_flow*100:.3f}%')
49 |     print(f'== Both: {acc*100:.3f}%')
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument(
55 |         '--rgb_file',
56 |         type=str,
57 |         help='path to the npy file of RGB frames'
58 |     )
59 |     parser.add_argument(
60 |         '--flow_file',
61 |         type=str,
62 |         help='path to the npy file of optical flow images'
63 |     )
64 |     parser.add_argument(
65 |         '--rgb_weight',
66 |         default=0.5,
67 |         type=float,
68 |         help='weight of RGB predictions (0.0 to 1.0)'
69 |     )
70 |     args = parser.parse_args()
71 |     print(args)
72 | 
73 |     main(args)
74 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_video/Testplot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tqdm import tqdm
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from torch.utils.data import DataLoader
 7 | from torchvision import transforms
 8 | from sklearn.metrics import roc_auc_score
 9 | from Data import VideoFrameDataset, loadImages
10 | from Model import FrameClassifier
11 | import matplotlib.pyplot as plt
12 | 
13 | 
14 | def test(ckpt_path, data_root, data_list, batch_size, num_workers, device, display_interval):
15 |     '''
16 |     Genarates and saves the plot of 4 test videos
17 |     '''
18 |     ckpt = torch.load(ckpt_path)
19 |     net_configs = ckpt['net_configs']
20 |     print(f'Load ckpt from {ckpt_path}')
21 | 
22 |     val_transform = transforms.Compose([
23 |         transforms.Resize([224, 224]),
24 |         transforms.ToTensor(),
25 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
26 |     ])
27 | 
28 |     model = FrameClassifier(
29 |         #cnn_type=net_configs['cnn_type'],
30 |         fc_sizes=net_configs['hidden_sizes'],
31 |         batchnorms=net_configs['batchnorms'],
32 |         dropouts=net_configs['dropouts']
33 |     )
34 |     model.to(device)
35 |     model.load_state_dict(ckpt['state_dict'])
36 | 
37 |     valid_videos = [1, 2, 3, 4] # folder names
38 | 
39 |     model.eval()
40 |     with torch.no_grad():
41 |         n_correct = 0
42 |         n_samples = 0
43 |         predicted_scores = []
44 |         target_scores = []
45 |         for i in range(len(valid_videos)):
46 |             images, labels = loadImages(data_root, valid_videos[i], batch_size, val_transform)
47 |             images = images.to(dtype=torch.float32, device=device)
48 |             labels = labels.to(dtype=torch.float32, device=device)
49 | 
50 |             prob = model(images)
51 | 
52 |             np_prob = prob.squeeze().detach().cpu().numpy()
53 |             np_target= labels.cpu().to(dtype=torch.int).numpy()
54 | 
55 |             plt.plot(np.unique(np_prob))
56 |             plt.ylabel('Anomaly Probability')
57 |             plt.xlabel('Time')
58 |             l = plt.axvline(x=int(len(np_prob)/2), linewidth=190, color='#FF5647', alpha=0.4)
59 |             plt.grid(True)
60 |             plt.savefig('plot' + str(i) + '.png')
61 |             plt.clf()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     test(
66 |         ckpt_path='exp/20201201-09:14:02/m_epoch07.pt',
67 |         data_root='/home/ubuntu/project/data/plot',
68 |         data_list='test.txt',
69 |         batch_size=150,
70 |         num_workers=4,
71 |         device='cuda',
72 |         display_interval=1
73 |     )
74 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_video/Test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tqdm import tqdm
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from torch.utils.data import DataLoader
 7 | from torchvision import transforms
 8 | from sklearn.metrics import roc_auc_score
 9 | from Data import VideoFrameDataset, loadImages
10 | from Model import FrameClassifier
11 | 
12 | 
13 | def test(ckpt_path, data_root, data_list, batch_size, num_workers, device, display_interval):
14 |     '''
15 |     Video-level prediction
16 |     '''
17 |     ckpt = torch.load(ckpt_path)
18 |     net_configs = ckpt['net_configs']
19 |     print(f'Load ckpt from {ckpt_path}')
20 | 
21 |     val_transform = transforms.Compose([
22 |         transforms.Resize([224, 224]),
23 |         transforms.ToTensor(),
24 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
25 |     ])
26 | 
27 |     valid_videos = []
28 |     with open(data_list, 'r') as fp:
29 |         valid_videos = [line.rstrip().split()[0].split('/')[-1] for line in fp.readlines()]
30 | 
31 |     model = FrameClassifier(
32 |         #cnn_type=net_configs['cnn_type'],
33 |         fc_sizes=net_configs['hidden_sizes'],
34 |         batchnorms=net_configs['batchnorms'],
35 |         dropouts=net_configs['dropouts']
36 |     )
37 |     model.to(device)
38 |     model.load_state_dict(ckpt['state_dict'])
39 | 
40 |     model.eval()
41 |     with torch.no_grad():
42 |         predicted_scores = []
43 |         target_scores = []
44 |         for i in range(len(valid_videos)):
45 |             images, labels = loadImages(data_root, valid_videos[i], batch_size, val_transform)
46 |             images = images.to(dtype=torch.float32, device=device)
47 |             labels = labels.to(dtype=torch.float32, device=device)
48 | 
49 |             prob = model(images)
50 | 
51 |             np_prob = prob.squeeze().detach().cpu().numpy()
52 |             np_target= labels.cpu().to(dtype=torch.int).numpy()
53 | 
54 |             predicted_avg_score = np.average(np_prob)
55 |             target_avg_score = np.average(np_target)
56 | 
57 |             if (predicted_avg_score >= 0.5 and target_avg_score == 1.0) or (predicted_avg_score <= 0.5 and target_avg_score == 0.0):
58 |                 n_correct += 1
59 | 
60 |             predicted_scores.append(predicted_avg_score)
61 |             target_scores.append(target_avg_score)
62 | 
63 |     print('=' * 80)
64 |     print(f'[info] Test Acc = {n_correct / len(valid_videos) * 100:.4f}%')
65 | 
66 |     auc = roc_auc_score(target_scores, predicted_scores)
67 |     print(f'[info] Test AUC = {auc:.5f}')
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     test(
72 |         ckpt_path='exp/20201201-09:14:02/m_epoch07.pt',
73 |         data_root='/home/ubuntu/project/data',
74 |         data_list='test.txt',
75 |         batch_size=30,
76 |         num_workers=4,
77 |         device='cuda',
78 |         display_interval=1
79 |     )
80 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/vgg16bn_frame/Test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tqdm import tqdm
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from torch.utils.data import DataLoader
 7 | from torchvision import transforms
 8 | from sklearn.metrics import roc_auc_score
 9 | from Data import VideoFrameDataset
10 | from Model import FrameClassifier
11 | 
12 | 
13 | def test(ckpt_path, data_root, data_list, batch_size, num_workers, device, display_interval):
14 |     ckpt = torch.load(ckpt_path)
15 |     net_configs = ckpt['net_configs']
16 |     print(f'Load ckpt from {ckpt_path}')
17 | 
18 |     val_transform = transforms.Compose([
19 |         transforms.Resize([224, 224]),
20 |         transforms.ToTensor(),
21 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
22 |     ])
23 |     val_dataset = VideoFrameDataset(
24 |         root=data_root,
25 |         video_list_path=data_list,
26 |         transform=val_transform
27 |     )
28 |     val_loader = DataLoader(
29 |         dataset=val_dataset,
30 |         batch_size=batch_size,
31 |         shuffle=False,
32 |         num_workers=num_workers,
33 |         pin_memory=True
34 |     )
35 | 
36 |     model = FrameClassifier(
37 |         #cnn_type=net_configs['cnn_type'],
38 |         fc_sizes=net_configs['hidden_sizes'],
39 |         batchnorms=net_configs['batchnorms'],
40 |         dropouts=net_configs['dropouts']
41 |     )
42 |     model.to(device)
43 |     model.load_state_dict(ckpt['state_dict'])
44 | 
45 |     model.eval()
46 |     with torch.no_grad():
47 |         n_correct = 0
48 |         n_samples = 0
49 |         predicted_scores = []
50 |         target_scores = []
51 |         for step, (images, labels) in enumerate(tqdm(val_loader)):
52 |             images = images.to(dtype=torch.float32, device=device)                  # (N, C, H, W)
53 |             labels = labels.to(dtype=torch.float32, device=device)                  # (N,)
54 | 
55 |             # forward
56 |             prob = model(images)                            # (N, 1)    after sigmoid
57 | 
58 |             n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item()
59 |             n_samples += labels.shape[0]
60 | 
61 |             predicted_scores.append(prob.squeeze().detach().cpu().numpy())
62 |             target_scores.append(labels.cpu().to(dtype=torch.int).numpy())
63 | 
64 |             # display
65 |             if (step + 1) % display_interval == 0 and step < len(val_loader) - 1:
66 |                 tqdm.write('-' * 40)
67 |                 tqdm.write(f'[info] Probs: {prob.squeeze().detach().cpu().numpy()[:10]}')
68 |                 tqdm.write(f'[info] Label: {labels.cpu().numpy()[:10]}')
69 | 
70 |     print('=' * 80)
71 |     print(f'[info] Test Acc = {n_correct / n_samples * 100:.4f}%')
72 | 
73 |     predicted_scores = np.concatenate(predicted_scores, axis=0)
74 |     target_scores = np.concatenate(target_scores, axis=0)
75 | 
76 |     auc = roc_auc_score(target_scores, predicted_scores)
77 |     print(f'[info] Test AUC = {auc:.5f}')
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     test(
82 |         ckpt_path='exp/20201130-16:26:59/m_epoch02.pt',
83 |         data_root='/home/ubuntu/project/data',
84 |         data_list='test.txt',
85 |         batch_size=128,
86 |         num_workers=4,
87 |         device='cuda',
88 |         display_interval=1
89 |     )
90 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_frame/Test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tqdm import tqdm
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from torch.utils.data import DataLoader
 7 | from torchvision import transforms
 8 | from sklearn.metrics import roc_auc_score
 9 | from Data import VideoFrameDataset
10 | from Model import FrameClassifier
11 | 
12 | 
13 | def test(ckpt_path, data_root, data_list, batch_size, num_workers, device, display_interval):
14 |     ckpt = torch.load(ckpt_path)
15 |     net_configs = ckpt['net_configs']
16 |     print(f'Load ckpt from {ckpt_path}')
17 | 
18 |     val_transform = transforms.Compose([
19 |         transforms.Resize([224, 224]),
20 |         transforms.ToTensor(),
21 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
22 |     ])
23 |     val_dataset = VideoFrameDataset(
24 |         root=data_root,
25 |         video_list_path=data_list,
26 |         transform=val_transform
27 |     )
28 |     val_loader = DataLoader(
29 |         dataset=val_dataset,
30 |         batch_size=batch_size,
31 |         shuffle=False,
32 |         num_workers=num_workers,
33 |         pin_memory=True
34 |     )
35 | 
36 |     model = FrameClassifier(
37 |         #cnn_type=net_configs['cnn_type'],
38 |         fc_sizes=net_configs['hidden_sizes'],
39 |         batchnorms=net_configs['batchnorms'],
40 |         dropouts=net_configs['dropouts']
41 |     )
42 |     model.to(device)
43 |     model.load_state_dict(ckpt['state_dict'])
44 | 
45 |     model.eval()
46 |     with torch.no_grad():
47 |         n_correct = 0
48 |         n_samples = 0
49 |         predicted_scores = []
50 |         target_scores = []
51 |         for step, (images, labels) in enumerate(tqdm(val_loader)):
52 |             images = images.to(dtype=torch.float32, device=device)                  # (N, C, H, W)
53 |             labels = labels.to(dtype=torch.float32, device=device)                  # (N,)
54 | 
55 |             # forward
56 |             prob = model(images)                            # (N, 1)    after sigmoid
57 | 
58 |             n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item()
59 |             n_samples += labels.shape[0]
60 | 
61 |             predicted_scores.append(prob.squeeze().detach().cpu().numpy())
62 |             target_scores.append(labels.cpu().to(dtype=torch.int).numpy())
63 | 
64 |             # display
65 |             if (step + 1) % display_interval == 0 and step < len(val_loader) - 1:
66 |                 tqdm.write('-' * 40)
67 |                 tqdm.write(f'[info] Probs: {prob.squeeze().detach().cpu().numpy()[:10]}')
68 |                 tqdm.write(f'[info] Label: {labels.cpu().numpy()[:10]}')
69 | 
70 |     print('=' * 80)
71 |     print(f'[info] Test Acc = {n_correct / n_samples * 100:.4f}%')
72 | 
73 |     predicted_scores = np.concatenate(predicted_scores, axis=0)
74 |     target_scores = np.concatenate(target_scores, axis=0)
75 | 
76 |     auc = roc_auc_score(target_scores, predicted_scores)
77 |     print(f'[info] Test AUC = {auc:.5f}')
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     test(
82 |         ckpt_path='exp/20201201-09:14:02/m_epoch02.pt',
83 |         data_root='/home/ubuntu/project/data',
84 |         data_list='test.txt',
85 |         batch_size=128,
86 |         num_workers=4,
87 |         device='cuda',
88 |         display_interval=1
89 |     )
90 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/vgg16bn_frame/Train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tqdm import tqdm
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from torch.utils.data import DataLoader
 7 | from torchvision import transforms
 8 | from sklearn.metrics import roc_auc_score
 9 | from Data import VideoFrameDataset
10 | from Model import FrameClassifier
11 | 
12 | 
13 | def test(ckpt_path, data_root, data_list, batch_size, num_workers, device, display_interval):
14 |     ckpt = torch.load(ckpt_path)
15 |     net_configs = ckpt['net_configs']
16 |     print(f'Load ckpt from {ckpt_path}')
17 | 
18 |     val_transform = transforms.Compose([
19 |         transforms.Resize([224, 224]),
20 |         transforms.ToTensor(),
21 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
22 |     ])
23 |     val_dataset = VideoFrameDataset(
24 |         root=data_root,
25 |         video_list_path=data_list,
26 |         transform=val_transform
27 |     )
28 |     val_loader = DataLoader(
29 |         dataset=val_dataset,
30 |         batch_size=batch_size,
31 |         shuffle=False,
32 |         num_workers=num_workers,
33 |         pin_memory=True
34 |     )
35 | 
36 |     model = FrameClassifier(
37 |         #cnn_type=net_configs['cnn_type'],
38 |         fc_sizes=net_configs['hidden_sizes'],
39 |         batchnorms=net_configs['batchnorms'],
40 |         dropouts=net_configs['dropouts']
41 |     )
42 |     model.to(device)
43 |     model.load_state_dict(ckpt['state_dict'])
44 | 
45 |     model.eval()
46 |     with torch.no_grad():
47 |         n_correct = 0
48 |         n_samples = 0
49 |         predicted_scores = []
50 |         target_scores = []
51 |         for step, (images, labels) in enumerate(tqdm(val_loader)):
52 |             images = images.to(dtype=torch.float32, device=device)                  # (N, C, H, W)
53 |             labels = labels.to(dtype=torch.float32, device=device)                  # (N,)
54 | 
55 |             # forward
56 |             prob = model(images)                            # (N, 1)    after sigmoid
57 | 
58 |             n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item()
59 |             n_samples += labels.shape[0]
60 | 
61 |             predicted_scores.append(prob.squeeze().detach().cpu().numpy())
62 |             target_scores.append(labels.cpu().to(dtype=torch.int).numpy())
63 | 
64 |             # display
65 |             if (step + 1) % display_interval == 0 and step < len(val_loader) - 1:
66 |                 tqdm.write('-' * 40)
67 |                 tqdm.write(f'[info] Probs: {prob.squeeze().detach().cpu().numpy()[:10]}')
68 |                 tqdm.write(f'[info] Label: {labels.cpu().numpy()[:10]}')
69 | 
70 |     print('=' * 80)
71 |     print(f'[info] Test Acc = {n_correct / n_samples * 100:.4f}%')
72 | 
73 |     predicted_scores = np.concatenate(predicted_scores, axis=0)
74 |     target_scores = np.concatenate(target_scores, axis=0)
75 | 
76 |     auc = roc_auc_score(target_scores, predicted_scores)
77 |     print(f'[info] Test AUC = {auc:.5f}')
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     test(
82 |         ckpt_path='exp/20201130-16:26:59/m_epoch02.pt',
83 |         data_root='/home/ubuntu/project/data',
84 |         data_list='test.txt',
85 |         batch_size=128,
86 |         num_workers=4,
87 |         device='cuda',
88 |         display_interval=1
89 |     )
90 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/data_prepare.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import shutil
 4 | 
 5 | 
 6 | if __name__ == "__main__":
 7 |     input_root = 'frames'
 8 |     output_root = 'data'
 9 |     train_json = 'metadata_train.json'
10 |     test_json = 'metadata_val.json'
11 | 
12 |     video_clips = os.listdir(input_root)
13 | 
14 |     with open(train_json, 'r') as fp:
15 |         train_info = json.load(fp)
16 |     with open(test_json, 'r') as fp:
17 |         test_info = json.load(fp)
18 | 
19 |     train_txt = []
20 |     test_txt = []
21 | 
22 |     invalid = []
23 | 
24 |     for vid in video_clips:
25 |         if vid in train_info.keys():
26 |             start = train_info[vid]['anomaly_start']
27 |             end = train_info[vid]['anomaly_end']
28 |             n_frames = train_info[vid]['num_frames']
29 |             data_split = 'train'
30 |         elif vid in test_info.keys():
31 |             start = test_info[vid]['anomaly_start']
32 |             end = test_info[vid]['anomaly_end']
33 |             n_frames = test_info[vid]['num_frames']
34 |             data_split = 'test'
35 |         else:
36 |             raise RuntimeError(f'invalid video clip {vid}')
37 |         
38 |         jpgs = os.listdir(os.path.join(input_root, vid))
39 | 
40 |         if len(jpgs) != n_frames:
41 |             invalid.append(vid)
42 |             print(f'{vid} has {len(jpgs)} frames, which is different from {n_frames}')
43 |         else:
44 |             jpgs.sort()
45 | 
46 |             for i in range(start-1):
47 |                 if not os.path.exists(os.path.join(output_root, f'{vid}-0')):
48 |                     os.makedirs(os.path.join(output_root, f'{vid}-0'))
49 |                 shutil.copy(os.path.join(input_root, vid, jpgs[i]), os.path.join(output_root, f'{vid}-0'))
50 |                 if data_split == 'train':
51 |                     train_txt.append(f'data/{vid}-0 normal\n')
52 |                 elif data_split == 'test':
53 |                     test_txt.append(f'data/{vid}-0 normal\n')
54 | 
55 |             for i in range(start-1, end):
56 |                 if not os.path.exists(os.path.join(output_root, f'{vid}-1')):
57 |                     os.makedirs(os.path.join(output_root, f'{vid}-1'))
58 |                 shutil.copy(os.path.join(input_root, vid, jpgs[i]), os.path.join(output_root, f'{vid}-1'))
59 |                 if data_split == 'train':
60 |                     train_txt.append(f'data/{vid}-1 anomaly\n')
61 |                 elif data_split == 'test':
62 |                     test_txt.append(f'data/{vid}-1 anomaly\n')
63 | 
64 |             for i in range(end, len(jpgs)):
65 |                 if not os.path.exists(os.path.join(output_root, f'{vid}-2')):
66 |                     os.makedirs(os.path.join(output_root, f'{vid}-2'))
67 |                 shutil.copy(os.path.join(input_root, vid, jpgs[i]), os.path.join(output_root, f'{vid}-2'))
68 |                 if data_split == 'train':
69 |                     train_txt.append(f'data/{vid}-2 normal\n')
70 |                 elif data_split == 'test':
71 |                     test_txt.append(f'data/{vid}-2 normal\n')
72 | 
73 |     with open('train_list.txt', 'w') as fp:
74 |         for line in train_txt:
75 |             fp.write(line)
76 |     print(f'train: {len(train_txt)}')
77 |     
78 |     with open('test_list.txt', 'w') as fp:
79 |         for line in test_txt:
80 |             fp.write(line)
81 |     print(f'test:  {len(test_txt)}')
82 | 


--------------------------------------------------------------------------------
/test_cnn_flow.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from tqdm import tqdm
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torchvision import transforms
  8 | from torch.utils.data import DataLoader
  9 | from sklearn.metrics import roc_auc_score
 10 | from data import OpticalFlowDataset
 11 | 
 12 | 
 13 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 14 | 
 15 | 
 16 | def main(args):
 17 |     """Test the CNN model on optical flow images and save results as a numpy file.
 18 |     """
 19 |     ckpt = torch.load(args.ckpt)
 20 | 
 21 |     transform = transforms.Compose([
 22 |         transforms.Resize([args.image_size, args.image_size]),
 23 |         transforms.ToTensor(),
 24 |     ])
 25 |     dataset = OpticalFlowDataset(
 26 |         root=args.data_root,
 27 |         video_list_path=args.data_list,
 28 |         n_frames=ckpt['net_configs']['n_frames'],
 29 |         transform=transform,
 30 |         is_train=False
 31 |     )
 32 |     dataloader = DataLoader(
 33 |         dataset=dataset,
 34 |         batch_size=args.batch_size,
 35 |         shuffle=False,
 36 |         num_workers=args.n_workers,
 37 |         collate_fn=dataset.collate_fn,
 38 |         pin_memory=True
 39 |     )
 40 | 
 41 |     assert ckpt['net_configs']['cnn_type'] in ['resnet50', 'resnet101', 'resnet152']
 42 |     if ckpt['net_configs']['cnn_type'] == 'resnet50':
 43 |         from networks import resnet50 as ResNet
 44 |     elif ckpt['net_configs']['cnn_type'] == 'resnet101':
 45 |         from networks import resnet101 as ResNet
 46 |     elif ckpt['net_configs']['cnn_type'] == 'resnet152':
 47 |         from networks import resnet152 as ResNet
 48 | 
 49 |     model = ResNet(pretrained=True, channel=ckpt['net_configs']['n_frames'] * 2)
 50 |     model.to(DEVICE)
 51 |     model.load_state_dict(ckpt['state_dict'])
 52 |     model.eval()
 53 |     print(f'[info] Loaded model from {args.ckpt}')
 54 | 
 55 |     with torch.no_grad():
 56 |         y_true = []         # 1 for anomaly, 0 for normal
 57 |         y_pred = []         # prob of anomaly
 58 |         for step, (images, labels) in enumerate(tqdm(dataloader)):
 59 |             images = images.to(dtype=torch.float32, device=DEVICE)      # (N, C=20, H, W)
 60 | 
 61 |             # forward
 62 |             out = model(images)         # (N, 1), logits before sigmoid
 63 |             prob = torch.sigmoid(out.squeeze(-1))       # (N,)
 64 | 
 65 |             y_true.append(labels.to(dtype=torch.int32).cpu().numpy())
 66 |             y_pred.append(prob.cpu().numpy())
 67 | 
 68 |         y_true = np.concatenate(y_true)
 69 |         y_pred = np.concatenate(y_pred)
 70 | 
 71 |         try:
 72 |             auc = roc_auc_score(y_true, y_pred)
 73 |             acc = (y_true == (y_pred >= 0.5)).sum() / y_true.shape[0]
 74 |             print(f'[info] Video-Level AUC = {auc:.5f}, ACC = {acc*100:.2f}%')
 75 |         except:
 76 |             print('[warning] Failed to compute AUC and ACC.')
 77 | 
 78 |         try:
 79 |             with open(args.data_list, 'r') as fp:
 80 |                 rows = [ln.rstrip().split() for ln in fp.readlines()]
 81 | 
 82 |             out_file = os.path.join(os.path.dirname(args.ckpt), 'results.npy')
 83 |             results = []
 84 |             for (name, label_str), target, prob in zip(rows, y_true, y_pred):
 85 |                 assert (label_str == 'anomaly') == target
 86 |                 results.append([name, label_str, target, prob])
 87 |             np.save(out_file, results)
 88 |         except:
 89 |             print('[warning] Failed to save output file.')
 90 | 
 91 | 
 92 | if __name__ == '__main__':
 93 |     parser = argparse.ArgumentParser()
 94 |     parser.add_argument(
 95 |         '--ckpt',
 96 |         default='',
 97 |         type=str,
 98 |         help='path to the model checkpoint'
 99 |     )
100 |     parser.add_argument(
101 |         '--data_root',
102 |         default='/home/ubuntu/data_flow',
103 |         type=str,
104 |         help='root directory of optical flow images'
105 |     )
106 |     parser.add_argument(
107 |         '--data_list',
108 |         default='dataset/test.txt',
109 |         type=str,
110 |         help='path to the list of test videos'
111 |     )
112 |     parser.add_argument(
113 |         '--image_size',
114 |         default=224,
115 |         type=int,
116 |         help='height and width of the input image (default 224 for ResNets)'
117 |     )
118 |     parser.add_argument(
119 |         '--batch_size',
120 |         default=32,
121 |         type=int,
122 |         help='batch size for test'
123 |     )
124 |     parser.add_argument(
125 |         '--n_workers',
126 |         default=4,
127 |         type=int,
128 |         help='number of workers for dataloader'
129 |     )
130 |     args = parser.parse_args()
131 |     print(args)
132 | 
133 |     main(args)
134 | 


--------------------------------------------------------------------------------
/test_lstm_flow.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from tqdm import tqdm
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torchvision import transforms
  8 | from torch.utils.data import DataLoader
  9 | from sklearn.metrics import roc_auc_score
 10 | from data import OpticalFlowDataset
 11 | from networks import CRNNOpticalFlow
 12 | 
 13 | 
 14 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 15 | 
 16 | 
 17 | def main(args):
 18 |     """Test the CRNN model on optical flow maps and save results as a numpy file.
 19 |     """
 20 |     ckpt = torch.load(args.ckpt)
 21 | 
 22 |     transform = transforms.Compose([
 23 |         transforms.Resize([args.image_size, args.image_size]),
 24 |         transforms.ToTensor(),
 25 |     ])
 26 |     dataset = OpticalFlowDataset(
 27 |         root=args.data_root,
 28 |         video_list_path=args.data_list,
 29 |         n_frames=args.n_frames if args.n_frames > 0 else ckpt['net_configs']['n_frames'],
 30 |         transform=transform,
 31 |         is_train=False
 32 |     )
 33 |     dataloader = DataLoader(
 34 |         dataset=dataset,
 35 |         batch_size=args.batch_size,
 36 |         shuffle=False,
 37 |         num_workers=args.n_workers,
 38 |         collate_fn=dataset.collate_fn,
 39 |         pin_memory=True
 40 |     )
 41 | 
 42 |     model = CRNNOpticalFlow(
 43 |         cnn_dropout=ckpt['net_configs']['cnn_dropout'],
 44 |         cnn_emb_dim=ckpt['net_configs']['cnn_emb_dim'],
 45 |         cnn_type=ckpt['net_configs']['cnn_type'],
 46 |         rnn_hidden_size=ckpt['net_configs']['rnn_hidden_size'],
 47 |         rnn_dropout=ckpt['net_configs']['rnn_dropout'],
 48 |         num_rnn_layers=ckpt['net_configs']['num_rnn_layers'],
 49 |         rnn_bidir=ckpt['net_configs']['rnn_bidir']
 50 |     )
 51 |     model.to(DEVICE)
 52 |     model.load_state_dict(ckpt['state_dict'])
 53 |     model.eval()
 54 |     print(f'[info] Loaded model from {args.ckpt}')
 55 | 
 56 |     with torch.no_grad():
 57 |         y_true = []  # 1 for anomaly, 0 for normal
 58 |         y_pred = []  # prob of anomaly
 59 |         for step, (images, labels) in enumerate(tqdm(dataloader)):
 60 |             images = images.to(dtype=torch.float32, device=DEVICE)         # (N, T * 2, H, W)
 61 | 
 62 |             # forward
 63 |             out = model(images)                   # (N, T), probs after sigmoid
 64 |             out = torch.mean(out, dim=-1)         # (N,)
 65 | 
 66 |             y_true.append(labels.to(dtype=torch.int32).cpu().numpy())
 67 |             y_pred.append(out.cpu().numpy())
 68 | 
 69 |         y_true = np.concatenate(y_true)
 70 |         y_pred = np.concatenate(y_pred)
 71 | 
 72 |         try:
 73 |             auc = roc_auc_score(y_true, y_pred)
 74 |             acc = (y_true == (y_pred >= 0.5)).sum() / y_true.shape[0]
 75 |             print(f'[info] Video-Level AUC = {auc:.5f}, ACC = {acc*100:.3f}%')
 76 |         except:
 77 |             print('[warning] Failed to compute AUC and ACC.')
 78 | 
 79 |         try:
 80 |             with open(args.data_list, 'r') as fp:
 81 |                 rows = [ln.rstrip().split() for ln in fp.readlines()]
 82 | 
 83 |             out_file = os.path.join(os.path.dirname(args.ckpt), 'results.npy')
 84 |             results = []
 85 |             for (name, label_str), target, prob in zip(rows, y_true, y_pred):
 86 |                 assert (label_str == 'anomaly') == target
 87 |                 results.append([name, label_str, target, prob])
 88 |             np.save(out_file, results)
 89 |         except:
 90 |             print('[warning] Failed to save output file.')
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument(
 96 |         '--ckpt',
 97 |         default='',
 98 |         type=str,
 99 |         help='path to the model checkpoint'
100 |     )
101 |     parser.add_argument(
102 |         '--data_root',
103 |         default='/home/ubuntu/data_flow',
104 |         type=str,
105 |         help='root directory of optical flow maps'
106 |     )
107 |     parser.add_argument(
108 |         '--data_list',
109 |         default='dataset/test.txt',
110 |         type=str,
111 |         help='path to the list of test videos'
112 |     )
113 |     parser.add_argument(
114 |         '--n_frames',
115 |         default=0,
116 |         type=int,
117 |         help='number of frames for each video clip'
118 |     )
119 |     parser.add_argument(
120 |         '--image_size',
121 |         default=224,
122 |         type=int,
123 |         help='height and width of the input image (default 224 for ResNets)'
124 |     )
125 |     parser.add_argument(
126 |         '--batch_size',
127 |         default=8,
128 |         type=int,
129 |         help='batch size for test'
130 |     )
131 |     parser.add_argument(
132 |         '--n_workers',
133 |         default=4,
134 |         type=int,
135 |         help='number of workers for dataloader'
136 |     )
137 |     args = parser.parse_args()
138 |     print(args)
139 | 
140 |     main(args)
141 | 


--------------------------------------------------------------------------------
/test_lstm_rgb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from tqdm import tqdm
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torchvision import transforms
  8 | from torch.utils.data import DataLoader
  9 | from sklearn.metrics import roc_auc_score
 10 | from data import RGBFrameDataset
 11 | from networks import CRNNClassifier
 12 | 
 13 | 
 14 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 15 | 
 16 | 
 17 | def main(args):
 18 |     """Test the CRNN model on RGB frames and save results as a numpy file.
 19 |     """
 20 |     transform = transforms.Compose([
 21 |         transforms.Resize([args.image_size, args.image_size]),
 22 |         transforms.ToTensor(),
 23 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 24 |     ])
 25 |     dataset = RGBFrameDataset(
 26 |         root=args.data_root,
 27 |         video_list_path=args.data_list,
 28 |         n_frames=args.n_frames,
 29 |         transform=transform,
 30 |         is_train=False
 31 |     )
 32 |     dataloader = DataLoader(
 33 |         dataset=dataset,
 34 |         batch_size=args.batch_size,
 35 |         shuffle=False,
 36 |         num_workers=args.n_workers,
 37 |         collate_fn=dataset.collate_fn,
 38 |         pin_memory=True
 39 |     )
 40 | 
 41 |     ckpt = torch.load(args.ckpt)
 42 |     model = CRNNClassifier(
 43 |         cnn_dropout=ckpt['net_configs']['cnn_dropout'],
 44 |         cnn_emb_dim=ckpt['net_configs']['cnn_emb_dim'],
 45 |         cnn_type=ckpt['net_configs']['cnn_type'],
 46 |         cnn_finetune=ckpt['net_configs']['cnn_finetune'],
 47 |         rnn_hidden_size=ckpt['net_configs']['rnn_hidden_size'],
 48 |         rnn_dropout=ckpt['net_configs']['rnn_dropout'],
 49 |         num_rnn_layers=ckpt['net_configs']['num_rnn_layers'],
 50 |         rnn_bidir=ckpt['net_configs']['rnn_bidir']
 51 |     )
 52 |     model.to(DEVICE)
 53 |     model.load_state_dict(ckpt['state_dict'])
 54 |     model.eval()
 55 |     print(f'[info] Loaded model from {args.ckpt}')
 56 | 
 57 |     with torch.no_grad():
 58 |         y_true = []  # 1 for anomaly, 0 for normal
 59 |         y_pred = []  # prob of anomaly
 60 |         for step, (videos, labels) in enumerate(tqdm(dataloader)):
 61 |             videos = videos.to(dtype=torch.float32, device=DEVICE)      # (N, T, C, H, W)
 62 | 
 63 |             # forward
 64 |             prob = model(videos)                # (N, T)    after sigmoid
 65 |             prob = torch.mean(prob, dim=-1)     # (N,)
 66 | 
 67 |             y_true.append(labels.to(dtype=torch.int32).cpu().numpy())
 68 |             y_pred.append(prob.cpu().numpy())
 69 | 
 70 |         y_true = np.concatenate(y_true)
 71 |         y_pred = np.concatenate(y_pred)
 72 | 
 73 |         try:
 74 |             auc = roc_auc_score(y_true, y_pred)
 75 |             acc = (y_true == (y_pred >= 0.5)).sum() / y_true.shape[0]
 76 |             print(f'[info] Video-Level AUC = {auc:.5f}, ACC = {acc*100:.3f}%')
 77 |         except:
 78 |             print('[warning] Failed to compute AUC and ACC.')
 79 | 
 80 |         try:
 81 |             with open(args.data_list, 'r') as fp:
 82 |                 rows = [ln.rstrip().split() for ln in fp.readlines()]
 83 | 
 84 |             out_file = os.path.join(os.path.dirname(args.ckpt), 'results.npy')
 85 |             results = []
 86 |             for (name, label_str), target, prob in zip(rows, y_true, y_pred):
 87 |                 assert (label_str == 'anomaly') == target
 88 |                 results.append([name, label_str, target, prob])
 89 |             np.save(out_file, results)
 90 |         except:
 91 |             print('[warning] Failed to save output file.')
 92 | 
 93 | 
 94 | if __name__ == '__main__':
 95 |     parser = argparse.ArgumentParser()
 96 |     parser.add_argument(
 97 |         '--ckpt',
 98 |         default='',
 99 |         type=str,
100 |         help='path to the model checkpoint'
101 |     )
102 |     parser.add_argument(
103 |         '--data_root',
104 |         default='/home/ubuntu/data',
105 |         type=str,
106 |         help='root directory of RGB frames'
107 |     )
108 |     parser.add_argument(
109 |         '--data_list',
110 |         default='dataset/test.txt',
111 |         type=str,
112 |         help='path to the list of test videos'
113 |     )
114 |     parser.add_argument(
115 |         '--n_frames',
116 |         default=30,
117 |         type=int,
118 |         help='number of frames for each video clip'
119 |     )
120 |     parser.add_argument(
121 |         '--image_size',
122 |         default=224,
123 |         type=int,
124 |         help='height and width of the input image (default 224 for ResNets)'
125 |     )
126 |     parser.add_argument(
127 |         '--batch_size',
128 |         default=32,
129 |         type=int,
130 |         help='batch size for test'
131 |     )
132 |     parser.add_argument(
133 |         '--n_workers',
134 |         default=4,
135 |         type=int,
136 |         help='number of workers for dataloader'
137 |     )
138 |     args = parser.parse_args()
139 |     print(args)
140 | 
141 |     main(args)
142 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from PIL import Image
  3 | from tqdm import tqdm
  4 | import numpy as np
  5 | import torch
  6 | from torch.utils.data import Dataset
  7 | 
  8 | 
  9 | class RGBFrameDataset(Dataset):
 10 |     """Dataset class for 3-channel RGB images.
 11 |     """
 12 |     def __init__(self, root, video_list_path, n_frames, transform, is_train):
 13 |         super(RGBFrameDataset, self).__init__()
 14 |         self.root = root
 15 |         self.n_frames = n_frames
 16 |         self.transform = transform
 17 |         self.is_train = is_train
 18 | 
 19 |         with open(video_list_path, 'r') as fp:
 20 |             self.lines = [line.rstrip() for line in fp.readlines()]
 21 |         tqdm.write(f'[info] There are {len(self.lines)} videos in {video_list_path}')
 22 | 
 23 |     def __len__(self):
 24 |         return len(self.lines)
 25 | 
 26 |     def __getitem__(self, index):
 27 |         """
 28 |         :param index: int
 29 |         :return result: (n_frames, C=3, H, W), label: 1 for anomaly, 0 for normal
 30 |         """
 31 |         line = self.lines[index]
 32 |         label = line.split()[1] == 'anomaly'        # anomaly: 1, normal: 0
 33 |         folder = os.path.join(self.root, line.split()[0])
 34 |         jpg_list = os.listdir(folder)
 35 |         jpg_list.sort()         # must sort to retain the order
 36 | 
 37 |         if len(jpg_list) > self.n_frames:      # there are enough frames
 38 |             if self.is_train:
 39 |                 start = np.random.randint(0, len(jpg_list) - self.n_frames)
 40 |             else:
 41 |                 start = 0
 42 |             jpg_list = jpg_list[start:start+self.n_frames]
 43 |         elif len(jpg_list) < self.n_frames:    # frames are not enough
 44 |             jpg_list += [jpg_list[-1]] * (self.n_frames - len(jpg_list))        # repeat the last frame
 45 | 
 46 |         assert len(jpg_list) == self.n_frames
 47 | 
 48 |         frames = []
 49 |         for jpg in jpg_list:
 50 |             image = Image.open(os.path.join(folder, jpg))
 51 |             image = self.transform(image)           # torch.Tensor, (C=3, H, W), range [0., 1.]
 52 |             frames.append(image)
 53 |         frames = torch.stack(frames, dim=0)         # (n_frames, C=3, H, W)
 54 | 
 55 |         return frames, label
 56 | 
 57 |     def collate_fn(self, batch):
 58 |         videos = torch.stack([b[0] for b in batch], dim=0)          # (batch_size, n_frames, C=3, H, W)
 59 |         labels = torch.tensor([b[1] for b in batch], dtype=torch.long)      # (batch_size,)
 60 | 
 61 |         return videos, labels
 62 | 
 63 | 
 64 | class OpticalFlowDataset(Dataset):
 65 |     """Dataset class for stacked optical flow. Each optical flow image has two components, i.e. x and y.
 66 |     """
 67 |     def __init__(self, root, video_list_path, n_frames, transform, is_train):
 68 |         super(OpticalFlowDataset, self).__init__()
 69 | 
 70 |         self.root = root
 71 |         self.n_frames = n_frames
 72 |         self.transform = transform
 73 |         self.is_train = is_train
 74 | 
 75 |         with open(video_list_path, 'r') as fp:
 76 |             self.lines = [line.rstrip() for line in fp.readlines()]
 77 |         tqdm.write(f'[info] There are {len(self.lines)} videos in {video_list_path}')
 78 | 
 79 |     def __len__(self):
 80 |         return len(self.lines)
 81 | 
 82 |     def __getitem__(self, index):
 83 |         """
 84 |         :param index: int
 85 |         :return frames: torch.FloatTensor, (n_frames * 2, H, W); label: 1 for anomaly, 0 for normal
 86 |         """
 87 |         line = self.lines[index]
 88 |         label = line.split()[1] == 'anomaly'        # anomaly: 1, normal: 0
 89 |         folder = os.path.join(self.root, line.split()[0])
 90 |         jpg_list = os.listdir(folder)
 91 |         jpg_list.sort()         # must sort to retain the order
 92 | 
 93 |         if len(jpg_list) > self.n_frames:      # there are enough frames
 94 |             if self.is_train:
 95 |                 start = np.random.randint(0, len(jpg_list) - self.n_frames)
 96 |             else:
 97 |                 start = 0
 98 |             jpg_list = jpg_list[start:start+self.n_frames]
 99 |         elif len(jpg_list) < self.n_frames:    # frames are not enough
100 |             jpg_list += [jpg_list[-1]] * (self.n_frames - len(jpg_list))        # repeat the last frame
101 | 
102 |         assert len(jpg_list) == self.n_frames
103 | 
104 |         frames = []
105 |         for jpg in jpg_list:
106 |             image = Image.open(os.path.join(folder, jpg))   # (H, W, 3), channel 0: horizontal, channel 1: vertical
107 |             image = self.transform(image)           # torch.FloatTensor, (3, H, W), range [0., 1.]
108 |             image = image[:-1, :, :]                # (2, H, W)
109 |             frames.append(image)
110 |         frames = torch.cat(frames, dim=0)         # (n_frames * 2, H, W)
111 |         assert frames.shape[0] == 2 * self.n_frames
112 | 
113 |         return frames, label
114 | 
115 |     def collate_fn(self, batch):
116 |         videos = torch.stack([b[0] for b in batch], dim=0)          # (batch_size, n_frames * 2, H, W)
117 |         labels = torch.tensor([b[1] for b in batch], dtype=torch.long)      # (batch_size,)
118 | 
119 |         return videos, labels
120 | 


--------------------------------------------------------------------------------
/frame_video_level_cnn/resnet101_frame/Train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import shutil
  4 | from tqdm import tqdm, trange
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | import torch.nn.functional as F
  9 | from torch.utils.data import DataLoader
 10 | from torchvision import transforms
 11 | from Data import VideoFrameDataset
 12 | from Model import FrameClassifier
 13 | from TrainConfig import configs
 14 | 
 15 | 
 16 | class Trainer(object):
 17 |     def __init__(self, configs):
 18 |         self.configs = configs
 19 |         self.device = torch.device(configs['device'])
 20 | 
 21 |         train_transform = transforms.Compose([
 22 |             transforms.RandomAffine(degrees=10, translate=[0.1, 0.1], scale=[0.9, 1.1]),
 23 |             transforms.Resize([configs['image_size'], configs['image_size']]),
 24 |             transforms.ToTensor(),
 25 |             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 26 |         ])
 27 |         train_dataset = VideoFrameDataset(
 28 |             root=configs['data_root'],
 29 |             video_list_path=configs['train_list'],
 30 |             transform=train_transform
 31 |         )
 32 |         self.train_loader = DataLoader(
 33 |             dataset=train_dataset,
 34 |             batch_size=configs['train_batch_size'],
 35 |             shuffle=True,
 36 |             num_workers=configs['n_workers'],
 37 |             pin_memory=True
 38 |         )
 39 | 
 40 |         self.criterion = nn.BCELoss()
 41 | 
 42 |         ckpt = None
 43 |         if configs['resume']:
 44 |             ckpt = torch.load(configs['ckpt_path'])
 45 |             configs['net'] = ckpt['net_configs']
 46 |             self.configs = configs
 47 | 
 48 |         self.model = FrameClassifier(
 49 |             #cnn_type=configs['net']['cnn_type'],
 50 |             fc_sizes=configs['net']['hidden_sizes'],
 51 |             batchnorms=configs['net']['batchnorms'],
 52 |             dropouts=configs['net']['dropouts']
 53 |         )
 54 |         self.model.to(self.device)
 55 |         self.optimizer = optim.Adam(self.model.parameters(), lr=configs['lr'],
 56 |                                     weight_decay=configs['weight_decay'])
 57 | 
 58 |         # if configs['pre_train']:
 59 |         #     pre_ckpt = torch.load(configs['pre_train_path'])
 60 |         #     self.model.load_state_dict(pre_ckpt['state_dict'], strict=True)
 61 |         #     print('============= load pre-trained model ==============')
 62 | 
 63 |         if configs['resume']:
 64 |             self.model.load_state_dict(ckpt['state_dict'])
 65 |             self.optimizer.load_state_dict(ckpt['optimizer'])
 66 |             if configs['new_lr'] is not None:
 67 |                 for param_group in self.optimizer.param_groups:
 68 |                     param_group['lr'] = configs['new_lr']
 69 | 
 70 |         if configs['apply_val']:
 71 |             val_transform = transforms.Compose([
 72 |                 transforms.Resize([configs['image_size'], configs['image_size']]),
 73 |                 transforms.ToTensor(),
 74 |                 transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 75 |             ])
 76 |             val_dataset = VideoFrameDataset(
 77 |                 root=configs['data_root'],
 78 |                 video_list_path=configs['val_list'],
 79 |                 transform=val_transform
 80 |             )
 81 |             self.val_loader = DataLoader(
 82 |                 dataset=val_dataset,
 83 |                 batch_size=configs['val_batch_size'],
 84 |                 shuffle=False,
 85 |                 num_workers=configs['n_workers'],
 86 |                 pin_memory=True
 87 |             )
 88 |             self.display_interval = configs['val_display_interval']
 89 | 
 90 |         self.save_dir = os.path.join(configs['save_dir'], time.strftime('%Y%m%d-%H:%M:%S', time.localtime()))
 91 |         os.makedirs(self.save_dir)
 92 |         self.log_file = os.path.join(self.save_dir, 'log_train.txt')
 93 |         self.copyscripts(os.path.join(self.save_dir, 'backup_scripts'))
 94 | 
 95 |         self.writelog(self.configs)
 96 |         self.writelog('=' * 80)
 97 | 
 98 |     def copyscripts(self, dest_path):
 99 |         """
100 |         Save python scripts.
101 |         Ignore directories such as '__pycache__' and '.idea'.
102 |         """
103 |         shutil.copytree('.', dest_path, ignore=shutil.ignore_patterns('_*', '.*', self.configs['save_dir']))
104 | 
105 |     def writelog(self, results):
106 |         if not isinstance(results, str):
107 |             results = str(results)
108 |         with open(self.log_file, 'a') as fp:
109 |             fp.write(results + '\n')
110 | 
111 |     def savemodel(self, save_name):
112 |         torch.save(
113 |             {
114 |                 'state_dict': self.model.state_dict(),
115 |                 'optimizer': self.optimizer.state_dict(),
116 |                 'net_configs': self.configs['net']
117 |             },
118 |             save_name
119 |         )
120 |         tqdm.write(f'[Info] Trained model has been saved as {save_name}')
121 | 
122 |     def train(self):
123 |         for epoch in trange(self.configs['n_epochs']):
124 |             tqdm.write('=' * 20 + f'Epoch {epoch + 1} starts' + '=' * 20)
125 |             average_loss, accuracy = self.train_epoch(epoch)
126 |             log_str = f'Epoch [{epoch + 1:02d}/{self.configs["n_epochs"]}] Train Loss = {average_loss:.5f} ' \
127 |                       f'Train ACC = {accuracy*100:.2f}%'
128 |             self.savemodel(os.path.join(self.save_dir, f'm_epoch{epoch + 1:02d}.pt'))
129 | 
130 |             if self.configs['apply_val']:
131 |                 with torch.no_grad():
132 |                     val_loss, val_acc = self.val_epoch(epoch)
133 |                     log_str += f' Val Loss = {val_loss:.5f} Val ACC = {val_acc * 100:.2f}%'
134 | 
135 |             self.writelog(log_str)
136 |             tqdm.write(log_str)
137 | 
138 |     def train_epoch(self, epoch):
139 |         self.model.train()
140 | 
141 |         total_loss = 0.
142 |         n_correct = 0
143 |         n_samples = 0
144 |         for step, (images, labels) in enumerate(tqdm(self.train_loader)):
145 |             images = images.to(dtype=torch.float32, device=self.device)       # (N, C, H, W)
146 |             labels = labels.to(dtype=torch.float32, device=self.device)       # (N,)
147 | 
148 |             # forward & backward
149 |             prob = self.model(images)        # (N, 1)    after sigmoid
150 | 
151 |             loss = self.criterion(prob, labels.reshape(labels.shape[0], 1))
152 | 
153 |             self.optimizer.zero_grad()
154 |             loss.backward()
155 |             self.optimizer.step()
156 | 
157 |             total_loss += loss.item() * labels.shape[0]
158 |             n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item()
159 |             n_samples += labels.shape[0]
160 | 
161 |         return total_loss / n_samples, n_correct / n_samples
162 | 
163 |     def val_epoch(self, epoch):
164 |         self.model.eval()
165 | 
166 |         total_loss = 0.
167 |         n_correct = 0
168 |         n_samples = 0
169 |         for step, (images, labels) in enumerate(tqdm(self.val_loader)):
170 |             images = images.to(dtype=torch.float32, device=self.device)  # (N, C, H, W)
171 |             labels = labels.to(dtype=torch.float32, device=self.device)  # (N,)
172 | 
173 |             # forward
174 |             prob = self.model(images)               # (N, 1)    after sigmoid
175 |             loss = self.criterion(prob, labels.reshape(labels.shape[0], 1))
176 | 
177 |             total_loss += loss.item() * labels.shape[0]
178 |             n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item()
179 |             n_samples += labels.shape[0]
180 | 
181 |             # display
182 |             if (step + 1) % self.display_interval == 0:
183 |                 tqdm.write('-' * 40)
184 |                 tqdm.write(f'[info] Probs: {prob.detach().cpu().numpy()}')
185 |                 tqdm.write(f'[info] Label: {labels.cpu().numpy()}')
186 |         return total_loss / n_samples, n_correct / n_samples
187 | 
188 | 
189 | if __name__ == '__main__':
190 |     trainer = Trainer(configs)
191 |     trainer.train()
192 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Traffic Accident Detection via Deep Learning
  2 | 
  3 | This repository contains the code of our IDL course project in Fall 2020.
  4 | 
  5 | NOTE: this repo is only for the course project and will not be maintained after this semester.
  6 | 
  7 | ## Contributors
  8 | 
  9 | **Yifan Peng** (@pyf98) and **Amine Bellamkaddem** (@amine-bellamkaddem) are the two contributors of this repository.
 10 | 
 11 | **Yifan Peng** is the main contributor of this project. He implemented the vast majority of our models. Specifically, he provided an initial version for `frame_video_level_cnn`, which was further developed by Amine. All the other scripts in this repository are completed by Yifan Peng.
 12 | 
 13 | **Amine Bellamkaddem** developed `frame_video_level_cnn` based on the initial version by Yifan and explored more architectures such as VGG16. He also provided the current visualization plots. Please see the folder `frame_video_level_cnn` for more information.
 14 | 
 15 | Details about division of work is in our final report.
 16 | 
 17 | ## Introduction
 18 | 
 19 | Detecting anomalous events such as road accidents in natural driving scenes is a challenging task. The majority of previous studies focus on fixed cameras with static backgrounds. In this project, we design **a CRNN-based two-stream method using both RGB frames and optical flow to detect traffic accidents in first-person dash-cam videos**. Our hypotheses are that motion features can improve the detection performance and that CRNN-based approaches are better for modeling temporal relationship than conventional CNN-based approaches. Results show that the motion stream outperforms the spatial-temporal stream, and that the fusion of two streams can further improve our model's performance.
 20 | 
 21 | ![two-stream](imgs/crnn_twostream.png "CRNN-based two-stream method for traffic accident detection")
 22 | 
 23 | ## Requirements
 24 | 
 25 | Our models are implemented using PyTorch. Required packages are listed in `requirements.txt`.
 26 | 
 27 | ```
 28 | numpy
 29 | tqdm
 30 | torchvision==0.7.0
 31 | torch==1.6.0
 32 | Pillow
 33 | scikit_learn
 34 | ```
 35 | 
 36 | To install these packages, run
 37 | 
 38 | ```
 39 | pip install -r requirements.txt
 40 | ```
 41 | 
 42 | All models can be trained on a single NVIDIA Tesla T4 GPU using the default configuration.
 43 | 
 44 | ## Dataset
 45 | 
 46 | We employ a recently introduced traffic anomaly dataset called [Detection of Traffic Anomaly](https://github.com/MoonBlvd/Detection-of-Traffic-Anomaly "Detection of Traffic Anomaly Dataset") (DoTA). DoTA contains 4,677 dash-cam videos collected from YouTube channels. These ego-centric driving videos are from different countries and under different weather and lighting conditions.
 47 | 
 48 | **Note that due to issues with YouTube, some videos are no longer available. We have collected 4,085 videos in total.** Most videos in DoTA can be separated into three temporal partitions: precursor, anomaly window, and post-anomaly. We label the first part (i.e. precursor) as *normal* or *non-accident*, and the second part (i.e. anomaly window) as *anomaly* or *accident*, but we do not use the third part. Details of our dataset are shown below.
 49 | 
 50 | Dataset | Training | Validation | Test
 51 | :---: | :---: | :---: | :---:
 52 | \#video clips | 5,700 | 801 | 1,657
 53 | \#frames | 208,649 | 29,997 | 58,778
 54 | 
 55 | Dataset classes are defined in `data.py`.
 56 | 
 57 | ## Models
 58 | 
 59 | Models are defined in `networks.py`.
 60 | 
 61 | ### Spatial-Temporal Stream
 62 | 
 63 | The spatial-temporal stream takes RGB frames as input, which contain appearance information. To extract frame-level features from an input video, an ImageNet pre-trained ResNet is applied. To capture high-level (temporal) information, three architectures are employed: a multi-layer perceptron (MLP), a unidirectional Long Short-Term Memory (LSTM), and a bidirectional LSTM (BiLSTM). The MLP doesn't consider temporal dependencies, which leads to degraded performance.
 64 | 
 65 | * ResNet + MLP: The code is in `frame_video_level_cnn`. Models are trained on individual frames and evaluated by frame-level and video-level metrics.
 66 | 
 67 | * ResNet + LSTM: The network configuration is in `conf/lstm_rgb.py`. Note that `configs['net']['rnn_bidir']` should be set to `False` for unidirectional LSTM. To train a model, run `python train_lstm_rgb.py`. To evaluate a trained model, run `python test_lstm_rgb.py --ckpt path/to/checkpoint`. Please refer to `test_lstm_rgb.py` for other options such as `n_frames` and `batch_size`.
 68 | 
 69 | * ResNet + BiLSTM: Similar to the second model, the network configuration is in `conf/lstm_rgb.py`. Note that `configs['net']['rnn_bidir']` should be set to `True` for bidirectional LSTM. To train a model, run `python train_lstm_rgb.py`. To evaluate a trained model, run `python test_lstm_rgb.py --ckpt path/to/checkpoint`. Please refer to `test_lstm_rgb.py` for other options such as `n_frames` and `batch_size`.
 70 | 
 71 | After running the test script, predictions will be saved as a `.npy` file in the same folder as the model checkpoint. The saved file can be used in the fusion section.
 72 | 
 73 | ![crnn](imgs/crnn.png "Convolutional Recurrent Neural Network (CRNN)")
 74 | 
 75 | 
 76 | ### Motion Stream
 77 | 
 78 | The motion stream takes dense optical flow as input, which represents motion features. Our results have demonstrated that motion features are better for accident detection in dash-cam videos with dynamic backgrounds. We utilize a recently proposed deep learning-based algorithm ([RAFT](https://github.com/princeton-vl/RAFT)) to estimate optical flow and save optical flow maps as jpg images. Note that each optical flow map has only two channels (horizontal and vertical) instead of three, so the last color channel of the image is set to zero.
 79 | 
 80 | Here we compare three architectures: ResNet-based Conv2d, ResNet with LSTM, and ResNet with BiLSTM. Results show that LSTMs have better capacity for modeling temporal relationship within a video clip, which achieve higher AUC and accuracy.
 81 | 
 82 | * ResNet-based Conv2d: Different from CNN-MLP for 3-channel RGB frames, each time this model takes stacked optical flow maps as input, which can be considered as a multi-channel image. The ResNet is initialized with pre-trained weights but the first convolutional layer needs additional processing (3-channel -> multi-channel). The configuration file is `conf/cnn_flow.py` which contains all the hyperparameters for training. To train a model from scratch, run `python train_cnn_flow.py`. To resume training, first set the `configs['resume']` variable in `conf/cnn_flow.py` and then run `python train_cnn_flow.py`. To evaluate a trained model, run `python test_cnn_flow.py --ckpt path/to/checkpoint`. Other options such as `data_root`, `data_list` and `batch_size` can also be changed.
 83 | 
 84 | * ResNet + LSTM: The network configuration is in `conf/lstm_flow.py`. For unidirectional LSTM, set `configs['net']['rnn_bidir'] = False`. To train a model, run `python train_lstm_flow.py`. To evaluate a trained model using the test set, run `python test_lstm_flow.py --ckpt path/to/checkpoint`.
 85 | 
 86 | * ResNet + BiLSTM: The network configuration is in `conf/lstm_flow.py`. For bidirectional LSTM, set `configs['net']['rnn_bidir'] = True`. To train a model, run `python train_lstm_flow.py`. To evaluate a trained model using the test set, run `python test_lstm_flow.py --ckpt path/to/checkpoint`.
 87 | 
 88 | After running the test script, results will be saved as a `.npy` file, which can be used in the fusion section.
 89 | 
 90 | 
 91 | ### Fusion of Two Streams
 92 | 
 93 | There are many fusion strategies that can be used to merge two predictions. We adopt a simple but effective strategy, namely weighted average fusion after the last activation. The main advantage of this approach is that we don't need to retrain our model and there is only one hyperparameter, i.e. the weight of the first prediction.
 94 | 
 95 | ```
 96 | fusion_pred = rgb_weight * rgb_pred + (1. - rgb_weight) * flow_pred
 97 | ```
 98 | 
 99 | To calculate the fusion of two streams, run this command:
100 | 
101 | ```
102 | python test_combined.py --rgb_file path/to/rgb/results --flow_file path/to/optical/flow/results --rgb_weight your_preferred_weight
103 | ```
104 | 
105 | It will automatically calculate the video-level AUC and accuracy, which will be display in the terminal.
106 | 
107 | ## Results
108 | 
109 | Our evaluation metric is video-level AUC and accuracy (ACC). Please refer to our final report for the details.
110 | 
111 | The following figure visualizes our result. The red region represents the anomaly window. Our model is able to predict high scores for those frames.
112 | 
113 | ![vis](imgs/visualization.png "Visualization of our results")
114 | 
115 | 


--------------------------------------------------------------------------------
/train_cnn_flow.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import shutil
  4 | from tqdm import tqdm, trange
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | import torch.nn.functional as F
  9 | from torch.utils.data import DataLoader
 10 | from torchvision import transforms
 11 | from data import OpticalFlowDataset
 12 | from conf.cnn_flow import configs
 13 | 
 14 | 
 15 | class Trainer(object):
 16 |     def __init__(self, configs):
 17 |         self.configs = configs
 18 |         self.device = torch.device(configs['device'])
 19 | 
 20 |         ckpt = None
 21 |         if configs['resume']:
 22 |             ckpt = torch.load(configs['ckpt_path'])
 23 |             configs['net'] = ckpt['net_configs']
 24 |             self.configs = configs
 25 | 
 26 |         # for pre-trained models
 27 |         train_transform = transforms.Compose([
 28 |             transforms.Resize([configs['image_size'], configs['image_size']]),
 29 |             transforms.ToTensor(),
 30 |         ])
 31 |         train_dataset = OpticalFlowDataset(
 32 |             root=configs['data_root'],
 33 |             video_list_path=configs['train_list'],
 34 |             n_frames=configs['net']['n_frames'],
 35 |             transform=train_transform,
 36 |             is_train=True
 37 |         )
 38 |         self.train_loader = DataLoader(
 39 |             dataset=train_dataset,
 40 |             batch_size=configs['train_batch_size'],
 41 |             shuffle=True,
 42 |             num_workers=configs['n_workers'],
 43 |             collate_fn=train_dataset.collate_fn,
 44 |             pin_memory=True
 45 |         )
 46 | 
 47 |         assert configs['net']['cnn_type'] in ['resnet50', 'resnet101', 'resnet152']
 48 |         if configs['net']['cnn_type'] == 'resnet50':
 49 |             from networks import resnet50 as ResNet
 50 |         elif configs['net']['cnn_type'] == 'resnet101':
 51 |             from networks import resnet101 as ResNet
 52 |         elif configs['net']['cnn_type'] == 'resnet152':
 53 |             from networks import resnet152 as ResNet
 54 | 
 55 |         self.model = ResNet(pretrained=True, channel=configs['net']['n_frames'] * 2)
 56 |         self.model.to(self.device)
 57 |         self.criterion = nn.BCEWithLogitsLoss()
 58 |         self.optimizer = optim.Adam(self.model.parameters(),
 59 |                                     lr=configs['lr'], weight_decay=configs['weight_decay'])
 60 | 
 61 |         if configs['resume']:
 62 |             self.model.load_state_dict(ckpt['state_dict'])
 63 |             self.optimizer.load_state_dict(ckpt['optimizer'])
 64 |             tqdm.write(f"[info] Loaded checkpoint from {configs['ckpt_path']}")
 65 |             if configs['new_lr'] is not None:
 66 |                 for param_group in self.optimizer.param_groups:
 67 |                     param_group['lr'] = configs['new_lr']
 68 |                 tqdm.write(f"[info] Set new lr to {configs['new_lr']}")
 69 | 
 70 |         if configs['apply_val']:
 71 |             val_transform = transforms.Compose([
 72 |                 transforms.Resize([configs['image_size'], configs['image_size']]),
 73 |                 transforms.ToTensor(),
 74 |             ])
 75 |             val_dataset = OpticalFlowDataset(
 76 |                 root=configs['data_root'],
 77 |                 video_list_path=configs['val_list'],
 78 |                 n_frames=configs['net']['n_frames'],
 79 |                 transform=val_transform,
 80 |                 is_train=False
 81 |             )
 82 |             self.val_loader = DataLoader(
 83 |                 dataset=val_dataset,
 84 |                 batch_size=configs['val_batch_size'],
 85 |                 shuffle=False,
 86 |                 num_workers=configs['n_workers'],
 87 |                 collate_fn=val_dataset.collate_fn,
 88 |                 pin_memory=True
 89 |             )
 90 |             self.display_interval = configs['val_display_interval']
 91 | 
 92 |         self.save_dir = os.path.join(configs['save_dir'], time.strftime('flow-cnn_%Y%m%d-%H%M%S', time.localtime()))
 93 |         os.makedirs(self.save_dir)
 94 |         self.log_file = os.path.join(self.save_dir, 'log_train.txt')
 95 |         self.copyscripts(os.path.join(self.save_dir, 'backup_scripts'))
 96 | 
 97 |         self.writelog(self.configs)
 98 |         self.writelog('=' * 80)
 99 | 
100 |     def copyscripts(self, dest_path):
101 |         """
102 |         Save python scripts.
103 |         Ignore directories such as '__pycache__' and '.idea'.
104 |         """
105 |         shutil.copytree('.', dest_path, ignore=shutil.ignore_patterns('_*', '.*', self.configs['save_dir']))
106 | 
107 |     def writelog(self, results):
108 |         if not isinstance(results, str):
109 |             results = str(results)
110 |         with open(self.log_file, 'a') as fp:
111 |             fp.write(results + '\n')
112 | 
113 |     def savemodel(self, save_name):
114 |         torch.save(
115 |             {
116 |                 'state_dict': self.model.state_dict(),
117 |                 'optimizer': self.optimizer.state_dict(),
118 |                 'net_configs': self.configs['net']
119 |             },
120 |             save_name
121 |         )
122 |         tqdm.write(f'[Info] Trained model has been saved as {save_name}')
123 | 
124 |     def train(self):
125 |         for epoch in trange(self.configs['n_epochs']):
126 |             tqdm.write('=' * 20 + f'Epoch {epoch + 1} starts' + '=' * 20)
127 |             average_loss, accuracy = self.train_epoch(epoch)
128 |             log_str = f'Epoch [{epoch + 1:02d}/{self.configs["n_epochs"]}] Train Loss = {average_loss:.5f} ' \
129 |                       f'Train ACC = {accuracy*100:.2f}%'
130 |             self.savemodel(os.path.join(self.save_dir, f'm_epoch{epoch + 1:02d}.pt'))
131 | 
132 |             if self.configs['apply_val']:
133 |                 with torch.no_grad():
134 |                     val_loss, val_acc = self.val_epoch(epoch)
135 |                     log_str += f' Val Loss = {val_loss:.5f} Val ACC = {val_acc * 100:.2f}%'
136 | 
137 |             self.writelog(log_str)
138 |             tqdm.write(log_str)
139 | 
140 |     def train_epoch(self, epoch):
141 |         self.model.train()
142 | 
143 |         total_loss = 0.
144 |         n_correct = 0
145 |         n_samples = 0
146 |         for step, (images, labels) in enumerate(tqdm(self.train_loader)):
147 |             images = images.to(dtype=torch.float32, device=self.device)       # (N, C=20, H, W)
148 |             labels = labels.to(dtype=torch.float32, device=self.device)       # (N,)
149 | 
150 |             # forward & backward
151 |             out = self.model(images)        # (N, 1), logits before sigmoid
152 | 
153 |             tqdm.write(str(images.shape))
154 |             tqdm.write(str(labels.shape))
155 |             tqdm.write(str(torch.sigmoid(out[:4, 0])))
156 |             tqdm.write(str(labels[:4]))
157 | 
158 |             loss = self.criterion(out, labels.unsqueeze(-1))
159 | 
160 |             tqdm.write(str(loss.item()) + '\n')
161 | 
162 |             self.optimizer.zero_grad()
163 |             loss.backward()
164 |             self.optimizer.step()
165 | 
166 |             total_loss += loss.item() * labels.shape[0]
167 |             n_correct += ((out.detach().squeeze(1) >= 0.) == labels).sum().item()
168 |             n_samples += labels.shape[0]
169 | 
170 |         return total_loss / n_samples, n_correct / n_samples
171 | 
172 |     def val_epoch(self, epoch):
173 |         self.model.eval()
174 | 
175 |         total_loss = 0.
176 |         n_correct = 0
177 |         n_samples = 0
178 |         for step, (images, labels) in enumerate(tqdm(self.val_loader)):
179 |             images = images.to(dtype=torch.float32, device=self.device)         # (N, C=20, H, W)
180 |             labels = labels.to(dtype=torch.float32, device=self.device)         # (N,)
181 | 
182 |             # forward
183 |             out = self.model(images)                            # (N, 1), logits before sigmoid
184 |             loss = self.criterion(out, labels.unsqueeze(-1))
185 | 
186 |             total_loss += loss.item() * labels.shape[0]
187 |             n_correct += ((out.detach().squeeze(1) >= 0.) == labels).sum().item()
188 |             n_samples += labels.shape[0]
189 | 
190 |             # display
191 |             if (step + 1) % self.display_interval == 0:
192 |                 try:
193 |                     print('-' * 40)
194 |                     print(f'[info] Probs: {torch.sigmoid(out).squeeze(-1).cpu().numpy()}')
195 |                     print(f'[info] Label: {labels.cpu().numpy()}')
196 |                 except:
197 |                     print(f'[warning] Failed to display validation result.')
198 | 
199 |         return total_loss / n_samples, n_correct / n_samples
200 | 
201 | 
202 | if __name__ == '__main__':
203 |     trainer = Trainer(configs)
204 |     trainer.train()
205 | 


--------------------------------------------------------------------------------
/train_lstm_flow.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Convolutional Recurrent Neural Networks (CRNN) for optical flow maps.
  3 | Each input map has horizontal and vertical components.
  4 | """
  5 | 
  6 | import os
  7 | import time
  8 | import shutil
  9 | from tqdm import tqdm, trange
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.optim as optim
 13 | import torch.nn.functional as F
 14 | from torch.utils.data import DataLoader
 15 | from torchvision import transforms
 16 | from data import OpticalFlowDataset
 17 | from networks import CRNNOpticalFlow
 18 | from conf.lstm_flow import configs
 19 | 
 20 | 
 21 | class Trainer(object):
 22 |     def __init__(self, configs):
 23 |         self.configs = configs
 24 |         self.device = torch.device(configs['device'])
 25 | 
 26 |         ckpt = None
 27 |         if configs['resume']:
 28 |             ckpt = torch.load(configs['ckpt_path'])
 29 |             configs['net'] = ckpt['net_configs']
 30 |             self.configs = configs
 31 | 
 32 |         train_transform = transforms.Compose([
 33 |             transforms.Resize([configs['image_size'], configs['image_size']]),
 34 |             transforms.ToTensor(),
 35 |         ])
 36 |         train_dataset = OpticalFlowDataset(
 37 |             root=configs['data_root'],
 38 |             video_list_path=configs['train_list'],
 39 |             n_frames=configs['net']['n_frames'],
 40 |             transform=train_transform,
 41 |             is_train=True
 42 |         )
 43 |         self.train_loader = DataLoader(
 44 |             dataset=train_dataset,
 45 |             batch_size=configs['train_batch_size'],
 46 |             shuffle=True,
 47 |             num_workers=configs['n_workers'],
 48 |             collate_fn=train_dataset.collate_fn,
 49 |             pin_memory=True
 50 |         )
 51 | 
 52 |         self.criterion = nn.BCELoss()
 53 | 
 54 |         self.model = CRNNOpticalFlow(
 55 |             cnn_dropout=configs['net']['cnn_dropout'],
 56 |             cnn_emb_dim=configs['net']['cnn_emb_dim'],
 57 |             cnn_type=configs['net']['cnn_type'],
 58 |             rnn_hidden_size=configs['net']['rnn_hidden_size'],
 59 |             rnn_dropout=configs['net']['rnn_dropout'],
 60 |             num_rnn_layers=configs['net']['num_rnn_layers'],
 61 |             rnn_bidir=configs['net']['rnn_bidir']
 62 |         )
 63 |         self.model.to(self.device)
 64 |         self.optimizer = optim.Adam(self.model.parameters(),
 65 |                                     lr=configs['lr'], weight_decay=configs['weight_decay'])
 66 | 
 67 |         if configs['resume']:
 68 |             self.model.load_state_dict(ckpt['state_dict'])
 69 |             self.optimizer.load_state_dict(ckpt['optimizer'])
 70 |             if configs['new_lr'] is not None:
 71 |                 for param_group in self.optimizer.param_groups:
 72 |                     param_group['lr'] = configs['new_lr']
 73 | 
 74 |         if configs['apply_val']:
 75 |             val_transform = transforms.Compose([
 76 |                 transforms.Resize([configs['image_size'], configs['image_size']]),
 77 |                 transforms.ToTensor(),
 78 |             ])
 79 |             val_dataset = OpticalFlowDataset(
 80 |                 root=configs['data_root'],
 81 |                 video_list_path=configs['val_list'],
 82 |                 n_frames=configs['net']['n_frames'],
 83 |                 transform=val_transform,
 84 |                 is_train=False
 85 |             )
 86 |             self.val_loader = DataLoader(
 87 |                 dataset=val_dataset,
 88 |                 batch_size=configs['val_batch_size'],
 89 |                 shuffle=False,
 90 |                 num_workers=configs['n_workers'],
 91 |                 collate_fn=val_dataset.collate_fn,
 92 |                 pin_memory=True
 93 |             )
 94 |             self.display_interval = configs['val_display_interval']
 95 | 
 96 |         self.save_dir = os.path.join(configs['save_dir'], time.strftime('flow-crnn_%Y%m%d-%H%M%S', time.localtime()))
 97 |         os.makedirs(self.save_dir)
 98 |         self.log_file = os.path.join(self.save_dir, 'log_train.txt')
 99 |         self.copyscripts(os.path.join(self.save_dir, 'backup_scripts'))
100 | 
101 |         self.writelog(self.configs)
102 |         self.writelog('=' * 80)
103 |         print(self.model)
104 | 
105 |     def copyscripts(self, dest_path):
106 |         """
107 |         Save python scripts.
108 |         Ignore directories such as '__pycache__' and '.idea'.
109 |         """
110 |         shutil.copytree('.', dest_path, ignore=shutil.ignore_patterns('_*', '.*', self.configs['save_dir']))
111 | 
112 |     def writelog(self, results):
113 |         if not isinstance(results, str):
114 |             results = str(results)
115 |         with open(self.log_file, 'a') as fp:
116 |             fp.write(results + '\n')
117 | 
118 |     def savemodel(self, save_name):
119 |         torch.save(
120 |             {
121 |                 'state_dict': self.model.state_dict(),
122 |                 'optimizer': self.optimizer.state_dict(),
123 |                 'net_configs': self.configs['net']
124 |             },
125 |             save_name
126 |         )
127 |         tqdm.write(f'[Info] Trained model has been saved as {save_name}')
128 | 
129 |     def train(self):
130 |         for epoch in trange(self.configs['n_epochs']):
131 |             tqdm.write('=' * 20 + f'Epoch {epoch + 1} starts' + '=' * 20)
132 |             average_loss, accuracy = self.train_epoch(epoch)
133 |             log_str = f'Epoch [{epoch + 1:02d}/{self.configs["n_epochs"]}] Train Loss = {average_loss:.5f} ' \
134 |                       f'Train ACC = {accuracy*100:.3f}%'
135 |             self.savemodel(os.path.join(self.save_dir, f'm_epoch{epoch + 1:02d}.pt'))
136 | 
137 |             if self.configs['apply_val']:
138 |                 with torch.no_grad():
139 |                     val_loss, val_acc = self.val_epoch(epoch)
140 |                     log_str += f' Val Loss = {val_loss:.5f} Val ACC = {val_acc * 100:.3f}%'
141 | 
142 |             self.writelog(log_str)
143 |             tqdm.write(log_str)
144 | 
145 |     def train_epoch(self, epoch):
146 |         self.model.train()
147 | 
148 |         total_loss = 0.
149 |         n_correct = 0
150 |         n_samples = 0
151 |         for step, (images, labels) in enumerate(tqdm(self.train_loader)):
152 |             images = images.to(dtype=torch.float32, device=self.device)       # (N, T * 2, H, W)
153 |             labels = labels.to(dtype=torch.float32, device=self.device)       # (N,)
154 | 
155 |             # forward & backward
156 |             out = self.model(images)        # (N, T), probs after sigmoid
157 |             out = torch.mean(out, dim=-1, keepdim=True)     # (N, 1)
158 | 
159 |             tqdm.write(str(images.shape))
160 |             tqdm.write(str(labels.shape))
161 |             tqdm.write(str(out[:4, 0]))
162 |             tqdm.write(str(labels[:4]))
163 | 
164 |             loss = self.criterion(out, labels.unsqueeze(-1))
165 | 
166 |             tqdm.write(str(loss.item()) + '\n')
167 | 
168 |             self.optimizer.zero_grad()
169 |             loss.backward()
170 | 
171 |             grad_norm = nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
172 |             self.optimizer.step()
173 | 
174 |             total_loss += loss.item() * labels.shape[0]
175 |             n_correct += ((out.detach().squeeze(1) >= 0.5) == labels).sum().item()
176 |             n_samples += labels.shape[0]
177 | 
178 |         return total_loss / n_samples, n_correct / n_samples
179 | 
180 |     def val_epoch(self, epoch):
181 |         self.model.eval()
182 | 
183 |         total_loss = 0.
184 |         n_correct = 0
185 |         n_samples = 0
186 |         for step, (images, labels) in enumerate(tqdm(self.val_loader)):
187 |             images = images.to(dtype=torch.float32, device=self.device)         # (N, T * 2, H, W)
188 |             labels = labels.to(dtype=torch.float32, device=self.device)         # (N,)
189 | 
190 |             # forward
191 |             out = self.model(images)                            # (N, T), probs after sigmoid
192 |             out = torch.mean(out, dim=-1, keepdim=True)         # (N, 1)
193 | 
194 |             loss = self.criterion(out, labels.unsqueeze(-1))
195 | 
196 |             total_loss += loss.item() * labels.shape[0]
197 |             n_correct += ((out.detach().squeeze(1) >= 0.5) == labels).sum().item()
198 |             n_samples += labels.shape[0]
199 | 
200 |             # display
201 |             if (step + 1) % self.display_interval == 0:
202 |                 try:
203 |                     tqdm.write('-' * 40)
204 |                     tqdm.write(f'[info] Probs: {out.squeeze(-1).cpu().numpy()}')
205 |                     tqdm.write(f'[info] Label: {labels.cpu().numpy()}')
206 |                 except:
207 |                     tqdm.write(f'[warning] Failed to display validation result.')
208 | 
209 |         return total_loss / n_samples, n_correct / n_samples
210 | 
211 | 
212 | if __name__ == '__main__':
213 |     trainer = Trainer(configs)
214 |     trainer.train()
215 | 


--------------------------------------------------------------------------------
/train_lstm_rgb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import shutil
  4 | from tqdm import tqdm, trange
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | import torch.nn.functional as F
  9 | from torch.utils.data import DataLoader
 10 | from torchvision import transforms
 11 | from data import RGBFrameDataset
 12 | from networks import CRNNClassifier
 13 | from conf.lstm_rgb import configs
 14 | 
 15 | 
 16 | class Trainer(object):
 17 |     def __init__(self, configs):
 18 |         self.configs = configs
 19 |         self.device = torch.device(configs['device'])
 20 | 
 21 |         # for pre-trained models
 22 |         train_transform = transforms.Compose([
 23 |             transforms.Resize([configs['image_size'], configs['image_size']]),
 24 |             transforms.ToTensor(),
 25 |             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 26 |         ])
 27 |         train_dataset = RGBFrameDataset(
 28 |             root=configs['data_root'],
 29 |             video_list_path=configs['train_list'],
 30 |             n_frames=configs['train_n_frames'],
 31 |             transform=train_transform,
 32 |             is_train=True
 33 |         )
 34 |         self.train_loader = DataLoader(
 35 |             dataset=train_dataset,
 36 |             batch_size=configs['train_batch_size'],
 37 |             shuffle=True,
 38 |             num_workers=configs['n_workers'],
 39 |             collate_fn=train_dataset.collate_fn,
 40 |             pin_memory=True
 41 |         )
 42 | 
 43 |         self.criterion = nn.BCELoss()
 44 | 
 45 |         ckpt = None
 46 |         if configs['resume']:
 47 |             ckpt = torch.load(configs['ckpt_path'])
 48 |             configs['net'] = ckpt['net_configs']
 49 |             self.configs = configs
 50 | 
 51 |         self.model = CRNNClassifier(
 52 |             cnn_dropout=configs['net']['cnn_dropout'],
 53 |             cnn_emb_dim=configs['net']['cnn_emb_dim'],
 54 |             cnn_type=configs['net']['cnn_type'],
 55 |             cnn_finetune=configs['net']['cnn_finetune'],
 56 |             rnn_hidden_size=configs['net']['rnn_hidden_size'],
 57 |             rnn_dropout=configs['net']['rnn_dropout'],
 58 |             num_rnn_layers=configs['net']['num_rnn_layers'],
 59 |             rnn_bidir=configs['net']['rnn_bidir']
 60 |         )
 61 |         self.model.to(self.device)
 62 |         self.optimizer = optim.Adam(self.model.parameters(), lr=configs['lr'], weight_decay=configs['weight_decay'])
 63 | 
 64 |         # if configs['pre_train']:
 65 |         #     pre_ckpt = torch.load(configs['pre_train_path'])
 66 |         #     self.model.load_state_dict(pre_ckpt['state_dict'], strict=True)
 67 |         #     print('============= load pre-trained model ==============')
 68 | 
 69 |         if configs['resume']:
 70 |             self.model.load_state_dict(ckpt['state_dict'])
 71 |             self.optimizer.load_state_dict(ckpt['optimizer'])
 72 |             if configs['new_lr'] is not None:
 73 |                 for param_group in self.optimizer.param_groups:
 74 |                     param_group['lr'] = configs['new_lr']
 75 | 
 76 |         if configs['apply_val']:
 77 |             val_transform = transforms.Compose([
 78 |                 transforms.Resize([configs['image_size'], configs['image_size']]),
 79 |                 transforms.ToTensor(),
 80 |                 transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 81 |             ])
 82 |             val_dataset = RGBFrameDataset(
 83 |                 root=configs['data_root'],
 84 |                 video_list_path=configs['val_list'],
 85 |                 n_frames=configs['val_n_frames'],
 86 |                 transform=val_transform,
 87 |                 is_train=False
 88 |             )
 89 |             self.val_loader = DataLoader(
 90 |                 dataset=val_dataset,
 91 |                 batch_size=configs['val_batch_size'],
 92 |                 shuffle=False,
 93 |                 num_workers=configs['n_workers'],
 94 |                 collate_fn=val_dataset.collate_fn,
 95 |                 pin_memory=True
 96 |             )
 97 |             self.display_interval = configs['val_display_interval']
 98 | 
 99 |         self.save_dir = os.path.join(configs['save_dir'], time.strftime('rgb_%Y%m%d-%H%M%S', time.localtime()))
100 |         os.makedirs(self.save_dir)
101 |         self.log_file = os.path.join(self.save_dir, 'log_train.txt')
102 |         self.copyscripts(os.path.join(self.save_dir, 'backup_scripts'))
103 | 
104 |         self.writelog(self.configs)
105 |         self.writelog('=' * 80)
106 | 
107 |     def copyscripts(self, dest_path):
108 |         """
109 |         Save python scripts.
110 |         Ignore directories such as '__pycache__' and '.idea'.
111 |         """
112 |         shutil.copytree('.', dest_path, ignore=shutil.ignore_patterns('_*', '.*', self.configs['save_dir']))
113 | 
114 |     def writelog(self, results):
115 |         if not isinstance(results, str):
116 |             results = str(results)
117 |         with open(self.log_file, 'a') as fp:
118 |             fp.write(results + '\n')
119 | 
120 |     def savemodel(self, save_name):
121 |         torch.save(
122 |             {
123 |                 'state_dict': self.model.state_dict(),
124 |                 'optimizer': self.optimizer.state_dict(),
125 |                 'net_configs': self.configs['net']
126 |             },
127 |             save_name
128 |         )
129 |         tqdm.write(f'[Info] Trained model has been saved as {save_name}')
130 | 
131 |     def train(self):
132 |         for epoch in trange(self.configs['n_epochs']):
133 |             tqdm.write('=' * 20 + f'Epoch {epoch + 1} starts' + '=' * 20)
134 |             average_loss, accuracy = self.train_epoch(epoch)
135 |             log_str = f'Epoch [{epoch + 1:02d}/{self.configs["n_epochs"]}] Train Loss = {average_loss:.5f} ' \
136 |                       f'Train ACC = {accuracy*100:.2f}%'
137 |             self.savemodel(os.path.join(self.save_dir, f'm_epoch{epoch + 1:02d}.pt'))
138 | 
139 |             if self.configs['apply_val']:
140 |                 with torch.no_grad():
141 |                     val_loss, val_acc = self.val_epoch(epoch)
142 |                     log_str += f' Val Loss = {val_loss:.5f} Val ACC = {val_acc * 100:.2f}%'
143 | 
144 |             self.writelog(log_str)
145 |             tqdm.write(log_str)
146 | 
147 |     def train_epoch(self, epoch):
148 |         self.model.train()
149 | 
150 |         total_loss = 0.
151 |         n_correct = 0
152 |         n_samples = 0
153 |         for step, (videos, labels) in enumerate(tqdm(self.train_loader)):
154 |             videos = videos.to(dtype=torch.float32, device=self.device)       # (N, T, C, H, W)
155 |             labels = labels.to(dtype=torch.float32, device=self.device)       # (N,)
156 | 
157 |             # forward & backward
158 |             prob = self.model(videos)        # (N, T)    after sigmoid
159 | 
160 |             tqdm.write(str(videos.shape))
161 |             tqdm.write(str(labels.shape))
162 |             tqdm.write(str(prob[:2]))
163 |             tqdm.write(str(labels[:2]))
164 | 
165 |             prob = torch.mean(prob, dim=-1, keepdim=True)
166 |             loss = self.criterion(prob, labels.unsqueeze(-1))
167 | 
168 |             tqdm.write(str(loss.item()) + '\n')
169 | 
170 |             self.optimizer.zero_grad()
171 |             loss.backward()
172 |             self.optimizer.step()
173 | 
174 |             total_loss += loss.item() * labels.shape[0]
175 |             n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item()
176 |             n_samples += labels.shape[0]
177 | 
178 |         return total_loss / n_samples, n_correct / n_samples
179 | 
180 |     def val_epoch(self, epoch):
181 |         self.model.eval()
182 | 
183 |         total_loss = 0.
184 |         n_correct = 0
185 |         n_samples = 0
186 |         for step, (videos, labels) in enumerate(tqdm(self.val_loader)):
187 |             videos = videos.to(dtype=torch.float32, device=self.device)         # (N, T, C, H, W)
188 |             labels = labels.to(dtype=torch.float32, device=self.device)         # (N,)
189 | 
190 |             # forward
191 |             prob = self.model(videos)                           # (N, T)    after sigmoid
192 |             prob = torch.mean(prob, dim=-1, keepdim=True)       # (N, 1)
193 |             loss = self.criterion(prob, labels.unsqueeze(-1))
194 | 
195 |             total_loss += loss.item() * labels.shape[0]
196 |             n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item()
197 |             n_samples += labels.shape[0]
198 | 
199 |             # display
200 |             if (step + 1) % self.display_interval == 0:
201 |                 try:
202 |                     print('-' * 40)
203 |                     print(f'[info] Probs: {prob.squeeze(-1).cpu().numpy()}')
204 |                     print(f'[info] Label: {labels.cpu().numpy()}')
205 |                 except:
206 |                     print(f'[warning] Failed to display validation result.')
207 | 
208 |         return total_loss / n_samples, n_correct / n_samples
209 | 
210 | 
211 | if __name__ == '__main__':
212 |     trainer = Trainer(configs)
213 |     trainer.train()
214 | 


--------------------------------------------------------------------------------
/networks.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import torchvision.models as models
  5 | import torch.utils.model_zoo as model_zoo
  6 | 
  7 | 
  8 | #######################################
  9 | #      CRNN Model for RGB Frames      #
 10 | #######################################
 11 | 
 12 | class CNNEncoder(nn.Module):
 13 |     '''2D CNN feature extractor based on pre-trained models.
 14 |     '''
 15 |     def __init__(self, dropout, embedding_dim, cnn_type, finetune):
 16 |         super(CNNEncoder, self).__init__()
 17 | 
 18 |         self.finetune = finetune        # if True, the CNN will also be trained
 19 | 
 20 |         assert cnn_type in ['resnet50', 'resnet101', 'resnet152'], f'invalid cnn type: {cnn_type}'
 21 |         # Note: for the following 3 types of ResNet, the output dim is 2048
 22 |         if cnn_type == 'resnet50':
 23 |             cnn = models.resnet50(pretrained=True)
 24 |         elif cnn_type == 'resnet101':
 25 |             cnn = models.resnet101(pretrained=True)
 26 |         else:
 27 |             cnn = models.resnet152(pretrained=True)
 28 | 
 29 |         modules = list(cnn.children())[:-1]         # remove the last FC layer
 30 |         self.cnn = nn.Sequential(*modules)
 31 |         cnn_out_dim = cnn.fc.in_features            # 2048
 32 | 
 33 |         self.proj = nn.Sequential(
 34 |             nn.Dropout(p=dropout),
 35 |             nn.Linear(cnn_out_dim, embedding_dim)
 36 |         )
 37 | 
 38 |     def forward(self, x_seq):
 39 |         '''
 40 |         :param x_seq: (N, T, C, H, W)
 41 |         :return (N, T, emb_dim)
 42 |         '''
 43 | 
 44 |         feature_seq = []
 45 |         for t in range(x_seq.shape[1]):
 46 |             if self.finetune:
 47 |                 x = self.cnn(x_seq[:, t, :, :, :])
 48 |                 x = x.reshape(x.shape[0], -1)  # (N, cnn_out_dim)
 49 |             else:
 50 |                 with torch.no_grad():                   # pre-trained model is fixed
 51 |                     x = self.cnn(x_seq[:, t, :, :, :])
 52 |                     x = x.reshape(x.shape[0], -1)       # (N, cnn_out_dim)
 53 | 
 54 |             x = self.proj(x)                        # (N, emb_dim)
 55 |             feature_seq.append(x)
 56 | 
 57 |         feature_seq = torch.stack(feature_seq, dim=0)   # (T, N, emb_dim)
 58 |         feature_seq = feature_seq.transpose(0, 1)       # (N, T, emb_dim)
 59 | 
 60 |         return feature_seq
 61 | 
 62 | 
 63 | class RNNDecoder(nn.Module):
 64 |     def __init__(self, input_size, hidden_size, dropout, num_layers, bidirectional):
 65 |         super(RNNDecoder, self).__init__()
 66 | 
 67 |         self.rnn = nn.LSTM(
 68 |             input_size=input_size,
 69 |             hidden_size=hidden_size,
 70 |             num_layers=num_layers,
 71 |             batch_first=True,
 72 |             dropout=dropout,
 73 |             bidirectional=bidirectional
 74 |         )
 75 |         nn.init.xavier_normal_(self.rnn.all_weights[0][0])
 76 |         nn.init.xavier_normal_(self.rnn.all_weights[0][1])
 77 |         # For bidirectional RNNs
 78 |         # nn.init.xavier_normal_(self.rnn.all_weights[1][0])
 79 |         # nn.init.xavier_normal_(self.rnn.all_weights[1][1])
 80 | 
 81 |         # binary classifier
 82 |         self.classifier = nn.Sequential(
 83 |             nn.Dropout(p=dropout),
 84 |             nn.Linear(hidden_size*2 if bidirectional else hidden_size, 1),
 85 |             nn.Sigmoid()
 86 |         )
 87 | 
 88 |     def forward(self, feature_seq):
 89 |         '''
 90 |         :param feature_seq: (N, T, dim)
 91 |         :return out: (N, T), probability after sigmoid
 92 |         '''
 93 | 
 94 |         self.rnn.flatten_parameters()       # for DataParallel
 95 | 
 96 |         out, _ = self.rnn(feature_seq)                  # (N, T, hidden_size)
 97 |         out = self.classifier(out).squeeze(-1)          # (N, T), after sigmoid
 98 | 
 99 |         return out
100 | 
101 | 
102 | class CRNNClassifier(nn.Module):
103 |     def __init__(self, cnn_dropout, cnn_emb_dim, cnn_type, cnn_finetune,
104 |                  rnn_hidden_size, rnn_dropout, num_rnn_layers, rnn_bidir):
105 |         super(CRNNClassifier, self).__init__()
106 | 
107 |         self.cnn_enc = CNNEncoder(cnn_dropout, cnn_emb_dim, cnn_type, cnn_finetune)
108 |         self.rnn_dec = RNNDecoder(cnn_emb_dim, rnn_hidden_size, rnn_dropout, num_rnn_layers, rnn_bidir)
109 | 
110 |     def forward(self, x_seq):
111 |         '''
112 |         :param x_seq: (N, T, C, H, W)
113 |         :return: prob of anomaly after sigmoid, (N, T)
114 |         '''
115 | 
116 |         feature_seq = self.cnn_enc(x_seq)       # (N, T, emb_dim)
117 |         prob = self.rnn_dec(feature_seq)        # (N, T), probability after sigmoid
118 | 
119 |         return prob
120 | 
121 | 
122 | ##################################
123 | #      CNN for Optical FLow      #
124 | ##################################
125 | """This part is similar to the temporal/motion stream in two-stream methods.
126 | Here, only three types of ResNet are supported.
127 | 
128 | Ref:
129 |     https://pytorch.org/docs/stable/torchvision/models.html?highlight=resnet
130 |     https://github.com/jeffreyyihuang/two-stream-action-recognition
131 | """
132 | 
133 | model_urls = {
134 |     'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
135 |     'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
136 |     'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth'
137 | }
138 | 
139 | class Bottleneck(nn.Module):
140 |     expansion = 4
141 | 
142 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
143 |         super(Bottleneck, self).__init__()
144 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
145 |         self.bn1 = nn.BatchNorm2d(planes)
146 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
147 |         self.bn2 = nn.BatchNorm2d(planes)
148 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
149 |         self.bn3 = nn.BatchNorm2d(planes * 4)
150 |         self.relu = nn.ReLU(inplace=True)
151 |         self.downsample = downsample
152 |         self.stride = stride
153 | 
154 |     def forward(self, x):
155 |         residual = x
156 | 
157 |         out = self.conv1(x)
158 |         out = self.bn1(out)
159 |         out = self.relu(out)
160 | 
161 |         out = self.conv2(out)
162 |         out = self.bn2(out)
163 |         out = self.relu(out)
164 | 
165 |         out = self.conv3(out)
166 |         out = self.bn3(out)
167 | 
168 |         if self.downsample is not None:
169 |             residual = self.downsample(x)
170 | 
171 |         out += residual
172 |         out = self.relu(out)
173 | 
174 |         return out
175 | 
176 | 
177 | class ResNet(nn.Module):
178 |     def __init__(self, block, layers, nb_classes=1, channel=20):
179 |         self.inplanes = 64
180 |         super(ResNet, self).__init__()
181 |         self.conv1_custom = nn.Conv2d(channel, 64, kernel_size=7, stride=2, padding=3, bias=False)
182 |         self.bn1 = nn.BatchNorm2d(64)
183 |         self.relu = nn.ReLU(inplace=True)
184 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
185 |         self.layer1 = self._make_layer(block, 64, layers[0])
186 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
187 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
188 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
189 |         self.avgpool = nn.AvgPool2d(7)
190 |         self.fc_custom = nn.Linear(512 * block.expansion, nb_classes)
191 |         for m in self.modules():
192 |             if isinstance(m, nn.Conv2d):
193 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
194 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
195 |             elif isinstance(m, nn.BatchNorm2d):
196 |                 m.weight.data.fill_(1)
197 |                 m.bias.data.zero_()
198 | 
199 |     def _make_layer(self, block, planes, blocks, stride=1):
200 |         downsample = None
201 |         if stride != 1 or self.inplanes != planes * block.expansion:
202 |             downsample = nn.Sequential(
203 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
204 |                           kernel_size=1, stride=stride, bias=False),
205 |                 nn.BatchNorm2d(planes * block.expansion),
206 |             )
207 | 
208 |         layers = [block(self.inplanes, planes, stride, downsample)]
209 |         self.inplanes = planes * block.expansion
210 |         for i in range(1, blocks):
211 |             layers.append(block(self.inplanes, planes))
212 | 
213 |         return nn.Sequential(*layers)
214 | 
215 |     def forward(self, x):
216 |         x = self.conv1_custom(x)
217 |         x = self.bn1(x)
218 |         x = self.relu(x)
219 |         x = self.maxpool(x)
220 | 
221 |         x = self.layer1(x)
222 |         x = self.layer2(x)
223 |         x = self.layer3(x)
224 |         x = self.layer4(x)
225 | 
226 |         x = self.avgpool(x)
227 |         x = x.view(x.size(0), -1)
228 |         out = self.fc_custom(x)
229 |         return out
230 | 
231 |     def extract_feature_vector(self, x):
232 |         """Extract a feature vector from the input image.
233 |         Args:
234 |             x (torch.Tensor): (N, C, H, W)
235 |         Returns:
236 |             out (torch.Tensor): (N, 2048)
237 |         """
238 |         x = self.conv1_custom(x)
239 |         x = self.bn1(x)
240 |         x = self.relu(x)
241 |         x = self.maxpool(x)
242 | 
243 |         x = self.layer1(x)
244 |         x = self.layer2(x)
245 |         x = self.layer3(x)
246 |         x = self.layer4(x)
247 | 
248 |         x = self.avgpool(x)
249 |         out = x.view(x.size(0), -1)
250 |         return out
251 | 
252 | 
253 | def resnet50(pretrained=True, channel=20):
254 |     model = ResNet(Bottleneck, [3, 4, 6, 3], nb_classes=1, channel=channel)
255 |     if pretrained:
256 |         pretrain_dict = model_zoo.load_url(model_urls['resnet50'])                  # modify pretrain code
257 |         model_dict = model.state_dict()
258 |         model_dict = weight_transform(model_dict, pretrain_dict, channel)
259 |         model.load_state_dict(model_dict)
260 |     return model
261 | 
262 | 
263 | def resnet101(pretrained=True, channel=20):
264 |     model = ResNet(Bottleneck, [3, 4, 23, 3], nb_classes=1, channel=channel)
265 |     if pretrained:
266 |         pretrain_dict = model_zoo.load_url(model_urls['resnet101'])                  # modify pretrain code
267 |         model_dict = model.state_dict()
268 |         model_dict = weight_transform(model_dict, pretrain_dict, channel)
269 |         model.load_state_dict(model_dict)
270 |     return model
271 | 
272 | 
273 | def resnet152(pretrained=True, channel=20):
274 |     model = ResNet(Bottleneck, [3, 8, 36, 3], nb_classes=1, channel=channel)
275 |     if pretrained:
276 |         pretrain_dict = model_zoo.load_url(model_urls['resnet152'])                  # modify pretrain code
277 |         model_dict = model.state_dict()
278 |         model_dict = weight_transform(model_dict, pretrain_dict, channel)
279 |         model.load_state_dict(model_dict)
280 |     return model
281 | 
282 | 
283 | def cross_modality_pretrain(conv1_weight, channel):
284 |     """Transforms the original 3 channel weight to "channel" channels
285 |     """
286 |     S=0
287 |     for i in range(3):
288 |         S += conv1_weight[:, i, :, :]
289 |     avg = S / 3.
290 |     new_conv1_weight = torch.FloatTensor(64, channel, 7, 7)
291 |     for i in range(channel):
292 |         new_conv1_weight[:, i, :, :] = avg.data
293 |     return new_conv1_weight
294 | 
295 | 
296 | def weight_transform(model_dict, pretrain_dict, channel):
297 |     weight_dict  = {k:v for k, v in pretrain_dict.items() if k in model_dict}
298 |     w3 = pretrain_dict['conv1.weight']
299 |     if channel == 3:
300 |         wt = w3
301 |     else:
302 |         wt = cross_modality_pretrain(w3, channel)
303 | 
304 |     weight_dict['conv1_custom.weight'] = wt
305 |     model_dict.update(weight_dict)
306 |     return model_dict
307 | 
308 | 
309 | ###################################
310 | #      CRNN for Optical Flow      #
311 | ###################################
312 | 
313 | class CRNNOpticalFlow(nn.Module):
314 |     def __init__(self, cnn_dropout, cnn_emb_dim, cnn_type,
315 |                  rnn_hidden_size, rnn_dropout, num_rnn_layers, rnn_bidir):
316 |         super(CRNNOpticalFlow, self).__init__()
317 | 
318 |         assert cnn_type in ['resnet50', 'resnet101', 'resnet152']
319 |         if cnn_type == 'resnet50':
320 |             self.cnn = resnet50(pretrained=True, channel=2)     # only 2 channels, i.e. x and y
321 |         elif cnn_type == 'resnet101':
322 |             self.cnn = resnet101(pretrained=True, channel=2)
323 |         elif cnn_type == 'resnet152':
324 |             self.cnn = resnet152(pretrained=True, channel=2)
325 | 
326 |         self.cnn.fc_custom = None       # here, we don't need the last linear layer
327 | 
328 |         self.embed = nn.Sequential(
329 |             nn.Dropout(p=cnn_dropout),
330 |             nn.Linear(2048, cnn_emb_dim),
331 |             nn.Dropout(p=rnn_dropout)
332 |         )
333 | 
334 |         self.rnn = RNNDecoder(cnn_emb_dim, rnn_hidden_size, rnn_dropout, num_rnn_layers, rnn_bidir)
335 | 
336 |     def forward(self, x):
337 |         '''
338 |         Args:
339 |             x (torch.Tensor): (N, T * 2, H, W)
340 |         Returns:
341 |             prob: probability of anomaly after sigmoid, (N, T)
342 |         '''
343 | 
344 |         N, _, H, W = x.shape
345 |         x = x.reshape(N, -1, 2, H, W).reshape(-1, 2, H, W)
346 |         x = self.cnn.extract_feature_vector(x)      # (N * T, 2048)
347 |         x = x.reshape(N, -1, 2048)
348 | 
349 |         x = self.embed(x)                           # (N, T, emb_dim)
350 | 
351 |         prob = self.rnn(x)                          # (N, T), probability after sigmoid
352 |         return prob
353 | 
354 | 
355 | if __name__ == '__main__':
356 |     model = resnet101(pretrained=True, channel=20)
357 |     print(model)
358 | 


--------------------------------------------------------------------------------