├── imgs ├── crnn.png ├── visualization.png └── crnn_twostream.png ├── requirements.txt ├── frame_video_level_cnn ├── resnet101_video │ ├── plot_video1.png │ ├── plot_video2.png │ ├── plot_video3.png │ ├── plot_video4.png │ ├── Data.py │ ├── Testplot.py │ └── Test.py ├── README.md ├── resnet101_frame │ ├── TrainConfig.py │ ├── Data.py │ ├── Model.py │ ├── Test.py │ └── Train.py ├── vgg16bn_frame │ ├── TrainConfig.py │ ├── Data.py │ ├── Model.py │ ├── Test.py │ └── Train.py └── data_prepare.py ├── conf ├── cnn_flow.py ├── lstm_flow.py └── lstm_rgb.py ├── test_combined.py ├── test_cnn_flow.py ├── test_lstm_flow.py ├── test_lstm_rgb.py ├── data.py ├── README.md ├── train_cnn_flow.py ├── train_lstm_flow.py ├── train_lstm_rgb.py └── networks.py /imgs/crnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/imgs/crnn.png -------------------------------------------------------------------------------- /imgs/visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/imgs/visualization.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | tqdm 3 | torchvision==0.7.0 4 | torch==1.6.0 5 | Pillow 6 | scikit_learn 7 | -------------------------------------------------------------------------------- /imgs/crnn_twostream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/imgs/crnn_twostream.png -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_video/plot_video1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/frame_video_level_cnn/resnet101_video/plot_video1.png -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_video/plot_video2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/frame_video_level_cnn/resnet101_video/plot_video2.png -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_video/plot_video3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/frame_video_level_cnn/resnet101_video/plot_video3.png -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_video/plot_video4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyf98/Traffic-Accident-Detection/HEAD/frame_video_level_cnn/resnet101_video/plot_video4.png -------------------------------------------------------------------------------- /frame_video_level_cnn/README.md: -------------------------------------------------------------------------------- 1 | # Traffic Accident Detection via Deep Learning 2 | 3 | 4 | ## ResNet101, VGG16_BN 5 | 6 | Code in folder `resnet101_frame`: A fixed ResNet101 Conv2d CNN + 2 MLP layers 7 | 8 | Code in folder `vgg16bn_frame`: A fixed VGG16_bn Conv2d CNN + 2 MLP layers 9 | 10 | To run the training code: 11 | ``` 12 | python Train.py 13 | ``` 14 | 15 | To run the test code: 16 | ``` 17 | python Train.py 18 | ``` 19 | 20 | ## Video-level ResNet101 21 | 22 | Code in folder `resnet101_video` 23 | 24 | ### Video-level Prediction 25 | No training, just run test script: 26 | ``` 27 | python Train.py 28 | ``` 29 | 30 | ### Visualization 31 | This program generates and saves video-level plots of 4 video clips. Selected videos should be saved in folders 1, 2, 3, and 4 before running this script 32 | 33 | ``` 34 | python Testplot.py 35 | ``` -------------------------------------------------------------------------------- /conf/cnn_flow.py: -------------------------------------------------------------------------------- 1 | configs = dict() 2 | configs['net'] = dict() 3 | 4 | # whether to resume 5 | configs['resume'] = False 6 | configs['ckpt_path'] = 'exp/xx/m_epochxx.pt' 7 | configs['new_lr'] = 1e-4 # if resume is True and new_lr is not None, then use this new learning rate 8 | 9 | # for network 10 | configs['net']['cnn_type'] = 'resnet101' # 'resnet50', 'resnet101', 'resnet152' 11 | configs['net']['n_frames'] = 10 # number of frames 12 | 13 | # for training and validation 14 | configs['lr'] = 1e-3 15 | configs['weight_decay'] = 1e-6 16 | configs['n_epochs'] = 100 17 | 18 | configs['save_dir'] = 'exp' 19 | configs['device'] = 'cuda' 20 | configs['n_workers'] = 4 21 | configs['image_size'] = 224 # for resnet 22 | configs['train_batch_size'] = 64 23 | configs['data_root'] = '/home/ubuntu/data_flow' # path to folders of optical flow images 24 | configs['train_list'] = 'dataset/train.txt' 25 | configs['apply_val'] = True 26 | configs['val_batch_size'] = 32 27 | configs['val_list'] = 'dataset/val.txt' 28 | configs['val_display_interval'] = 5 29 | -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_frame/TrainConfig.py: -------------------------------------------------------------------------------- 1 | configs = dict() 2 | configs['net'] = dict() 3 | 4 | # whether to resume 5 | configs['resume'] = False 6 | configs['ckpt_path'] = 'exp/xxxx/m_epochxx.pt' 7 | configs['new_lr'] = 1e-3 # if resume is True and new_lr is not None, then use this new learning rate 8 | 9 | # for network 10 | configs['net']['cnn_type'] = 'resnet101' # 'alexnet' 11 | configs['net']['hidden_sizes'] = [256, 128] 12 | configs['net']['batchnorms'] = [True, True, True] 13 | configs['net']['dropouts'] = [0.3, 0.2, 0.1] # the first is dropout for input 14 | 15 | # for training and validation 16 | configs['lr'] = 1e-2 17 | configs['weight_decay'] = 1e-6 18 | configs['n_epochs'] = 100 19 | 20 | configs['save_dir'] = 'exp' 21 | configs['device'] = 'cuda' 22 | configs['n_workers'] = 4 23 | configs['train_batch_size'] = 64 #1024 24 | configs['image_size'] = 224 # for resnet 25 | configs['data_root'] = '/home/ubuntu/project/data' 26 | configs['train_list'] = 'trainlist_reduced.txt' 27 | configs['apply_val'] = True 28 | configs['val_batch_size'] = 128 29 | configs['val_list'] = 'val.txt' 30 | configs['val_display_interval'] = 5 31 | -------------------------------------------------------------------------------- /frame_video_level_cnn/vgg16bn_frame/TrainConfig.py: -------------------------------------------------------------------------------- 1 | configs = dict() 2 | configs['net'] = dict() 3 | 4 | # whether to resume 5 | configs['resume'] = False 6 | configs['ckpt_path'] = 'exp/xxxx/m_epochxx.pt' 7 | configs['new_lr'] = 1e-3 # if resume is True and new_lr is not None, then use this new learning rate 8 | 9 | # for network 10 | configs['net']['cnn_type'] = 'vgg16_bn' # 'alexnet' 11 | configs['net']['hidden_sizes'] = [256, 128] 12 | configs['net']['batchnorms'] = [True, True, True] 13 | configs['net']['dropouts'] = [0.3, 0.2, 0.1] # the first is dropout for input 14 | 15 | # for training and validation 16 | configs['lr'] = 1e-2 17 | configs['weight_decay'] = 1e-6 18 | configs['n_epochs'] = 100 19 | 20 | configs['save_dir'] = 'exp' 21 | configs['device'] = 'cuda' 22 | configs['n_workers'] = 4 23 | configs['train_batch_size'] = 64 #1024 24 | configs['image_size'] = 224 # for resnet 25 | configs['data_root'] = '/home/ubuntu/project/data' 26 | configs['train_list'] = 'trainlist_reduced.txt' 27 | configs['apply_val'] = True 28 | configs['val_batch_size'] = 128 29 | configs['val_list'] = 'val.txt' 30 | configs['val_display_interval'] = 5 31 | -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_frame/Data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import torch 4 | import numpy as np 5 | from PIL import Image 6 | from tqdm import tqdm 7 | from torch.utils.data import Dataset 8 | 9 | 10 | class VideoFrameDataset(Dataset): 11 | def __init__(self, root, video_list_path, transform): 12 | self.transform = transform 13 | 14 | with open(video_list_path, 'r') as fp: 15 | valid_videos = [line.rstrip().split()[0].split('/')[-1] for line in fp.readlines()] 16 | 17 | all_jpgs = sorted(glob.glob(root.rstrip('/') + '/*/*.jpg')) 18 | valid_jpgs = [] 19 | for name in all_jpgs: 20 | if name.split('/')[-2] in valid_videos: 21 | valid_jpgs.append(name) 22 | 23 | self.images = valid_jpgs 24 | self.labels = [int(n.split('/')[-2].split('-')[-1] == '1') for n in valid_jpgs] 25 | 26 | tqdm.write(f'There are {len(self.images)} images, {sum(self.labels)} are anomaly.') 27 | 28 | def __len__(self): 29 | return len(self.images) 30 | 31 | def __getitem__(self, index): 32 | img = self.transform(Image.open(self.images[index])) 33 | return img, self.labels[index] 34 | -------------------------------------------------------------------------------- /frame_video_level_cnn/vgg16bn_frame/Data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import torch 4 | import numpy as np 5 | from PIL import Image 6 | from tqdm import tqdm 7 | from torch.utils.data import Dataset 8 | 9 | 10 | class VideoFrameDataset(Dataset): 11 | def __init__(self, root, video_list_path, transform): 12 | self.transform = transform 13 | 14 | with open(video_list_path, 'r') as fp: 15 | valid_videos = [line.rstrip().split()[0].split('/')[-1] for line in fp.readlines()] 16 | 17 | all_jpgs = sorted(glob.glob(root.rstrip('/') + '/*/*.jpg')) 18 | valid_jpgs = [] 19 | for name in all_jpgs: 20 | if name.split('/')[-2] in valid_videos: 21 | valid_jpgs.append(name) 22 | 23 | self.images = valid_jpgs 24 | self.labels = [int(n.split('/')[-2].split('-')[-1] == '1') for n in valid_jpgs] 25 | 26 | tqdm.write(f'There are {len(self.images)} images, {sum(self.labels)} are anomaly.') 27 | 28 | def __len__(self): 29 | return len(self.images) 30 | 31 | def __getitem__(self, index): 32 | img = self.transform(Image.open(self.images[index])) 33 | return img, self.labels[index] 34 | -------------------------------------------------------------------------------- /conf/lstm_flow.py: -------------------------------------------------------------------------------- 1 | configs = dict() 2 | configs['net'] = dict() 3 | 4 | # whether to resume 5 | configs['resume'] = False 6 | configs['ckpt_path'] = 'exp/xx/m_epochxx.pt' 7 | configs['new_lr'] = 1e-5 # if resume is True and new_lr is not None, then use this new learning rate 8 | 9 | # for network 10 | configs['net']['n_frames'] = 10 11 | configs['net']['cnn_type'] = 'resnet101' # 'resnet50', 'resnet101', 'resnet152' 12 | configs['net']['cnn_emb_dim'] = 256 13 | configs['net']['cnn_dropout'] = 0.5 14 | configs['net']['num_rnn_layers'] = 1 15 | configs['net']['rnn_bidir'] = False # bidirectional or unidirectional 16 | configs['net']['rnn_hidden_size'] = 256 17 | configs['net']['rnn_dropout'] = 0.3 18 | 19 | # for training and validation 20 | configs['lr'] = 1e-4 21 | configs['weight_decay'] = 1e-6 22 | configs['n_epochs'] = 100 23 | 24 | configs['save_dir'] = 'exp' 25 | configs['device'] = 'cuda' 26 | configs['n_workers'] = 4 27 | configs['image_size'] = 224 # for resnet 28 | configs['data_root'] = '/home/ubuntu/data_flow' # path to folders of optical flows 29 | configs['train_batch_size'] = 8 30 | configs['train_list'] = 'dataset/train.txt' 31 | configs['apply_val'] = True 32 | configs['val_batch_size'] = 8 33 | configs['val_list'] = 'dataset/val.txt' 34 | configs['val_display_interval'] = 5 35 | -------------------------------------------------------------------------------- /conf/lstm_rgb.py: -------------------------------------------------------------------------------- 1 | configs = dict() 2 | configs['net'] = dict() 3 | 4 | # whether to resume 5 | configs['resume'] = False 6 | configs['ckpt_path'] = 'exp/xx/m_epochxx.pt' 7 | configs['new_lr'] = 2e-5 # if resume is True and new_lr is not None, then use this new learning rate 8 | 9 | # for network 10 | configs['net']['cnn_type'] = 'resnet101' # 'resnet50', 'resnet101', 'resnet152' 11 | configs['net']['cnn_emb_dim'] = 512 12 | configs['net']['cnn_dropout'] = 0.5 13 | configs['net']['cnn_finetune'] = False # whether to fine-tune the pre-trained CNN 14 | configs['net']['num_rnn_layers'] = 1 15 | configs['net']['rnn_hidden_size'] = 256 16 | configs['net']['rnn_dropout'] = 0.3 17 | configs['net']['rnn_bidir'] = True # bidirectional or unidirectional 18 | 19 | # for training and validation 20 | configs['lr'] = 2e-4 21 | configs['weight_decay'] = 1e-6 22 | configs['n_epochs'] = 100 23 | 24 | configs['save_dir'] = 'exp' 25 | configs['device'] = 'cuda' 26 | configs['n_workers'] = 4 27 | configs['train_batch_size'] = 64 28 | configs['train_n_frames'] = 15 # max number of frames per video during training 29 | configs['image_size'] = 224 # for resnet 30 | configs['data_root'] = '/home/ubuntu/data' # path to folders of rgb frames 31 | configs['train_list'] = 'dataset/train.txt' 32 | configs['apply_val'] = True 33 | configs['val_batch_size'] = 32 34 | configs['val_n_frames'] = 30 35 | configs['val_list'] = 'dataset/val.txt' 36 | configs['val_display_interval'] = 5 37 | -------------------------------------------------------------------------------- /frame_video_level_cnn/vgg16bn_frame/Model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision.models as models 4 | 5 | 6 | class FrameClassifier(nn.Module): 7 | '''2D CNN feature extractor based on pre-trained models. 8 | ''' 9 | def __init__(self, fc_sizes, batchnorms, dropouts): 10 | super(FrameClassifier, self).__init__() 11 | 12 | cnn = models.vgg16_bn(pretrained=True) 13 | 14 | modules = list(cnn.children())[:-1] # remove the last FC layer 15 | self.cnn = nn.Sequential(*modules) 16 | in_features = cnn.classifier[0].in_features # 4096 17 | 18 | fc_layers = [nn.Dropout(dropouts[0])] # input dropout 19 | dropouts = dropouts[1:] 20 | for hidden_size, batchnorm, drop_p in zip(fc_sizes, batchnorms, dropouts): 21 | fc_layers.append(nn.Linear(in_features, hidden_size)) 22 | in_features = hidden_size 23 | if batchnorm: 24 | fc_layers.append(nn.BatchNorm1d(hidden_size)) 25 | fc_layers.append(nn.LeakyReLU(inplace=True)) 26 | fc_layers.append(nn.Dropout(p=drop_p)) 27 | fc_layers.append(nn.Linear(in_features, 1)) # binary classification 28 | fc_layers.append(nn.Sigmoid()) 29 | 30 | self.classifier = nn.Sequential(*fc_layers) 31 | 32 | def forward(self, x): 33 | with torch.no_grad(): 34 | out = self.cnn(x) 35 | out = out.reshape(out.shape[0], -1) 36 | 37 | prob = self.classifier(out) # (N, 1) 38 | return prob 39 | -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_frame/Model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision.models as models 4 | 5 | 6 | class FrameClassifier(nn.Module): 7 | '''2D CNN feature extractor based on pre-trained models. 8 | ''' 9 | def __init__(self, fc_sizes, batchnorms, dropouts): 10 | super(FrameClassifier, self).__init__() 11 | 12 | cnn = models.resnet101(pretrained=True) 13 | 14 | modules = list(cnn.children())[:-1] # remove the last FC layer 15 | self.cnn = nn.Sequential(*modules) 16 | in_features = cnn.fc.in_features # 2048 17 | 18 | fc_layers = [nn.Dropout(dropouts[0])] # input dropout 19 | dropouts = dropouts[1:] 20 | for hidden_size, batchnorm, drop_p in zip(fc_sizes, batchnorms, dropouts): 21 | fc_layers.append(nn.Linear(in_features, hidden_size)) 22 | in_features = hidden_size 23 | if batchnorm: 24 | fc_layers.append(nn.BatchNorm1d(hidden_size)) 25 | fc_layers.append(nn.LeakyReLU(inplace=True)) 26 | fc_layers.append(nn.Dropout(p=drop_p)) 27 | fc_layers.append(nn.Linear(in_features, 1)) # binary classification 28 | fc_layers.append(nn.Sigmoid()) 29 | 30 | self.classifier = nn.Sequential(*fc_layers) 31 | 32 | def forward(self, x): 33 | with torch.no_grad(): 34 | out = self.cnn(x) 35 | out = out.reshape(out.shape[0], -1) # (N, 2048) 36 | 37 | prob = self.classifier(out) # (N, 1) 38 | return prob 39 | -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_video/Data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import torch 4 | import numpy as np 5 | from PIL import Image 6 | from tqdm import tqdm 7 | from torch.utils.data import Dataset 8 | 9 | 10 | class VideoFrameDataset(Dataset): 11 | def __init__(self, root, video_list_path, transform): 12 | self.transform = transform 13 | 14 | with open(video_list_path, 'r') as fp: 15 | valid_videos = [line.rstrip().split()[0].split('/')[-1] for line in fp.readlines()] 16 | 17 | all_jpgs = sorted(glob.glob(root.rstrip('/') + '/*/*.jpg')) 18 | valid_jpgs = [] 19 | for name in all_jpgs: 20 | if name.split('/')[-2] in valid_videos: 21 | valid_jpgs.append(name) 22 | 23 | self.images = valid_jpgs 24 | self.labels = [int(n.split('/')[-2].split('-')[-1] == '1') for n in valid_jpgs] 25 | 26 | tqdm.write(f'There are {len(self.images)} images, {sum(self.labels)} are anomaly.') 27 | 28 | def __len__(self): 29 | return len(self.images) 30 | 31 | def __getitem__(self, index): 32 | img = self.transform(Image.open(self.images[index])) 33 | return img, self.labels[index] 34 | 35 | 36 | 37 | def loadImages(root, folder, batch, transform): 38 | valid_jpgs = sorted(glob.glob(root.rstrip('/') + '/' + str(folder) + '/*.jpg')) 39 | labels = [int(n.split('/')[-2].split('-')[-1] == '1') for n in valid_jpgs] 40 | 41 | valid_jpgs = [transform(Image.open(item)) for item in valid_jpgs] 42 | 43 | if len(valid_jpgs) > batch: 44 | valid_jpgs = valid_jpgs[:batch] 45 | elif len(valid_jpgs) < batch: 46 | for i in range(batch - len(valid_jpgs)): 47 | valid_jpgs.append(valid_jpgs[-1]) 48 | 49 | return torch.stack(valid_jpgs, dim=0), torch.tensor(labels) 50 | -------------------------------------------------------------------------------- /test_combined.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | from sklearn.metrics import roc_auc_score 5 | 6 | 7 | def main(args): 8 | """Combine video-level predictions from two streams using weighted average fusion. 9 | Notes: 10 | combined_prob = rgb_weight * rgb_prob + (1. - rgb_weight) * flow_prob 11 | """ 12 | rgb_results = np.load(args.rgb_file) 13 | flow_results = np.load(args.flow_file) 14 | rgb_w = float(args.rgb_weight) 15 | assert 0. <= rgb_w <= 1. 16 | 17 | y_true = [] 18 | y_pred_rgb = [] 19 | y_pred_flow = [] 20 | for (name1, label_str1, target1, prob1), (name2, label_str2, target2, prob2) in zip(rgb_results, flow_results): 21 | assert name1 == name2 22 | assert label_str1 == label_str2 23 | assert target1 == target2 24 | 25 | y_true.append(int(target1)) 26 | y_pred_rgb.append(float(prob1)) 27 | y_pred_flow.append(float(prob2)) 28 | 29 | y_true = np.array(y_true, dtype=np.int) 30 | y_pred_rgb = np.array(y_pred_rgb, dtype=np.float) 31 | y_pred_flow = np.array(y_pred_flow, dtype=np.float) 32 | y_pred = y_pred_rgb * rgb_w + y_pred_flow * (1. - rgb_w) 33 | 34 | auc_rgb = roc_auc_score(y_true, y_pred_rgb) 35 | auc_flow = roc_auc_score(y_true, y_pred_flow) 36 | auc = roc_auc_score(y_true, y_pred) 37 | 38 | acc_rgb = ((y_pred_rgb >= 0.5) == y_true).sum() / y_true.shape[0] 39 | acc_flow = ((y_pred_flow >= 0.5) == y_true).sum() / y_true.shape[0] 40 | acc = ((y_pred >= 0.5) == y_true).sum() / y_true.shape[0] 41 | 42 | print(f'=============== AUC ===============') 43 | print(f'== RGB: {auc_rgb:.5f}') 44 | print(f'== Flow: {auc_flow:.5f}') 45 | print(f'== Both: {auc:.5f}') 46 | print(f'============= Accuracy ==============') 47 | print(f'== RGB: {acc_rgb*100:.3f}%') 48 | print(f'== Flow: {acc_flow*100:.3f}%') 49 | print(f'== Both: {acc*100:.3f}%') 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument( 55 | '--rgb_file', 56 | type=str, 57 | help='path to the npy file of RGB frames' 58 | ) 59 | parser.add_argument( 60 | '--flow_file', 61 | type=str, 62 | help='path to the npy file of optical flow images' 63 | ) 64 | parser.add_argument( 65 | '--rgb_weight', 66 | default=0.5, 67 | type=float, 68 | help='weight of RGB predictions (0.0 to 1.0)' 69 | ) 70 | args = parser.parse_args() 71 | print(args) 72 | 73 | main(args) 74 | -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_video/Testplot.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | from torch.utils.data import DataLoader 7 | from torchvision import transforms 8 | from sklearn.metrics import roc_auc_score 9 | from Data import VideoFrameDataset, loadImages 10 | from Model import FrameClassifier 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | def test(ckpt_path, data_root, data_list, batch_size, num_workers, device, display_interval): 15 | ''' 16 | Genarates and saves the plot of 4 test videos 17 | ''' 18 | ckpt = torch.load(ckpt_path) 19 | net_configs = ckpt['net_configs'] 20 | print(f'Load ckpt from {ckpt_path}') 21 | 22 | val_transform = transforms.Compose([ 23 | transforms.Resize([224, 224]), 24 | transforms.ToTensor(), 25 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 26 | ]) 27 | 28 | model = FrameClassifier( 29 | #cnn_type=net_configs['cnn_type'], 30 | fc_sizes=net_configs['hidden_sizes'], 31 | batchnorms=net_configs['batchnorms'], 32 | dropouts=net_configs['dropouts'] 33 | ) 34 | model.to(device) 35 | model.load_state_dict(ckpt['state_dict']) 36 | 37 | valid_videos = [1, 2, 3, 4] # folder names 38 | 39 | model.eval() 40 | with torch.no_grad(): 41 | n_correct = 0 42 | n_samples = 0 43 | predicted_scores = [] 44 | target_scores = [] 45 | for i in range(len(valid_videos)): 46 | images, labels = loadImages(data_root, valid_videos[i], batch_size, val_transform) 47 | images = images.to(dtype=torch.float32, device=device) 48 | labels = labels.to(dtype=torch.float32, device=device) 49 | 50 | prob = model(images) 51 | 52 | np_prob = prob.squeeze().detach().cpu().numpy() 53 | np_target= labels.cpu().to(dtype=torch.int).numpy() 54 | 55 | plt.plot(np.unique(np_prob)) 56 | plt.ylabel('Anomaly Probability') 57 | plt.xlabel('Time') 58 | l = plt.axvline(x=int(len(np_prob)/2), linewidth=190, color='#FF5647', alpha=0.4) 59 | plt.grid(True) 60 | plt.savefig('plot' + str(i) + '.png') 61 | plt.clf() 62 | 63 | 64 | if __name__ == '__main__': 65 | test( 66 | ckpt_path='exp/20201201-09:14:02/m_epoch07.pt', 67 | data_root='/home/ubuntu/project/data/plot', 68 | data_list='test.txt', 69 | batch_size=150, 70 | num_workers=4, 71 | device='cuda', 72 | display_interval=1 73 | ) 74 | -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_video/Test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | from torch.utils.data import DataLoader 7 | from torchvision import transforms 8 | from sklearn.metrics import roc_auc_score 9 | from Data import VideoFrameDataset, loadImages 10 | from Model import FrameClassifier 11 | 12 | 13 | def test(ckpt_path, data_root, data_list, batch_size, num_workers, device, display_interval): 14 | ''' 15 | Video-level prediction 16 | ''' 17 | ckpt = torch.load(ckpt_path) 18 | net_configs = ckpt['net_configs'] 19 | print(f'Load ckpt from {ckpt_path}') 20 | 21 | val_transform = transforms.Compose([ 22 | transforms.Resize([224, 224]), 23 | transforms.ToTensor(), 24 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 25 | ]) 26 | 27 | valid_videos = [] 28 | with open(data_list, 'r') as fp: 29 | valid_videos = [line.rstrip().split()[0].split('/')[-1] for line in fp.readlines()] 30 | 31 | model = FrameClassifier( 32 | #cnn_type=net_configs['cnn_type'], 33 | fc_sizes=net_configs['hidden_sizes'], 34 | batchnorms=net_configs['batchnorms'], 35 | dropouts=net_configs['dropouts'] 36 | ) 37 | model.to(device) 38 | model.load_state_dict(ckpt['state_dict']) 39 | 40 | model.eval() 41 | with torch.no_grad(): 42 | predicted_scores = [] 43 | target_scores = [] 44 | for i in range(len(valid_videos)): 45 | images, labels = loadImages(data_root, valid_videos[i], batch_size, val_transform) 46 | images = images.to(dtype=torch.float32, device=device) 47 | labels = labels.to(dtype=torch.float32, device=device) 48 | 49 | prob = model(images) 50 | 51 | np_prob = prob.squeeze().detach().cpu().numpy() 52 | np_target= labels.cpu().to(dtype=torch.int).numpy() 53 | 54 | predicted_avg_score = np.average(np_prob) 55 | target_avg_score = np.average(np_target) 56 | 57 | if (predicted_avg_score >= 0.5 and target_avg_score == 1.0) or (predicted_avg_score <= 0.5 and target_avg_score == 0.0): 58 | n_correct += 1 59 | 60 | predicted_scores.append(predicted_avg_score) 61 | target_scores.append(target_avg_score) 62 | 63 | print('=' * 80) 64 | print(f'[info] Test Acc = {n_correct / len(valid_videos) * 100:.4f}%') 65 | 66 | auc = roc_auc_score(target_scores, predicted_scores) 67 | print(f'[info] Test AUC = {auc:.5f}') 68 | 69 | 70 | if __name__ == '__main__': 71 | test( 72 | ckpt_path='exp/20201201-09:14:02/m_epoch07.pt', 73 | data_root='/home/ubuntu/project/data', 74 | data_list='test.txt', 75 | batch_size=30, 76 | num_workers=4, 77 | device='cuda', 78 | display_interval=1 79 | ) 80 | -------------------------------------------------------------------------------- /frame_video_level_cnn/vgg16bn_frame/Test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | from torch.utils.data import DataLoader 7 | from torchvision import transforms 8 | from sklearn.metrics import roc_auc_score 9 | from Data import VideoFrameDataset 10 | from Model import FrameClassifier 11 | 12 | 13 | def test(ckpt_path, data_root, data_list, batch_size, num_workers, device, display_interval): 14 | ckpt = torch.load(ckpt_path) 15 | net_configs = ckpt['net_configs'] 16 | print(f'Load ckpt from {ckpt_path}') 17 | 18 | val_transform = transforms.Compose([ 19 | transforms.Resize([224, 224]), 20 | transforms.ToTensor(), 21 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 22 | ]) 23 | val_dataset = VideoFrameDataset( 24 | root=data_root, 25 | video_list_path=data_list, 26 | transform=val_transform 27 | ) 28 | val_loader = DataLoader( 29 | dataset=val_dataset, 30 | batch_size=batch_size, 31 | shuffle=False, 32 | num_workers=num_workers, 33 | pin_memory=True 34 | ) 35 | 36 | model = FrameClassifier( 37 | #cnn_type=net_configs['cnn_type'], 38 | fc_sizes=net_configs['hidden_sizes'], 39 | batchnorms=net_configs['batchnorms'], 40 | dropouts=net_configs['dropouts'] 41 | ) 42 | model.to(device) 43 | model.load_state_dict(ckpt['state_dict']) 44 | 45 | model.eval() 46 | with torch.no_grad(): 47 | n_correct = 0 48 | n_samples = 0 49 | predicted_scores = [] 50 | target_scores = [] 51 | for step, (images, labels) in enumerate(tqdm(val_loader)): 52 | images = images.to(dtype=torch.float32, device=device) # (N, C, H, W) 53 | labels = labels.to(dtype=torch.float32, device=device) # (N,) 54 | 55 | # forward 56 | prob = model(images) # (N, 1) after sigmoid 57 | 58 | n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item() 59 | n_samples += labels.shape[0] 60 | 61 | predicted_scores.append(prob.squeeze().detach().cpu().numpy()) 62 | target_scores.append(labels.cpu().to(dtype=torch.int).numpy()) 63 | 64 | # display 65 | if (step + 1) % display_interval == 0 and step < len(val_loader) - 1: 66 | tqdm.write('-' * 40) 67 | tqdm.write(f'[info] Probs: {prob.squeeze().detach().cpu().numpy()[:10]}') 68 | tqdm.write(f'[info] Label: {labels.cpu().numpy()[:10]}') 69 | 70 | print('=' * 80) 71 | print(f'[info] Test Acc = {n_correct / n_samples * 100:.4f}%') 72 | 73 | predicted_scores = np.concatenate(predicted_scores, axis=0) 74 | target_scores = np.concatenate(target_scores, axis=0) 75 | 76 | auc = roc_auc_score(target_scores, predicted_scores) 77 | print(f'[info] Test AUC = {auc:.5f}') 78 | 79 | 80 | if __name__ == '__main__': 81 | test( 82 | ckpt_path='exp/20201130-16:26:59/m_epoch02.pt', 83 | data_root='/home/ubuntu/project/data', 84 | data_list='test.txt', 85 | batch_size=128, 86 | num_workers=4, 87 | device='cuda', 88 | display_interval=1 89 | ) 90 | -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_frame/Test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | from torch.utils.data import DataLoader 7 | from torchvision import transforms 8 | from sklearn.metrics import roc_auc_score 9 | from Data import VideoFrameDataset 10 | from Model import FrameClassifier 11 | 12 | 13 | def test(ckpt_path, data_root, data_list, batch_size, num_workers, device, display_interval): 14 | ckpt = torch.load(ckpt_path) 15 | net_configs = ckpt['net_configs'] 16 | print(f'Load ckpt from {ckpt_path}') 17 | 18 | val_transform = transforms.Compose([ 19 | transforms.Resize([224, 224]), 20 | transforms.ToTensor(), 21 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 22 | ]) 23 | val_dataset = VideoFrameDataset( 24 | root=data_root, 25 | video_list_path=data_list, 26 | transform=val_transform 27 | ) 28 | val_loader = DataLoader( 29 | dataset=val_dataset, 30 | batch_size=batch_size, 31 | shuffle=False, 32 | num_workers=num_workers, 33 | pin_memory=True 34 | ) 35 | 36 | model = FrameClassifier( 37 | #cnn_type=net_configs['cnn_type'], 38 | fc_sizes=net_configs['hidden_sizes'], 39 | batchnorms=net_configs['batchnorms'], 40 | dropouts=net_configs['dropouts'] 41 | ) 42 | model.to(device) 43 | model.load_state_dict(ckpt['state_dict']) 44 | 45 | model.eval() 46 | with torch.no_grad(): 47 | n_correct = 0 48 | n_samples = 0 49 | predicted_scores = [] 50 | target_scores = [] 51 | for step, (images, labels) in enumerate(tqdm(val_loader)): 52 | images = images.to(dtype=torch.float32, device=device) # (N, C, H, W) 53 | labels = labels.to(dtype=torch.float32, device=device) # (N,) 54 | 55 | # forward 56 | prob = model(images) # (N, 1) after sigmoid 57 | 58 | n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item() 59 | n_samples += labels.shape[0] 60 | 61 | predicted_scores.append(prob.squeeze().detach().cpu().numpy()) 62 | target_scores.append(labels.cpu().to(dtype=torch.int).numpy()) 63 | 64 | # display 65 | if (step + 1) % display_interval == 0 and step < len(val_loader) - 1: 66 | tqdm.write('-' * 40) 67 | tqdm.write(f'[info] Probs: {prob.squeeze().detach().cpu().numpy()[:10]}') 68 | tqdm.write(f'[info] Label: {labels.cpu().numpy()[:10]}') 69 | 70 | print('=' * 80) 71 | print(f'[info] Test Acc = {n_correct / n_samples * 100:.4f}%') 72 | 73 | predicted_scores = np.concatenate(predicted_scores, axis=0) 74 | target_scores = np.concatenate(target_scores, axis=0) 75 | 76 | auc = roc_auc_score(target_scores, predicted_scores) 77 | print(f'[info] Test AUC = {auc:.5f}') 78 | 79 | 80 | if __name__ == '__main__': 81 | test( 82 | ckpt_path='exp/20201201-09:14:02/m_epoch02.pt', 83 | data_root='/home/ubuntu/project/data', 84 | data_list='test.txt', 85 | batch_size=128, 86 | num_workers=4, 87 | device='cuda', 88 | display_interval=1 89 | ) 90 | -------------------------------------------------------------------------------- /frame_video_level_cnn/vgg16bn_frame/Train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | from torch.utils.data import DataLoader 7 | from torchvision import transforms 8 | from sklearn.metrics import roc_auc_score 9 | from Data import VideoFrameDataset 10 | from Model import FrameClassifier 11 | 12 | 13 | def test(ckpt_path, data_root, data_list, batch_size, num_workers, device, display_interval): 14 | ckpt = torch.load(ckpt_path) 15 | net_configs = ckpt['net_configs'] 16 | print(f'Load ckpt from {ckpt_path}') 17 | 18 | val_transform = transforms.Compose([ 19 | transforms.Resize([224, 224]), 20 | transforms.ToTensor(), 21 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 22 | ]) 23 | val_dataset = VideoFrameDataset( 24 | root=data_root, 25 | video_list_path=data_list, 26 | transform=val_transform 27 | ) 28 | val_loader = DataLoader( 29 | dataset=val_dataset, 30 | batch_size=batch_size, 31 | shuffle=False, 32 | num_workers=num_workers, 33 | pin_memory=True 34 | ) 35 | 36 | model = FrameClassifier( 37 | #cnn_type=net_configs['cnn_type'], 38 | fc_sizes=net_configs['hidden_sizes'], 39 | batchnorms=net_configs['batchnorms'], 40 | dropouts=net_configs['dropouts'] 41 | ) 42 | model.to(device) 43 | model.load_state_dict(ckpt['state_dict']) 44 | 45 | model.eval() 46 | with torch.no_grad(): 47 | n_correct = 0 48 | n_samples = 0 49 | predicted_scores = [] 50 | target_scores = [] 51 | for step, (images, labels) in enumerate(tqdm(val_loader)): 52 | images = images.to(dtype=torch.float32, device=device) # (N, C, H, W) 53 | labels = labels.to(dtype=torch.float32, device=device) # (N,) 54 | 55 | # forward 56 | prob = model(images) # (N, 1) after sigmoid 57 | 58 | n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item() 59 | n_samples += labels.shape[0] 60 | 61 | predicted_scores.append(prob.squeeze().detach().cpu().numpy()) 62 | target_scores.append(labels.cpu().to(dtype=torch.int).numpy()) 63 | 64 | # display 65 | if (step + 1) % display_interval == 0 and step < len(val_loader) - 1: 66 | tqdm.write('-' * 40) 67 | tqdm.write(f'[info] Probs: {prob.squeeze().detach().cpu().numpy()[:10]}') 68 | tqdm.write(f'[info] Label: {labels.cpu().numpy()[:10]}') 69 | 70 | print('=' * 80) 71 | print(f'[info] Test Acc = {n_correct / n_samples * 100:.4f}%') 72 | 73 | predicted_scores = np.concatenate(predicted_scores, axis=0) 74 | target_scores = np.concatenate(target_scores, axis=0) 75 | 76 | auc = roc_auc_score(target_scores, predicted_scores) 77 | print(f'[info] Test AUC = {auc:.5f}') 78 | 79 | 80 | if __name__ == '__main__': 81 | test( 82 | ckpt_path='exp/20201130-16:26:59/m_epoch02.pt', 83 | data_root='/home/ubuntu/project/data', 84 | data_list='test.txt', 85 | batch_size=128, 86 | num_workers=4, 87 | device='cuda', 88 | display_interval=1 89 | ) 90 | -------------------------------------------------------------------------------- /frame_video_level_cnn/data_prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import shutil 4 | 5 | 6 | if __name__ == "__main__": 7 | input_root = 'frames' 8 | output_root = 'data' 9 | train_json = 'metadata_train.json' 10 | test_json = 'metadata_val.json' 11 | 12 | video_clips = os.listdir(input_root) 13 | 14 | with open(train_json, 'r') as fp: 15 | train_info = json.load(fp) 16 | with open(test_json, 'r') as fp: 17 | test_info = json.load(fp) 18 | 19 | train_txt = [] 20 | test_txt = [] 21 | 22 | invalid = [] 23 | 24 | for vid in video_clips: 25 | if vid in train_info.keys(): 26 | start = train_info[vid]['anomaly_start'] 27 | end = train_info[vid]['anomaly_end'] 28 | n_frames = train_info[vid]['num_frames'] 29 | data_split = 'train' 30 | elif vid in test_info.keys(): 31 | start = test_info[vid]['anomaly_start'] 32 | end = test_info[vid]['anomaly_end'] 33 | n_frames = test_info[vid]['num_frames'] 34 | data_split = 'test' 35 | else: 36 | raise RuntimeError(f'invalid video clip {vid}') 37 | 38 | jpgs = os.listdir(os.path.join(input_root, vid)) 39 | 40 | if len(jpgs) != n_frames: 41 | invalid.append(vid) 42 | print(f'{vid} has {len(jpgs)} frames, which is different from {n_frames}') 43 | else: 44 | jpgs.sort() 45 | 46 | for i in range(start-1): 47 | if not os.path.exists(os.path.join(output_root, f'{vid}-0')): 48 | os.makedirs(os.path.join(output_root, f'{vid}-0')) 49 | shutil.copy(os.path.join(input_root, vid, jpgs[i]), os.path.join(output_root, f'{vid}-0')) 50 | if data_split == 'train': 51 | train_txt.append(f'data/{vid}-0 normal\n') 52 | elif data_split == 'test': 53 | test_txt.append(f'data/{vid}-0 normal\n') 54 | 55 | for i in range(start-1, end): 56 | if not os.path.exists(os.path.join(output_root, f'{vid}-1')): 57 | os.makedirs(os.path.join(output_root, f'{vid}-1')) 58 | shutil.copy(os.path.join(input_root, vid, jpgs[i]), os.path.join(output_root, f'{vid}-1')) 59 | if data_split == 'train': 60 | train_txt.append(f'data/{vid}-1 anomaly\n') 61 | elif data_split == 'test': 62 | test_txt.append(f'data/{vid}-1 anomaly\n') 63 | 64 | for i in range(end, len(jpgs)): 65 | if not os.path.exists(os.path.join(output_root, f'{vid}-2')): 66 | os.makedirs(os.path.join(output_root, f'{vid}-2')) 67 | shutil.copy(os.path.join(input_root, vid, jpgs[i]), os.path.join(output_root, f'{vid}-2')) 68 | if data_split == 'train': 69 | train_txt.append(f'data/{vid}-2 normal\n') 70 | elif data_split == 'test': 71 | test_txt.append(f'data/{vid}-2 normal\n') 72 | 73 | with open('train_list.txt', 'w') as fp: 74 | for line in train_txt: 75 | fp.write(line) 76 | print(f'train: {len(train_txt)}') 77 | 78 | with open('test_list.txt', 'w') as fp: 79 | for line in test_txt: 80 | fp.write(line) 81 | print(f'test: {len(test_txt)}') 82 | -------------------------------------------------------------------------------- /test_cnn_flow.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from tqdm import tqdm 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F 7 | from torchvision import transforms 8 | from torch.utils.data import DataLoader 9 | from sklearn.metrics import roc_auc_score 10 | from data import OpticalFlowDataset 11 | 12 | 13 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 14 | 15 | 16 | def main(args): 17 | """Test the CNN model on optical flow images and save results as a numpy file. 18 | """ 19 | ckpt = torch.load(args.ckpt) 20 | 21 | transform = transforms.Compose([ 22 | transforms.Resize([args.image_size, args.image_size]), 23 | transforms.ToTensor(), 24 | ]) 25 | dataset = OpticalFlowDataset( 26 | root=args.data_root, 27 | video_list_path=args.data_list, 28 | n_frames=ckpt['net_configs']['n_frames'], 29 | transform=transform, 30 | is_train=False 31 | ) 32 | dataloader = DataLoader( 33 | dataset=dataset, 34 | batch_size=args.batch_size, 35 | shuffle=False, 36 | num_workers=args.n_workers, 37 | collate_fn=dataset.collate_fn, 38 | pin_memory=True 39 | ) 40 | 41 | assert ckpt['net_configs']['cnn_type'] in ['resnet50', 'resnet101', 'resnet152'] 42 | if ckpt['net_configs']['cnn_type'] == 'resnet50': 43 | from networks import resnet50 as ResNet 44 | elif ckpt['net_configs']['cnn_type'] == 'resnet101': 45 | from networks import resnet101 as ResNet 46 | elif ckpt['net_configs']['cnn_type'] == 'resnet152': 47 | from networks import resnet152 as ResNet 48 | 49 | model = ResNet(pretrained=True, channel=ckpt['net_configs']['n_frames'] * 2) 50 | model.to(DEVICE) 51 | model.load_state_dict(ckpt['state_dict']) 52 | model.eval() 53 | print(f'[info] Loaded model from {args.ckpt}') 54 | 55 | with torch.no_grad(): 56 | y_true = [] # 1 for anomaly, 0 for normal 57 | y_pred = [] # prob of anomaly 58 | for step, (images, labels) in enumerate(tqdm(dataloader)): 59 | images = images.to(dtype=torch.float32, device=DEVICE) # (N, C=20, H, W) 60 | 61 | # forward 62 | out = model(images) # (N, 1), logits before sigmoid 63 | prob = torch.sigmoid(out.squeeze(-1)) # (N,) 64 | 65 | y_true.append(labels.to(dtype=torch.int32).cpu().numpy()) 66 | y_pred.append(prob.cpu().numpy()) 67 | 68 | y_true = np.concatenate(y_true) 69 | y_pred = np.concatenate(y_pred) 70 | 71 | try: 72 | auc = roc_auc_score(y_true, y_pred) 73 | acc = (y_true == (y_pred >= 0.5)).sum() / y_true.shape[0] 74 | print(f'[info] Video-Level AUC = {auc:.5f}, ACC = {acc*100:.2f}%') 75 | except: 76 | print('[warning] Failed to compute AUC and ACC.') 77 | 78 | try: 79 | with open(args.data_list, 'r') as fp: 80 | rows = [ln.rstrip().split() for ln in fp.readlines()] 81 | 82 | out_file = os.path.join(os.path.dirname(args.ckpt), 'results.npy') 83 | results = [] 84 | for (name, label_str), target, prob in zip(rows, y_true, y_pred): 85 | assert (label_str == 'anomaly') == target 86 | results.append([name, label_str, target, prob]) 87 | np.save(out_file, results) 88 | except: 89 | print('[warning] Failed to save output file.') 90 | 91 | 92 | if __name__ == '__main__': 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument( 95 | '--ckpt', 96 | default='', 97 | type=str, 98 | help='path to the model checkpoint' 99 | ) 100 | parser.add_argument( 101 | '--data_root', 102 | default='/home/ubuntu/data_flow', 103 | type=str, 104 | help='root directory of optical flow images' 105 | ) 106 | parser.add_argument( 107 | '--data_list', 108 | default='dataset/test.txt', 109 | type=str, 110 | help='path to the list of test videos' 111 | ) 112 | parser.add_argument( 113 | '--image_size', 114 | default=224, 115 | type=int, 116 | help='height and width of the input image (default 224 for ResNets)' 117 | ) 118 | parser.add_argument( 119 | '--batch_size', 120 | default=32, 121 | type=int, 122 | help='batch size for test' 123 | ) 124 | parser.add_argument( 125 | '--n_workers', 126 | default=4, 127 | type=int, 128 | help='number of workers for dataloader' 129 | ) 130 | args = parser.parse_args() 131 | print(args) 132 | 133 | main(args) 134 | -------------------------------------------------------------------------------- /test_lstm_flow.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from tqdm import tqdm 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F 7 | from torchvision import transforms 8 | from torch.utils.data import DataLoader 9 | from sklearn.metrics import roc_auc_score 10 | from data import OpticalFlowDataset 11 | from networks import CRNNOpticalFlow 12 | 13 | 14 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 15 | 16 | 17 | def main(args): 18 | """Test the CRNN model on optical flow maps and save results as a numpy file. 19 | """ 20 | ckpt = torch.load(args.ckpt) 21 | 22 | transform = transforms.Compose([ 23 | transforms.Resize([args.image_size, args.image_size]), 24 | transforms.ToTensor(), 25 | ]) 26 | dataset = OpticalFlowDataset( 27 | root=args.data_root, 28 | video_list_path=args.data_list, 29 | n_frames=args.n_frames if args.n_frames > 0 else ckpt['net_configs']['n_frames'], 30 | transform=transform, 31 | is_train=False 32 | ) 33 | dataloader = DataLoader( 34 | dataset=dataset, 35 | batch_size=args.batch_size, 36 | shuffle=False, 37 | num_workers=args.n_workers, 38 | collate_fn=dataset.collate_fn, 39 | pin_memory=True 40 | ) 41 | 42 | model = CRNNOpticalFlow( 43 | cnn_dropout=ckpt['net_configs']['cnn_dropout'], 44 | cnn_emb_dim=ckpt['net_configs']['cnn_emb_dim'], 45 | cnn_type=ckpt['net_configs']['cnn_type'], 46 | rnn_hidden_size=ckpt['net_configs']['rnn_hidden_size'], 47 | rnn_dropout=ckpt['net_configs']['rnn_dropout'], 48 | num_rnn_layers=ckpt['net_configs']['num_rnn_layers'], 49 | rnn_bidir=ckpt['net_configs']['rnn_bidir'] 50 | ) 51 | model.to(DEVICE) 52 | model.load_state_dict(ckpt['state_dict']) 53 | model.eval() 54 | print(f'[info] Loaded model from {args.ckpt}') 55 | 56 | with torch.no_grad(): 57 | y_true = [] # 1 for anomaly, 0 for normal 58 | y_pred = [] # prob of anomaly 59 | for step, (images, labels) in enumerate(tqdm(dataloader)): 60 | images = images.to(dtype=torch.float32, device=DEVICE) # (N, T * 2, H, W) 61 | 62 | # forward 63 | out = model(images) # (N, T), probs after sigmoid 64 | out = torch.mean(out, dim=-1) # (N,) 65 | 66 | y_true.append(labels.to(dtype=torch.int32).cpu().numpy()) 67 | y_pred.append(out.cpu().numpy()) 68 | 69 | y_true = np.concatenate(y_true) 70 | y_pred = np.concatenate(y_pred) 71 | 72 | try: 73 | auc = roc_auc_score(y_true, y_pred) 74 | acc = (y_true == (y_pred >= 0.5)).sum() / y_true.shape[0] 75 | print(f'[info] Video-Level AUC = {auc:.5f}, ACC = {acc*100:.3f}%') 76 | except: 77 | print('[warning] Failed to compute AUC and ACC.') 78 | 79 | try: 80 | with open(args.data_list, 'r') as fp: 81 | rows = [ln.rstrip().split() for ln in fp.readlines()] 82 | 83 | out_file = os.path.join(os.path.dirname(args.ckpt), 'results.npy') 84 | results = [] 85 | for (name, label_str), target, prob in zip(rows, y_true, y_pred): 86 | assert (label_str == 'anomaly') == target 87 | results.append([name, label_str, target, prob]) 88 | np.save(out_file, results) 89 | except: 90 | print('[warning] Failed to save output file.') 91 | 92 | 93 | if __name__ == '__main__': 94 | parser = argparse.ArgumentParser() 95 | parser.add_argument( 96 | '--ckpt', 97 | default='', 98 | type=str, 99 | help='path to the model checkpoint' 100 | ) 101 | parser.add_argument( 102 | '--data_root', 103 | default='/home/ubuntu/data_flow', 104 | type=str, 105 | help='root directory of optical flow maps' 106 | ) 107 | parser.add_argument( 108 | '--data_list', 109 | default='dataset/test.txt', 110 | type=str, 111 | help='path to the list of test videos' 112 | ) 113 | parser.add_argument( 114 | '--n_frames', 115 | default=0, 116 | type=int, 117 | help='number of frames for each video clip' 118 | ) 119 | parser.add_argument( 120 | '--image_size', 121 | default=224, 122 | type=int, 123 | help='height and width of the input image (default 224 for ResNets)' 124 | ) 125 | parser.add_argument( 126 | '--batch_size', 127 | default=8, 128 | type=int, 129 | help='batch size for test' 130 | ) 131 | parser.add_argument( 132 | '--n_workers', 133 | default=4, 134 | type=int, 135 | help='number of workers for dataloader' 136 | ) 137 | args = parser.parse_args() 138 | print(args) 139 | 140 | main(args) 141 | -------------------------------------------------------------------------------- /test_lstm_rgb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from tqdm import tqdm 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F 7 | from torchvision import transforms 8 | from torch.utils.data import DataLoader 9 | from sklearn.metrics import roc_auc_score 10 | from data import RGBFrameDataset 11 | from networks import CRNNClassifier 12 | 13 | 14 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 15 | 16 | 17 | def main(args): 18 | """Test the CRNN model on RGB frames and save results as a numpy file. 19 | """ 20 | transform = transforms.Compose([ 21 | transforms.Resize([args.image_size, args.image_size]), 22 | transforms.ToTensor(), 23 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 24 | ]) 25 | dataset = RGBFrameDataset( 26 | root=args.data_root, 27 | video_list_path=args.data_list, 28 | n_frames=args.n_frames, 29 | transform=transform, 30 | is_train=False 31 | ) 32 | dataloader = DataLoader( 33 | dataset=dataset, 34 | batch_size=args.batch_size, 35 | shuffle=False, 36 | num_workers=args.n_workers, 37 | collate_fn=dataset.collate_fn, 38 | pin_memory=True 39 | ) 40 | 41 | ckpt = torch.load(args.ckpt) 42 | model = CRNNClassifier( 43 | cnn_dropout=ckpt['net_configs']['cnn_dropout'], 44 | cnn_emb_dim=ckpt['net_configs']['cnn_emb_dim'], 45 | cnn_type=ckpt['net_configs']['cnn_type'], 46 | cnn_finetune=ckpt['net_configs']['cnn_finetune'], 47 | rnn_hidden_size=ckpt['net_configs']['rnn_hidden_size'], 48 | rnn_dropout=ckpt['net_configs']['rnn_dropout'], 49 | num_rnn_layers=ckpt['net_configs']['num_rnn_layers'], 50 | rnn_bidir=ckpt['net_configs']['rnn_bidir'] 51 | ) 52 | model.to(DEVICE) 53 | model.load_state_dict(ckpt['state_dict']) 54 | model.eval() 55 | print(f'[info] Loaded model from {args.ckpt}') 56 | 57 | with torch.no_grad(): 58 | y_true = [] # 1 for anomaly, 0 for normal 59 | y_pred = [] # prob of anomaly 60 | for step, (videos, labels) in enumerate(tqdm(dataloader)): 61 | videos = videos.to(dtype=torch.float32, device=DEVICE) # (N, T, C, H, W) 62 | 63 | # forward 64 | prob = model(videos) # (N, T) after sigmoid 65 | prob = torch.mean(prob, dim=-1) # (N,) 66 | 67 | y_true.append(labels.to(dtype=torch.int32).cpu().numpy()) 68 | y_pred.append(prob.cpu().numpy()) 69 | 70 | y_true = np.concatenate(y_true) 71 | y_pred = np.concatenate(y_pred) 72 | 73 | try: 74 | auc = roc_auc_score(y_true, y_pred) 75 | acc = (y_true == (y_pred >= 0.5)).sum() / y_true.shape[0] 76 | print(f'[info] Video-Level AUC = {auc:.5f}, ACC = {acc*100:.3f}%') 77 | except: 78 | print('[warning] Failed to compute AUC and ACC.') 79 | 80 | try: 81 | with open(args.data_list, 'r') as fp: 82 | rows = [ln.rstrip().split() for ln in fp.readlines()] 83 | 84 | out_file = os.path.join(os.path.dirname(args.ckpt), 'results.npy') 85 | results = [] 86 | for (name, label_str), target, prob in zip(rows, y_true, y_pred): 87 | assert (label_str == 'anomaly') == target 88 | results.append([name, label_str, target, prob]) 89 | np.save(out_file, results) 90 | except: 91 | print('[warning] Failed to save output file.') 92 | 93 | 94 | if __name__ == '__main__': 95 | parser = argparse.ArgumentParser() 96 | parser.add_argument( 97 | '--ckpt', 98 | default='', 99 | type=str, 100 | help='path to the model checkpoint' 101 | ) 102 | parser.add_argument( 103 | '--data_root', 104 | default='/home/ubuntu/data', 105 | type=str, 106 | help='root directory of RGB frames' 107 | ) 108 | parser.add_argument( 109 | '--data_list', 110 | default='dataset/test.txt', 111 | type=str, 112 | help='path to the list of test videos' 113 | ) 114 | parser.add_argument( 115 | '--n_frames', 116 | default=30, 117 | type=int, 118 | help='number of frames for each video clip' 119 | ) 120 | parser.add_argument( 121 | '--image_size', 122 | default=224, 123 | type=int, 124 | help='height and width of the input image (default 224 for ResNets)' 125 | ) 126 | parser.add_argument( 127 | '--batch_size', 128 | default=32, 129 | type=int, 130 | help='batch size for test' 131 | ) 132 | parser.add_argument( 133 | '--n_workers', 134 | default=4, 135 | type=int, 136 | help='number of workers for dataloader' 137 | ) 138 | args = parser.parse_args() 139 | print(args) 140 | 141 | main(args) 142 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | from tqdm import tqdm 4 | import numpy as np 5 | import torch 6 | from torch.utils.data import Dataset 7 | 8 | 9 | class RGBFrameDataset(Dataset): 10 | """Dataset class for 3-channel RGB images. 11 | """ 12 | def __init__(self, root, video_list_path, n_frames, transform, is_train): 13 | super(RGBFrameDataset, self).__init__() 14 | self.root = root 15 | self.n_frames = n_frames 16 | self.transform = transform 17 | self.is_train = is_train 18 | 19 | with open(video_list_path, 'r') as fp: 20 | self.lines = [line.rstrip() for line in fp.readlines()] 21 | tqdm.write(f'[info] There are {len(self.lines)} videos in {video_list_path}') 22 | 23 | def __len__(self): 24 | return len(self.lines) 25 | 26 | def __getitem__(self, index): 27 | """ 28 | :param index: int 29 | :return result: (n_frames, C=3, H, W), label: 1 for anomaly, 0 for normal 30 | """ 31 | line = self.lines[index] 32 | label = line.split()[1] == 'anomaly' # anomaly: 1, normal: 0 33 | folder = os.path.join(self.root, line.split()[0]) 34 | jpg_list = os.listdir(folder) 35 | jpg_list.sort() # must sort to retain the order 36 | 37 | if len(jpg_list) > self.n_frames: # there are enough frames 38 | if self.is_train: 39 | start = np.random.randint(0, len(jpg_list) - self.n_frames) 40 | else: 41 | start = 0 42 | jpg_list = jpg_list[start:start+self.n_frames] 43 | elif len(jpg_list) < self.n_frames: # frames are not enough 44 | jpg_list += [jpg_list[-1]] * (self.n_frames - len(jpg_list)) # repeat the last frame 45 | 46 | assert len(jpg_list) == self.n_frames 47 | 48 | frames = [] 49 | for jpg in jpg_list: 50 | image = Image.open(os.path.join(folder, jpg)) 51 | image = self.transform(image) # torch.Tensor, (C=3, H, W), range [0., 1.] 52 | frames.append(image) 53 | frames = torch.stack(frames, dim=0) # (n_frames, C=3, H, W) 54 | 55 | return frames, label 56 | 57 | def collate_fn(self, batch): 58 | videos = torch.stack([b[0] for b in batch], dim=0) # (batch_size, n_frames, C=3, H, W) 59 | labels = torch.tensor([b[1] for b in batch], dtype=torch.long) # (batch_size,) 60 | 61 | return videos, labels 62 | 63 | 64 | class OpticalFlowDataset(Dataset): 65 | """Dataset class for stacked optical flow. Each optical flow image has two components, i.e. x and y. 66 | """ 67 | def __init__(self, root, video_list_path, n_frames, transform, is_train): 68 | super(OpticalFlowDataset, self).__init__() 69 | 70 | self.root = root 71 | self.n_frames = n_frames 72 | self.transform = transform 73 | self.is_train = is_train 74 | 75 | with open(video_list_path, 'r') as fp: 76 | self.lines = [line.rstrip() for line in fp.readlines()] 77 | tqdm.write(f'[info] There are {len(self.lines)} videos in {video_list_path}') 78 | 79 | def __len__(self): 80 | return len(self.lines) 81 | 82 | def __getitem__(self, index): 83 | """ 84 | :param index: int 85 | :return frames: torch.FloatTensor, (n_frames * 2, H, W); label: 1 for anomaly, 0 for normal 86 | """ 87 | line = self.lines[index] 88 | label = line.split()[1] == 'anomaly' # anomaly: 1, normal: 0 89 | folder = os.path.join(self.root, line.split()[0]) 90 | jpg_list = os.listdir(folder) 91 | jpg_list.sort() # must sort to retain the order 92 | 93 | if len(jpg_list) > self.n_frames: # there are enough frames 94 | if self.is_train: 95 | start = np.random.randint(0, len(jpg_list) - self.n_frames) 96 | else: 97 | start = 0 98 | jpg_list = jpg_list[start:start+self.n_frames] 99 | elif len(jpg_list) < self.n_frames: # frames are not enough 100 | jpg_list += [jpg_list[-1]] * (self.n_frames - len(jpg_list)) # repeat the last frame 101 | 102 | assert len(jpg_list) == self.n_frames 103 | 104 | frames = [] 105 | for jpg in jpg_list: 106 | image = Image.open(os.path.join(folder, jpg)) # (H, W, 3), channel 0: horizontal, channel 1: vertical 107 | image = self.transform(image) # torch.FloatTensor, (3, H, W), range [0., 1.] 108 | image = image[:-1, :, :] # (2, H, W) 109 | frames.append(image) 110 | frames = torch.cat(frames, dim=0) # (n_frames * 2, H, W) 111 | assert frames.shape[0] == 2 * self.n_frames 112 | 113 | return frames, label 114 | 115 | def collate_fn(self, batch): 116 | videos = torch.stack([b[0] for b in batch], dim=0) # (batch_size, n_frames * 2, H, W) 117 | labels = torch.tensor([b[1] for b in batch], dtype=torch.long) # (batch_size,) 118 | 119 | return videos, labels 120 | -------------------------------------------------------------------------------- /frame_video_level_cnn/resnet101_frame/Train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import shutil 4 | from tqdm import tqdm, trange 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import torch.nn.functional as F 9 | from torch.utils.data import DataLoader 10 | from torchvision import transforms 11 | from Data import VideoFrameDataset 12 | from Model import FrameClassifier 13 | from TrainConfig import configs 14 | 15 | 16 | class Trainer(object): 17 | def __init__(self, configs): 18 | self.configs = configs 19 | self.device = torch.device(configs['device']) 20 | 21 | train_transform = transforms.Compose([ 22 | transforms.RandomAffine(degrees=10, translate=[0.1, 0.1], scale=[0.9, 1.1]), 23 | transforms.Resize([configs['image_size'], configs['image_size']]), 24 | transforms.ToTensor(), 25 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 26 | ]) 27 | train_dataset = VideoFrameDataset( 28 | root=configs['data_root'], 29 | video_list_path=configs['train_list'], 30 | transform=train_transform 31 | ) 32 | self.train_loader = DataLoader( 33 | dataset=train_dataset, 34 | batch_size=configs['train_batch_size'], 35 | shuffle=True, 36 | num_workers=configs['n_workers'], 37 | pin_memory=True 38 | ) 39 | 40 | self.criterion = nn.BCELoss() 41 | 42 | ckpt = None 43 | if configs['resume']: 44 | ckpt = torch.load(configs['ckpt_path']) 45 | configs['net'] = ckpt['net_configs'] 46 | self.configs = configs 47 | 48 | self.model = FrameClassifier( 49 | #cnn_type=configs['net']['cnn_type'], 50 | fc_sizes=configs['net']['hidden_sizes'], 51 | batchnorms=configs['net']['batchnorms'], 52 | dropouts=configs['net']['dropouts'] 53 | ) 54 | self.model.to(self.device) 55 | self.optimizer = optim.Adam(self.model.parameters(), lr=configs['lr'], 56 | weight_decay=configs['weight_decay']) 57 | 58 | # if configs['pre_train']: 59 | # pre_ckpt = torch.load(configs['pre_train_path']) 60 | # self.model.load_state_dict(pre_ckpt['state_dict'], strict=True) 61 | # print('============= load pre-trained model ==============') 62 | 63 | if configs['resume']: 64 | self.model.load_state_dict(ckpt['state_dict']) 65 | self.optimizer.load_state_dict(ckpt['optimizer']) 66 | if configs['new_lr'] is not None: 67 | for param_group in self.optimizer.param_groups: 68 | param_group['lr'] = configs['new_lr'] 69 | 70 | if configs['apply_val']: 71 | val_transform = transforms.Compose([ 72 | transforms.Resize([configs['image_size'], configs['image_size']]), 73 | transforms.ToTensor(), 74 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 75 | ]) 76 | val_dataset = VideoFrameDataset( 77 | root=configs['data_root'], 78 | video_list_path=configs['val_list'], 79 | transform=val_transform 80 | ) 81 | self.val_loader = DataLoader( 82 | dataset=val_dataset, 83 | batch_size=configs['val_batch_size'], 84 | shuffle=False, 85 | num_workers=configs['n_workers'], 86 | pin_memory=True 87 | ) 88 | self.display_interval = configs['val_display_interval'] 89 | 90 | self.save_dir = os.path.join(configs['save_dir'], time.strftime('%Y%m%d-%H:%M:%S', time.localtime())) 91 | os.makedirs(self.save_dir) 92 | self.log_file = os.path.join(self.save_dir, 'log_train.txt') 93 | self.copyscripts(os.path.join(self.save_dir, 'backup_scripts')) 94 | 95 | self.writelog(self.configs) 96 | self.writelog('=' * 80) 97 | 98 | def copyscripts(self, dest_path): 99 | """ 100 | Save python scripts. 101 | Ignore directories such as '__pycache__' and '.idea'. 102 | """ 103 | shutil.copytree('.', dest_path, ignore=shutil.ignore_patterns('_*', '.*', self.configs['save_dir'])) 104 | 105 | def writelog(self, results): 106 | if not isinstance(results, str): 107 | results = str(results) 108 | with open(self.log_file, 'a') as fp: 109 | fp.write(results + '\n') 110 | 111 | def savemodel(self, save_name): 112 | torch.save( 113 | { 114 | 'state_dict': self.model.state_dict(), 115 | 'optimizer': self.optimizer.state_dict(), 116 | 'net_configs': self.configs['net'] 117 | }, 118 | save_name 119 | ) 120 | tqdm.write(f'[Info] Trained model has been saved as {save_name}') 121 | 122 | def train(self): 123 | for epoch in trange(self.configs['n_epochs']): 124 | tqdm.write('=' * 20 + f'Epoch {epoch + 1} starts' + '=' * 20) 125 | average_loss, accuracy = self.train_epoch(epoch) 126 | log_str = f'Epoch [{epoch + 1:02d}/{self.configs["n_epochs"]}] Train Loss = {average_loss:.5f} ' \ 127 | f'Train ACC = {accuracy*100:.2f}%' 128 | self.savemodel(os.path.join(self.save_dir, f'm_epoch{epoch + 1:02d}.pt')) 129 | 130 | if self.configs['apply_val']: 131 | with torch.no_grad(): 132 | val_loss, val_acc = self.val_epoch(epoch) 133 | log_str += f' Val Loss = {val_loss:.5f} Val ACC = {val_acc * 100:.2f}%' 134 | 135 | self.writelog(log_str) 136 | tqdm.write(log_str) 137 | 138 | def train_epoch(self, epoch): 139 | self.model.train() 140 | 141 | total_loss = 0. 142 | n_correct = 0 143 | n_samples = 0 144 | for step, (images, labels) in enumerate(tqdm(self.train_loader)): 145 | images = images.to(dtype=torch.float32, device=self.device) # (N, C, H, W) 146 | labels = labels.to(dtype=torch.float32, device=self.device) # (N,) 147 | 148 | # forward & backward 149 | prob = self.model(images) # (N, 1) after sigmoid 150 | 151 | loss = self.criterion(prob, labels.reshape(labels.shape[0], 1)) 152 | 153 | self.optimizer.zero_grad() 154 | loss.backward() 155 | self.optimizer.step() 156 | 157 | total_loss += loss.item() * labels.shape[0] 158 | n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item() 159 | n_samples += labels.shape[0] 160 | 161 | return total_loss / n_samples, n_correct / n_samples 162 | 163 | def val_epoch(self, epoch): 164 | self.model.eval() 165 | 166 | total_loss = 0. 167 | n_correct = 0 168 | n_samples = 0 169 | for step, (images, labels) in enumerate(tqdm(self.val_loader)): 170 | images = images.to(dtype=torch.float32, device=self.device) # (N, C, H, W) 171 | labels = labels.to(dtype=torch.float32, device=self.device) # (N,) 172 | 173 | # forward 174 | prob = self.model(images) # (N, 1) after sigmoid 175 | loss = self.criterion(prob, labels.reshape(labels.shape[0], 1)) 176 | 177 | total_loss += loss.item() * labels.shape[0] 178 | n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item() 179 | n_samples += labels.shape[0] 180 | 181 | # display 182 | if (step + 1) % self.display_interval == 0: 183 | tqdm.write('-' * 40) 184 | tqdm.write(f'[info] Probs: {prob.detach().cpu().numpy()}') 185 | tqdm.write(f'[info] Label: {labels.cpu().numpy()}') 186 | return total_loss / n_samples, n_correct / n_samples 187 | 188 | 189 | if __name__ == '__main__': 190 | trainer = Trainer(configs) 191 | trainer.train() 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Traffic Accident Detection via Deep Learning 2 | 3 | This repository contains the code of our IDL course project in Fall 2020. 4 | 5 | NOTE: this repo is only for the course project and will not be maintained after this semester. 6 | 7 | ## Contributors 8 | 9 | **Yifan Peng** (@pyf98) and **Amine Bellamkaddem** (@amine-bellamkaddem) are the two contributors of this repository. 10 | 11 | **Yifan Peng** is the main contributor of this project. He implemented the vast majority of our models. Specifically, he provided an initial version for `frame_video_level_cnn`, which was further developed by Amine. All the other scripts in this repository are completed by Yifan Peng. 12 | 13 | **Amine Bellamkaddem** developed `frame_video_level_cnn` based on the initial version by Yifan and explored more architectures such as VGG16. He also provided the current visualization plots. Please see the folder `frame_video_level_cnn` for more information. 14 | 15 | Details about division of work is in our final report. 16 | 17 | ## Introduction 18 | 19 | Detecting anomalous events such as road accidents in natural driving scenes is a challenging task. The majority of previous studies focus on fixed cameras with static backgrounds. In this project, we design **a CRNN-based two-stream method using both RGB frames and optical flow to detect traffic accidents in first-person dash-cam videos**. Our hypotheses are that motion features can improve the detection performance and that CRNN-based approaches are better for modeling temporal relationship than conventional CNN-based approaches. Results show that the motion stream outperforms the spatial-temporal stream, and that the fusion of two streams can further improve our model's performance. 20 | 21 | ![two-stream](imgs/crnn_twostream.png "CRNN-based two-stream method for traffic accident detection") 22 | 23 | ## Requirements 24 | 25 | Our models are implemented using PyTorch. Required packages are listed in `requirements.txt`. 26 | 27 | ``` 28 | numpy 29 | tqdm 30 | torchvision==0.7.0 31 | torch==1.6.0 32 | Pillow 33 | scikit_learn 34 | ``` 35 | 36 | To install these packages, run 37 | 38 | ``` 39 | pip install -r requirements.txt 40 | ``` 41 | 42 | All models can be trained on a single NVIDIA Tesla T4 GPU using the default configuration. 43 | 44 | ## Dataset 45 | 46 | We employ a recently introduced traffic anomaly dataset called [Detection of Traffic Anomaly](https://github.com/MoonBlvd/Detection-of-Traffic-Anomaly "Detection of Traffic Anomaly Dataset") (DoTA). DoTA contains 4,677 dash-cam videos collected from YouTube channels. These ego-centric driving videos are from different countries and under different weather and lighting conditions. 47 | 48 | **Note that due to issues with YouTube, some videos are no longer available. We have collected 4,085 videos in total.** Most videos in DoTA can be separated into three temporal partitions: precursor, anomaly window, and post-anomaly. We label the first part (i.e. precursor) as *normal* or *non-accident*, and the second part (i.e. anomaly window) as *anomaly* or *accident*, but we do not use the third part. Details of our dataset are shown below. 49 | 50 | Dataset | Training | Validation | Test 51 | :---: | :---: | :---: | :---: 52 | \#video clips | 5,700 | 801 | 1,657 53 | \#frames | 208,649 | 29,997 | 58,778 54 | 55 | Dataset classes are defined in `data.py`. 56 | 57 | ## Models 58 | 59 | Models are defined in `networks.py`. 60 | 61 | ### Spatial-Temporal Stream 62 | 63 | The spatial-temporal stream takes RGB frames as input, which contain appearance information. To extract frame-level features from an input video, an ImageNet pre-trained ResNet is applied. To capture high-level (temporal) information, three architectures are employed: a multi-layer perceptron (MLP), a unidirectional Long Short-Term Memory (LSTM), and a bidirectional LSTM (BiLSTM). The MLP doesn't consider temporal dependencies, which leads to degraded performance. 64 | 65 | * ResNet + MLP: The code is in `frame_video_level_cnn`. Models are trained on individual frames and evaluated by frame-level and video-level metrics. 66 | 67 | * ResNet + LSTM: The network configuration is in `conf/lstm_rgb.py`. Note that `configs['net']['rnn_bidir']` should be set to `False` for unidirectional LSTM. To train a model, run `python train_lstm_rgb.py`. To evaluate a trained model, run `python test_lstm_rgb.py --ckpt path/to/checkpoint`. Please refer to `test_lstm_rgb.py` for other options such as `n_frames` and `batch_size`. 68 | 69 | * ResNet + BiLSTM: Similar to the second model, the network configuration is in `conf/lstm_rgb.py`. Note that `configs['net']['rnn_bidir']` should be set to `True` for bidirectional LSTM. To train a model, run `python train_lstm_rgb.py`. To evaluate a trained model, run `python test_lstm_rgb.py --ckpt path/to/checkpoint`. Please refer to `test_lstm_rgb.py` for other options such as `n_frames` and `batch_size`. 70 | 71 | After running the test script, predictions will be saved as a `.npy` file in the same folder as the model checkpoint. The saved file can be used in the fusion section. 72 | 73 | ![crnn](imgs/crnn.png "Convolutional Recurrent Neural Network (CRNN)") 74 | 75 | 76 | ### Motion Stream 77 | 78 | The motion stream takes dense optical flow as input, which represents motion features. Our results have demonstrated that motion features are better for accident detection in dash-cam videos with dynamic backgrounds. We utilize a recently proposed deep learning-based algorithm ([RAFT](https://github.com/princeton-vl/RAFT)) to estimate optical flow and save optical flow maps as jpg images. Note that each optical flow map has only two channels (horizontal and vertical) instead of three, so the last color channel of the image is set to zero. 79 | 80 | Here we compare three architectures: ResNet-based Conv2d, ResNet with LSTM, and ResNet with BiLSTM. Results show that LSTMs have better capacity for modeling temporal relationship within a video clip, which achieve higher AUC and accuracy. 81 | 82 | * ResNet-based Conv2d: Different from CNN-MLP for 3-channel RGB frames, each time this model takes stacked optical flow maps as input, which can be considered as a multi-channel image. The ResNet is initialized with pre-trained weights but the first convolutional layer needs additional processing (3-channel -> multi-channel). The configuration file is `conf/cnn_flow.py` which contains all the hyperparameters for training. To train a model from scratch, run `python train_cnn_flow.py`. To resume training, first set the `configs['resume']` variable in `conf/cnn_flow.py` and then run `python train_cnn_flow.py`. To evaluate a trained model, run `python test_cnn_flow.py --ckpt path/to/checkpoint`. Other options such as `data_root`, `data_list` and `batch_size` can also be changed. 83 | 84 | * ResNet + LSTM: The network configuration is in `conf/lstm_flow.py`. For unidirectional LSTM, set `configs['net']['rnn_bidir'] = False`. To train a model, run `python train_lstm_flow.py`. To evaluate a trained model using the test set, run `python test_lstm_flow.py --ckpt path/to/checkpoint`. 85 | 86 | * ResNet + BiLSTM: The network configuration is in `conf/lstm_flow.py`. For bidirectional LSTM, set `configs['net']['rnn_bidir'] = True`. To train a model, run `python train_lstm_flow.py`. To evaluate a trained model using the test set, run `python test_lstm_flow.py --ckpt path/to/checkpoint`. 87 | 88 | After running the test script, results will be saved as a `.npy` file, which can be used in the fusion section. 89 | 90 | 91 | ### Fusion of Two Streams 92 | 93 | There are many fusion strategies that can be used to merge two predictions. We adopt a simple but effective strategy, namely weighted average fusion after the last activation. The main advantage of this approach is that we don't need to retrain our model and there is only one hyperparameter, i.e. the weight of the first prediction. 94 | 95 | ``` 96 | fusion_pred = rgb_weight * rgb_pred + (1. - rgb_weight) * flow_pred 97 | ``` 98 | 99 | To calculate the fusion of two streams, run this command: 100 | 101 | ``` 102 | python test_combined.py --rgb_file path/to/rgb/results --flow_file path/to/optical/flow/results --rgb_weight your_preferred_weight 103 | ``` 104 | 105 | It will automatically calculate the video-level AUC and accuracy, which will be display in the terminal. 106 | 107 | ## Results 108 | 109 | Our evaluation metric is video-level AUC and accuracy (ACC). Please refer to our final report for the details. 110 | 111 | The following figure visualizes our result. The red region represents the anomaly window. Our model is able to predict high scores for those frames. 112 | 113 | ![vis](imgs/visualization.png "Visualization of our results") 114 | 115 | -------------------------------------------------------------------------------- /train_cnn_flow.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import shutil 4 | from tqdm import tqdm, trange 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import torch.nn.functional as F 9 | from torch.utils.data import DataLoader 10 | from torchvision import transforms 11 | from data import OpticalFlowDataset 12 | from conf.cnn_flow import configs 13 | 14 | 15 | class Trainer(object): 16 | def __init__(self, configs): 17 | self.configs = configs 18 | self.device = torch.device(configs['device']) 19 | 20 | ckpt = None 21 | if configs['resume']: 22 | ckpt = torch.load(configs['ckpt_path']) 23 | configs['net'] = ckpt['net_configs'] 24 | self.configs = configs 25 | 26 | # for pre-trained models 27 | train_transform = transforms.Compose([ 28 | transforms.Resize([configs['image_size'], configs['image_size']]), 29 | transforms.ToTensor(), 30 | ]) 31 | train_dataset = OpticalFlowDataset( 32 | root=configs['data_root'], 33 | video_list_path=configs['train_list'], 34 | n_frames=configs['net']['n_frames'], 35 | transform=train_transform, 36 | is_train=True 37 | ) 38 | self.train_loader = DataLoader( 39 | dataset=train_dataset, 40 | batch_size=configs['train_batch_size'], 41 | shuffle=True, 42 | num_workers=configs['n_workers'], 43 | collate_fn=train_dataset.collate_fn, 44 | pin_memory=True 45 | ) 46 | 47 | assert configs['net']['cnn_type'] in ['resnet50', 'resnet101', 'resnet152'] 48 | if configs['net']['cnn_type'] == 'resnet50': 49 | from networks import resnet50 as ResNet 50 | elif configs['net']['cnn_type'] == 'resnet101': 51 | from networks import resnet101 as ResNet 52 | elif configs['net']['cnn_type'] == 'resnet152': 53 | from networks import resnet152 as ResNet 54 | 55 | self.model = ResNet(pretrained=True, channel=configs['net']['n_frames'] * 2) 56 | self.model.to(self.device) 57 | self.criterion = nn.BCEWithLogitsLoss() 58 | self.optimizer = optim.Adam(self.model.parameters(), 59 | lr=configs['lr'], weight_decay=configs['weight_decay']) 60 | 61 | if configs['resume']: 62 | self.model.load_state_dict(ckpt['state_dict']) 63 | self.optimizer.load_state_dict(ckpt['optimizer']) 64 | tqdm.write(f"[info] Loaded checkpoint from {configs['ckpt_path']}") 65 | if configs['new_lr'] is not None: 66 | for param_group in self.optimizer.param_groups: 67 | param_group['lr'] = configs['new_lr'] 68 | tqdm.write(f"[info] Set new lr to {configs['new_lr']}") 69 | 70 | if configs['apply_val']: 71 | val_transform = transforms.Compose([ 72 | transforms.Resize([configs['image_size'], configs['image_size']]), 73 | transforms.ToTensor(), 74 | ]) 75 | val_dataset = OpticalFlowDataset( 76 | root=configs['data_root'], 77 | video_list_path=configs['val_list'], 78 | n_frames=configs['net']['n_frames'], 79 | transform=val_transform, 80 | is_train=False 81 | ) 82 | self.val_loader = DataLoader( 83 | dataset=val_dataset, 84 | batch_size=configs['val_batch_size'], 85 | shuffle=False, 86 | num_workers=configs['n_workers'], 87 | collate_fn=val_dataset.collate_fn, 88 | pin_memory=True 89 | ) 90 | self.display_interval = configs['val_display_interval'] 91 | 92 | self.save_dir = os.path.join(configs['save_dir'], time.strftime('flow-cnn_%Y%m%d-%H%M%S', time.localtime())) 93 | os.makedirs(self.save_dir) 94 | self.log_file = os.path.join(self.save_dir, 'log_train.txt') 95 | self.copyscripts(os.path.join(self.save_dir, 'backup_scripts')) 96 | 97 | self.writelog(self.configs) 98 | self.writelog('=' * 80) 99 | 100 | def copyscripts(self, dest_path): 101 | """ 102 | Save python scripts. 103 | Ignore directories such as '__pycache__' and '.idea'. 104 | """ 105 | shutil.copytree('.', dest_path, ignore=shutil.ignore_patterns('_*', '.*', self.configs['save_dir'])) 106 | 107 | def writelog(self, results): 108 | if not isinstance(results, str): 109 | results = str(results) 110 | with open(self.log_file, 'a') as fp: 111 | fp.write(results + '\n') 112 | 113 | def savemodel(self, save_name): 114 | torch.save( 115 | { 116 | 'state_dict': self.model.state_dict(), 117 | 'optimizer': self.optimizer.state_dict(), 118 | 'net_configs': self.configs['net'] 119 | }, 120 | save_name 121 | ) 122 | tqdm.write(f'[Info] Trained model has been saved as {save_name}') 123 | 124 | def train(self): 125 | for epoch in trange(self.configs['n_epochs']): 126 | tqdm.write('=' * 20 + f'Epoch {epoch + 1} starts' + '=' * 20) 127 | average_loss, accuracy = self.train_epoch(epoch) 128 | log_str = f'Epoch [{epoch + 1:02d}/{self.configs["n_epochs"]}] Train Loss = {average_loss:.5f} ' \ 129 | f'Train ACC = {accuracy*100:.2f}%' 130 | self.savemodel(os.path.join(self.save_dir, f'm_epoch{epoch + 1:02d}.pt')) 131 | 132 | if self.configs['apply_val']: 133 | with torch.no_grad(): 134 | val_loss, val_acc = self.val_epoch(epoch) 135 | log_str += f' Val Loss = {val_loss:.5f} Val ACC = {val_acc * 100:.2f}%' 136 | 137 | self.writelog(log_str) 138 | tqdm.write(log_str) 139 | 140 | def train_epoch(self, epoch): 141 | self.model.train() 142 | 143 | total_loss = 0. 144 | n_correct = 0 145 | n_samples = 0 146 | for step, (images, labels) in enumerate(tqdm(self.train_loader)): 147 | images = images.to(dtype=torch.float32, device=self.device) # (N, C=20, H, W) 148 | labels = labels.to(dtype=torch.float32, device=self.device) # (N,) 149 | 150 | # forward & backward 151 | out = self.model(images) # (N, 1), logits before sigmoid 152 | 153 | tqdm.write(str(images.shape)) 154 | tqdm.write(str(labels.shape)) 155 | tqdm.write(str(torch.sigmoid(out[:4, 0]))) 156 | tqdm.write(str(labels[:4])) 157 | 158 | loss = self.criterion(out, labels.unsqueeze(-1)) 159 | 160 | tqdm.write(str(loss.item()) + '\n') 161 | 162 | self.optimizer.zero_grad() 163 | loss.backward() 164 | self.optimizer.step() 165 | 166 | total_loss += loss.item() * labels.shape[0] 167 | n_correct += ((out.detach().squeeze(1) >= 0.) == labels).sum().item() 168 | n_samples += labels.shape[0] 169 | 170 | return total_loss / n_samples, n_correct / n_samples 171 | 172 | def val_epoch(self, epoch): 173 | self.model.eval() 174 | 175 | total_loss = 0. 176 | n_correct = 0 177 | n_samples = 0 178 | for step, (images, labels) in enumerate(tqdm(self.val_loader)): 179 | images = images.to(dtype=torch.float32, device=self.device) # (N, C=20, H, W) 180 | labels = labels.to(dtype=torch.float32, device=self.device) # (N,) 181 | 182 | # forward 183 | out = self.model(images) # (N, 1), logits before sigmoid 184 | loss = self.criterion(out, labels.unsqueeze(-1)) 185 | 186 | total_loss += loss.item() * labels.shape[0] 187 | n_correct += ((out.detach().squeeze(1) >= 0.) == labels).sum().item() 188 | n_samples += labels.shape[0] 189 | 190 | # display 191 | if (step + 1) % self.display_interval == 0: 192 | try: 193 | print('-' * 40) 194 | print(f'[info] Probs: {torch.sigmoid(out).squeeze(-1).cpu().numpy()}') 195 | print(f'[info] Label: {labels.cpu().numpy()}') 196 | except: 197 | print(f'[warning] Failed to display validation result.') 198 | 199 | return total_loss / n_samples, n_correct / n_samples 200 | 201 | 202 | if __name__ == '__main__': 203 | trainer = Trainer(configs) 204 | trainer.train() 205 | -------------------------------------------------------------------------------- /train_lstm_flow.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convolutional Recurrent Neural Networks (CRNN) for optical flow maps. 3 | Each input map has horizontal and vertical components. 4 | """ 5 | 6 | import os 7 | import time 8 | import shutil 9 | from tqdm import tqdm, trange 10 | import torch 11 | import torch.nn as nn 12 | import torch.optim as optim 13 | import torch.nn.functional as F 14 | from torch.utils.data import DataLoader 15 | from torchvision import transforms 16 | from data import OpticalFlowDataset 17 | from networks import CRNNOpticalFlow 18 | from conf.lstm_flow import configs 19 | 20 | 21 | class Trainer(object): 22 | def __init__(self, configs): 23 | self.configs = configs 24 | self.device = torch.device(configs['device']) 25 | 26 | ckpt = None 27 | if configs['resume']: 28 | ckpt = torch.load(configs['ckpt_path']) 29 | configs['net'] = ckpt['net_configs'] 30 | self.configs = configs 31 | 32 | train_transform = transforms.Compose([ 33 | transforms.Resize([configs['image_size'], configs['image_size']]), 34 | transforms.ToTensor(), 35 | ]) 36 | train_dataset = OpticalFlowDataset( 37 | root=configs['data_root'], 38 | video_list_path=configs['train_list'], 39 | n_frames=configs['net']['n_frames'], 40 | transform=train_transform, 41 | is_train=True 42 | ) 43 | self.train_loader = DataLoader( 44 | dataset=train_dataset, 45 | batch_size=configs['train_batch_size'], 46 | shuffle=True, 47 | num_workers=configs['n_workers'], 48 | collate_fn=train_dataset.collate_fn, 49 | pin_memory=True 50 | ) 51 | 52 | self.criterion = nn.BCELoss() 53 | 54 | self.model = CRNNOpticalFlow( 55 | cnn_dropout=configs['net']['cnn_dropout'], 56 | cnn_emb_dim=configs['net']['cnn_emb_dim'], 57 | cnn_type=configs['net']['cnn_type'], 58 | rnn_hidden_size=configs['net']['rnn_hidden_size'], 59 | rnn_dropout=configs['net']['rnn_dropout'], 60 | num_rnn_layers=configs['net']['num_rnn_layers'], 61 | rnn_bidir=configs['net']['rnn_bidir'] 62 | ) 63 | self.model.to(self.device) 64 | self.optimizer = optim.Adam(self.model.parameters(), 65 | lr=configs['lr'], weight_decay=configs['weight_decay']) 66 | 67 | if configs['resume']: 68 | self.model.load_state_dict(ckpt['state_dict']) 69 | self.optimizer.load_state_dict(ckpt['optimizer']) 70 | if configs['new_lr'] is not None: 71 | for param_group in self.optimizer.param_groups: 72 | param_group['lr'] = configs['new_lr'] 73 | 74 | if configs['apply_val']: 75 | val_transform = transforms.Compose([ 76 | transforms.Resize([configs['image_size'], configs['image_size']]), 77 | transforms.ToTensor(), 78 | ]) 79 | val_dataset = OpticalFlowDataset( 80 | root=configs['data_root'], 81 | video_list_path=configs['val_list'], 82 | n_frames=configs['net']['n_frames'], 83 | transform=val_transform, 84 | is_train=False 85 | ) 86 | self.val_loader = DataLoader( 87 | dataset=val_dataset, 88 | batch_size=configs['val_batch_size'], 89 | shuffle=False, 90 | num_workers=configs['n_workers'], 91 | collate_fn=val_dataset.collate_fn, 92 | pin_memory=True 93 | ) 94 | self.display_interval = configs['val_display_interval'] 95 | 96 | self.save_dir = os.path.join(configs['save_dir'], time.strftime('flow-crnn_%Y%m%d-%H%M%S', time.localtime())) 97 | os.makedirs(self.save_dir) 98 | self.log_file = os.path.join(self.save_dir, 'log_train.txt') 99 | self.copyscripts(os.path.join(self.save_dir, 'backup_scripts')) 100 | 101 | self.writelog(self.configs) 102 | self.writelog('=' * 80) 103 | print(self.model) 104 | 105 | def copyscripts(self, dest_path): 106 | """ 107 | Save python scripts. 108 | Ignore directories such as '__pycache__' and '.idea'. 109 | """ 110 | shutil.copytree('.', dest_path, ignore=shutil.ignore_patterns('_*', '.*', self.configs['save_dir'])) 111 | 112 | def writelog(self, results): 113 | if not isinstance(results, str): 114 | results = str(results) 115 | with open(self.log_file, 'a') as fp: 116 | fp.write(results + '\n') 117 | 118 | def savemodel(self, save_name): 119 | torch.save( 120 | { 121 | 'state_dict': self.model.state_dict(), 122 | 'optimizer': self.optimizer.state_dict(), 123 | 'net_configs': self.configs['net'] 124 | }, 125 | save_name 126 | ) 127 | tqdm.write(f'[Info] Trained model has been saved as {save_name}') 128 | 129 | def train(self): 130 | for epoch in trange(self.configs['n_epochs']): 131 | tqdm.write('=' * 20 + f'Epoch {epoch + 1} starts' + '=' * 20) 132 | average_loss, accuracy = self.train_epoch(epoch) 133 | log_str = f'Epoch [{epoch + 1:02d}/{self.configs["n_epochs"]}] Train Loss = {average_loss:.5f} ' \ 134 | f'Train ACC = {accuracy*100:.3f}%' 135 | self.savemodel(os.path.join(self.save_dir, f'm_epoch{epoch + 1:02d}.pt')) 136 | 137 | if self.configs['apply_val']: 138 | with torch.no_grad(): 139 | val_loss, val_acc = self.val_epoch(epoch) 140 | log_str += f' Val Loss = {val_loss:.5f} Val ACC = {val_acc * 100:.3f}%' 141 | 142 | self.writelog(log_str) 143 | tqdm.write(log_str) 144 | 145 | def train_epoch(self, epoch): 146 | self.model.train() 147 | 148 | total_loss = 0. 149 | n_correct = 0 150 | n_samples = 0 151 | for step, (images, labels) in enumerate(tqdm(self.train_loader)): 152 | images = images.to(dtype=torch.float32, device=self.device) # (N, T * 2, H, W) 153 | labels = labels.to(dtype=torch.float32, device=self.device) # (N,) 154 | 155 | # forward & backward 156 | out = self.model(images) # (N, T), probs after sigmoid 157 | out = torch.mean(out, dim=-1, keepdim=True) # (N, 1) 158 | 159 | tqdm.write(str(images.shape)) 160 | tqdm.write(str(labels.shape)) 161 | tqdm.write(str(out[:4, 0])) 162 | tqdm.write(str(labels[:4])) 163 | 164 | loss = self.criterion(out, labels.unsqueeze(-1)) 165 | 166 | tqdm.write(str(loss.item()) + '\n') 167 | 168 | self.optimizer.zero_grad() 169 | loss.backward() 170 | 171 | grad_norm = nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) 172 | self.optimizer.step() 173 | 174 | total_loss += loss.item() * labels.shape[0] 175 | n_correct += ((out.detach().squeeze(1) >= 0.5) == labels).sum().item() 176 | n_samples += labels.shape[0] 177 | 178 | return total_loss / n_samples, n_correct / n_samples 179 | 180 | def val_epoch(self, epoch): 181 | self.model.eval() 182 | 183 | total_loss = 0. 184 | n_correct = 0 185 | n_samples = 0 186 | for step, (images, labels) in enumerate(tqdm(self.val_loader)): 187 | images = images.to(dtype=torch.float32, device=self.device) # (N, T * 2, H, W) 188 | labels = labels.to(dtype=torch.float32, device=self.device) # (N,) 189 | 190 | # forward 191 | out = self.model(images) # (N, T), probs after sigmoid 192 | out = torch.mean(out, dim=-1, keepdim=True) # (N, 1) 193 | 194 | loss = self.criterion(out, labels.unsqueeze(-1)) 195 | 196 | total_loss += loss.item() * labels.shape[0] 197 | n_correct += ((out.detach().squeeze(1) >= 0.5) == labels).sum().item() 198 | n_samples += labels.shape[0] 199 | 200 | # display 201 | if (step + 1) % self.display_interval == 0: 202 | try: 203 | tqdm.write('-' * 40) 204 | tqdm.write(f'[info] Probs: {out.squeeze(-1).cpu().numpy()}') 205 | tqdm.write(f'[info] Label: {labels.cpu().numpy()}') 206 | except: 207 | tqdm.write(f'[warning] Failed to display validation result.') 208 | 209 | return total_loss / n_samples, n_correct / n_samples 210 | 211 | 212 | if __name__ == '__main__': 213 | trainer = Trainer(configs) 214 | trainer.train() 215 | -------------------------------------------------------------------------------- /train_lstm_rgb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import shutil 4 | from tqdm import tqdm, trange 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import torch.nn.functional as F 9 | from torch.utils.data import DataLoader 10 | from torchvision import transforms 11 | from data import RGBFrameDataset 12 | from networks import CRNNClassifier 13 | from conf.lstm_rgb import configs 14 | 15 | 16 | class Trainer(object): 17 | def __init__(self, configs): 18 | self.configs = configs 19 | self.device = torch.device(configs['device']) 20 | 21 | # for pre-trained models 22 | train_transform = transforms.Compose([ 23 | transforms.Resize([configs['image_size'], configs['image_size']]), 24 | transforms.ToTensor(), 25 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 26 | ]) 27 | train_dataset = RGBFrameDataset( 28 | root=configs['data_root'], 29 | video_list_path=configs['train_list'], 30 | n_frames=configs['train_n_frames'], 31 | transform=train_transform, 32 | is_train=True 33 | ) 34 | self.train_loader = DataLoader( 35 | dataset=train_dataset, 36 | batch_size=configs['train_batch_size'], 37 | shuffle=True, 38 | num_workers=configs['n_workers'], 39 | collate_fn=train_dataset.collate_fn, 40 | pin_memory=True 41 | ) 42 | 43 | self.criterion = nn.BCELoss() 44 | 45 | ckpt = None 46 | if configs['resume']: 47 | ckpt = torch.load(configs['ckpt_path']) 48 | configs['net'] = ckpt['net_configs'] 49 | self.configs = configs 50 | 51 | self.model = CRNNClassifier( 52 | cnn_dropout=configs['net']['cnn_dropout'], 53 | cnn_emb_dim=configs['net']['cnn_emb_dim'], 54 | cnn_type=configs['net']['cnn_type'], 55 | cnn_finetune=configs['net']['cnn_finetune'], 56 | rnn_hidden_size=configs['net']['rnn_hidden_size'], 57 | rnn_dropout=configs['net']['rnn_dropout'], 58 | num_rnn_layers=configs['net']['num_rnn_layers'], 59 | rnn_bidir=configs['net']['rnn_bidir'] 60 | ) 61 | self.model.to(self.device) 62 | self.optimizer = optim.Adam(self.model.parameters(), lr=configs['lr'], weight_decay=configs['weight_decay']) 63 | 64 | # if configs['pre_train']: 65 | # pre_ckpt = torch.load(configs['pre_train_path']) 66 | # self.model.load_state_dict(pre_ckpt['state_dict'], strict=True) 67 | # print('============= load pre-trained model ==============') 68 | 69 | if configs['resume']: 70 | self.model.load_state_dict(ckpt['state_dict']) 71 | self.optimizer.load_state_dict(ckpt['optimizer']) 72 | if configs['new_lr'] is not None: 73 | for param_group in self.optimizer.param_groups: 74 | param_group['lr'] = configs['new_lr'] 75 | 76 | if configs['apply_val']: 77 | val_transform = transforms.Compose([ 78 | transforms.Resize([configs['image_size'], configs['image_size']]), 79 | transforms.ToTensor(), 80 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 81 | ]) 82 | val_dataset = RGBFrameDataset( 83 | root=configs['data_root'], 84 | video_list_path=configs['val_list'], 85 | n_frames=configs['val_n_frames'], 86 | transform=val_transform, 87 | is_train=False 88 | ) 89 | self.val_loader = DataLoader( 90 | dataset=val_dataset, 91 | batch_size=configs['val_batch_size'], 92 | shuffle=False, 93 | num_workers=configs['n_workers'], 94 | collate_fn=val_dataset.collate_fn, 95 | pin_memory=True 96 | ) 97 | self.display_interval = configs['val_display_interval'] 98 | 99 | self.save_dir = os.path.join(configs['save_dir'], time.strftime('rgb_%Y%m%d-%H%M%S', time.localtime())) 100 | os.makedirs(self.save_dir) 101 | self.log_file = os.path.join(self.save_dir, 'log_train.txt') 102 | self.copyscripts(os.path.join(self.save_dir, 'backup_scripts')) 103 | 104 | self.writelog(self.configs) 105 | self.writelog('=' * 80) 106 | 107 | def copyscripts(self, dest_path): 108 | """ 109 | Save python scripts. 110 | Ignore directories such as '__pycache__' and '.idea'. 111 | """ 112 | shutil.copytree('.', dest_path, ignore=shutil.ignore_patterns('_*', '.*', self.configs['save_dir'])) 113 | 114 | def writelog(self, results): 115 | if not isinstance(results, str): 116 | results = str(results) 117 | with open(self.log_file, 'a') as fp: 118 | fp.write(results + '\n') 119 | 120 | def savemodel(self, save_name): 121 | torch.save( 122 | { 123 | 'state_dict': self.model.state_dict(), 124 | 'optimizer': self.optimizer.state_dict(), 125 | 'net_configs': self.configs['net'] 126 | }, 127 | save_name 128 | ) 129 | tqdm.write(f'[Info] Trained model has been saved as {save_name}') 130 | 131 | def train(self): 132 | for epoch in trange(self.configs['n_epochs']): 133 | tqdm.write('=' * 20 + f'Epoch {epoch + 1} starts' + '=' * 20) 134 | average_loss, accuracy = self.train_epoch(epoch) 135 | log_str = f'Epoch [{epoch + 1:02d}/{self.configs["n_epochs"]}] Train Loss = {average_loss:.5f} ' \ 136 | f'Train ACC = {accuracy*100:.2f}%' 137 | self.savemodel(os.path.join(self.save_dir, f'm_epoch{epoch + 1:02d}.pt')) 138 | 139 | if self.configs['apply_val']: 140 | with torch.no_grad(): 141 | val_loss, val_acc = self.val_epoch(epoch) 142 | log_str += f' Val Loss = {val_loss:.5f} Val ACC = {val_acc * 100:.2f}%' 143 | 144 | self.writelog(log_str) 145 | tqdm.write(log_str) 146 | 147 | def train_epoch(self, epoch): 148 | self.model.train() 149 | 150 | total_loss = 0. 151 | n_correct = 0 152 | n_samples = 0 153 | for step, (videos, labels) in enumerate(tqdm(self.train_loader)): 154 | videos = videos.to(dtype=torch.float32, device=self.device) # (N, T, C, H, W) 155 | labels = labels.to(dtype=torch.float32, device=self.device) # (N,) 156 | 157 | # forward & backward 158 | prob = self.model(videos) # (N, T) after sigmoid 159 | 160 | tqdm.write(str(videos.shape)) 161 | tqdm.write(str(labels.shape)) 162 | tqdm.write(str(prob[:2])) 163 | tqdm.write(str(labels[:2])) 164 | 165 | prob = torch.mean(prob, dim=-1, keepdim=True) 166 | loss = self.criterion(prob, labels.unsqueeze(-1)) 167 | 168 | tqdm.write(str(loss.item()) + '\n') 169 | 170 | self.optimizer.zero_grad() 171 | loss.backward() 172 | self.optimizer.step() 173 | 174 | total_loss += loss.item() * labels.shape[0] 175 | n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item() 176 | n_samples += labels.shape[0] 177 | 178 | return total_loss / n_samples, n_correct / n_samples 179 | 180 | def val_epoch(self, epoch): 181 | self.model.eval() 182 | 183 | total_loss = 0. 184 | n_correct = 0 185 | n_samples = 0 186 | for step, (videos, labels) in enumerate(tqdm(self.val_loader)): 187 | videos = videos.to(dtype=torch.float32, device=self.device) # (N, T, C, H, W) 188 | labels = labels.to(dtype=torch.float32, device=self.device) # (N,) 189 | 190 | # forward 191 | prob = self.model(videos) # (N, T) after sigmoid 192 | prob = torch.mean(prob, dim=-1, keepdim=True) # (N, 1) 193 | loss = self.criterion(prob, labels.unsqueeze(-1)) 194 | 195 | total_loss += loss.item() * labels.shape[0] 196 | n_correct += ((prob.detach().squeeze(1) >= 0.5) == labels).sum().item() 197 | n_samples += labels.shape[0] 198 | 199 | # display 200 | if (step + 1) % self.display_interval == 0: 201 | try: 202 | print('-' * 40) 203 | print(f'[info] Probs: {prob.squeeze(-1).cpu().numpy()}') 204 | print(f'[info] Label: {labels.cpu().numpy()}') 205 | except: 206 | print(f'[warning] Failed to display validation result.') 207 | 208 | return total_loss / n_samples, n_correct / n_samples 209 | 210 | 211 | if __name__ == '__main__': 212 | trainer = Trainer(configs) 213 | trainer.train() 214 | -------------------------------------------------------------------------------- /networks.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torchvision.models as models 5 | import torch.utils.model_zoo as model_zoo 6 | 7 | 8 | ####################################### 9 | # CRNN Model for RGB Frames # 10 | ####################################### 11 | 12 | class CNNEncoder(nn.Module): 13 | '''2D CNN feature extractor based on pre-trained models. 14 | ''' 15 | def __init__(self, dropout, embedding_dim, cnn_type, finetune): 16 | super(CNNEncoder, self).__init__() 17 | 18 | self.finetune = finetune # if True, the CNN will also be trained 19 | 20 | assert cnn_type in ['resnet50', 'resnet101', 'resnet152'], f'invalid cnn type: {cnn_type}' 21 | # Note: for the following 3 types of ResNet, the output dim is 2048 22 | if cnn_type == 'resnet50': 23 | cnn = models.resnet50(pretrained=True) 24 | elif cnn_type == 'resnet101': 25 | cnn = models.resnet101(pretrained=True) 26 | else: 27 | cnn = models.resnet152(pretrained=True) 28 | 29 | modules = list(cnn.children())[:-1] # remove the last FC layer 30 | self.cnn = nn.Sequential(*modules) 31 | cnn_out_dim = cnn.fc.in_features # 2048 32 | 33 | self.proj = nn.Sequential( 34 | nn.Dropout(p=dropout), 35 | nn.Linear(cnn_out_dim, embedding_dim) 36 | ) 37 | 38 | def forward(self, x_seq): 39 | ''' 40 | :param x_seq: (N, T, C, H, W) 41 | :return (N, T, emb_dim) 42 | ''' 43 | 44 | feature_seq = [] 45 | for t in range(x_seq.shape[1]): 46 | if self.finetune: 47 | x = self.cnn(x_seq[:, t, :, :, :]) 48 | x = x.reshape(x.shape[0], -1) # (N, cnn_out_dim) 49 | else: 50 | with torch.no_grad(): # pre-trained model is fixed 51 | x = self.cnn(x_seq[:, t, :, :, :]) 52 | x = x.reshape(x.shape[0], -1) # (N, cnn_out_dim) 53 | 54 | x = self.proj(x) # (N, emb_dim) 55 | feature_seq.append(x) 56 | 57 | feature_seq = torch.stack(feature_seq, dim=0) # (T, N, emb_dim) 58 | feature_seq = feature_seq.transpose(0, 1) # (N, T, emb_dim) 59 | 60 | return feature_seq 61 | 62 | 63 | class RNNDecoder(nn.Module): 64 | def __init__(self, input_size, hidden_size, dropout, num_layers, bidirectional): 65 | super(RNNDecoder, self).__init__() 66 | 67 | self.rnn = nn.LSTM( 68 | input_size=input_size, 69 | hidden_size=hidden_size, 70 | num_layers=num_layers, 71 | batch_first=True, 72 | dropout=dropout, 73 | bidirectional=bidirectional 74 | ) 75 | nn.init.xavier_normal_(self.rnn.all_weights[0][0]) 76 | nn.init.xavier_normal_(self.rnn.all_weights[0][1]) 77 | # For bidirectional RNNs 78 | # nn.init.xavier_normal_(self.rnn.all_weights[1][0]) 79 | # nn.init.xavier_normal_(self.rnn.all_weights[1][1]) 80 | 81 | # binary classifier 82 | self.classifier = nn.Sequential( 83 | nn.Dropout(p=dropout), 84 | nn.Linear(hidden_size*2 if bidirectional else hidden_size, 1), 85 | nn.Sigmoid() 86 | ) 87 | 88 | def forward(self, feature_seq): 89 | ''' 90 | :param feature_seq: (N, T, dim) 91 | :return out: (N, T), probability after sigmoid 92 | ''' 93 | 94 | self.rnn.flatten_parameters() # for DataParallel 95 | 96 | out, _ = self.rnn(feature_seq) # (N, T, hidden_size) 97 | out = self.classifier(out).squeeze(-1) # (N, T), after sigmoid 98 | 99 | return out 100 | 101 | 102 | class CRNNClassifier(nn.Module): 103 | def __init__(self, cnn_dropout, cnn_emb_dim, cnn_type, cnn_finetune, 104 | rnn_hidden_size, rnn_dropout, num_rnn_layers, rnn_bidir): 105 | super(CRNNClassifier, self).__init__() 106 | 107 | self.cnn_enc = CNNEncoder(cnn_dropout, cnn_emb_dim, cnn_type, cnn_finetune) 108 | self.rnn_dec = RNNDecoder(cnn_emb_dim, rnn_hidden_size, rnn_dropout, num_rnn_layers, rnn_bidir) 109 | 110 | def forward(self, x_seq): 111 | ''' 112 | :param x_seq: (N, T, C, H, W) 113 | :return: prob of anomaly after sigmoid, (N, T) 114 | ''' 115 | 116 | feature_seq = self.cnn_enc(x_seq) # (N, T, emb_dim) 117 | prob = self.rnn_dec(feature_seq) # (N, T), probability after sigmoid 118 | 119 | return prob 120 | 121 | 122 | ################################## 123 | # CNN for Optical FLow # 124 | ################################## 125 | """This part is similar to the temporal/motion stream in two-stream methods. 126 | Here, only three types of ResNet are supported. 127 | 128 | Ref: 129 | https://pytorch.org/docs/stable/torchvision/models.html?highlight=resnet 130 | https://github.com/jeffreyyihuang/two-stream-action-recognition 131 | """ 132 | 133 | model_urls = { 134 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 135 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 136 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth' 137 | } 138 | 139 | class Bottleneck(nn.Module): 140 | expansion = 4 141 | 142 | def __init__(self, inplanes, planes, stride=1, downsample=None): 143 | super(Bottleneck, self).__init__() 144 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 145 | self.bn1 = nn.BatchNorm2d(planes) 146 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 147 | self.bn2 = nn.BatchNorm2d(planes) 148 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 149 | self.bn3 = nn.BatchNorm2d(planes * 4) 150 | self.relu = nn.ReLU(inplace=True) 151 | self.downsample = downsample 152 | self.stride = stride 153 | 154 | def forward(self, x): 155 | residual = x 156 | 157 | out = self.conv1(x) 158 | out = self.bn1(out) 159 | out = self.relu(out) 160 | 161 | out = self.conv2(out) 162 | out = self.bn2(out) 163 | out = self.relu(out) 164 | 165 | out = self.conv3(out) 166 | out = self.bn3(out) 167 | 168 | if self.downsample is not None: 169 | residual = self.downsample(x) 170 | 171 | out += residual 172 | out = self.relu(out) 173 | 174 | return out 175 | 176 | 177 | class ResNet(nn.Module): 178 | def __init__(self, block, layers, nb_classes=1, channel=20): 179 | self.inplanes = 64 180 | super(ResNet, self).__init__() 181 | self.conv1_custom = nn.Conv2d(channel, 64, kernel_size=7, stride=2, padding=3, bias=False) 182 | self.bn1 = nn.BatchNorm2d(64) 183 | self.relu = nn.ReLU(inplace=True) 184 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 185 | self.layer1 = self._make_layer(block, 64, layers[0]) 186 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 187 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 188 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 189 | self.avgpool = nn.AvgPool2d(7) 190 | self.fc_custom = nn.Linear(512 * block.expansion, nb_classes) 191 | for m in self.modules(): 192 | if isinstance(m, nn.Conv2d): 193 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 194 | m.weight.data.normal_(0, math.sqrt(2. / n)) 195 | elif isinstance(m, nn.BatchNorm2d): 196 | m.weight.data.fill_(1) 197 | m.bias.data.zero_() 198 | 199 | def _make_layer(self, block, planes, blocks, stride=1): 200 | downsample = None 201 | if stride != 1 or self.inplanes != planes * block.expansion: 202 | downsample = nn.Sequential( 203 | nn.Conv2d(self.inplanes, planes * block.expansion, 204 | kernel_size=1, stride=stride, bias=False), 205 | nn.BatchNorm2d(planes * block.expansion), 206 | ) 207 | 208 | layers = [block(self.inplanes, planes, stride, downsample)] 209 | self.inplanes = planes * block.expansion 210 | for i in range(1, blocks): 211 | layers.append(block(self.inplanes, planes)) 212 | 213 | return nn.Sequential(*layers) 214 | 215 | def forward(self, x): 216 | x = self.conv1_custom(x) 217 | x = self.bn1(x) 218 | x = self.relu(x) 219 | x = self.maxpool(x) 220 | 221 | x = self.layer1(x) 222 | x = self.layer2(x) 223 | x = self.layer3(x) 224 | x = self.layer4(x) 225 | 226 | x = self.avgpool(x) 227 | x = x.view(x.size(0), -1) 228 | out = self.fc_custom(x) 229 | return out 230 | 231 | def extract_feature_vector(self, x): 232 | """Extract a feature vector from the input image. 233 | Args: 234 | x (torch.Tensor): (N, C, H, W) 235 | Returns: 236 | out (torch.Tensor): (N, 2048) 237 | """ 238 | x = self.conv1_custom(x) 239 | x = self.bn1(x) 240 | x = self.relu(x) 241 | x = self.maxpool(x) 242 | 243 | x = self.layer1(x) 244 | x = self.layer2(x) 245 | x = self.layer3(x) 246 | x = self.layer4(x) 247 | 248 | x = self.avgpool(x) 249 | out = x.view(x.size(0), -1) 250 | return out 251 | 252 | 253 | def resnet50(pretrained=True, channel=20): 254 | model = ResNet(Bottleneck, [3, 4, 6, 3], nb_classes=1, channel=channel) 255 | if pretrained: 256 | pretrain_dict = model_zoo.load_url(model_urls['resnet50']) # modify pretrain code 257 | model_dict = model.state_dict() 258 | model_dict = weight_transform(model_dict, pretrain_dict, channel) 259 | model.load_state_dict(model_dict) 260 | return model 261 | 262 | 263 | def resnet101(pretrained=True, channel=20): 264 | model = ResNet(Bottleneck, [3, 4, 23, 3], nb_classes=1, channel=channel) 265 | if pretrained: 266 | pretrain_dict = model_zoo.load_url(model_urls['resnet101']) # modify pretrain code 267 | model_dict = model.state_dict() 268 | model_dict = weight_transform(model_dict, pretrain_dict, channel) 269 | model.load_state_dict(model_dict) 270 | return model 271 | 272 | 273 | def resnet152(pretrained=True, channel=20): 274 | model = ResNet(Bottleneck, [3, 8, 36, 3], nb_classes=1, channel=channel) 275 | if pretrained: 276 | pretrain_dict = model_zoo.load_url(model_urls['resnet152']) # modify pretrain code 277 | model_dict = model.state_dict() 278 | model_dict = weight_transform(model_dict, pretrain_dict, channel) 279 | model.load_state_dict(model_dict) 280 | return model 281 | 282 | 283 | def cross_modality_pretrain(conv1_weight, channel): 284 | """Transforms the original 3 channel weight to "channel" channels 285 | """ 286 | S=0 287 | for i in range(3): 288 | S += conv1_weight[:, i, :, :] 289 | avg = S / 3. 290 | new_conv1_weight = torch.FloatTensor(64, channel, 7, 7) 291 | for i in range(channel): 292 | new_conv1_weight[:, i, :, :] = avg.data 293 | return new_conv1_weight 294 | 295 | 296 | def weight_transform(model_dict, pretrain_dict, channel): 297 | weight_dict = {k:v for k, v in pretrain_dict.items() if k in model_dict} 298 | w3 = pretrain_dict['conv1.weight'] 299 | if channel == 3: 300 | wt = w3 301 | else: 302 | wt = cross_modality_pretrain(w3, channel) 303 | 304 | weight_dict['conv1_custom.weight'] = wt 305 | model_dict.update(weight_dict) 306 | return model_dict 307 | 308 | 309 | ################################### 310 | # CRNN for Optical Flow # 311 | ################################### 312 | 313 | class CRNNOpticalFlow(nn.Module): 314 | def __init__(self, cnn_dropout, cnn_emb_dim, cnn_type, 315 | rnn_hidden_size, rnn_dropout, num_rnn_layers, rnn_bidir): 316 | super(CRNNOpticalFlow, self).__init__() 317 | 318 | assert cnn_type in ['resnet50', 'resnet101', 'resnet152'] 319 | if cnn_type == 'resnet50': 320 | self.cnn = resnet50(pretrained=True, channel=2) # only 2 channels, i.e. x and y 321 | elif cnn_type == 'resnet101': 322 | self.cnn = resnet101(pretrained=True, channel=2) 323 | elif cnn_type == 'resnet152': 324 | self.cnn = resnet152(pretrained=True, channel=2) 325 | 326 | self.cnn.fc_custom = None # here, we don't need the last linear layer 327 | 328 | self.embed = nn.Sequential( 329 | nn.Dropout(p=cnn_dropout), 330 | nn.Linear(2048, cnn_emb_dim), 331 | nn.Dropout(p=rnn_dropout) 332 | ) 333 | 334 | self.rnn = RNNDecoder(cnn_emb_dim, rnn_hidden_size, rnn_dropout, num_rnn_layers, rnn_bidir) 335 | 336 | def forward(self, x): 337 | ''' 338 | Args: 339 | x (torch.Tensor): (N, T * 2, H, W) 340 | Returns: 341 | prob: probability of anomaly after sigmoid, (N, T) 342 | ''' 343 | 344 | N, _, H, W = x.shape 345 | x = x.reshape(N, -1, 2, H, W).reshape(-1, 2, H, W) 346 | x = self.cnn.extract_feature_vector(x) # (N * T, 2048) 347 | x = x.reshape(N, -1, 2048) 348 | 349 | x = self.embed(x) # (N, T, emb_dim) 350 | 351 | prob = self.rnn(x) # (N, T), probability after sigmoid 352 | return prob 353 | 354 | 355 | if __name__ == '__main__': 356 | model = resnet101(pretrained=True, channel=20) 357 | print(model) 358 | --------------------------------------------------------------------------------