├── LoadUCF101DataByTorch.py ├── README.md ├── config.py ├── model.py └── train.py /LoadUCF101DataByTorch.py: -------------------------------------------------------------------------------- 1 | import torchvision.datasets as datasets 2 | from torch.utils.data import DataLoader 3 | import torchvision.transforms as transforms 4 | import torchvision.transforms._transforms_video as v_transform 5 | import torch 6 | import config 7 | 8 | 9 | 10 | 11 | 12 | transform = transforms.Compose([ 13 | v_transform.ToTensorVideo(), 14 | v_transform.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]), 15 | v_transform.RandomHorizontalFlipVideo(), 16 | v_transform.RandomCropVideo(112), 17 | ]) 18 | 19 | 20 | 21 | def custom_collate(batch): 22 | filtered_batch = [] 23 | for video, _, label in batch: 24 | filtered_batch.append((video, label)) 25 | return torch.utils.data.dataloader.default_collate(filtered_batch) 26 | 27 | 28 | 29 | trainset = datasets.UCF101( 30 | root='data/UCF101/UCF-101', 31 | annotation_path='data/UCF101TrainTestSplits-RecognitionTask/ucfTrainTestlist', 32 | frames_per_clip=config.seq_length, 33 | num_workers=0, 34 | transform=transform, 35 | step_between_clips=2 36 | ) 37 | 38 | trainset_loader = DataLoader( 39 | trainset, 40 | batch_size=config.BATCH_SIZE, 41 | shuffle=True, 42 | num_workers=0, 43 | collate_fn=custom_collate 44 | ) 45 | 46 | 47 | 48 | testset = datasets.UCF101( 49 | root='data/UCF101/UCF-101', 50 | annotation_path='data/UCF101TrainTestSplits-RecognitionTask/ucfTrainTestlist', 51 | frames_per_clip=config.seq_length, 52 | num_workers=4, 53 | train=False, 54 | transform=transform, 55 | step_between_clips=2 56 | ) 57 | 58 | testset_loader = DataLoader( 59 | testset, 60 | batch_size=config.BATCH_SIZE, 61 | shuffle=False, 62 | num_workers=4, 63 | collate_fn=custom_collate 64 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LRCN_PyTorch 2 | This project includes the whole training process. Specifically, I use PyTorch 1.7 **`VideoIO / Video Datasets Loading API / Video Transform`** to process the Data. [More Details:How to use Video Datasets,Video IO,Video Classification Models,Video Transform in PyTorch](https://blog.csdn.net/qq_36627158/article/details/113791050) 3 | 4 | The LRCN's paper:Long-term Recurrent Convolutional Networks for Visual Recognition and Description. [download](https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Donahue_Long-Term_Recurrent_Convolutional_2015_CVPR_paper.pdf) 5 | 6 |   7 | 8 | 9 | ## Performance 10 | Accuracy | 11 | :-----------:| 12 | 62.43% (only 4 epochs)| 13 | 14 |   15 | 16 | 17 | ## Training Environment 18 | + Ubuntu 16.04.7 LTS 19 | + CUDA Version: 10.1 20 | + PyTorch 1.7.1 21 | + torchvision 0.8.2 22 | + numpy 1.19.2 23 | + pillow 8.1.0 24 | + python 3.8.5 25 | + av 8.0.3 26 | + matplotlib 3.3.4 27 | 28 |   29 | 30 | ## Data Preparation 31 | Original Dataset:[UCF101](https://www.crcv.ucf.edu/data/UCF101.php) 32 | 33 | After downloading the UCF101 dataset: **`UCF101.rar`** and **`UCF101TrainTestSplits-RecognitionTask.zip`**, you should seperately unrar them. Then put it into the directory named **`data`** 34 | ``` 35 | Project 36 | │--- data 37 | │------ UCF101 38 | │------ UCF101TrainTestSplits-RecognitionTask 39 | │--- other files 40 | ``` 41 | 42 |   43 | 44 | ## Train 45 | Before training, make sure you have a directory named **`model`** in the root project to save checkpoint file. 46 | ```python 47 | python3 train.py 48 | ``` 49 |   50 | 51 | ## Problems 52 | I recorded some problems and solutions when writing the code. Really so sorry that I only write in Chinese! 53 | Here is the [Link](https://blog.csdn.net/qq_36627158/article/details/114026519) 54 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # configuration of LSTM 2 | input_size = 4096 3 | hidden_size = 1024 4 | num_of_layers = 1 5 | seq_length = 16 # num of frames 6 | 7 | 8 | # configuration of Datasets 9 | classNum = 101 10 | 11 | 12 | # configuration of training process 13 | BATCH_SIZE = 256 14 | 15 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import torchvision.models as models 4 | import config 5 | 6 | 7 | 8 | 9 | class LRCN(nn.Module): 10 | def __init__(self): 11 | super(LRCN, self).__init__() 12 | 13 | # define the CNN part 14 | self.featureExtractor = models.alexnet(pretrained=True) 15 | # remove fc7 16 | self.featureExtractor.classifier = nn.Sequential(*list(self.featureExtractor.classifier.children())[:-5]) 17 | 18 | # define the lstm part 19 | self.lstm = nn.LSTM(input_size=config.input_size, hidden_size=config.hidden_size, num_layers=config.num_of_layers, dropout=0.9, batch_first=True) 20 | # define a linear layer 21 | self.linearLayer = nn.Linear(config.hidden_size, config.classNum) 22 | 23 | 24 | def forward(self, video_clip): 25 | # video clip's dimension: [B, C, T, H, W] 26 | 27 | # frameFeatures' dimension: [B, T, CNN's output dimension(4096)] 28 | # it is used to store all frame's feature 29 | frameFeatures = torch.empty(size=(video_clip.size()[0], video_clip.size()[2], config.input_size), device='cuda') 30 | 31 | 32 | for t in range(0, video_clip.size()[2]): 33 | frame = video_clip[:, :, t, :, :] 34 | frame_feature = self.featureExtractor(frame) 35 | # print(frame_feature.shape) 36 | frameFeatures[:, t, :] = frame_feature 37 | 38 | 39 | # x is the output of lstm:(batch, seq_len, input_size) 40 | x, _ = self.lstm(frameFeatures) 41 | 42 | # input's dimension: (batch_size, seq_length, hidden_size) 43 | # output's dimension: (batch_size, seq_length, classNum) 44 | x = self.linearLayer(x) 45 | 46 | # get frame-wise's mean 47 | # output's dimension:(batch, class_Num) 48 | x = torch.mean(x, dim=1) 49 | 50 | return x 51 | 52 | 53 | 54 | if __name__ == '__main__': 55 | model = LRCN() 56 | frames = torch.rand(config.BATCH_SIZE, 3, config.seq_length, 227, 227) 57 | output = model(frames) 58 | 59 | print(output.size()) 60 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from LoadUCF101DataByTorch import trainset_loader, testset_loader 2 | from model import LRCN 3 | import torch 4 | import torch.nn.functional as F 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | 9 | EPOCH = 10 10 | LEARNING_RATE = 0.003 11 | MOMENTUM = 0.9 12 | GAMMA = 0.5 13 | STEP_SIZE = 1 14 | 15 | 16 | 17 | if torch.cuda.is_available(): 18 | device = torch.device('cuda') 19 | else: 20 | device = torch.device('cpu') 21 | print(device) 22 | 23 | 24 | 25 | model = LRCN().to(device) 26 | 27 | 28 | 29 | optimizer = torch.optim.SGD( 30 | model.parameters(), 31 | lr=LEARNING_RATE, 32 | momentum=MOMENTUM 33 | ) 34 | # scheduler = torch.optim.lr_scheduler.StepLR( 35 | # optimizer, 36 | # step_size=STEP_SIZE, 37 | # gamma=GAMMA 38 | # ) 39 | 40 | 41 | 42 | def save_checkpoint(path, model, optimizer): 43 | state = { 44 | 'model': model.state_dict(), 45 | 'optimizer': optimizer.state_dict() 46 | } 47 | torch.save(state, path) 48 | 49 | 50 | 51 | def train(epoch): 52 | iteration = 0 53 | loss_plt=[] 54 | 55 | 56 | for i in range(epoch): 57 | model.train() 58 | # print('current lr', scheduler.get_last_lr()) 59 | 60 | for index, data in enumerate(trainset_loader): 61 | video_clips, label = data 62 | 63 | video_clips = video_clips.to(device) 64 | label = label.to(device) 65 | 66 | optimizer.zero_grad() 67 | 68 | output = model(video_clips) 69 | 70 | loss = F.cross_entropy(output, label) 71 | 72 | loss_plt.append(loss.item()) 73 | 74 | loss.backward() 75 | optimizer.step() 76 | 77 | iteration += 1 78 | 79 | print("Epoch:", i, "/", epoch-1, "\tIteration:", index, "/", len(trainset_loader)-1, "\tLoss: " + str(loss.item())) 80 | with open('log.txt', 'a') as f: 81 | f.write("Epoch: " + str(i) + "/" + str(epoch-1) + "\tIteration:" + str(index) + "/" + str(len(trainset_loader)-1) + "\tLoss: " + str(loss.item()) + "\n") 82 | 83 | save_checkpoint('model/checkpoint-%i.pth' % iteration, model, optimizer) 84 | 85 | test(i) 86 | 87 | # scheduler.step() 88 | 89 | save_checkpoint('model/checkpoint-%i.pth' % iteration, model, optimizer) 90 | 91 | plt.figure() 92 | plt.plot(loss_plt) 93 | plt.title('Loss') 94 | plt.xlabel('Iteration') 95 | plt.ylabel('') 96 | plt.show() 97 | 98 | 99 | 100 | def test(i_epoch): 101 | model.eval() 102 | 103 | correct = 0 104 | 105 | with torch.no_grad(): 106 | for index, data in enumerate(testset_loader): 107 | video_clips, label = data 108 | 109 | video_clips = video_clips.to(device) 110 | label = label.to(device) 111 | 112 | output = model(video_clips) 113 | 114 | max_value, max_index = output.max(1, keepdim=True) 115 | correct += max_index.eq(label.view_as(max_index)).sum().item() 116 | 117 | print("Accuracy: " + str(correct * 1.0 * 100 / len(testset_loader.dataset))) 118 | with open('log.txt', 'a') as f: 119 | f.write("Epoch " + str(i_epoch) + "'s Accuracy: " + str(correct * 1.0 * 100 / len(testset_loader.dataset)) + "\n") 120 | 121 | 122 | if __name__ == '__main__': 123 | train(EPOCH) 124 | --------------------------------------------------------------------------------