├── LoadUCF101DataByTorch.py
├── README.md
├── config.py
├── model.py
└── train.py


/LoadUCF101DataByTorch.py:
--------------------------------------------------------------------------------
 1 | import torchvision.datasets as datasets
 2 | from torch.utils.data import DataLoader
 3 | import torchvision.transforms as transforms
 4 | import torchvision.transforms._transforms_video as v_transform
 5 | import torch
 6 | import config
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | transform = transforms.Compose([
13 |     v_transform.ToTensorVideo(),
14 |     v_transform.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]),
15 |     v_transform.RandomHorizontalFlipVideo(),
16 |     v_transform.RandomCropVideo(112),
17 | ])
18 | 
19 | 
20 | 
21 | def custom_collate(batch):
22 |     filtered_batch = []
23 |     for video, _, label in batch:
24 |         filtered_batch.append((video, label))
25 |     return torch.utils.data.dataloader.default_collate(filtered_batch)
26 | 
27 | 
28 | 
29 | trainset = datasets.UCF101(
30 |     root='data/UCF101/UCF-101',
31 |     annotation_path='data/UCF101TrainTestSplits-RecognitionTask/ucfTrainTestlist',
32 |     frames_per_clip=config.seq_length,
33 |     num_workers=0,
34 |     transform=transform,
35 |     step_between_clips=2
36 | )
37 | 
38 | trainset_loader = DataLoader(
39 |     trainset,
40 |     batch_size=config.BATCH_SIZE,
41 |     shuffle=True,
42 |     num_workers=0,
43 |     collate_fn=custom_collate
44 | )
45 | 
46 | 
47 | 
48 | testset = datasets.UCF101(
49 |     root='data/UCF101/UCF-101',
50 |     annotation_path='data/UCF101TrainTestSplits-RecognitionTask/ucfTrainTestlist',
51 |     frames_per_clip=config.seq_length,
52 |     num_workers=4,
53 |     train=False,
54 |     transform=transform,
55 |     step_between_clips=2
56 | )
57 | 
58 | testset_loader = DataLoader(
59 |     testset,
60 |     batch_size=config.BATCH_SIZE,
61 |     shuffle=False,
62 |     num_workers=4,
63 |     collate_fn=custom_collate
64 | )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LRCN_PyTorch
 2 | This project includes the whole training process. Specifically, I use PyTorch 1.7 **`VideoIO / Video Datasets Loading API / Video Transform`** to process the Data. [More Details：How to use Video Datasets，Video IO，Video Classification Models，Video Transform in PyTorch](https://blog.csdn.net/qq_36627158/article/details/113791050)
 3 | 
 4 | The LRCN's paper：Long-term Recurrent Convolutional Networks for Visual Recognition and Description. [download](https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Donahue_Long-Term_Recurrent_Convolutional_2015_CVPR_paper.pdf)
 5 | 
 6 | &nbsp;
 7 | 
 8 | 
 9 | ## Performance
10 | Accuracy     | 
11 | :-----------:|
12 | 62.43% (only 4 epochs)|
13 | 
14 | &nbsp;
15 | 
16 | 
17 | ## Training Environment
18 | + Ubuntu 16.04.7 LTS
19 | + CUDA Version: 10.1
20 | + PyTorch 1.7.1
21 | + torchvision 0.8.2
22 | + numpy 1.19.2
23 | + pillow 8.1.0
24 | + python 3.8.5
25 | + av 8.0.3
26 | + matplotlib 3.3.4
27 | 
28 | &nbsp;
29 | 
30 | ## Data Preparation
31 | Original Dataset：[UCF101](https://www.crcv.ucf.edu/data/UCF101.php)
32 | 
33 | After downloading the UCF101 dataset: **`UCF101.rar`** and **`UCF101TrainTestSplits-RecognitionTask.zip`**, you should seperately unrar them. Then put it into the directory named **`data`**
34 | ```
35 | Project
36 | │--- data
37 | │------ UCF101
38 | │------ UCF101TrainTestSplits-RecognitionTask
39 | │--- other files
40 | ```
41 | 
42 | &nbsp;
43 | 
44 | ## Train
45 | Before training, make sure you have a directory named **`model`** in the root project to save checkpoint file.
46 | ```python
47 | python3 train.py
48 | ```
49 | &nbsp;
50 | 
51 | ## Problems
52 | I recorded some problems and solutions when writing the code. Really so sorry that I only write in Chinese! 
53 | Here is the [Link](https://blog.csdn.net/qq_36627158/article/details/114026519)
54 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | # configuration of LSTM
 2 | input_size = 4096
 3 | hidden_size = 1024
 4 | num_of_layers = 1
 5 | seq_length = 16      # num of frames
 6 | 
 7 | 
 8 | # configuration of Datasets
 9 | classNum = 101
10 | 
11 | 
12 | # configuration of training process
13 | BATCH_SIZE = 256
14 | 
15 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | import torchvision.models as models
 4 | import config
 5 | 
 6 | 
 7 | 
 8 | 
 9 | class LRCN(nn.Module):
10 |     def __init__(self):
11 |         super(LRCN, self).__init__()
12 | 
13 |         # define the CNN part
14 |         self.featureExtractor = models.alexnet(pretrained=True)
15 |         # remove fc7
16 |         self.featureExtractor.classifier = nn.Sequential(*list(self.featureExtractor.classifier.children())[:-5])
17 | 
18 |         # define the lstm part
19 |         self.lstm = nn.LSTM(input_size=config.input_size, hidden_size=config.hidden_size, num_layers=config.num_of_layers, dropout=0.9, batch_first=True)
20 |         # define a linear layer
21 |         self.linearLayer = nn.Linear(config.hidden_size, config.classNum)
22 | 
23 | 
24 |     def forward(self, video_clip):
25 |         # video clip's dimension: [B, C, T, H, W]
26 | 
27 |         # frameFeatures' dimension: [B, T, CNN's output dimension(4096)]
28 |         # it is used to store all frame's feature
29 |         frameFeatures = torch.empty(size=(video_clip.size()[0], video_clip.size()[2], config.input_size), device='cuda')
30 | 
31 | 
32 |         for t in range(0, video_clip.size()[2]):
33 |             frame = video_clip[:, :, t, :, :]
34 |             frame_feature = self.featureExtractor(frame)
35 |             # print(frame_feature.shape)
36 |             frameFeatures[:, t, :] = frame_feature
37 | 
38 | 
39 |         # x is the output of lstm：(batch, seq_len, input_size)
40 |         x, _ = self.lstm(frameFeatures)
41 | 
42 |         # input's dimension: (batch_size, seq_length, hidden_size)
43 |         # output's dimension: (batch_size, seq_length, classNum)
44 |         x = self.linearLayer(x)
45 | 
46 |         # get frame-wise's mean
47 |         # output's dimension：(batch, class_Num)
48 |         x = torch.mean(x, dim=1)
49 | 
50 |         return x
51 | 
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     model = LRCN()
56 |     frames = torch.rand(config.BATCH_SIZE, 3, config.seq_length, 227, 227)
57 |     output = model(frames)
58 | 
59 |     print(output.size())
60 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from LoadUCF101DataByTorch import trainset_loader, testset_loader
  2 | from model import LRCN
  3 | import torch
  4 | import torch.nn.functional as F
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | 
  8 | 
  9 | EPOCH = 10
 10 | LEARNING_RATE = 0.003
 11 | MOMENTUM = 0.9
 12 | GAMMA = 0.5
 13 | STEP_SIZE = 1
 14 | 
 15 | 
 16 | 
 17 | if torch.cuda.is_available():
 18 |     device = torch.device('cuda')
 19 | else:
 20 |     device = torch.device('cpu')
 21 | print(device)
 22 | 
 23 | 
 24 | 
 25 | model = LRCN().to(device)
 26 | 
 27 | 
 28 | 
 29 | optimizer = torch.optim.SGD(
 30 |     model.parameters(),
 31 |     lr=LEARNING_RATE,
 32 |     momentum=MOMENTUM
 33 | )
 34 | # scheduler = torch.optim.lr_scheduler.StepLR(
 35 | #     optimizer,
 36 | #     step_size=STEP_SIZE,
 37 | #     gamma=GAMMA
 38 | # )
 39 | 
 40 | 
 41 | 
 42 | def save_checkpoint(path, model, optimizer):
 43 |     state = {
 44 |         'model': model.state_dict(),
 45 |         'optimizer': optimizer.state_dict()
 46 |     }
 47 |     torch.save(state, path)
 48 | 
 49 | 
 50 | 
 51 | def train(epoch):
 52 |     iteration = 0
 53 |     loss_plt=[]
 54 | 
 55 | 
 56 |     for i in range(epoch):
 57 |         model.train()
 58 |         # print('current lr', scheduler.get_last_lr())
 59 |         
 60 |         for index, data in enumerate(trainset_loader):
 61 |             video_clips, label = data
 62 | 
 63 |             video_clips = video_clips.to(device)
 64 |             label = label.to(device)
 65 | 
 66 |             optimizer.zero_grad()
 67 | 
 68 |             output = model(video_clips)
 69 | 
 70 |             loss = F.cross_entropy(output, label)
 71 | 
 72 |             loss_plt.append(loss.item())
 73 | 
 74 |             loss.backward()
 75 |             optimizer.step()
 76 | 
 77 |             iteration += 1
 78 | 
 79 |             print("Epoch:", i, "/", epoch-1, "\tIteration:", index, "/", len(trainset_loader)-1, "\tLoss: " + str(loss.item()))
 80 |             with open('log.txt', 'a') as f:
 81 |                 f.write("Epoch: " + str(i) + "/" + str(epoch-1) + "\tIteration:" + str(index) + "/" + str(len(trainset_loader)-1) + "\tLoss: " + str(loss.item()) + "\n")
 82 | 
 83 |         save_checkpoint('model/checkpoint-%i.pth' % iteration, model, optimizer)
 84 | 
 85 |         test(i)
 86 | 
 87 |         # scheduler.step()
 88 | 
 89 |     save_checkpoint('model/checkpoint-%i.pth' % iteration, model, optimizer)
 90 | 
 91 |     plt.figure()
 92 |     plt.plot(loss_plt)
 93 |     plt.title('Loss')
 94 |     plt.xlabel('Iteration')
 95 |     plt.ylabel('')
 96 |     plt.show()
 97 | 
 98 | 
 99 | 
100 | def test(i_epoch):
101 |     model.eval()
102 | 
103 |     correct = 0
104 | 
105 |     with torch.no_grad():
106 |         for index, data in enumerate(testset_loader):
107 |             video_clips, label = data
108 | 
109 |             video_clips = video_clips.to(device)
110 |             label = label.to(device)
111 | 
112 |             output = model(video_clips)
113 | 
114 |             max_value, max_index = output.max(1, keepdim=True)
115 |             correct += max_index.eq(label.view_as(max_index)).sum().item()
116 | 
117 |     print("Accuracy: " + str(correct * 1.0 * 100 / len(testset_loader.dataset)))
118 |     with open('log.txt', 'a') as f:
119 |         f.write("Epoch " + str(i_epoch) + "'s Accuracy: " + str(correct * 1.0 * 100 / len(testset_loader.dataset)) + "\n")
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     train(EPOCH)
124 | 


--------------------------------------------------------------------------------