├── .gitignore ├── 3DCNN ├── ARTNet │ ├── ARTNet.py │ └── README.md ├── FstCN │ ├── FstCN.py │ └── README.md ├── I3D │ ├── I3D.py │ └── README.md ├── LTC │ ├── LTC.py │ └── README.md ├── P3D │ ├── P3D.py │ └── README.md ├── R21D_34 │ ├── R21D_34.py │ └── README.md ├── Res3D │ ├── README.md │ └── Res3D.py ├── S3D │ ├── Fast_S3D.py │ ├── README.md │ └── S3D_G.py └── c3d │ ├── README.md │ └── c3d.py ├── CNN+LSTM ├── ALSTM │ ├── ALSTM.py │ └── README.md ├── LRCNs │ ├── LRCNs.py │ └── README.md └── convpooling_LSTM │ ├── README.md │ └── convpooling_LSTM.py ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /3DCNN/ARTNet/ARTNet.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @project: ARTNet 3 | @author: Zhimeng Zhang 4 | ''' 5 | import torch.nn as nn 6 | import torch 7 | 8 | class SMART_block(nn.Module): 9 | 10 | def __init__(self, in_channel,out_channel,kernel_size=(3,3,3), stride=(1,1,1), padding=(1,1,1)): 11 | super(SMART_block, self).__init__() 12 | 13 | self.appearance_conv=nn.Conv3d(in_channel, out_channel, kernel_size=(1,kernel_size[1],kernel_size[2]),stride= stride,padding=(0, padding[1], padding[2]),bias=False) 14 | self.appearance_bn=nn.BatchNorm3d(out_channel) 15 | 16 | self.relation_conv=nn.Conv3d(in_channel, out_channel,kernel_size=kernel_size, stride=stride, padding=padding, bias=False) 17 | self.relation_bn1=nn.BatchNorm3d(out_channel) 18 | self.relation_pooling=nn.Conv3d(out_channel,out_channel//2,kernel_size=1,stride=1,groups=out_channel//2,bias=False) 19 | nn.init.constant_(self.relation_pooling.weight,0.5) 20 | self.relation_pooling.weight.requires_grad=False 21 | self.relation_bn2 = nn.BatchNorm3d(out_channel//2) 22 | 23 | self.reduce=nn.Conv3d(out_channel+out_channel//2,out_channel,kernel_size=1,bias=False) 24 | self.reduce_bn=nn.BatchNorm3d(out_channel) 25 | 26 | self.relu = nn.ReLU() 27 | if in_channel != out_channel or stride[0] != 1 or stride[1] != 1: 28 | self.down_sample = nn.Sequential(nn.Conv3d(in_channel, out_channel, kernel_size=1, 29 | stride=stride, 30 | bias=False), 31 | nn.BatchNorm3d(out_channel)) 32 | else: 33 | self.down_sample = None 34 | 35 | def forward(self, x): 36 | appearance=x 37 | relation=x 38 | appearance=self.appearance_conv(appearance) 39 | appearance=self.appearance_bn(appearance) 40 | relation=self.relation_conv(relation) 41 | relation=self.relation_bn1(relation) 42 | relation=torch.pow(relation,2) 43 | relation=self.relation_pooling(relation) 44 | relation=self.relation_bn2(relation) 45 | stream=self.relu(torch.cat([appearance,relation],1)) 46 | stream=self.reduce(stream) 47 | stream=self.reduce_bn(stream) 48 | if self.down_sample is not None: 49 | x=self.down_sample(x) 50 | 51 | return self.relu(stream+x) 52 | 53 | 54 | class ARTNet(nn.Module): 55 | # Input size: 16x112x112 56 | def __init__(self, num_class): 57 | super(ARTNet, self).__init__() 58 | 59 | self.conv1=SMART_block(3,64,kernel_size=(3,7,7),stride=(2,2,2),padding=(1,3,3)) 60 | self.conv2=nn.Sequential(SMART_block(64,64), 61 | SMART_block(64, 64)) 62 | self.conv3=nn.Sequential(SMART_block(64,128,stride=(2,2,2)), 63 | SMART_block(128, 128)) 64 | self.conv4 = nn.Sequential(SMART_block(128, 256, stride=(2,2,2)), 65 | SMART_block(256, 256)) 66 | self.conv5 = nn.Sequential(SMART_block(256, 512, stride=(2,2,2)), 67 | SMART_block(512, 512)) 68 | self.avg_pool=nn.AvgPool3d(kernel_size=(1,7,7)) 69 | self.linear=nn.Linear(512,num_class) 70 | 71 | def forward(self, x): 72 | x=self.conv1(x) 73 | x=self.conv2(x) 74 | x=self.conv3(x) 75 | x=self.conv4(x) 76 | x=self.conv5(x) 77 | x=self.avg_pool(x) 78 | return self.linear(x.view(x.size(0),-1)) -------------------------------------------------------------------------------- /3DCNN/ARTNet/README.md: -------------------------------------------------------------------------------- 1 | # Appearance-and-Relation Networks for Video Classification 2 | This paper can be downloaded [here](http://openaccess.thecvf.com/content_cvpr_2018/papers/Wang_Appearance-and-Relation_Networks_for_CVPR_2018_paper.pdf). 3 | 4 | 5 | ## Note 6 | I reproduce the model based on the original caffe code [here](https://github.com/wanglimin/ARTNet.git). 7 | 8 | 9 | -------------------------------------------------------------------------------- /3DCNN/FstCN/FstCN.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @project: FstCN 3 | @author: MRzzm 4 | @E-mail: zhangzhimeng1@gmail.com 5 | @github: https://github.com/MRzzm/action-recognition-models-pytorch.git 6 | ''' 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch.nn import init 11 | 12 | class TCL(nn.Module): 13 | def __init__(self, in_channels,init_weights): 14 | super(TCL, self).__init__() 15 | self.branch1=nn.Sequential(nn.Conv3d(in_channels,32,kernel_size=(3,1,1),stride=(1,1,1),padding=(1,0,0)), 16 | nn.ReLU(True), 17 | nn.MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1)) 18 | ) 19 | self.branch2=nn.Sequential(nn.Conv3d(in_channels,32,kernel_size=(5,1,1),stride=(1,1,1),padding=(2,0,0)), 20 | nn.ReLU(True), 21 | nn.MaxPool3d(kernel_size=(2,1,1),stride=(2,1,1)) 22 | ) 23 | if init_weights: 24 | self._initialize_weights() 25 | 26 | def forward(self, x): 27 | res1=self.branch1(x) 28 | res2=self.branch2(x) 29 | return torch.cat([res1,res2],1) 30 | 31 | def _initialize_weights(self): 32 | for m in self.modules(): 33 | if isinstance(m, nn.Sequential): 34 | for n in m: 35 | if isinstance(n,nn.Conv3d): 36 | init.xavier_uniform_(n.weight) 37 | init.constant_(n.bias, 0) 38 | 39 | 40 | 41 | # input_size: 16x204x204 42 | class FstCN(nn.Module): 43 | def __init__(self, num_class, init_weights=True): 44 | super(FstCN, self).__init__() 45 | 46 | self.SCL1 = nn.Sequential(nn.Conv3d(3, 96, kernel_size=(1,7,7), stride=(1,2,2),padding=(0,3,3)), 47 | nn.ReLU(True), 48 | nn.MaxPool3d((1,3,3),stride=(1,2,2))) 49 | self.SCL2=nn.Sequential(nn.Conv3d(96, 256, kernel_size=(1,5,5), stride=(1,2,2),padding=(0,2,2)), 50 | nn.ReLU(True), 51 | nn.MaxPool3d((1,3,3),stride=(1,2,2))) 52 | self.SCL3 = nn.Sequential(nn.Conv3d(256, 512, kernel_size=(1,3,3),stride=(1,1,1),padding=(0,1,1)), 53 | nn.ReLU(True) 54 | ) 55 | self.SCL4 = nn.Sequential(nn.Conv3d(512, 512, kernel_size=(1,3,3),stride=(1,1,1),padding=(0,1,1)), 56 | nn.ReLU(True) 57 | ) 58 | 59 | self.Parallel_temporal = nn.Sequential( nn.Conv3d(512,128,kernel_size=(1,3,3),stride=(1,1,1),padding=(0,1,1)), 60 | nn.MaxPool3d((1,3,3),stride=(1,3,3)), 61 | TCL(in_channels=128,init_weights=init_weights) 62 | ) 63 | self.Parallel_spatial = nn.Sequential( nn.Conv2d(512,128,kernel_size=(3,3),stride=(1,1),padding=(1,1)), 64 | nn.MaxPool2d((3, 3), stride=(3, 3)) 65 | ) 66 | self.tem_fc=nn.Sequential(nn.Linear(8192, 4096), 67 | nn.Dropout(), 68 | nn.Linear(4096, 2048)) 69 | self.spa_fc = nn.Sequential(nn.Linear(2048, 4096), 70 | nn.Dropout(), 71 | nn.Linear(4096, 2048)) 72 | self.fc=nn.Linear(4096,2048) 73 | self.out=nn.Linear(2048,num_class) 74 | 75 | if init_weights: 76 | self._initialize_weights() 77 | 78 | def forward(self,clip,clip_diff): 79 | clip_all=torch.cat([clip,clip_diff],2) 80 | clip_len=clip.size(2) 81 | clip_all = self.SCL1(clip_all) 82 | clip_all = self.SCL2(clip_all) 83 | clip_all = self.SCL3(clip_all) 84 | clip_all = self.SCL4(clip_all) 85 | clip=clip_all[:,:,:clip_len,:,:] 86 | clip_diff=clip_all[:,:,clip_len:,:,:] 87 | clip=torch.squeeze(clip[:,:,clip.size(2)//2,:,:]) 88 | clip = self.Parallel_spatial(clip) 89 | clip=self.spa_fc(clip.view(clip.size(0),-1)) 90 | clip_diff = self.Parallel_temporal(clip_diff) 91 | clip_diff=self.tem_fc(clip_diff.view(clip_diff.size(0),-1)) 92 | res = torch.cat([clip,clip_diff],1) 93 | res=self.fc(res) 94 | res=self.out(res) 95 | return res 96 | 97 | def _initialize_weights(self): 98 | for m in self.modules(): 99 | if isinstance(m, nn.Sequential): 100 | for n in m: 101 | if isinstance(n,nn.Conv3d): 102 | init.xavier_uniform_(n.weight) 103 | if n.bias is not None: 104 | init.constant_(n.bias, 0) 105 | elif isinstance(n,nn.Conv2d): 106 | init.xavier_uniform_(n.weight) 107 | if n.bias is not None: 108 | init.constant_(n.bias, 0) 109 | elif isinstance(m, nn.Linear): 110 | init.xavier_uniform_(m.weight) 111 | init.constant_(m.bias, 0) 112 | -------------------------------------------------------------------------------- /3DCNN/FstCN/README.md: -------------------------------------------------------------------------------- 1 | # Human Action Recognition using Factorized Spatio-Temporal Convolutional Networks 2 | This paper can be downloaded [here](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Sun_Human_Action_Recognition_ICCV_2015_paper.pdf). 3 | 4 | ## Detailed introduction of the paper 5 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/84880257). 6 | 7 | 8 | -------------------------------------------------------------------------------- /3DCNN/I3D/I3D.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @project: I3D 3 | @author: Zhimeng Zhang 4 | @E-mail: zhangzhimeng1@gmail.com 5 | @github: https://github.com/MRzzm/action-recognition-models-pytorch.git 6 | ''' 7 | import torch 8 | import torch.nn as nn 9 | 10 | class BasicConv3d(nn.Module): 11 | def __init__(self, in_channel, out_channel, kernel_size, stride, padding=0): 12 | super(BasicConv3d, self).__init__() 13 | self.conv = nn.Conv3d(in_channel, out_channel, 14 | kernel_size=kernel_size, stride=stride, 15 | padding=padding) 16 | self.bn = nn.BatchNorm3d(out_channel, 17 | eps=0.001, # value found in tensorflow 18 | ) 19 | self.relu = nn.ReLU() 20 | 21 | def forward(self, x): 22 | x = self.conv(x) 23 | x = self.bn(x) 24 | x = self.relu(x) 25 | return x 26 | 27 | class Inception_block(nn.Module): 28 | 29 | def __init__(self,in_channel,out_channel): 30 | super(Inception_block, self).__init__() 31 | # out_channel=[1x1x1,3x3x3_reduce,3x3x3,3x3x3_reduce,3x3x3,pooling_reduce] 32 | 33 | self.branch1 = BasicConv3d(in_channel,out_channel[0], kernel_size=1, stride=1) 34 | self.branch2 = nn.Sequential( 35 | BasicConv3d(in_channel, out_channel[1], kernel_size=1, stride=1), 36 | BasicConv3d(out_channel[1], out_channel[2],kernel_size=3, stride=1, padding=1) 37 | ) 38 | self.branch3 = nn.Sequential( 39 | BasicConv3d(in_channel, out_channel[3], kernel_size=1, stride=1), 40 | BasicConv3d(out_channel[3], out_channel[4], kernel_size=3, stride=1, padding=1) 41 | ) 42 | self.branch4 = nn.Sequential( 43 | nn.MaxPool3d(kernel_size=3,stride=1,padding=1), 44 | BasicConv3d(in_channel, out_channel[5], kernel_size=1, stride=1), 45 | ) 46 | 47 | def forward(self, x): 48 | x1 = self.branch1(x) 49 | x2 = self.branch2(x) 50 | x3 = self.branch3(x) 51 | x4 = self.branch4(x) 52 | return torch.cat([x1,x2,x3,x4], 1) 53 | 54 | 55 | class I3D(nn.Module): 56 | # Input size: 64x224x224 57 | def __init__(self, num_class): 58 | super(I3D, self).__init__() 59 | 60 | self.conv1=BasicConv3d(3,64,kernel_size=7,stride=2,padding=3) 61 | self.pool1=nn.MaxPool3d(kernel_size=(1,3,3),stride=(1,2,2),padding=(0,1,1)) 62 | self.conv2=BasicConv3d(64,64,kernel_size=1,stride=1) 63 | self.conv3=BasicConv3d(64,192,kernel_size=3,stride=1,padding=1) 64 | self.pool2=nn.MaxPool3d(kernel_size=(1,3,3),stride=(1,2,2),padding=(0,1,1)) 65 | self.Inception1=nn.Sequential(Inception_block(192, [64,96,128,16,32,32]), 66 | Inception_block(256, [128, 128, 192, 32, 96, 64])) 67 | self.pool3=nn.MaxPool3d(kernel_size=(3,3,3),stride=(2,2,2),padding=(1,1,1)) 68 | self.Inception2=nn.Sequential(Inception_block(480,[192,96,208,16,48,64]), 69 | Inception_block(512, [160, 112, 224, 24, 64, 64]), 70 | Inception_block(512, [128, 128, 256, 24, 64, 64]), 71 | Inception_block(512, [112, 144, 288, 32, 64, 64]), 72 | Inception_block(528, [256, 160, 320, 32, 128, 128])) 73 | self.pool4=nn.MaxPool3d(kernel_size=(2,2,2),stride=2) 74 | self.Inception3=nn.Sequential(Inception_block(832,[256,160,320,32,128,128]), 75 | Inception_block(832, [384, 192, 384, 48, 128, 128])) 76 | self.avg_pool=nn.AvgPool3d(kernel_size=(8,7,7)) 77 | self.dropout = nn.Dropout(0.4) 78 | self.linear=nn.Linear(1024,num_class) 79 | 80 | def forward(self, x): 81 | x = self.conv1(x) 82 | x = self.pool1(x) 83 | x = self.conv2(x) 84 | x = self.conv3(x) 85 | x = self.pool2(x) 86 | x = self.Inception1(x) 87 | x = self.pool3(x) 88 | x = self.Inception2(x) 89 | x = self.pool4(x) 90 | x = self.Inception3(x) 91 | x = self.avg_pool(x) 92 | x = self.dropout(x.view(x.size(0),-1)) 93 | return self.linear(x) -------------------------------------------------------------------------------- /3DCNN/I3D/README.md: -------------------------------------------------------------------------------- 1 | # Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset 2 | This paper can be downloaded [here](https://arxiv.org/pdf/1705.07750.pdf). 3 | Limited by the time, I only provide the code of I3D without pre-trained ImageNet parameters. 4 | 5 | ## Detailed introduction of the paper 6 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/84936338). 7 | 8 | 9 | -------------------------------------------------------------------------------- /3DCNN/LTC/LTC.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @project: LTC 3 | @author: Zhimeng Zhang 4 | ''' 5 | 6 | import torch.nn as nn 7 | from torch.nn import init 8 | 9 | class LTC(nn.Module): 10 | # input size: 100x71x71 11 | def __init__(self, num_class, init_weights=True): 12 | super(LTC, self).__init__() 13 | 14 | self.conv1 = nn.Conv3d(3, 64, kernel_size=3, padding=1) 15 | self.conv2 = nn.Conv3d(64, 128, kernel_size=3, padding=1) 16 | self.conv3 = nn.Conv3d(128, 256, kernel_size=3, padding=1) 17 | self.conv4 = nn.Conv3d(256, 256, kernel_size=3, padding=1) 18 | self.conv5 = nn.Conv3d(256, 256, kernel_size=3, padding=1) 19 | 20 | self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)) 21 | self.pool2 = nn.MaxPool3d(kernel_size=2, stride=2) 22 | self.pool3 = nn.MaxPool3d(kernel_size=2, stride=2) 23 | self.pool4 = nn.MaxPool3d(kernel_size=2, stride=2) 24 | self.pool5 = nn.MaxPool3d(kernel_size=2, stride=2) 25 | 26 | self.fc1 = nn.Linear(6144, 2048) 27 | self.fc2 = nn.Linear(2048, 2048) 28 | self.out = nn.Linear(2048, num_class) 29 | 30 | self.relu = nn.ReLU() 31 | self.dropout=nn.Dropout(0.9) 32 | if init_weights: 33 | self._initialize_weights() 34 | 35 | def forward(self, x): 36 | 37 | x = self.conv1(x) 38 | x = self.relu(x) 39 | x = self.pool1(x) 40 | 41 | x = self.conv2(x) 42 | x = self.relu(x) 43 | x = self.pool2(x) 44 | 45 | x = self.conv3(x) 46 | x = self.relu(x) 47 | x = self.pool3(x) 48 | 49 | x = self.conv4(x) 50 | x = self.relu(x) 51 | x = self.pool4(x) 52 | 53 | x = self.conv5(x) 54 | x = self.relu(x) 55 | x = self.pool5(x) 56 | 57 | x = x.view(x.size(0), -1) 58 | x = self.relu(self.fc1(x)) 59 | x=self.dropout(x) 60 | x = self.relu(self.fc2(x)) 61 | x = self.dropout(x) 62 | res = self.out(x) 63 | 64 | return res 65 | 66 | def _initialize_weights(self): 67 | for m in self.modules(): 68 | if isinstance(m, nn.Conv3d): 69 | init.xavier_uniform_(m.weight) 70 | if m.bias is not None: 71 | init.constant_(m.bias, 0) 72 | elif isinstance(m, nn.Linear): 73 | init.xavier_uniform_(m.weight) 74 | init.constant_(m.bias, 0) 75 | -------------------------------------------------------------------------------- /3DCNN/LTC/README.md: -------------------------------------------------------------------------------- 1 | # Long-term Temporal Convolutions for Action Recognition 2 | This paper can be downloaded [here](https://arxiv.org/pdf/1604.04494.pdf). 3 | 4 | 5 | ## Detailed introduction of the paper 6 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/85051850). 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /3DCNN/P3D/P3D.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @project: P3D 3 | @author: Zhimeng Zhang 4 | ''' 5 | import torch.nn as nn 6 | 7 | class P3D_Block(nn.Module): 8 | 9 | def __init__(self, blockType, inplanes, planes, stride=1): 10 | super(P3D_Block, self).__init__() 11 | self.expansion = 4 12 | self.blockType=blockType 13 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 14 | self.bn1 = nn.BatchNorm3d(planes) 15 | if self.blockType=='A': 16 | self.conv2D = nn.Conv3d(planes, planes, kernel_size=(1,3,3), stride=(1,stride,stride), 17 | padding=(0,1,1), bias=False) 18 | self.conv1D = nn.Conv3d(planes, planes, kernel_size=(3,1,1), stride=(stride,1,1), 19 | padding=(1,0,0), bias=False) 20 | elif self.blockType == 'B': 21 | self.conv2D = nn.Conv3d(planes, planes, kernel_size=(1, 3, 3), stride=stride, 22 | padding=(0, 1, 1), bias=False) 23 | self.conv1D = nn.Conv3d(planes, planes, kernel_size=(3, 1, 1), stride=stride, 24 | padding=(1, 0, 0), bias=False) 25 | else: 26 | self.conv2D = nn.Conv3d(planes, planes, kernel_size=(1, 3, 3), stride=stride, 27 | padding=(0, 1, 1), bias=False) 28 | self.conv1D = nn.Conv3d(planes, planes, kernel_size=(3, 1, 1), stride=1, 29 | padding=(1, 0, 0), bias=False) 30 | self.bn2D = nn.BatchNorm3d(planes) 31 | self.bn1D = nn.BatchNorm3d(planes) 32 | self.conv3 = nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False) 33 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 34 | self.relu = nn.ReLU() 35 | self.stride = stride 36 | 37 | if self.stride != 1 or inplanes!= planes * self.expansion: 38 | self.downsample = nn.Sequential( 39 | nn.Conv3d(inplanes, planes * self.expansion, 40 | kernel_size=1, stride=stride, bias=False), 41 | nn.BatchNorm3d(planes * self.expansion), 42 | ) 43 | else: 44 | self.downsample=None 45 | 46 | 47 | def forward(self, x): 48 | x_branch = self.conv1(x) 49 | x_branch = self.bn1(x_branch) 50 | x_branch = self.relu(x_branch) 51 | 52 | if self.blockType=='A': 53 | x_branch = self.conv2D(x_branch) 54 | x_branch = self.bn2D(x_branch) 55 | x_branch = self.relu(x_branch) 56 | x_branch = self.conv1D(x_branch) 57 | x_branch = self.bn1D(x_branch) 58 | x_branch = self.relu(x_branch) 59 | elif self.blockType=='B': 60 | x_branch2D = self.conv2D(x_branch) 61 | x_branch2D = self.bn2D(x_branch2D) 62 | x_branch2D = self.relu(x_branch2D) 63 | x_branch1D = self.conv1D(x_branch) 64 | x_branch1D = self.bn1D(x_branch1D) 65 | x_branch=x_branch1D+x_branch2D 66 | x_branch=self.relu(x_branch) 67 | else: 68 | x_branch = self.conv2D(x_branch) 69 | x_branch = self.bn2D(x_branch) 70 | x_branch = self.relu(x_branch) 71 | x_branch1D = self.conv1D(x_branch) 72 | x_branch1D = self.bn1D(x_branch1D) 73 | x_branch=x_branch+x_branch1D 74 | x_branch=self.relu(x_branch) 75 | 76 | x_branch = self.conv3(x_branch) 77 | x_branch = self.bn3(x_branch) 78 | 79 | if self.downsample is not None: 80 | x = self.downsample(x) 81 | 82 | x =x+ x_branch 83 | x = self.relu(x) 84 | return x 85 | 86 | class P3D (nn.Module): 87 | # input size: 16 x 160 x 160 88 | def __init__(self, num_class): 89 | super(P3D, self).__init__() 90 | self.expansion = 4 91 | self.conv1 = nn.Conv3d(3, 64, kernel_size=(1,7,7), stride=(1,2,2), padding=(0,3,3), 92 | bias=False) 93 | self.bn1 = nn.BatchNorm3d(64) 94 | self.relu = nn.ReLU() 95 | self.maxpool = nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)) 96 | self.conv2 = nn.Sequential(P3D_Block('A',64,64,2), 97 | P3D_Block('B', 64 * self.expansion, 64), 98 | P3D_Block('C', 64 * self.expansion, 64)) 99 | self.conv3 = nn.Sequential(P3D_Block('A', 64 * self.expansion, 128, 2), 100 | P3D_Block('B', 128 * self.expansion, 128), 101 | P3D_Block('C', 128 * self.expansion, 128), 102 | P3D_Block('A', 128 * self.expansion, 128)) 103 | self.conv4 = nn.Sequential(P3D_Block('B', 128 * self.expansion, 256, 2), 104 | P3D_Block('C', 256 * self.expansion, 256), 105 | P3D_Block('A', 256 * self.expansion, 256), 106 | P3D_Block('B', 256 * self.expansion, 256), 107 | P3D_Block('C', 256 * self.expansion, 256), 108 | P3D_Block('A', 256 * self.expansion, 256)) 109 | self.conv5 = nn.Sequential(P3D_Block('B', 256 * self.expansion, 512, 2), 110 | P3D_Block('C', 512 * self.expansion, 512), 111 | P3D_Block('A', 512 * self.expansion, 512)) 112 | self.average_pool=nn.AvgPool3d((1,3,3)) 113 | self.fc=nn.Linear(512 * self.expansion,num_class) 114 | 115 | def forward(self, x): 116 | x=self.conv1(x) 117 | x=self.bn1(x) 118 | x=self.relu(x) 119 | x=self.maxpool(x) 120 | x=self.conv2(x) 121 | x=self.conv3(x) 122 | x=self.conv4(x) 123 | x=self.conv5(x) 124 | x=self.average_pool(x) 125 | x=x.view(x.size(0),-1) 126 | x = self.fc(x) 127 | return x -------------------------------------------------------------------------------- /3DCNN/P3D/README.md: -------------------------------------------------------------------------------- 1 | # Learning Spatio-Temporal Representation with Pseudo-3D Residual Networks 2 | This paper can be downloaded [here](http://openaccess.thecvf.com/content_ICCV_2017/papers/Qiu_Learning_Spatio-Temporal_Representation_ICCV_2017_paper.pdf). 3 | 4 | # Note: 5 | The structure of the network is not described in detail in this paper, so my code may not be the same as the author. 6 | 7 | ## Detailed introduction of the paper 8 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/85099886). 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /3DCNN/R21D_34/R21D_34.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @project: R21D_34 3 | @author: Zhimeng Zhang 4 | ''' 5 | import torch.nn as nn 6 | 7 | class Res21D_Block(nn.Module): 8 | def __init__(self, in_channel,out_channel, spatial_stride=1,temporal_stride=1): 9 | super(Res21D_Block, self).__init__() 10 | self.MidChannel1=int((27*in_channel*out_channel)/(9*in_channel+3*out_channel)) 11 | self.MidChannel2 = int((27 * out_channel * out_channel) / ( 12 * out_channel)) 12 | self.conv1_2D = nn.Conv3d(in_channel,self.MidChannel1 , kernel_size=(1, 3, 3), stride=(1, spatial_stride, spatial_stride), 13 | padding=(0, 1, 1)) 14 | self.bn1_2D = nn.BatchNorm3d(self.MidChannel1) 15 | self.conv1_1D=nn.Conv3d(self.MidChannel1, out_channel, kernel_size=(3, 1, 1), stride=(temporal_stride, 1, 1), 16 | padding=(1, 0, 0)) 17 | self.bn1_1D = nn.BatchNorm3d(out_channel) 18 | 19 | self.conv2_2D = nn.Conv3d(out_channel, self.MidChannel2, kernel_size=(1, 3, 3), stride=1, 20 | padding=(0, 1, 1)) 21 | self.bn2_2D = nn.BatchNorm3d(self.MidChannel2) 22 | self.conv2_1D = nn.Conv3d(self.MidChannel2, out_channel, kernel_size=(3, 1, 1), stride=1, 23 | padding=(1, 0, 0)) 24 | self.bn2_1D = nn.BatchNorm3d(out_channel) 25 | 26 | self.relu = nn.ReLU() 27 | if in_channel != out_channel or spatial_stride != 1 or temporal_stride != 1: 28 | self.down_sample=nn.Sequential(nn.Conv3d(in_channel, out_channel,kernel_size=1,stride=(temporal_stride, spatial_stride, spatial_stride),bias=False), 29 | nn.BatchNorm3d(out_channel)) 30 | else: 31 | self.down_sample=None 32 | 33 | def forward(self, x): 34 | 35 | x_branch = self.conv1_2D(x) 36 | x_branch=self.bn1_2D(x_branch) 37 | x_branch = self.relu(x_branch) 38 | x_branch=self.conv1_1D(x_branch) 39 | x_branch=self.bn1_1D(x_branch) 40 | x_branch = self.relu(x_branch) 41 | 42 | x_branch = self.conv2_2D(x_branch) 43 | x_branch = self.bn2_2D(x_branch) 44 | x_branch = self.relu(x_branch) 45 | x_branch = self.conv2_1D(x_branch) 46 | x_branch = self.bn2_1D(x_branch) 47 | 48 | if self.down_sample is not None: 49 | x=self.down_sample(x) 50 | return self.relu(x_branch+x) 51 | 52 | class Res21D(nn.Module): 53 | # Input size: 8 x 112 x 112 54 | def __init__(self, num_class): 55 | super(Res21D, self).__init__() 56 | 57 | self.conv1=nn.Conv3d(3,64,kernel_size=(3,7,7),stride=(1,2,2),padding=(1,3,3)) 58 | self.conv2=nn.Sequential(Res21D_Block(64, 64, spatial_stride=2), 59 | Res21D_Block(64, 64), 60 | Res21D_Block(64, 64)) 61 | self.conv3=nn.Sequential(Res21D_Block(64,128,spatial_stride=2,temporal_stride=2), 62 | Res21D_Block(128, 128), 63 | Res21D_Block(128, 128), 64 | Res21D_Block(128, 128),) 65 | self.conv4 = nn.Sequential(Res21D_Block(128, 256, spatial_stride=2,temporal_stride=2), 66 | Res21D_Block(256, 256), 67 | Res21D_Block(256, 256), 68 | Res21D_Block(256, 256), 69 | Res21D_Block(256, 256), 70 | Res21D_Block(256, 256)) 71 | self.conv5 = nn.Sequential(Res21D_Block(256, 512, spatial_stride=2,temporal_stride=2), 72 | Res21D_Block(512, 512), 73 | Res21D_Block(512, 512)) 74 | self.avg_pool=nn.AvgPool3d(kernel_size=(1,4,4)) 75 | self.linear=nn.Linear(512,num_class) 76 | 77 | def forward(self, x): 78 | x=self.conv1(x) 79 | x=self.conv2(x) 80 | x=self.conv3(x) 81 | x=self.conv4(x) 82 | x=self.conv5(x) 83 | x=self.avg_pool(x) 84 | return self.linear(x.view(x.size(0),-1)) -------------------------------------------------------------------------------- /3DCNN/R21D_34/README.md: -------------------------------------------------------------------------------- 1 | # A Closer Look at Spatiotemporal Convolutions for Action Recognition 2 | This paper can be downloaded [here](http://openaccess.thecvf.com/content_cvpr_2018/CameraReady/2648.pdf). 3 | 4 | ## Detailed introduction of the paper 5 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/85143711). 6 | 7 | 8 | -------------------------------------------------------------------------------- /3DCNN/Res3D/README.md: -------------------------------------------------------------------------------- 1 | # ConvNet Architecture Search for Spatiotemporal Feature Learning 2 | This paper can be downloaded [here](https://arxiv.org/pdf/1708.05038.pdf). 3 | 4 | ## Detailed introduction of the paper 5 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/84962135). 6 | 7 | 8 | -------------------------------------------------------------------------------- /3DCNN/Res3D/Res3D.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @project: Res3D 3 | @author: Zhimeng Zhang 4 | @E-mail: zhangzhimeng1@gmail.com 5 | @github: https://github.com/MRzzm/action-recognition-models-pytorch.git 6 | ''' 7 | 8 | import torch.nn as nn 9 | import torch.nn.init as init 10 | 11 | class ResBlock(nn.Module): 12 | def __init__(self, in_channel,out_channel, spatial_stride=1,temporal_stride=1): 13 | super(ResBlock, self).__init__() 14 | 15 | self.conv1 = nn.Conv3d(in_channel, out_channel,kernel_size=(3,3,3),stride=(temporal_stride,spatial_stride,spatial_stride),padding=(1,1,1)) 16 | self.conv2 = nn.Conv3d(out_channel, out_channel,kernel_size=(3, 3, 3),stride=(1, 1, 1),padding=(1, 1, 1)) 17 | self.bn1 = nn.BatchNorm3d(out_channel) 18 | self.bn2 = nn.BatchNorm3d(out_channel) 19 | self.relu = nn.ReLU() 20 | if in_channel != out_channel or spatial_stride != 1 or temporal_stride != 1: 21 | self.down_sample=nn.Sequential(nn.Conv3d(in_channel, out_channel,kernel_size=1,stride=(temporal_stride,spatial_stride,spatial_stride),bias=False), 22 | nn.BatchNorm3d(out_channel)) 23 | else: 24 | self.down_sample=None 25 | 26 | def forward(self, x): 27 | x_branch = self.conv1(x) 28 | x_branch = self.bn1(x_branch) 29 | x_branch = self.relu(x_branch) 30 | x_branch = self.conv2(x_branch) 31 | x_branch = self.bn2(x_branch) 32 | if self.down_sample is not None: 33 | x=self.down_sample(x) 34 | return self.relu(x_branch+x) 35 | 36 | class Res3D(nn.Module): 37 | # Input size: 8x224x224 38 | def __init__(self, num_class): 39 | super(Res3D, self).__init__() 40 | 41 | self.conv1=nn.Conv3d(3,64,kernel_size=(3,7,7),stride=(1,2,2),padding=(1,3,3)) 42 | self.conv2=nn.Sequential(ResBlock(64,64,spatial_stride=2), 43 | ResBlock(64, 64)) 44 | self.conv3=nn.Sequential(ResBlock(64,128,spatial_stride=2,temporal_stride=2), 45 | ResBlock(128, 128)) 46 | self.conv4 = nn.Sequential(ResBlock(128, 256, spatial_stride=2,temporal_stride=2), 47 | ResBlock(256, 256)) 48 | self.conv5 = nn.Sequential(ResBlock(256, 512, spatial_stride=2,temporal_stride=2), 49 | ResBlock(512, 512)) 50 | self.avg_pool=nn.AvgPool3d(kernel_size=(1,7,7)) 51 | self.linear=nn.Linear(512,num_class) 52 | 53 | def forward(self, x): 54 | x=self.conv1(x) 55 | x=self.conv2(x) 56 | x=self.conv3(x) 57 | x=self.conv4(x) 58 | x=self.conv5(x) 59 | x=self.avg_pool(x) 60 | return self.linear(x.view(x.size(0),-1)) 61 | -------------------------------------------------------------------------------- /3DCNN/S3D/Fast_S3D.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @project: Fast_S3D 3 | @author: Zhimeng Zhang 4 | ''' 5 | import torch 6 | import torch.nn as nn 7 | 8 | class BasicConv3d(nn.Module): 9 | def __init__(self, in_channel, out_channel, kernel_size, stride, padding=(0,0,0)): 10 | super(BasicConv3d, self).__init__() 11 | self.conv = nn.Conv3d(in_channel, out_channel, 12 | kernel_size=kernel_size, stride=stride, 13 | padding=padding) 14 | self.bn = nn.BatchNorm3d(out_channel) 15 | self.relu = nn.ReLU() 16 | 17 | def forward(self, x): 18 | x = self.conv(x) 19 | x = self.bn(x) 20 | x = self.relu(x) 21 | return x 22 | 23 | class Inception_block(nn.Module): 24 | 25 | def __init__(self,in_channel,out_channel): 26 | super(Inception_block, self).__init__() 27 | # out_channel=[1x1x1,3x3x3_reduce,3x3x3,3x3x3_reduce,3x3x3,pooling_reduce] 28 | 29 | self.branch1 = BasicConv3d(in_channel,out_channel[0], kernel_size=(3,1,1), stride=1, padding=(1,0,0)) 30 | self.branch2 = nn.Sequential( 31 | BasicConv3d(in_channel, out_channel[1], kernel_size=1, stride=1), 32 | BasicConv3d(out_channel[1], out_channel[2],kernel_size=(1,3,3), stride=1, padding=(0,1,1)) 33 | ) 34 | self.branch3 = nn.Sequential( 35 | BasicConv3d(in_channel, out_channel[3], kernel_size=1, stride=1), 36 | BasicConv3d(out_channel[3], out_channel[4], kernel_size=(1, 3, 3), stride=1, padding= (0, 1, 1)) 37 | ) 38 | self.branch4 = nn.Sequential( 39 | nn.MaxPool3d(kernel_size=(1,3,3),stride=1,padding=(0,1,1)), 40 | BasicConv3d(in_channel, out_channel[5], kernel_size=(3,1,1), stride=1,padding=(1,0,0)) 41 | ) 42 | 43 | def forward(self, x): 44 | x1 = self.branch1(x) 45 | x2 = self.branch2(x) 46 | x3 = self.branch3(x) 47 | x4 = self.branch4(x) 48 | return torch.cat([x1,x2,x3,x4], 1) 49 | 50 | class S3D_block(nn.Module): 51 | 52 | def __init__(self,in_channel,out_channel): 53 | super(S3D_block, self).__init__() 54 | # out_channel=[1x1x1,3x3x3_reduce,3x3x3,3x3x3_reduce,3x3x3,pooling_reduce] 55 | 56 | self.branch1 = BasicConv3d(in_channel,out_channel[0], kernel_size=(3,1,1), stride=1,padding=(1,0,0)) 57 | self.branch2 = nn.Sequential( 58 | BasicConv3d(in_channel, out_channel[1], kernel_size=1, stride=1), 59 | BasicConv3d(out_channel[1], out_channel[1],kernel_size=(1,3,3), stride=1, padding=(0,1,1)), 60 | BasicConv3d(out_channel[1], out_channel[2], kernel_size=(3, 1, 1), stride=1, padding=(1, 0, 0)) 61 | ) 62 | self.branch3 = nn.Sequential( 63 | BasicConv3d(in_channel, out_channel[3], kernel_size=1, stride=1), 64 | BasicConv3d(out_channel[3], out_channel[3], kernel_size=(1, 3, 3), stride=1, padding= (0, 1, 1)), 65 | BasicConv3d(out_channel[3], out_channel[4], kernel_size=(3, 1, 1), stride=1, padding=(1, 0, 0)) 66 | ) 67 | self.branch4 = nn.Sequential( 68 | nn.MaxPool3d(kernel_size=3,stride=1,padding=1), 69 | BasicConv3d(in_channel, out_channel[5], kernel_size=(3,1,1), stride=1,padding=(1,0,0)) 70 | ) 71 | 72 | def forward(self, x): 73 | x1 = self.branch1(x) 74 | x2 = self.branch2(x) 75 | x3 = self.branch3(x) 76 | x4 = self.branch4(x) 77 | return torch.cat([x1,x2,x3,x4], 1) 78 | 79 | class fast_S3D(nn.Module): 80 | # Input size: 64x224x224 81 | def __init__(self, num_class): 82 | super(fast_S3D, self).__init__() 83 | 84 | self.conv1=BasicConv3d(3,64,kernel_size=(1,7,7),stride=2,padding=(0,3,3)) 85 | self.pool1=nn.MaxPool3d(kernel_size=(1,3,3),stride=(1,2,2),padding=(0,1,1)) 86 | self.conv2=BasicConv3d(64,64,kernel_size=1,stride=1) 87 | self.conv3=BasicConv3d(64,192,kernel_size=(1,3,3),stride=1,padding=(0,1,1)) 88 | self.pool2=nn.MaxPool3d(kernel_size=(1,3,3),stride=(1,2,2),padding=(0,1,1)) 89 | self.Inception1=nn.Sequential(Inception_block(192, [64,96,128,16,32,32]), 90 | Inception_block(256, [128, 128, 192, 32, 96, 64])) 91 | self.pool3=nn.MaxPool3d(kernel_size=3,stride=2,padding=1) 92 | self.Inception2=nn.Sequential(Inception_block(480,[192,96,208,16,48,64]), 93 | Inception_block(512, [160, 112, 224, 24, 64, 64]), 94 | Inception_block(512, [128, 128, 256, 24, 64, 64]), 95 | Inception_block(512, [112, 144, 288, 32, 64, 64]), 96 | Inception_block(528, [256, 160, 320, 32, 128, 128])) 97 | self.pool4=nn.MaxPool3d(kernel_size=2,stride=2) 98 | self.Inception3=nn.Sequential(S3D_block(832,[256,160,320,32,128,128]), 99 | S3D_block(832, [384, 192, 384, 48, 128, 128])) 100 | self.avg_pool=nn.AvgPool3d(kernel_size=(8,7,7)) 101 | self.dropout = nn.Dropout(0.4) 102 | self.linear=nn.Linear(1024,num_class) 103 | 104 | def forward(self, x): 105 | x = self.conv1(x) 106 | x = self.pool1(x) 107 | x = self.conv2(x) 108 | x = self.conv3(x) 109 | x = self.pool2(x) 110 | x = self.Inception1(x) 111 | x = self.pool3(x) 112 | x = self.Inception2(x) 113 | x = self.pool4(x) 114 | x = self.Inception3(x) 115 | x = self.avg_pool(x) 116 | x = self.dropout(x.view(x.size(0),-1)) 117 | return self.linear(x) -------------------------------------------------------------------------------- /3DCNN/S3D/README.md: -------------------------------------------------------------------------------- 1 | # Rethinking Spatiotemporal Feature Learning For Video Understanding 2 | This paper can be downloaded [here](http://chensun.me/files/xie_s3d.pdf). 3 | 4 | ## Detailed introduction of the paper 5 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/85235239). 6 | 7 | 8 | -------------------------------------------------------------------------------- /3DCNN/S3D/S3D_G.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | 5 | class BasicConv3d(nn.Module): 6 | def __init__(self, in_channel, out_channel, kernel_size, stride, padding=(0, 0, 0)): 7 | super(BasicConv3d, self).__init__() 8 | self.conv = nn.Conv3d(in_channel, out_channel, 9 | kernel_size=kernel_size, stride=stride, 10 | padding=padding) 11 | self.bn = nn.BatchNorm3d(out_channel, 12 | eps=0.001, # value found in tensorflow 13 | ) 14 | self.relu = nn.ReLU() 15 | 16 | def forward(self, x): 17 | x = self.conv(x) 18 | x = self.bn(x) 19 | x = self.relu(x) 20 | return x 21 | 22 | class S3D_G_block(nn.Module): 23 | 24 | def __init__(self,in_channel,out_channel): 25 | super(S3D_G_block, self).__init__() 26 | # out_channel=[1x1x1,3x3x3_reduce,3x3x3,3x3x3_reduce,3x3x3,pooling_reduce] 27 | 28 | 29 | self.branch1 = BasicConv3d(in_channel,out_channel[0], kernel_size=(3,1,1), stride=1, padding=(1,0,0)) 30 | self.branch2 = nn.Sequential( 31 | BasicConv3d(in_channel, out_channel[1], kernel_size=1, stride=1), 32 | BasicConv3d(out_channel[1], out_channel[1],kernel_size=(1,3,3), stride=1, padding=(0,1,1)), 33 | BasicConv3d(out_channel[1], out_channel[2], kernel_size=(3, 1, 1), stride=1, padding=(1, 0, 0)) 34 | ) 35 | self.branch3 = nn.Sequential( 36 | BasicConv3d(in_channel, out_channel[3], kernel_size=1, stride=1), 37 | BasicConv3d(out_channel[3], out_channel[3], kernel_size=(1, 3, 3), stride=1, padding= (0, 1, 1)), 38 | BasicConv3d(out_channel[3], out_channel[4], kernel_size=(3, 1, 1), stride=1, padding=(1, 0, 0)) 39 | ) 40 | self.branch4 = nn.Sequential( 41 | nn.MaxPool3d(kernel_size=3,stride=1,padding=1), 42 | BasicConv3d(in_channel, out_channel[5], kernel_size=(3,1,1), stride=1,padding=(1,0,0)) 43 | ) 44 | self.squeeze = nn.AdaptiveAvgPool3d(1) 45 | # we replace weight matrix with 1D conv to reduce the para 46 | self.excitation = nn.Conv1d(1, 1, (3,1,1), stride=1,padding=(1,0,0)) 47 | self.sigmoid=nn.Sigmoid() 48 | def forward(self, x): 49 | x1 = self.branch1(x) 50 | x2 = self.branch2(x) 51 | x3 = self.branch3(x) 52 | x4 = self.branch4(x) 53 | x=torch.cat([x1,x2,x3,x4], 1) 54 | input = x 55 | x=self.squeeze(x) 56 | x=self.excitation(x.permute(0,2,1,3,4)) 57 | x=self.sigmoid(x) 58 | return x.permute(0,2,1,3,4)*input 59 | 60 | 61 | 62 | class S3D_G(nn.Module): 63 | # Input size: 64x224x224 64 | def __init__(self, num_class): 65 | super(S3D_G, self).__init__() 66 | 67 | self.conv1=BasicConv3d(3,64,kernel_size=7,stride=2,padding=3) 68 | self.pool1=nn.MaxPool3d(kernel_size=(1,3,3),stride=(1,2,2),padding=(0,1,1)) 69 | self.conv2=BasicConv3d(64,64,kernel_size=1,stride=1) 70 | self.conv3=BasicConv3d(64,192,kernel_size=3,stride=1,padding=1) 71 | self.pool2=nn.MaxPool3d(kernel_size=(1,3,3),stride=(1,2,2),padding=(0,1,1)) 72 | self.Inception1=nn.Sequential(S3D_G_block(192, [64,96,128,16,32,32]), 73 | S3D_G_block(256, [128, 128, 192, 32, 96, 64])) 74 | self.pool3=nn.MaxPool3d(kernel_size=(3,3,3),stride=(2,2,2),padding=(1,1,1)) 75 | self.Inception2=nn.Sequential(S3D_G_block(480,[192,96,208,16,48,64]), 76 | S3D_G_block(512, [160, 112, 224, 24, 64, 64]), 77 | S3D_G_block(512, [128, 128, 256, 24, 64, 64]), 78 | S3D_G_block(512, [112, 144, 288, 32, 64, 64]), 79 | S3D_G_block(528, [256, 160, 320, 32, 128, 128])) 80 | self.pool4=nn.MaxPool3d(kernel_size=(2,2,2),stride=2) 81 | self.Inception3=nn.Sequential(S3D_G_block(832,[256,160,320,32,128,128]), 82 | S3D_G_block(832, [384, 192, 384, 48, 128, 128])) 83 | self.avg_pool=nn.AvgPool3d(kernel_size=(8,7,7)) 84 | self.dropout = nn.Dropout(0.4) 85 | self.linear=nn.Linear(1024,num_class) 86 | 87 | def forward(self, x): 88 | x = self.conv1(x) 89 | x = self.pool1(x) 90 | x = self.conv2(x) 91 | x = self.conv3(x) 92 | x = self.pool2(x) 93 | x = self.Inception1(x) 94 | x = self.pool3(x) 95 | x = self.Inception2(x) 96 | x = self.pool4(x) 97 | x = self.Inception3(x) 98 | x = self.avg_pool(x) 99 | x = self.dropout(x.view(x.size(0),-1)) 100 | return self.linear(x) -------------------------------------------------------------------------------- /3DCNN/c3d/README.md: -------------------------------------------------------------------------------- 1 | # Learning Spatiotemporal Features with 3D Convolutional Networks 2 | This paper can be downloaded [here](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Tran_Learning_Spatiotemporal_Features_ICCV_2015_paper.pdf). 3 | 4 | ## code 5 | We not provide pre-trained model on sports-1M. if you need, pls go [here](https://github.com/DavideA/c3d-pytorch.git). 6 | 7 | ## Detailed introduction of the paper 8 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/84866514#comments). 9 | 10 | 11 | -------------------------------------------------------------------------------- /3DCNN/c3d/c3d.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.nn import init 3 | class c3d(nn.Module): 4 | def __init__(self,num_class,init_weights=True): 5 | super(c3d, self).__init__() 6 | 7 | self.conv1a = nn.Conv3d(3, 64, kernel_size=3, padding=1) 8 | self.conv2a = nn.Conv3d(64, 128, kernel_size=3, padding=1) 9 | self.conv3a = nn.Conv3d(128, 256, kernel_size=3, padding=1) 10 | self.conv3b = nn.Conv3d(256, 256, kernel_size=3, padding=1) 11 | self.conv4a = nn.Conv3d(256, 512, kernel_size=3, padding=1) 12 | self.conv4b = nn.Conv3d(512, 512, kernel_size=3, padding=1) 13 | self.conv5a = nn.Conv3d(512, 512, kernel_size=3, padding=1) 14 | self.conv5b = nn.Conv3d(512, 512, kernel_size=3, padding=1) 15 | 16 | self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)) 17 | self.pool2 = nn.MaxPool3d(kernel_size=2, stride=2) 18 | self.pool3 = nn.MaxPool3d(kernel_size=2, stride=2) 19 | self.pool4 = nn.MaxPool3d(kernel_size=2, stride=2) 20 | self.pool5 = nn.MaxPool3d(kernel_size=2, stride=2)#, padding=(0, 1, 1) 21 | 22 | self.fc6 = nn.Linear(4608, 4096) 23 | self.fc7 = nn.Linear(4096, 4096) 24 | self.out = nn.Linear(4096, num_class) 25 | 26 | self.relu = nn.ReLU() 27 | self.softmax = nn.Softmax() 28 | if init_weights: 29 | self._initialize_weights() 30 | 31 | def forward(self, x): 32 | 33 | x = self.conv1a(x) 34 | x = self.relu(x) 35 | x = self.pool1(x) 36 | 37 | x = self.conv2a(x) 38 | x = self.relu(x) 39 | x = self.pool2(x) 40 | 41 | x = self.conv3a(x) 42 | x = self.relu(x) 43 | x = self.conv3b(x) 44 | x = self.relu(x) 45 | x = self.pool3(x) 46 | 47 | x = self.conv4a(x) 48 | x = self.relu(x) 49 | x = self.conv4b(x) 50 | x = self.relu(x) 51 | x = self.pool4(x) 52 | 53 | x = self.conv5a(x) 54 | x = self.relu(x) 55 | x = self.conv5b(x) 56 | x = self.relu(x) 57 | x = self.pool5(x) 58 | 59 | x = x.view(x.size(0), -1) 60 | x = self.fc6(x) 61 | x = self.relu(x) 62 | x = self.fc7(x) 63 | x = self.relu(x) 64 | res = self.out(x) 65 | # if you use CrossEntropyLoss, you don't need to add softmax in network 66 | # res = self.softmax(x) 67 | 68 | return res 69 | 70 | def _initialize_weights(self): 71 | for m in self.modules(): 72 | if isinstance(m, nn.Conv3d): 73 | init.xavier_uniform_(m.weight) 74 | if m.bias is not None: 75 | init.constant_(m.bias,0) 76 | elif isinstance(m, nn.Linear): 77 | init.xavier_uniform_(m.weight) 78 | init.constant_(m.bias, 0) 79 | -------------------------------------------------------------------------------- /CNN+LSTM/ALSTM/ALSTM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision 4 | 5 | class lstm_cell(nn.Module): 6 | def __init__(self, input_num, hidden_num): 7 | super(lstm_cell, self).__init__() 8 | 9 | self.input_num = input_num 10 | self.hidden_num = hidden_num 11 | 12 | self.Wxi = nn.Linear(self.input_num, self.hidden_num, bias=True) 13 | self.Whi = nn.Linear(self.hidden_num, self.hidden_num, bias=False) 14 | self.Wxf = nn.Linear(self.input_num, self.hidden_num, bias=True) 15 | self.Whf = nn.Linear(self.hidden_num, self.hidden_num, bias=False) 16 | self.Wxc = nn.Linear(self.input_num, self.hidden_num, bias=True) 17 | self.Whc = nn.Linear(self.hidden_num, self.hidden_num, bias=False) 18 | self.Wxo = nn.Linear(self.input_num, self.hidden_num, bias=True) 19 | self.Who = nn.Linear(self.hidden_num, self.hidden_num, bias=False) 20 | 21 | def forward(self, xt, ht_1, ct_1): 22 | it = torch.sigmoid(self.Wxi(xt) + self.Whi(ht_1)) 23 | ft = torch.sigmoid(self.Wxf(xt) + self.Whf(ht_1)) 24 | ot = torch.sigmoid(self.Wxo(xt) + self.Who(ht_1)) 25 | ct = ft * ct_1 + it * torch.tanh(self.Wxc(xt) + self.Whc(ht_1)) 26 | ht = ot * torch.tanh(ct) 27 | return ht, ct 28 | 29 | 30 | class ALSTM(nn.Module): 31 | 32 | def __init__(self, input_num, hidden_num, num_layers,out_num ): 33 | super(ALSTM, self).__init__() 34 | 35 | # Make sure that `hidden_num` are lists having len == num_layers 36 | hidden_num = self._extend_for_multilayer(hidden_num, num_layers) 37 | if not len(hidden_num) == num_layers: 38 | raise ValueError('The length of hidden_num is not consistent with num_layers.') 39 | 40 | self.input_num = input_num 41 | self.hidden_num = hidden_num 42 | self.num_layers = num_layers 43 | self.out_num = out_num 44 | 45 | cell_list = [] 46 | for i in range(0, self.num_layers): 47 | cur_input_num = self.input_num if i == 0 else self.hidden_num[i - 1] 48 | cell_list.append(lstm_cell(cur_input_num,self.hidden_num[i])) 49 | 50 | self.cell_list = nn.ModuleList(cell_list) 51 | self.conv=nn.Sequential(*list(torchvision.models.resnet101().children())[:-2]) 52 | self.Wha=nn.Linear(self.hidden_num[-1],49) 53 | self.fc=nn.Linear(self.hidden_num[-1],self.out_num) 54 | self.softmax=nn.Softmax(dim=1) 55 | 56 | def forward(self, x, hidden_state=None): 57 | #input model: batch x channel x time x height x width 58 | #input size: 30 x 224 x 224 59 | 60 | # init -1 time hidden units 61 | if hidden_state is not None: 62 | raise NotImplementedError() 63 | else: 64 | hidden_state = self._init_hidden(batch_size=x.size(0)) 65 | out_list=[] 66 | seq_len = x.size(2) 67 | 68 | for t in range(seq_len): 69 | output_t = [] 70 | for layer_idx in range(self.num_layers): 71 | if 0==t: 72 | ht_1, ct_1 = hidden_state[layer_idx][0],hidden_state[layer_idx][1] 73 | attention_h=hidden_state[-1][0] 74 | else: 75 | ht_1, ct_1 = hct_1[layer_idx][0],hct_1[layer_idx][1] 76 | if 0==layer_idx: 77 | feature_map=self.conv(x[:, :, t, :, :]) 78 | feature_map=feature_map.view(feature_map.size(0),feature_map.size(1),-1) 79 | attention_map=self.Wha(attention_h) 80 | attention_map=torch.unsqueeze(self.softmax(attention_map),1) 81 | attention_feature=attention_map*feature_map 82 | attention_feature=torch.sum(attention_feature,2) 83 | ht, ct = self.cell_list[layer_idx](attention_feature,ht_1, ct_1) 84 | output_t.append([ht,ct]) 85 | else: 86 | ht, ct = self.cell_list[layer_idx](output_t[layer_idx-1][0], ht_1, ct_1) 87 | output_t.append([ht,ct]) 88 | attention_h=output_t[-1][0] 89 | hct_1=output_t 90 | out_list.append(self.fc(output_t[-1][0])) 91 | 92 | 93 | return torch.stack(out_list,1) 94 | 95 | 96 | def _init_hidden(self, batch_size): 97 | init_states = [] 98 | for i in range(self.num_layers): 99 | init_states.append([torch.zeros(batch_size, self.hidden_num[i]),torch.zeros(batch_size, self.hidden_num[i])]) 100 | return init_states 101 | 102 | 103 | @staticmethod 104 | def _extend_for_multilayer(param, num_layers): 105 | if not isinstance(param, list): 106 | param = [param] * num_layers 107 | return param 108 | 109 | -------------------------------------------------------------------------------- /CNN+LSTM/ALSTM/README.md: -------------------------------------------------------------------------------- 1 | # Action Recognition Using Visual Attention 2 | This paper can be downloaded [here](https://arxiv.org/pdf/1511.04119.pdf). 3 | 4 | ## Detailed introduction of the paper 5 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/86063410). 6 | 7 | ## Note 8 | The 2D CNN in my code is resnet101 instead of GoogLeNet. 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /CNN+LSTM/LRCNs/LRCNs.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision 4 | 5 | class lstm_cell(nn.Module): 6 | def __init__(self, input_num, hidden_num): 7 | super(lstm_cell, self).__init__() 8 | 9 | self.input_num = input_num 10 | self.hidden_num = hidden_num 11 | 12 | self.Wxi = nn.Linear(self.input_num, self.hidden_num, bias=True) 13 | self.Whi = nn.Linear(self.hidden_num, self.hidden_num, bias=False) 14 | self.Wxf = nn.Linear(self.input_num, self.hidden_num, bias=True) 15 | self.Whf = nn.Linear(self.hidden_num, self.hidden_num, bias=False) 16 | self.Wxc = nn.Linear(self.input_num, self.hidden_num, bias=True) 17 | self.Whc = nn.Linear(self.hidden_num, self.hidden_num, bias=False) 18 | self.Wxo = nn.Linear(self.input_num, self.hidden_num, bias=True) 19 | self.Who = nn.Linear(self.hidden_num, self.hidden_num, bias=False) 20 | 21 | def forward(self, xt, ht_1, ct_1): 22 | it = torch.sigmoid(self.Wxi(xt) + self.Whi(ht_1)) 23 | ft = torch.sigmoid(self.Wxf(xt) + self.Whf(ht_1)) 24 | ot = torch.sigmoid(self.Wxo(xt) + self.Who(ht_1)) 25 | ct = ft * ct_1 + it * torch.tanh(self.Wxc(xt) + self.Whc(ht_1)) 26 | ht = ot * torch.tanh(ct) 27 | return ht, ct 28 | 29 | 30 | class LRCNs(nn.Module): 31 | 32 | def __init__(self, input_num, hidden_num, num_layers,out_num ): 33 | super(LRCNs, self).__init__() 34 | 35 | # Make sure that `hidden_num` are lists having len == num_layers 36 | hidden_num = self._extend_for_multilayer(hidden_num, num_layers) 37 | if not len(hidden_num) == num_layers: 38 | raise ValueError('The length of hidden_num is not consistent with num_layers.') 39 | 40 | self.input_num = input_num 41 | self.hidden_num = hidden_num 42 | self.num_layers = num_layers 43 | self.out_num=out_num 44 | cell_list = [] 45 | for i in range(0, self.num_layers): 46 | cur_input_num = self.input_num if i == 0 else self.hidden_num[i - 1] 47 | cell_list.append(lstm_cell(input_num=cur_input_num,hidden_num=self.hidden_num[i])) 48 | 49 | self.cell_list = nn.ModuleList(cell_list) 50 | self.conv=nn.Sequential(*list(torchvision.models.resnet101().children())[:-1]) 51 | self.fc = nn.Linear(self.hidden_num[-1],self.out_num) 52 | 53 | def forward(self, x, hidden_state=None): 54 | #input size: batch x channel x time x height x width 55 | 56 | # init the -1 time hidden units 57 | if hidden_state is not None: 58 | raise NotImplementedError() 59 | else: 60 | hidden_state = self._init_hidden(batch_size=x.size(0)) 61 | 62 | seq_len = x.size(2) 63 | cur_layer_input = x 64 | 65 | for layer_idx in range(self.num_layers): 66 | h, c = hidden_state[layer_idx][0],hidden_state[layer_idx][1] 67 | output_inner = [] 68 | for t in range(seq_len): 69 | if layer_idx==0: 70 | cnn_feature=torch.squeeze(self.conv(cur_layer_input[:, :, t, :, :])) 71 | h, c = self.cell_list[layer_idx](cnn_feature,h, c) 72 | else: 73 | h, c = self.cell_list[layer_idx](cur_layer_input[:, t, :], h, c) 74 | 75 | if self.num_layers==layer_idx+1: 76 | output_inner.append(self.fc(h)) 77 | else: 78 | output_inner.append(h) 79 | layer_output = torch.stack(output_inner, dim=1) 80 | cur_layer_input = layer_output 81 | 82 | return layer_output 83 | 84 | 85 | def _init_hidden(self, batch_size): 86 | init_states = [] 87 | for i in range(self.num_layers): 88 | init_states.append([torch.zeros(batch_size, self.hidden_num[i]),torch.zeros(batch_size, self.hidden_num[i])]) 89 | return init_states 90 | 91 | 92 | @staticmethod 93 | def _extend_for_multilayer(param, num_layers): 94 | if not isinstance(param, list): 95 | param = [param] * num_layers 96 | return param 97 | 98 | -------------------------------------------------------------------------------- /CNN+LSTM/LRCNs/README.md: -------------------------------------------------------------------------------- 1 | # Long-Term Recurrent Convolutional Networks for Visual Recognition and Description 2 | This paper can be downloaded [here](https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Donahue_Long-Term_Recurrent_Convolutional_2015_CVPR_paper.pdf). 3 | 4 | ## Detailed introduction of the paper 5 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/85989394). 6 | 7 | ## Note 8 | The 2D CNN in my code is resnet101 instead of GoogLeNet. 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /CNN+LSTM/convpooling_LSTM/README.md: -------------------------------------------------------------------------------- 1 | # Beyond Short Snippets: Deep Networks for Video Classification 2 | This paper can be downloaded [here](https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Ng_Beyond_Short_Snippets_2015_CVPR_paper.pdf). 3 | 4 | ## Detailed introduction of the paper 5 | I introduced the paper in detail in my [blog](https://blog.csdn.net/zzmshuai/article/details/85762257). 6 | 7 | ## Note 8 | The 2D CNN in my code is resnet101 instead of GoogLeNet. 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /CNN+LSTM/convpooling_LSTM/convpooling_LSTM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision 4 | 5 | class conv_pooling(nn.Module): 6 | # Input size: 120x224x224 7 | # The CNN structure is first trained from single frame, then the FC layers are fine-tuned from scratch. 8 | def __init__(self, num_class): 9 | super(conv_pooling, self).__init__() 10 | 11 | self.conv=nn.Sequential(* list(torchvision.models.resnet101().children())[:-2]) 12 | self.time_pooling=nn.MaxPool3d(kernel_size=(120,1,1)) 13 | self.average_pool=nn.AvgPool3d(kernel_size=(1,7,7)) 14 | self.linear1=nn.Linear(2048,2048) 15 | self.linear2=nn.Linear(2048, num_class) 16 | def forward(self, x): 17 | t_len=x.size(2) 18 | conv_out_list=[] 19 | for i in range(t_len): 20 | conv_out_list.append(self.conv(torch.squeeze(x[:,:,i,:,:]))) 21 | conv_out=self.time_pooling(torch.stack(conv_out_list,2)) 22 | conv_out = self.average_pool(conv_out) 23 | conv_out=self.linear1(conv_out.view(conv_out.size(0),-1)) 24 | conv_out=self.linear2(conv_out) 25 | return conv_out 26 | 27 | class cnn_lstm(nn.Module): 28 | # Input size: 30x224x224 29 | # The CNN structure is first trained from single frame, then the lstm is fine-tuned from scratch. 30 | def __init__(self, num_class): 31 | super(cnn_lstm, self).__init__() 32 | 33 | self.conv = nn.Sequential(*list(torchvision.models.resnet101().children())[:-1]) 34 | self.lstm = nn.LSTM(2048,512,5,batch_first=True) 35 | self.fc=nn.Linear(512,num_class) 36 | 37 | def forward(self, x): 38 | t_len = x.size(2) 39 | conv_out_list = [] 40 | for i in range(t_len): 41 | conv_out_list.append(self.conv(torch.squeeze(x[:, :, i, :, :]))) 42 | conv_out=torch.stack(conv_out_list,1) 43 | conv_out,hidden=self.lstm(conv_out.view(conv_out.size(0),conv_out.size(1),-1)) 44 | lstm_out=[] 45 | for j in range (conv_out.size(1)): 46 | lstm_out.append(self.fc(torch.squeeze(conv_out[:,j,:]))) 47 | return torch.stack(lstm_out,1),hidden -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 MRzzm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # action-recognition-models-pytorch(update paused) 2 | **I'm working as an intern in company now, so the project is suspended!** 3 | 4 | I'm trying to reproduce the models of action recognition with pytorch to deepen the understanding of the paper. I follow the taxonomy of deep learning models of action recognition as follow. 5 | ![The taxonomy of deep learning based models](http://m.qpic.cn/psb?/V146Uaoq2KWgA7/.rlEuCIe*T1BTj3MN*HcI0UG7.LRuqX9G1nKxi7HBAQ!/b/dDcBAAAAAAAA&bo=tAY8AwAAAAARB70!&rf=viewer_4) 6 | --------------------------------------------------------------------------------