├── .gitignore ├── CSL_Continuous_Seq2Seq.py ├── CSL_Isolated_Conv3D.py ├── CSL_Isolated_ConvLSTM.py ├── CSL_Skeleton_GCN.py ├── CSL_Skeleton_RNN.py ├── README.md ├── dataset.py ├── log ├── 3dresnet18_100_acc.svg ├── 3dresnet18_100_loss.svg ├── 3dresnet34_100_acc.svg ├── 3dresnet34_100_loss.svg ├── skeleton_lstm_100_acc.svg └── skeleton_lstm_100_loss.svg ├── models ├── Attention.py ├── Conv3D.py ├── ConvLSTM.py ├── GCN.py ├── RNN.py └── Seq2Seq.py ├── test.py ├── tools.py ├── train.py └── validation.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # tensorboard 107 | runs/ 108 | 109 | # models 110 | *.pth -------------------------------------------------------------------------------- /CSL_Continuous_Seq2Seq.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from datetime import datetime 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | from torch.utils.data import DataLoader 9 | from torch.utils.tensorboard import SummaryWriter 10 | import torchvision.transforms as transforms 11 | from dataset import CSL_Continuous, CSL_Continuous_Char 12 | from models.Seq2Seq import Encoder, Decoder, Seq2Seq 13 | from train import train_seq2seq 14 | from validation import val_seq2seq 15 | 16 | # Path setting 17 | data_path = "/home/haodong/Data/CSL_Continuous/color" 18 | dict_path = "/home/haodong/Data/CSL_Continuous/dictionary.txt" 19 | corpus_path = "/home/haodong/Data/CSL_Continuous/corpus.txt" 20 | model_path = "/home/haodong/Data/seq2seq_models" 21 | log_path = "log/seq2seq_{:%Y-%m-%d_%H-%M-%S}.log".format(datetime.now()) 22 | sum_path = "runs/slr_seq2seq_{:%Y-%m-%d_%H-%M-%S}".format(datetime.now()) 23 | 24 | # Log to file & tensorboard writer 25 | logging.basicConfig(level=logging.INFO, format='%(message)s', handlers=[logging.FileHandler(log_path), logging.StreamHandler()]) 26 | logger = logging.getLogger('SLR') 27 | logger.info('Logging to file...') 28 | writer = SummaryWriter(sum_path) 29 | 30 | # Use specific gpus 31 | os.environ["CUDA_VISIBLE_DEVICES"]="3" 32 | # Device setting 33 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 34 | 35 | # Hyperparams 36 | epochs = 100 37 | batch_size = 8 38 | learning_rate = 1e-4 39 | weight_decay = 1e-5 40 | sample_size = 128 41 | sample_duration = 48 42 | enc_hid_dim = 512 43 | emb_dim = 256 44 | dec_hid_dim = 512 45 | dropout = 0.5 46 | clip = 1 47 | log_interval = 100 48 | 49 | if __name__ == '__main__': 50 | # Load data 51 | transform = transforms.Compose([transforms.Resize([sample_size, sample_size]), 52 | transforms.ToTensor(), 53 | transforms.Normalize(mean=[0.5], std=[0.5])]) 54 | # train_set = CSL_Continuous(data_path=data_path, dict_path=dict_path, 55 | # corpus_path=corpus_path, frames=sample_duration, train=True, transform=transform) 56 | # val_set = CSL_Continuous(data_path=data_path, dict_path=dict_path, 57 | # corpus_path=corpus_path, frames=sample_duration, train=False, transform=transform) 58 | train_set = CSL_Continuous_Char(data_path=data_path, corpus_path=corpus_path, 59 | frames=sample_duration, train=True, transform=transform) 60 | val_set = CSL_Continuous_Char(data_path=data_path, corpus_path=corpus_path, 61 | frames=sample_duration, train=False, transform=transform) 62 | logger.info("Dataset samples: {}".format(len(train_set)+len(val_set))) 63 | train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True) 64 | val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True) 65 | # Create Model 66 | encoder = Encoder(lstm_hidden_size=enc_hid_dim, arch="resnet18").to(device) 67 | decoder = Decoder(output_dim=train_set.output_dim, emb_dim=emb_dim, enc_hid_dim=enc_hid_dim, dec_hid_dim=dec_hid_dim, dropout=dropout).to(device) 68 | model = Seq2Seq(encoder=encoder, decoder=decoder, device=device).to(device) 69 | # Run the model parallelly 70 | if torch.cuda.device_count() > 1: 71 | logger.info("Using {} GPUs".format(torch.cuda.device_count())) 72 | model = nn.DataParallel(model) 73 | # Create loss criterion & optimizer 74 | criterion = nn.CrossEntropyLoss() 75 | optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) 76 | 77 | # Start training 78 | logger.info("Training Started".center(60, '#')) 79 | for epoch in range(epochs): 80 | # Train the model 81 | train_seq2seq(model, criterion, optimizer, clip, train_loader, device, epoch, logger, log_interval, writer) 82 | 83 | # Validate the model 84 | val_seq2seq(model, criterion, val_loader, device, epoch, logger, writer) 85 | 86 | # Save model 87 | torch.save(model.state_dict(), os.path.join(model_path, "slr_seq2seq_epoch{:03d}.pth".format(epoch+1))) 88 | logger.info("Epoch {} Model Saved".format(epoch+1).center(60, '#')) 89 | 90 | logger.info("Training Finished".center(60, '#')) 91 | -------------------------------------------------------------------------------- /CSL_Isolated_Conv3D.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import logging 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | from torch.utils.data import DataLoader, random_split 10 | from torch.utils.tensorboard import SummaryWriter 11 | import torchvision.transforms as transforms 12 | from models.Conv3D import CNN3D, resnet18, resnet34, resnet50, resnet101, r2plus1d_18 13 | from dataset import CSL_Isolated 14 | from train import train_epoch 15 | from validation import val_epoch 16 | 17 | # Path setting 18 | data_path = "/home/haodong/Data/CSL_Isolated/color_video_125000" 19 | label_path = "/home/haodong/Data/CSL_Isolated/dictionary.txt" 20 | model_path = "/home/haodong/Data/cnn3d_models" 21 | log_path = "log/cnn3d_{:%Y-%m-%d_%H-%M-%S}.log".format(datetime.now()) 22 | sum_path = "runs/slr_cnn3d_{:%Y-%m-%d_%H-%M-%S}".format(datetime.now()) 23 | 24 | # Log to file & tensorboard writer 25 | logging.basicConfig(level=logging.INFO, format='%(message)s', handlers=[logging.FileHandler(log_path), logging.StreamHandler()]) 26 | logger = logging.getLogger('SLR') 27 | logger.info('Logging to file...') 28 | writer = SummaryWriter(sum_path) 29 | 30 | # Use specific gpus 31 | os.environ["CUDA_VISIBLE_DEVICES"]="2" 32 | # Device setting 33 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 34 | 35 | # Hyperparams 36 | num_classes = 100 37 | epochs = 100 38 | batch_size = 16 39 | learning_rate = 1e-5 40 | log_interval = 20 41 | sample_size = 128 42 | sample_duration = 16 43 | attention = False 44 | drop_p = 0.0 45 | hidden1, hidden2 = 512, 256 46 | 47 | # Train with 3DCNN 48 | if __name__ == '__main__': 49 | # Load data 50 | transform = transforms.Compose([transforms.Resize([sample_size, sample_size]), 51 | transforms.ToTensor(), 52 | transforms.Normalize(mean=[0.5], std=[0.5])]) 53 | train_set = CSL_Isolated(data_path=data_path, label_path=label_path, frames=sample_duration, 54 | num_classes=num_classes, train=True, transform=transform) 55 | val_set = CSL_Isolated(data_path=data_path, label_path=label_path, frames=sample_duration, 56 | num_classes=num_classes, train=False, transform=transform) 57 | logger.info("Dataset samples: {}".format(len(train_set)+len(val_set))) 58 | train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True) 59 | val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True) 60 | # Create model 61 | # model = CNN3D(sample_size=sample_size, sample_duration=sample_duration, drop_p=drop_p, 62 | # hidden1=hidden1, hidden2=hidden2, num_classes=num_classes).to(device) 63 | model = resnet18(pretrained=True, progress=True, sample_size=sample_size, sample_duration=sample_duration, 64 | attention=attention, num_classes=num_classes).to(device) 65 | # model = r2plus1d_18(pretrained=True, num_classes=num_classes).to(device) 66 | # Run the model parallelly 67 | if torch.cuda.device_count() > 1: 68 | logger.info("Using {} GPUs".format(torch.cuda.device_count())) 69 | model = nn.DataParallel(model) 70 | # Create loss criterion & optimizer 71 | criterion = nn.CrossEntropyLoss() 72 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 73 | 74 | # Start training 75 | logger.info("Training Started".center(60, '#')) 76 | for epoch in range(epochs): 77 | # Train the model 78 | train_epoch(model, criterion, optimizer, train_loader, device, epoch, logger, log_interval, writer) 79 | 80 | # Validate the model 81 | val_epoch(model, criterion, val_loader, device, epoch, logger, writer) 82 | 83 | # Save model 84 | torch.save(model.state_dict(), os.path.join(model_path, "slr_cnn3d_epoch{:03d}.pth".format(epoch+1))) 85 | logger.info("Epoch {} Model Saved".format(epoch+1).center(60, '#')) 86 | 87 | logger.info("Training Finished".center(60, '#')) 88 | -------------------------------------------------------------------------------- /CSL_Isolated_ConvLSTM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import logging 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | from torch.utils.data import DataLoader, random_split 10 | from torch.utils.tensorboard import SummaryWriter 11 | import torchvision.transforms as transforms 12 | from models.ConvLSTM import CRNN, ResCRNN 13 | from dataset import CSL_Isolated 14 | from train import train_epoch 15 | from validation import val_epoch 16 | 17 | # Path setting 18 | data_path = "/home/haodong/Data/CSL_Isolated/color_video_125000" 19 | label_path = "/home/haodong/Data/CSL_Isolated/dictionary.txt" 20 | model_path = "/home/haodong/Data/cnnlstm_models" 21 | log_path = "log/cnnlstm_{:%Y-%m-%d_%H-%M-%S}.log".format(datetime.now()) 22 | sum_path = "runs/slr_cnnlstm_{:%Y-%m-%d_%H-%M-%S}".format(datetime.now()) 23 | 24 | # Log to file & tensorboard writer 25 | logging.basicConfig(level=logging.INFO, format='%(message)s', handlers=[logging.FileHandler(log_path), logging.StreamHandler()]) 26 | logger = logging.getLogger('SLR') 27 | logger.info('Logging to file...') 28 | writer = SummaryWriter(sum_path) 29 | 30 | # Use specific gpus 31 | os.environ["CUDA_VISIBLE_DEVICES"]="2" 32 | # Device setting 33 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 34 | 35 | # Hyperparams 36 | epochs = 200 37 | batch_size = 16 38 | learning_rate = 1e-4 39 | weight_decay = 1e-5 40 | log_interval = 20 41 | sample_size = 128 42 | sample_duration = 16 43 | num_classes = 100 44 | lstm_hidden_size = 512 45 | lstm_num_layers = 1 46 | attention = False 47 | 48 | # Train with Conv+LSTM 49 | if __name__ == '__main__': 50 | # Load data 51 | transform = transforms.Compose([transforms.Resize([sample_size, sample_size]), 52 | transforms.ToTensor(), 53 | transforms.Normalize(mean=[0.5], std=[0.5])]) 54 | train_set = CSL_Isolated(data_path=data_path, label_path=label_path, frames=sample_duration, 55 | num_classes=num_classes, train=True, transform=transform) 56 | val_set = CSL_Isolated(data_path=data_path, label_path=label_path, frames=sample_duration, 57 | num_classes=num_classes, train=False, transform=transform) 58 | logger.info("Dataset samples: {}".format(len(train_set)+len(val_set))) 59 | train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True) 60 | val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True) 61 | # Create model 62 | # model = CRNN(sample_size=sample_size, sample_duration=sample_duration, num_classes=num_classes, 63 | # lstm_hidden_size=lstm_hidden_size, lstm_num_layers=lstm_num_layers).to(device) 64 | model = ResCRNN(sample_size=sample_size, sample_duration=sample_duration, num_classes=num_classes, 65 | lstm_hidden_size=lstm_hidden_size, lstm_num_layers=lstm_num_layers, attention=attention).to(device) 66 | # Run the model parallelly 67 | if torch.cuda.device_count() > 1: 68 | logger.info("Using {} GPUs".format(torch.cuda.device_count())) 69 | model = nn.DataParallel(model) 70 | # Create loss criterion & optimizer 71 | criterion = nn.CrossEntropyLoss() 72 | optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) 73 | 74 | # Start training 75 | logger.info("Training Started".center(60, '#')) 76 | for epoch in range(epochs): 77 | # Train the model 78 | train_epoch(model, criterion, optimizer, train_loader, device, epoch, logger, log_interval, writer) 79 | 80 | # Validate the model 81 | val_epoch(model, criterion, val_loader, device, epoch, logger, writer) 82 | 83 | # Save model 84 | torch.save(model.state_dict(), os.path.join(model_path, "slr_convlstm_epoch{:03d}.pth".format(epoch+1))) 85 | logger.info("Epoch {} Model Saved".format(epoch+1).center(60, '#')) 86 | 87 | logger.info("Training Finished".center(60, '#')) 88 | -------------------------------------------------------------------------------- /CSL_Skeleton_GCN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import logging 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | from torch.utils.data import DataLoader, random_split 10 | from torch.utils.tensorboard import SummaryWriter 11 | from models.GCN import GCN 12 | from dataset import CSL_Skeleton 13 | from train import train_epoch 14 | from validation import val_epoch 15 | 16 | # Path setting 17 | data_path = "/home/haodong/Data/CSL_Isolated_1/xf500_body_depth_txt" 18 | label_path = "/home/haodong/Data/CSL_Isolated_1/dictionary.txt" 19 | model_path = "/home/haodong/Data/gcn_models" 20 | log_path = "log/gcn_{:%Y-%m-%d_%H-%M-%S}.log".format(datetime.now()) 21 | sum_path = "runs/slr_gcn_{:%Y-%m-%d_%H-%M-%S}".format(datetime.now()) 22 | 23 | # Log to file & tensorboard writer 24 | logging.basicConfig(level=logging.INFO, format='%(message)s', handlers=[logging.FileHandler(log_path), logging.StreamHandler()]) 25 | logger = logging.getLogger('SLR') 26 | logger.info('Logging to file...') 27 | writer = SummaryWriter(sum_path) 28 | 29 | # Use specific gpus 30 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 31 | # Device setting 32 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 33 | 34 | # Hyperparams 35 | epochs = 200 36 | batch_size = 32 37 | learning_rate = 1e-5 38 | log_interval = 100 39 | num_classes = 500 40 | in_channels = 2 41 | sample_duration = 16 42 | selected_joints = None 43 | split_to_channels = True 44 | 45 | # Train with GCN 46 | if __name__ == '__main__': 47 | # Load data 48 | transform = None # TODO 49 | train_set = CSL_Skeleton(data_path=data_path, label_path=label_path, frames=sample_duration, num_classes=num_classes, 50 | selected_joints=selected_joints, split_to_channels=split_to_channels, train=True, transform=transform) 51 | val_set = CSL_Skeleton(data_path=data_path, label_path=label_path, frames=sample_duration, num_classes=num_classes, 52 | selected_joints=selected_joints, split_to_channels=split_to_channels, train=False, transform=transform) 53 | logger.info("Dataset samples: {}".format(len(train_set)+len(val_set))) 54 | train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) 55 | val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) 56 | # Create model 57 | model = GCN(in_channels=in_channels, num_class=num_classes, graph_args={'layout': 'ntu-rgb+d'}, 58 | edge_importance_weighting=True).to(device) 59 | # Run the model parallelly 60 | if torch.cuda.device_count() > 1: 61 | logger.info("Using {} GPUs".format(torch.cuda.device_count())) 62 | model = nn.DataParallel(model) 63 | # Create loss criterion & optimizer 64 | criterion = nn.CrossEntropyLoss() 65 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 66 | 67 | # Start training 68 | logger.info("Training Started".center(60, '#')) 69 | for epoch in range(epochs): 70 | # Train the model 71 | train_epoch(model, criterion, optimizer, train_loader, device, epoch, logger, log_interval, writer) 72 | 73 | # Validate the model 74 | val_epoch(model, criterion, val_loader, device, epoch, logger, writer) 75 | 76 | # Save model 77 | torch.save(model.state_dict(), os.path.join(model_path, "slr_gcn_epoch{:03d}.pth".format(epoch+1))) 78 | logger.info("Epoch {} Model Saved".format(epoch+1).center(60, '#')) 79 | 80 | logger.info("Training Finished".center(60, '#')) 81 | -------------------------------------------------------------------------------- /CSL_Skeleton_RNN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import logging 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | from torch.utils.data import DataLoader, random_split 10 | from torch.utils.tensorboard import SummaryWriter 11 | from models.RNN import LSTM, GRU 12 | from dataset import CSL_Skeleton 13 | from train import train_epoch 14 | from validation import val_epoch 15 | 16 | # Path setting 17 | data_path = "/home/haodong/Data/CSL_Isolated/xf500_body_depth_txt" 18 | label_path = "/home/haodong/Data/CSL_Isolated/dictionary.txt" 19 | model_path = "/home/haodong/Data/skeleton_models" 20 | log_path = "log/skeleton_{:%Y-%m-%d_%H-%M-%S}.log".format(datetime.now()) 21 | sum_path = "runs/slr_skeleton_{:%Y-%m-%d_%H-%M-%S}".format(datetime.now()) 22 | 23 | # Log to file & tensorboard writer 24 | logging.basicConfig(level=logging.INFO, format='%(message)s', handlers=[logging.FileHandler(log_path), logging.StreamHandler()]) 25 | logger = logging.getLogger('SLR') 26 | logger.info('Logging to file...') 27 | writer = SummaryWriter(sum_path) 28 | 29 | # Use specific gpus 30 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 31 | # Device setting 32 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 33 | 34 | # Hyperparams 35 | epochs = 500 36 | batch_size = 32 37 | learning_rate = 1e-5 38 | log_interval = 20 39 | num_classes = 100 40 | sample_duration = 16 41 | selected_joints = ['HANDLEFT', 'HANDRIGHT', 'ELBOWLEFT', 'ELBOWRIGHT'] 42 | input_size = len(selected_joints)*2 43 | hidden_size = 512 44 | num_layers = 1 45 | hidden1 = 512 46 | drop_p = 0.0 47 | 48 | # Train with Skeleton+RNN 49 | if __name__ == '__main__': 50 | # Load data 51 | transform = None # TODO 52 | train_set = CSL_Skeleton(data_path=data_path, label_path=label_path, frames=sample_duration, 53 | num_classes=num_classes, selected_joints=selected_joints, train=True, transform=transform) 54 | val_set = CSL_Skeleton(data_path=data_path, label_path=label_path, frames=sample_duration, 55 | num_classes=num_classes, selected_joints=selected_joints, train=False, transform=transform) 56 | logger.info("Dataset samples: {}".format(len(train_set)+len(val_set))) 57 | train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) 58 | val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) 59 | # Create model 60 | # model = LSTM(lstm_input_size=input_size, lstm_hidden_size=hidden_size, lstm_num_layers=num_layers, 61 | # num_classes=num_classes, hidden1=hidden1, drop_p=drop_p).to(device) 62 | model = GRU(gru_input_size=input_size, gru_hidden_size=hidden_size, gru_num_layers=num_layers, 63 | num_classes=num_classes, hidden1=hidden1, drop_p=drop_p).to(device) 64 | # Run the model parallelly 65 | if torch.cuda.device_count() > 1: 66 | logger.info("Using {} GPUs".format(torch.cuda.device_count())) 67 | model = nn.DataParallel(model) 68 | # Create loss criterion & optimizer 69 | criterion = nn.CrossEntropyLoss() 70 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 71 | 72 | # Start training 73 | logger.info("Training Started".center(60, '#')) 74 | for epoch in range(epochs): 75 | # Train the model 76 | train_epoch(model, criterion, optimizer, train_loader, device, epoch, logger, log_interval, writer) 77 | 78 | # Validate the model 79 | val_epoch(model, criterion, val_loader, device, epoch, logger, writer) 80 | 81 | # Save model 82 | torch.save(model.state_dict(), os.path.join(model_path, "slr_skeleton_epoch{:03d}.pth".format(epoch+1))) 83 | logger.info("Epoch {} Model Saved".format(epoch+1).center(60, '#')) 84 | 85 | logger.info("Training Finished".center(60, '#')) 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SLR 2 | isolated & continuous sign language recognition using CNN+LSTM/3D CNN/GCN/Encoder-Decoder 3 | 4 | ## Requirements 5 | 6 | - Download and extract **[CSL Dataset](http://home.ustc.edu.cn/~pjh/openresources/cslr-dataset-2015/index.html)** 7 | - Download and install **[PyTorch](https://pytorch.org/)** 8 | 9 | ## Isolated Sign Language Recognition 10 | 11 | ### CNN+LSTM 12 | 13 | 1. **four layers of Conv2d + one layer of LSTM** 14 | 15 | | Dataset | Classes | Samples | Best Test Acc | Best Test Loss | 16 | | ------------ | ------- | ------- | ------------- | -------------- | 17 | | CSL_Isolated | 100 | 25,000 | 82.08% | 0.734426 | 18 | | CSL_Isolated | 500 | 125,000 | 71.71% | 1.332122 | 19 | 20 | 2. **ResNet + one layer of LSTM** 21 | 22 | | Dataset | Classes | Samples | Best Test Acc | Best Test Loss | 23 | | ------------ | ------- | ------- | ------------- | -------------- | 24 | | CSL_Isolated | 100 | 25,000 | 93.54% | 0.245582 | 25 | | CSL_Isolated | 500 | 125,000 | 83.17% | 0.748759 | 26 | 27 | ### 3D CNN 28 | 29 | 1. **three layers of Conv3d** 30 | 31 | | Dataset | Classes | Samples | Best Test Acc | Best Test Loss | 32 | | ------------ | ------- | ------- | ------------- | -------------- | 33 | | CSL_Isolated | 100 | 25,000 | 58.86% | 1.560049 | 34 | | CSL_Isolated | 500 | 125,000 | 45.07% | 2.255563 | 35 | 36 | 2. **3D ResNet** 37 | 38 | | Method | Dataset | Classes | Samples | Best Test Acc | Best Test Loss | 39 | | --------- | ------------ | ------- | ------- | ------------- | -------------- | 40 | | ResNet18 | CSL_Isolated | 100 | 25,000 | 93.30% | 0.246169 | 41 | | ResNet18 | CSL_Isolated | 500 | 125,000 | 79.42% | 0.800490 | 42 | | ResNet34 | CSL_Isolated | 100 | 25,000 | 94.78% | 0.207592 | 43 | | ResNet34 | CSL_Isolated | 500 | 125,000 | 81.61% | 0.750424 | 44 | | ResNet50 | CSL_Isolated | 100 | 25,000 | 94.36% | 0.232631 | 45 | | ResNet50 | CSL_Isolated | 500 | 125,000 | 83.15% | 0.803212 | 46 | | ResNet101 | CSL_Isolated | 100 | 25,000 | 95.26% | 0.205430 | 47 | | ResNet101 | CSL_Isolated | 500 | 125,000 | 83.18% | 0.751727 | 48 | 49 | 3. **ResNet (2+1)D** 50 | 51 | | Dataset | Classes | Samples | Best Test Acc | Best Test Loss | 52 | | ------------ | ------- | ------- | ------------- | -------------- | 53 | | CSL_Isolated | 100 | 25,000 | 98.68% | 0.043099 | 54 | | CSL_Isolated | 500 | 125,000 | 94.85% | 0.234880 | 55 | 56 | ### GCN 57 | 58 | | Dataset | Classes | Samples | Best Test Acc | Best Test Loss | 59 | | ------------ | ------- | ------- | ------------- | -------------- | 60 | | CSL_Skeleton | 100 | 25,000 | 79.20% | 0.737053 | 61 | | CSL_Skeleton | 500 | 125,000 | 66.64% | 1.165872 | 62 | 63 | ### Skeleton+LSTM 64 | 65 | | Dataset | Classes | Samples | Best Test Acc | Best Test Loss | 66 | | ------------ | ------- | ------- | ------------- | -------------- | 67 | | CSL_Skeleton | 100 | 25,000 | 84.30% | 0.488253 | 68 | | CSL_Skeleton | 500 | 125,000 | 70.62% | 1.078730 | 69 | 70 | ## Continuous Sign Language Recognition 71 | 72 | ### Encoder-Decoder 73 | 74 | *Encoder is ResNet18+LSTM, and Decoder is LSTM* 75 | 76 | | Dataset | Sentences | Samples | Best Test Wer | Best Test Loss | 77 | | ------------------- | --------- | ------- | ------------- | -------------- | 78 | | CSL_Continuous | 100 | 25,000 | 1.01% | 0.034636 | 79 | | CSL_Continuous_Char | 100 | 25,000 | 1.19% | 0.049449 | 80 | 81 | ## References 82 | 83 | - [Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?](https://arxiv.org/pdf/1711.09577.pdf) 84 | 85 | - [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/pdf/1801.07455.pdf) 86 | - [A Closer Look at Spatiotemporal Convolutions for Action Recognition](https://arxiv.org/abs/1711.11248) 87 | - [SIGN LANGUAGE RECOGNITION WITH LONG SHORT-TERM MEMORY](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7532884) 88 | - https://github.com/HHTseng/video-classification 89 | - https://github.com/kenshohara/3D-ResNets-PyTorch 90 | 91 | - https://github.com/bentrevett/pytorch-seq2seq 92 | 93 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | import torch 4 | from torch.utils.data import Dataset 5 | import torchvision.transforms as transforms 6 | 7 | """ 8 | Implementation of Chinese Sign Language Dataset(50 signers with 5 times) 9 | """ 10 | class CSL_Isolated(Dataset): 11 | def __init__(self, data_path, label_path, frames=16, num_classes=500, train=True, transform=None): 12 | super(CSL_Isolated, self).__init__() 13 | self.data_path = data_path 14 | self.label_path = label_path 15 | self.train = train 16 | self.transform = transform 17 | self.frames = frames 18 | self.num_classes = num_classes 19 | self.signers = 50 20 | self.repetition = 5 21 | if self.train: 22 | self.videos_per_folder = int(0.8 * self.signers * self.repetition) 23 | else: 24 | self.videos_per_folder = int(0.2 * self.signers * self.repetition) 25 | self.data_folder = [] 26 | try: 27 | obs_path = [os.path.join(self.data_path, item) for item in os.listdir(self.data_path)] 28 | self.data_folder = sorted([item for item in obs_path if os.path.isdir(item)]) 29 | except Exception as e: 30 | print("Something wrong with your data path!!!") 31 | raise 32 | self.labels = {} 33 | try: 34 | label_file = open(self.label_path, 'r') 35 | for line in label_file.readlines(): 36 | line = line.strip() 37 | line = line.split('\t') 38 | self.labels[line[0]] = line[1] 39 | except Exception as e: 40 | raise 41 | 42 | def read_images(self, folder_path): 43 | assert len(os.listdir(folder_path)) >= self.frames, "Too few images in your data folder: " + str(folder_path) 44 | images = [] 45 | start = 1 46 | step = int(len(os.listdir(folder_path))/self.frames) 47 | for i in range(self.frames): 48 | image = Image.open(os.path.join(folder_path, '{:06d}.jpg').format(start+i*step)) #.convert('L') 49 | if self.transform is not None: 50 | image = self.transform(image) 51 | images.append(image) 52 | 53 | images = torch.stack(images, dim=0) 54 | # switch dimension for 3d cnn 55 | images = images.permute(1, 0, 2, 3) 56 | # print(images.shape) 57 | return images 58 | 59 | def __len__(self): 60 | return self.num_classes * self.videos_per_folder 61 | 62 | def __getitem__(self, idx): 63 | top_folder = self.data_folder[int(idx/self.videos_per_folder)] 64 | selected_folders = [os.path.join(top_folder, item) for item in os.listdir(top_folder)] 65 | selected_folders = sorted([item for item in selected_folders if os.path.isdir(item)]) 66 | if self.train: 67 | selected_folder = selected_folders[idx%self.videos_per_folder] 68 | else: 69 | selected_folder = selected_folders[idx%self.videos_per_folder + int(0.8*self.signers*self.repetition)] 70 | images = self.read_images(selected_folder) 71 | # print(selected_folder, int(idx/self.videos_per_folder)) 72 | # print(self.labels['{:06d}'.format(int(idx/self.videos_per_folder))]) 73 | # label = self.labels['{:06d}'.format(int(idx/self.videos_per_folder))] 74 | label = torch.LongTensor([int(idx/self.videos_per_folder)]) 75 | 76 | return {'data': images, 'label': label} 77 | 78 | def label_to_word(self, label): 79 | if isinstance(label, torch.Tensor): 80 | return self.labels['{:06d}'.format(label.item())] 81 | elif isinstance(label, int): 82 | return self.labels['{:06d}'.format(label)] 83 | 84 | 85 | """ 86 | Implementation of CSL Skeleton Dataset 87 | """ 88 | class CSL_Skeleton(Dataset): 89 | joints_index = {'SPINEBASE': 0, 'SPINEMID': 1, 'NECK': 2, 'HEAD': 3, 'SHOULDERLEFT':4, 90 | 'ELBOWLEFT': 5, 'WRISTLEFT': 6, 'HANDLEFT': 7, 'SHOULDERRIGHT': 8, 91 | 'ELBOWRIGHT': 9, 'WRISTRIGHT': 10, 'HANDRIGHT': 11, 'HIPLEFT': 12, 92 | 'KNEELEFT': 13, 'ANKLELEFT': 14, 'FOOTLEFT': 15, 'HIPRIGHT': 16, 93 | 'KNEERIGHT': 17, 'ANKLERIGHT': 18, 'FOOTRIGHT': 19, 'SPINESHOULDER': 20, 94 | 'HANDTIPLEFT': 21, 'THUMBLEFT': 22, 'HANDTIPRIGHT': 23, 'THUMBRIGHT': 24} 95 | def __init__(self, data_path, label_path, frames=16, num_classes=500, selected_joints=None, split_to_channels=False, train=True, transform=None): 96 | super(CSL_Skeleton, self).__init__() 97 | self.data_path = data_path 98 | self.label_path = label_path 99 | self.frames = frames 100 | self.num_classes = num_classes 101 | self.selected_joints = selected_joints 102 | self.split_to_channels = split_to_channels 103 | self.train = train 104 | self.transform = transform 105 | self.signers = 50 106 | self.repetition = 5 107 | if self.train: 108 | self.txt_per_folder = int(0.8 * self.signers * self.repetition) 109 | else: 110 | self.txt_per_folder = int(0.2 * self.signers * self.repetition) 111 | self.data_folder = [] 112 | try: 113 | obs_path = [os.path.join(self.data_path, item) for item in os.listdir(self.data_path)] 114 | self.data_folder = sorted([item for item in obs_path if os.path.isdir(item)]) 115 | except Exception as e: 116 | print("Something wrong with your data path!!!") 117 | raise 118 | self.labels = {} 119 | try: 120 | label_file = open(self.label_path, 'r') 121 | for line in label_file.readlines(): 122 | line = line.strip() 123 | line = line.split('\t') 124 | self.labels[line[0]] = line[1] 125 | except Exception as e: 126 | raise 127 | 128 | def read_file(self, txt_path): 129 | txt_file = open(txt_path, 'r') 130 | all_skeletons = [] 131 | for line in txt_file.readlines(): 132 | line = line.split(' ') 133 | skeleton = [int(item) for item in line if item is not '\n'] 134 | selected_x = [] 135 | selected_y = [] 136 | # select specific joints 137 | if self.selected_joints is not None: 138 | for joint in self.selected_joints: 139 | assert joint in self.joints_index, 'JOINT ' + joint + ' DONT EXIST!!!' 140 | selected_x.append(skeleton[2*self.joints_index[joint]]) 141 | selected_y.append(skeleton[2*self.joints_index[joint]+1]) 142 | else: 143 | for i in range(len(skeleton)): 144 | if i % 2 == 0: 145 | selected_x.append(skeleton[i]) 146 | else: 147 | selected_y.append(skeleton[i]) 148 | # print(selected_x, selected_y) 149 | if self.split_to_channels: 150 | selected_skeleton = torch.FloatTensor([selected_x, selected_y]) 151 | else: 152 | selected_skeleton = torch.FloatTensor(selected_x + selected_y) 153 | # print(selected_skeleton.shape) 154 | if self.transform is not None: 155 | selected_skeleton = self.transform(selected_skeleton) 156 | all_skeletons.append(selected_skeleton) 157 | # print(all_skeletons) 158 | skeletons = [] 159 | start = 0 160 | step = int(len(all_skeletons)/self.frames) 161 | for i in range(self.frames): 162 | skeletons.append(all_skeletons[start+i*step]) 163 | skeletons = torch.stack(skeletons, dim=0) 164 | # print(skeletons.shape) 165 | 166 | return skeletons 167 | 168 | def __len__(self): 169 | return self.num_classes * self.txt_per_folder 170 | 171 | def __getitem__(self, idx): 172 | top_folder = self.data_folder[int(idx/self.txt_per_folder)] 173 | selected_txts = [os.path.join(top_folder, item) for item in os.listdir(top_folder)] 174 | selected_txts = sorted([item for item in selected_txts if item.endswith('.txt')]) 175 | if self.train: 176 | selected_txt = selected_txts[idx%self.txt_per_folder] 177 | else: 178 | selected_txt = selected_txts[idx%self.txt_per_folder + int(0.8*self.signers*self.repetition)] 179 | # print(selected_txt) 180 | data = self.read_file(selected_txt) 181 | label = torch.LongTensor([int(idx/self.txt_per_folder)]) 182 | 183 | return {'data': data, 'label': label} 184 | 185 | def label_to_word(self, label): 186 | if isinstance(label, torch.Tensor): 187 | return self.labels['{:06d}'.format(label.item())] 188 | elif isinstance(label, int): 189 | return self.labels['{:06d}'.format(label)] 190 | 191 | 192 | """ 193 | Implementation of CSL Continuous Dataset(Word Level) 194 | """ 195 | class CSL_Continuous(Dataset): 196 | def __init__(self, data_path, dict_path, corpus_path, frames=128, train=True, transform=None): 197 | super(CSL_Continuous, self).__init__() 198 | self.data_path = data_path 199 | self.dict_path = dict_path 200 | self.corpus_path = corpus_path 201 | self.frames = frames 202 | self.train = train 203 | self.transform = transform 204 | self.num_sentences = 100 205 | self.signers = 50 206 | self.repetition = 5 207 | if self.train: 208 | self.videos_per_folder = int(0.8 * self.signers * self.repetition) 209 | else: 210 | self.videos_per_folder = int(0.2 * self.signers * self.repetition) 211 | # dictionary 212 | self.dict = {'': 0, '': 1, '': 2} 213 | self.output_dim = 3 214 | try: 215 | dict_file = open(self.dict_path, 'r') 216 | for line in dict_file.readlines(): 217 | line = line.strip().split('\t') 218 | # word with multiple expressions 219 | if '(' in line[1] and ')' in line[1]: 220 | for delimeter in ['(', ')', '、']: 221 | line[1] = line[1].replace(delimeter, " ") 222 | words = line[1].split() 223 | else: 224 | words = [line[1]] 225 | # print(words) 226 | for word in words: 227 | self.dict[word] = self.output_dim 228 | self.output_dim += 1 229 | except Exception as e: 230 | raise 231 | # img data 232 | self.data_folder = [] 233 | try: 234 | obs_path = [os.path.join(self.data_path, item) for item in os.listdir(self.data_path)] 235 | self.data_folder = sorted([item for item in obs_path if os.path.isdir(item)]) 236 | except Exception as e: 237 | raise 238 | # corpus 239 | self.corpus = {} 240 | self.unknown = set() 241 | try: 242 | corpus_file = open(self.corpus_path, 'r') 243 | for line in corpus_file.readlines(): 244 | line = line.strip().split() 245 | sentence = line[1] 246 | raw_sentence = (line[1]+'.')[:-1] 247 | paired = [False for i in range(len(line[1]))] 248 | # print(id(raw_sentence), id(line[1]), id(sentence)) 249 | # pair long words with higher priority 250 | for token in sorted(self.dict, key=len, reverse=True): 251 | index = raw_sentence.find(token) 252 | # print(index, line[1]) 253 | if index != -1 and not paired[index]: 254 | line[1] = line[1].replace(token, " "+token+" ") 255 | # mark as paired 256 | for i in range(len(token)): 257 | paired[index+i] = True 258 | # add sos 259 | tokens = [self.dict['']] 260 | for token in line[1].split(): 261 | if token in self.dict: 262 | tokens.append(self.dict[token]) 263 | else: 264 | self.unknown.add(token) 265 | # add eos 266 | tokens.append(self.dict['']) 267 | self.corpus[line[0]] = tokens 268 | except Exception as e: 269 | raise 270 | # add padding 271 | length = [len(tokens) for key, tokens in self.corpus.items()] 272 | self.max_length = max(length) 273 | # print(max(length)) 274 | for key, tokens in self.corpus.items(): 275 | if len(tokens) < self.max_length: 276 | tokens.extend([self.dict['']]*(self.max_length-len(tokens))) 277 | # print(self.corpus) 278 | # print(self.unknown) 279 | 280 | def read_images(self, folder_path): 281 | assert len(os.listdir(folder_path)) >= self.frames, "Too few images in your data folder: " + str(folder_path) 282 | images = [] 283 | start = 1 284 | step = int(len(os.listdir(folder_path))/self.frames) 285 | for i in range(self.frames): 286 | image = Image.open(os.path.join(folder_path, '{:06d}.jpg').format(start+i*step)) #.convert('L') 287 | if self.transform is not None: 288 | image = self.transform(image) 289 | images.append(image) 290 | 291 | images = torch.stack(images, dim=0) 292 | # switch dimension 293 | images = images.permute(1, 0, 2, 3) 294 | # print(images.shape) 295 | return images 296 | 297 | def __len__(self): 298 | return self.num_sentences * self.videos_per_folder 299 | 300 | def __getitem__(self, idx): 301 | top_folder = self.data_folder[int(idx/self.videos_per_folder)] 302 | selected_folders = [os.path.join(top_folder, item) for item in os.listdir(top_folder)] 303 | selected_folders = sorted([item for item in selected_folders if os.path.isdir(item)]) 304 | if self.train: 305 | selected_folder = selected_folders[idx%self.videos_per_folder] 306 | else: 307 | selected_folder = selected_folders[idx%self.videos_per_folder + int(0.8*self.signers*self.repetition)] 308 | images = self.read_images(selected_folder) 309 | # print(selected_folder, int(idx/self.videos_per_folder)) 310 | # print(self.corpus['{:06d}'.format(int(idx/self.videos_per_folder))]) 311 | tokens = torch.LongTensor(self.corpus['{:06d}'.format(int(idx/self.videos_per_folder))]) 312 | 313 | return images, tokens 314 | 315 | 316 | """ 317 | Implementation of CSL Continuous Dataset(Character Level) 318 | """ 319 | class CSL_Continuous_Char(Dataset): 320 | def __init__(self, data_path, corpus_path, frames=128, train=True, transform=None): 321 | super(CSL_Continuous_Char, self).__init__() 322 | self.data_path = data_path 323 | self.corpus_path = corpus_path 324 | self.frames = frames 325 | self.train = train 326 | self.transform = transform 327 | self.num_sentences = 100 328 | self.signers = 50 329 | self.repetition = 5 330 | if self.train: 331 | self.videos_per_folder = int(0.8 * self.signers * self.repetition) 332 | else: 333 | self.videos_per_folder = int(0.2 * self.signers * self.repetition) 334 | # dictionary 335 | self.dict = {'': 0, '': 1, '': 2} 336 | self.output_dim = 3 337 | try: 338 | dict_file = open(self.corpus_path, 'r') 339 | for line in dict_file.readlines(): 340 | line = line.strip().split() 341 | sentence = line[1] 342 | for char in sentence: 343 | if char not in self.dict: 344 | self.dict[char] = self.output_dim 345 | self.output_dim += 1 346 | except Exception as e: 347 | raise 348 | # img data 349 | self.data_folder = [] 350 | try: 351 | obs_path = [os.path.join(self.data_path, item) for item in os.listdir(self.data_path)] 352 | self.data_folder = sorted([item for item in obs_path if os.path.isdir(item)]) 353 | except Exception as e: 354 | raise 355 | # corpus 356 | self.corpus = {} 357 | self.unknown = set() 358 | try: 359 | corpus_file = open(self.corpus_path, 'r') 360 | for line in corpus_file.readlines(): 361 | line = line.strip().split() 362 | sentence = line[1] 363 | raw_sentence = (line[1]+'.')[:-1] 364 | paired = [False for i in range(len(line[1]))] 365 | # print(id(raw_sentence), id(line[1]), id(sentence)) 366 | # pair long words with higher priority 367 | for token in sorted(self.dict, key=len, reverse=True): 368 | index = raw_sentence.find(token) 369 | # print(index, line[1]) 370 | if index != -1 and not paired[index]: 371 | line[1] = line[1].replace(token, " "+token+" ") 372 | # mark as paired 373 | for i in range(len(token)): 374 | paired[index+i] = True 375 | # add sos 376 | tokens = [self.dict['']] 377 | for token in line[1].split(): 378 | if token in self.dict: 379 | tokens.append(self.dict[token]) 380 | else: 381 | self.unknown.add(token) 382 | # add eos 383 | tokens.append(self.dict['']) 384 | self.corpus[line[0]] = tokens 385 | except Exception as e: 386 | raise 387 | # add padding 388 | length = [len(tokens) for key, tokens in self.corpus.items()] 389 | self.max_length = max(length) 390 | # print(max(length)) 391 | for key, tokens in self.corpus.items(): 392 | if len(tokens) < self.max_length: 393 | tokens.extend([self.dict['']]*(self.max_length-len(tokens))) 394 | # print(self.corpus) 395 | # print(self.unknown) 396 | 397 | def read_images(self, folder_path): 398 | assert len(os.listdir(folder_path)) >= self.frames, "Too few images in your data folder: " + str(folder_path) 399 | images = [] 400 | start = 1 401 | step = int(len(os.listdir(folder_path))/self.frames) 402 | for i in range(self.frames): 403 | image = Image.open(os.path.join(folder_path, '{:06d}.jpg').format(start+i*step)) #.convert('L') 404 | if self.transform is not None: 405 | image = self.transform(image) 406 | images.append(image) 407 | 408 | images = torch.stack(images, dim=0) 409 | # switch dimension 410 | images = images.permute(1, 0, 2, 3) 411 | # print(images.shape) 412 | return images 413 | 414 | def __len__(self): 415 | return self.num_sentences * self.videos_per_folder 416 | 417 | def __getitem__(self, idx): 418 | top_folder = self.data_folder[int(idx/self.videos_per_folder)] 419 | selected_folders = [os.path.join(top_folder, item) for item in os.listdir(top_folder)] 420 | selected_folders = sorted([item for item in selected_folders if os.path.isdir(item)]) 421 | if self.train: 422 | selected_folder = selected_folders[idx%self.videos_per_folder] 423 | else: 424 | selected_folder = selected_folders[idx%self.videos_per_folder + int(0.8*self.signers*self.repetition)] 425 | images = self.read_images(selected_folder) 426 | # print(selected_folder, int(idx/self.videos_per_folder)) 427 | # print(self.corpus['{:06d}'.format(int(idx/self.videos_per_folder))]) 428 | tokens = torch.LongTensor(self.corpus['{:06d}'.format(int(idx/self.videos_per_folder))]) 429 | 430 | return images, tokens 431 | 432 | 433 | # Test 434 | if __name__ == '__main__': 435 | transform = transforms.Compose([transforms.Resize([128, 128]), transforms.ToTensor()]) 436 | # dataset = CSL_Isolated(data_path="/home/haodong/Data/CSL_Isolated/color_video_125000", 437 | # label_path='/home/haodong/Data/CSL_Isolated/dictionary.txt', transform=transform) # print(len(dataset)) 438 | # print(dataset[1000]['images'].shape) 439 | # dataset = CSL_Skeleton(data_path="/home/haodong/Data/CSL_Isolated/xf500_body_depth_txt", 440 | # label_path="/home/haodong/Data/CSL_Isolated/dictionary.txt", selected_joints=['SPINEBASE', 'SPINEMID', 'HANDTIPRIGHT'], split_to_channels=True) 441 | # print(dataset[1000]) 442 | # label = dataset[1000]['label'] 443 | # print(dataset.label_to_word(label)) 444 | # dataset[1000] 445 | dataset = CSL_Continuous( 446 | data_path="/home/haodong/Data/CSL_Continuous/color", 447 | dict_path="/home/haodong/Data/CSL_Continuous/dictionary.txt", 448 | corpus_path="/home/haodong/Data/CSL_Continuous/corpus.txt", 449 | train=True, transform=transform 450 | ) 451 | # dataset = CSL_Continuous_Char( 452 | # data_path="/home/haodong/Data/CSL_Continuous/color", 453 | # corpus_path="/home/haodong/Data/CSL_Continuous/corpus.txt", 454 | # train=True, transform=transform 455 | # ) 456 | print(len(dataset)) 457 | images, tokens = dataset[1000] 458 | print(images.shape, tokens) 459 | print(dataset.output_dim) 460 | -------------------------------------------------------------------------------- /log/3dresnet18_100_acc.svg: -------------------------------------------------------------------------------- 1 | 0.840.860.880.90.920.940.960.9811.021.04-1001020304050607080 -------------------------------------------------------------------------------- /log/3dresnet18_100_loss.svg: -------------------------------------------------------------------------------- 1 | -0.100.10.20.30.40.50.60.70.80.9-1001020304050607080 -------------------------------------------------------------------------------- /log/3dresnet34_100_acc.svg: -------------------------------------------------------------------------------- 1 | 0.840.860.880.90.920.940.960.9811.021.04-50510152025303540455055 -------------------------------------------------------------------------------- /log/3dresnet34_100_loss.svg: -------------------------------------------------------------------------------- 1 | -0.100.10.20.30.40.50.60.70.8-50510152025303540455055 -------------------------------------------------------------------------------- /models/Attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | """ 6 | Attention blocks 7 | Reference: Learn To Pay Attention 8 | """ 9 | class ProjectorBlock(nn.Module): 10 | def __init__(self, in_channels, out_channels): 11 | super(ProjectorBlock, self).__init__() 12 | self.op = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 13 | kernel_size=1, padding=0, bias=False) 14 | 15 | def forward(self, x): 16 | return self.op(x) 17 | 18 | 19 | class ProjectorBlock3D(nn.Module): 20 | def __init__(self, in_channels, out_channels): 21 | super(ProjectorBlock3D, self).__init__() 22 | self.op = nn.Conv3d(in_channels=in_channels, out_channels=out_channels, 23 | kernel_size=1, padding=0, bias=False) 24 | 25 | def forward(self, x): 26 | return self.op(x) 27 | 28 | 29 | class LinearAttentionBlock(nn.Module): 30 | def __init__(self, in_channels, normalize_attn=True): 31 | super(LinearAttentionBlock, self).__init__() 32 | self.normalize_attn = normalize_attn 33 | self.op = nn.Conv2d(in_channels=in_channels, out_channels=1, 34 | kernel_size=1, padding=0, bias=False) 35 | 36 | def forward(self, l, g): 37 | N, C, H, W = l.size() 38 | c = self.op(l+g) # (batch_size,1,H,W) 39 | if self.normalize_attn: 40 | a = F.softmax(c.view(N,1,-1), dim=2).view(N,1,H,W) 41 | else: 42 | a = torch.sigmoid(c) 43 | g = torch.mul(a.expand_as(l), l) 44 | if self.normalize_attn: 45 | g = g.view(N,C,-1).sum(dim=2) # (batch_size,C) 46 | else: 47 | g = F.adaptive_avg_pool2d(g, (1,1)).view(N,C) 48 | return c.view(N,1,H,W), g 49 | 50 | 51 | class LinearAttentionBlock3D(nn.Module): 52 | def __init__(self, in_channels, normalize_attn=True): 53 | super(LinearAttentionBlock3D, self).__init__() 54 | self.normalize_attn = normalize_attn 55 | self.op = nn.Conv3d(in_channels=in_channels, out_channels=1, 56 | kernel_size=1, padding=0, bias=False) 57 | 58 | def forward(self, l, g): 59 | N, C, T, H, W = l.size() 60 | c = self.op(l+g) # (batch_size,1,T,H,W) 61 | if self.normalize_attn: 62 | a = F.softmax(c.view(N,1,-1), dim=2).view(N,1,T,H,W) 63 | else: 64 | a = torch.sigmoid(c) 65 | g = torch.mul(a.expand_as(l), l) 66 | if self.normalize_attn: 67 | g = g.view(N,C,-1).sum(dim=2) # (batch_size,C) 68 | else: 69 | g = F.adaptive_avg_pool3d(g, (1,1,1)).view(N,C) 70 | return c.view(N,1,T,H,W), g 71 | 72 | """ 73 | Dense attention block 74 | Reference: https://github.com/philipperemy/keras-attention-mechanism 75 | """ 76 | class LSTMAttentionBlock(nn.Module): 77 | def __init__(self, hidden_size): 78 | super(LSTMAttentionBlock, self).__init__() 79 | self.hidden_size = hidden_size 80 | self.fc1 = nn.Linear(self.hidden_size, self.hidden_size, bias=False) 81 | self.fc2 = nn.Linear(self.hidden_size*2, self.hidden_size, bias=False) 82 | 83 | def forward(self, hidden_states): 84 | # (batch_size, time_steps, hidden_size) 85 | score_first_part = self.fc1(hidden_states) 86 | # (batch_size, hidden_size) 87 | h_t = hidden_states[:,-1,:] 88 | # (batch_size, time_steps) 89 | score = torch.bmm(score_first_part, h_t.unsqueeze(2)).squeeze(2) 90 | attention_weights = F.softmax(score, dim=1) 91 | # (batch_size, hidden_size) 92 | context_vector = torch.bmm(hidden_states.permute(0,2,1), attention_weights.unsqueeze(2)).squeeze(2) 93 | # (batch_size, hidden_size*2) 94 | pre_activation = torch.cat((context_vector, h_t), dim=1) 95 | # (batch_size, hidden_size) 96 | attention_vector = self.fc2(pre_activation) 97 | attention_vector = torch.tanh(attention_vector) 98 | 99 | return attention_vector 100 | 101 | # Test 102 | if __name__ == '__main__': 103 | # 2d block 104 | attention_block = LinearAttentionBlock(in_channels=3) 105 | l = torch.randn(16, 3, 128, 128) 106 | g = torch.randn(16, 3, 128, 128) 107 | print(attention_block(l, g)) 108 | # 3d block 109 | attention_block_3d = LinearAttentionBlock3D(in_channels=3) 110 | l = torch.randn(16, 3, 16, 128, 128) 111 | g = torch.randn(16, 3, 16, 128, 128) 112 | print(attention_block_3d(l, g)) 113 | # LSTM block 114 | attention_block_lstm = LSTMAttentionBlock(hidden_size=256) 115 | hidden_states = torch.randn(32, 16, 256) 116 | print(attention_block_lstm(hidden_states).shape) 117 | -------------------------------------------------------------------------------- /models/Conv3D.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from torch.hub import load_state_dict_from_url 7 | import torchvision 8 | from functools import partial 9 | from collections import OrderedDict 10 | import math 11 | 12 | import os,inspect,sys 13 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 14 | sys.path.insert(0,currentdir) 15 | from Attention import ProjectorBlock3D, LinearAttentionBlock3D 16 | 17 | """ 18 | Implementation of 3D CNN. 19 | """ 20 | class CNN3D(nn.Module): 21 | def __init__(self, sample_size=128, sample_duration=16, drop_p=0.0, hidden1=512, hidden2=256, num_classes=100): 22 | super(CNN3D, self).__init__() 23 | self.sample_size = sample_size 24 | self.sample_duration = sample_duration 25 | self.num_classes = num_classes 26 | 27 | # network params 28 | self.ch1, self.ch2, self.ch3 = 32, 48, 48 29 | self.k1, self.k2, self.k3 = (3,7,7), (3,7,7), (3,5,5) 30 | self.s1, self.s2, self.s3 = (2,2,2), (2,2,2), (2,2,2) 31 | self.p1, self.p2, self.p3 = (0,0,0), (0,0,0), (0,0,0) 32 | self.d1, self.d2, self.d3 = (1,1,1), (1,1,1), (1,1,1) 33 | self.hidden1, self.hidden2 = hidden1, hidden2 34 | self.drop_p = drop_p 35 | self.pool_k, self.pool_s, self.pool_p, self.pool_d = (1,2,2), (1,2,2), (0,0,0), (1,1,1) 36 | # Conv1 37 | self.conv1_output_shape = self.compute_output_shape(self.sample_duration, self.sample_size, 38 | self.sample_size, self.k1, self.s1, self.p1, self.d1) 39 | # self.conv1_output_shape = self.compute_output_shape(self.conv1_output_shape[0], self.conv1_output_shape[1], 40 | # self.conv1_output_shape[2], self.pool_k, self.pool_s, self.pool_p, self.pool_d) 41 | # Conv2 42 | self.conv2_output_shape = self.compute_output_shape(self.conv1_output_shape[0], self.conv1_output_shape[1], 43 | self.conv1_output_shape[2], self.k2, self.s2, self.p2, self.d2) 44 | # self.conv2_output_shape = self.compute_output_shape(self.conv2_output_shape[0], self.conv2_output_shape[1], 45 | # self.conv2_output_shape[2], self.pool_k, self.pool_s, self.pool_p, self.pool_d) 46 | # Conv3 47 | self.conv3_output_shape = self.compute_output_shape(self.conv2_output_shape[0], self.conv2_output_shape[1], 48 | self.conv2_output_shape[2], self.k3, self.s3, self.p3, self.d3) 49 | # print(self.conv1_output_shape, self.conv2_output_shape, self.conv3_output_shape) 50 | 51 | # network architecture 52 | # in_channels=1 for grayscale, 3 for rgb 53 | self.conv1 = nn.Conv3d(in_channels=3, out_channels=self.ch1, kernel_size=self.k1, 54 | stride=self.s1, padding=self.p1, dilation=self.d1) 55 | self.bn1 = nn.BatchNorm3d(self.ch1) 56 | self.conv2 = nn.Conv3d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, 57 | stride=self.s2, padding=self.p2, dilation=self.d2) 58 | self.bn2 = nn.BatchNorm3d(self.ch2) 59 | self.conv3 = nn.Conv3d(in_channels=self.ch2, out_channels=self.ch3, kernel_size=self.k3, 60 | stride=self.s3, padding=self.p3, dilation=self.d3) 61 | self.bn3 = nn.BatchNorm3d(self.ch3) 62 | self.relu = nn.ReLU(inplace=True) 63 | self.drop = nn.Dropout3d(p=self.drop_p) 64 | self.pool = nn.MaxPool3d(kernel_size=self.pool_k) 65 | self.fc1 = nn.Linear(self.ch3 * self.conv3_output_shape[0] * self.conv3_output_shape[1] * self.conv3_output_shape[2], self.hidden1) 66 | self.fc2 = nn.Linear(self.hidden1, self.hidden2) 67 | self.fc3 = nn.Linear(self.hidden2, self.num_classes) 68 | 69 | def forward(self, x): 70 | # Conv1 71 | x = self.conv1(x) 72 | x = self.bn1(x) 73 | x = self.relu(x) 74 | # x = self.pool(x) 75 | # x = self.drop(x) 76 | # Conv2 77 | x = self.conv2(x) 78 | x = self.bn2(x) 79 | x = self.relu(x) 80 | # x = self.pool(x) 81 | # x = self.drop(x) 82 | # Conv3 83 | x = self.conv3(x) 84 | x = self.bn3(x) 85 | x = self.relu(x) 86 | # x = self.drop(x) 87 | # MLP 88 | # print(x.shape) 89 | # x.size(0) ------ batch_size 90 | x = x.view(x.size(0), -1) 91 | x = F.relu(self.fc1(x)) 92 | x = F.relu(self.fc2(x)) 93 | x = F.dropout(x, p=self.drop_p, training=self.training) 94 | x = self.fc3(x) 95 | 96 | return x 97 | 98 | def compute_output_shape(self, D_in, H_in, W_in, k, s, p, d): 99 | # Conv 100 | D_out = np.floor((D_in + 2*p[0] - d[0]*(k[0] - 1) - 1)/s[0] + 1).astype(int) 101 | H_out = np.floor((H_in + 2*p[1] - d[1]*(k[1] - 1) - 1)/s[1] + 1).astype(int) 102 | W_out = np.floor((W_in + 2*p[2] - d[2]*(k[2] - 1) - 1)/s[2] + 1).astype(int) 103 | 104 | return D_out, H_out, W_out 105 | 106 | 107 | """ 108 | Implementation of 3D Resnet 109 | Reference: Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet? 110 | """ 111 | class BasicBlock(nn.Module): 112 | expansion = 1 113 | # planes refer to the number of feature maps 114 | def __init__(self, inplanes, planes, stride=1, downsample=None): 115 | super(BasicBlock, self).__init__() 116 | self.stride = stride 117 | self.downsample = downsample 118 | self.conv1 = nn.Conv3d( 119 | inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 120 | self.bn1 = nn.BatchNorm3d(planes) 121 | self.relu = nn.ReLU(inplace=True) 122 | self.conv2 = nn.Conv3d( 123 | planes, planes, kernel_size=3, padding=1, bias=False) 124 | self.bn2 = nn.BatchNorm3d(planes) 125 | 126 | def forward(self, x): 127 | residual = x 128 | # conv1 129 | out = self.conv1(x) 130 | out = self.bn1(out) 131 | out = self.relu(out) 132 | # conv2 133 | out = self.conv2(out) 134 | out = self.bn2(out) 135 | # downsample 136 | if self.downsample is not None: 137 | residual = self.downsample(x) 138 | 139 | # print(out.shape, residual.shape) 140 | out += residual 141 | out = self.relu(out) 142 | 143 | return out 144 | 145 | 146 | class Bottleneck(nn.Module): 147 | expansion = 4 148 | # planes refer to the number of feature maps 149 | def __init__(self, inplanes, planes, stride=1, downsample=None): 150 | super(Bottleneck, self).__init__() 151 | self.stride = stride 152 | self.downsample = downsample 153 | self.conv1 = nn.Conv3d( 154 | inplanes, planes, kernel_size=1, bias=False) # kernal_size=1 don't need padding 155 | self.bn1 = nn.BatchNorm3d(planes) 156 | self.conv2 = nn.Conv3d( 157 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 158 | self.bn2 = nn.BatchNorm3d(planes) 159 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) 160 | self.bn3 = nn.BatchNorm3d(planes * 4) 161 | self.relu = nn.ReLU(inplace=True) 162 | 163 | def forward(self, x): 164 | residual = x 165 | # conv1 166 | out = self.conv1(x) 167 | out = self.bn1(out) 168 | out = self.relu(out) 169 | # conv2 170 | out = self.conv2(out) 171 | out = self.bn2(out) 172 | out = self.relu(out) 173 | # conv3 174 | out = self.conv3(out) 175 | out = self.bn3(out) 176 | # downsample 177 | if self.downsample is not None: 178 | residual = self.downsample(x) 179 | 180 | # print(out.shape, residual.shape) 181 | out += residual 182 | out = self.relu(out) 183 | 184 | return out 185 | 186 | 187 | def downsample_basic_block(x, planes, stride): 188 | # decrease data resolution if stride not equals to 1 189 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 190 | # shape: (batch_size, channel, t, h, w) 191 | # try to match the channel size 192 | zero_pads = torch.Tensor( 193 | out.size(0), planes - out.size(1), out.size(2), out.size(3), 194 | out.size(4)).zero_() 195 | if isinstance(out.data, torch.cuda.FloatTensor): 196 | zero_pads = zero_pads.cuda() 197 | 198 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 199 | 200 | return out 201 | 202 | 203 | class ResNet(nn.Module): 204 | def __init__(self, block, layers, shortcut_type, sample_size, sample_duration, attention=False, num_classes=500): 205 | super(ResNet, self).__init__() 206 | # initialize inplanes to 64, it'll be changed later 207 | self.inplanes = 64 208 | self.conv1 = nn.Conv3d( 209 | 3, 64, kernel_size=7, stride=(1, 2, 2), padding=(3, 3, 3), bias=False) 210 | self.bn1 = nn.BatchNorm3d(64) 211 | self.relu = nn.ReLU(inplace=True) 212 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 213 | # layers refers to the number of blocks in each layer 214 | self.layer1 = self._make_layer( 215 | block, 64, layers[0], shortcut_type, stride=1) 216 | self.layer2 = self._make_layer( 217 | block, 128, layers[1], shortcut_type, stride=2) 218 | self.layer3 = self._make_layer( 219 | block, 256, layers[2], shortcut_type, stride=2) 220 | self.layer4 = self._make_layer( 221 | block, 512, layers[3], shortcut_type, stride=2) 222 | # calclatue kernal size for average pooling 223 | last_duration = int(math.ceil(sample_duration / 16)) 224 | last_size = int(math.ceil(sample_size / 32)) 225 | self.avgpool = nn.AvgPool3d( 226 | (last_duration, last_size, last_size), stride=1) 227 | # attention blocks 228 | self.attention = attention 229 | if self.attention: 230 | self.attn1 = LinearAttentionBlock3D(in_channels=512*block.expansion, normalize_attn=True) 231 | self.attn2 = LinearAttentionBlock3D(in_channels=512*block.expansion, normalize_attn=True) 232 | self.attn3 = LinearAttentionBlock3D(in_channels=512*block.expansion, normalize_attn=True) 233 | self.attn4 = LinearAttentionBlock3D(in_channels=512*block.expansion, normalize_attn=True) 234 | self.projector1 = ProjectorBlock3D(in_channels=64*block.expansion, out_channels=512*block.expansion) 235 | self.projector2 = ProjectorBlock3D(in_channels=128*block.expansion, out_channels=512*block.expansion) 236 | self.projector3 = ProjectorBlock3D(in_channels=256*block.expansion, out_channels=512*block.expansion) 237 | self.fc = nn.Linear(512 * block.expansion * 4, num_classes) 238 | else: 239 | self.fc = nn.Linear(512 * block.expansion, num_classes) 240 | # init the weights 241 | for m in self.modules(): 242 | if isinstance(m, nn.Conv3d): 243 | m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out') 244 | elif isinstance(m, nn.BatchNorm3d): 245 | m.weight.data.fill_(1) 246 | m.bias.data.zero_() 247 | 248 | def _make_layer(self, block, planes, blocks, shortcut_type, stride): 249 | downsample = None 250 | # when the in-channel and the out-channel dismatch, downsample!!! 251 | if stride != 1 or self.inplanes != planes * block.expansion: 252 | # stride once for downsample and block. 253 | if shortcut_type == 'A': 254 | downsample = partial( 255 | downsample_basic_block, 256 | planes=planes * block.expansion, 257 | stride=stride) 258 | else: 259 | downsample = nn.Sequential( 260 | nn.Conv3d( 261 | self.inplanes, 262 | planes * block.expansion, 263 | kernel_size=1, 264 | stride=stride, 265 | bias=False), nn.BatchNorm3d(planes * block.expansion)) 266 | 267 | layers = [] 268 | # only the first block needs downsample. 269 | layers.append(block(self.inplanes, planes, stride, downsample)) 270 | # change inplanes for the next layer 271 | self.inplanes = planes * block.expansion 272 | for i in range(1, blocks): 273 | layers.append(block(self.inplanes, planes)) 274 | 275 | return nn.Sequential(*layers) 276 | 277 | def forward(self, x): 278 | x = self.conv1(x) 279 | x = self.bn1(x) 280 | x = self.relu(x) 281 | x = self.maxpool(x) 282 | 283 | l1 = self.layer1(x) 284 | l2 = self.layer2(l1) 285 | l3 = self.layer3(l2) 286 | l4 = self.layer4(l3) 287 | 288 | g = self.avgpool(l4) 289 | # attention 290 | if self.attention: 291 | # print(l1.shape, l2.shape, l3.shape, l4.shape, g.shape) 292 | c1, g1 = self.attn1(self.projector1(l1), g) 293 | c2, g2 = self.attn2(self.projector2(l2), g) 294 | c3, g3 = self.attn3(self.projector3(l3), g) 295 | c4, g4 = self.attn4(l4, g) 296 | g = torch.cat((g1,g2,g3,g4), dim=1) 297 | x = self.fc(g) 298 | else: 299 | c1, c2, c3, c4 = None, None, None, None 300 | # x.size(0) ------ batch_size 301 | g = g.view(g.size(0), -1) 302 | x = self.fc(g) 303 | 304 | return [x, c1, c2, c3, c4] 305 | 306 | def load_my_state_dict(self, state_dict): 307 | my_state_dict = self.state_dict() 308 | for name, param in state_dict.items(): 309 | if name == 'fc.weight' or name == 'fc.bias': 310 | continue 311 | my_state_dict[name].copy_(param.data) 312 | 313 | 314 | model_urls = { 315 | 'resnet18': 'https://www.jianguoyun.com/c/dl-file/resnet-18-kinetics.pth?dt=q67aev&kv=YXF6QHpqdS5lZHUuY24&sd=a54cr&ud=B8Sbfz0nRv1pG8YNAbo0KiCnzvJHDsLYQsWjtT4b1j8&vr=1', 316 | 'resnet34': 'https://www.jianguoyun.com/c/dl-file/resnet-34-kinetics.pth?dt=q67acv&kv=YXF6QHpqdS5lZHUuY24&sd=a54cr&ud=BftTcvolMjyywptfxelwwjXJksCaU0ektvfMwCbMD1I&vr=1', 317 | 'resnet50': 'https://www.jianguoyun.com/c/dl-file/resnet-50-kinetics.pth?dt=q67atr&kv=YXF6QHpqdS5lZHUuY24&sd=a54cr&ud=uKpTbIK63qX3bHs2weOGqYYc2gtssQi-o7UqpoTaG6Q&vr=1', 318 | 'resnet101': '', 319 | 'resnet152': '', 320 | 'resnet200': '', 321 | } 322 | 323 | 324 | def resnet18(pretrained=False, progress=True, **kwargs): 325 | """Constructs a ResNet-18 model. 326 | """ 327 | model = ResNet(BasicBlock, [2, 2, 2, 2], shortcut_type='A', **kwargs) 328 | if pretrained: 329 | checkpoint = load_state_dict_from_url(model_urls['resnet18'], 330 | progress=progress) 331 | state_dict = checkpoint['state_dict'] 332 | 333 | new_state_dict = OrderedDict() 334 | for k, v in state_dict.items(): 335 | name = k[7:] # remove 'module.' 336 | new_state_dict[name]=v 337 | model.load_my_state_dict(new_state_dict) 338 | 339 | return model 340 | 341 | 342 | def resnet34(pretrained=False, progress=True, **kwargs): 343 | """Constructs a ResNet-34 model. 344 | """ 345 | model = ResNet(BasicBlock, [3, 4, 6, 3], shortcut_type='A', **kwargs) 346 | if pretrained: 347 | checkpoint = load_state_dict_from_url(model_urls['resnet34'], 348 | progress=progress) 349 | state_dict = checkpoint['state_dict'] 350 | 351 | new_state_dict = OrderedDict() 352 | for k, v in state_dict.items(): 353 | name = k[7:] # remove 'module.' 354 | new_state_dict[name]=v 355 | model.load_my_state_dict(new_state_dict) 356 | 357 | return model 358 | 359 | 360 | def resnet50(pretrained=False, progress=True, **kwargs): 361 | """Constructs a ResNet-50 model. 362 | """ 363 | model = ResNet(Bottleneck, [3, 4, 6, 3], shortcut_type='B', **kwargs) 364 | if pretrained: 365 | checkpoint = load_state_dict_from_url(model_urls['resnet50'], 366 | progress=progress) 367 | state_dict = checkpoint['state_dict'] 368 | 369 | new_state_dict = OrderedDict() 370 | for k, v in state_dict.items(): 371 | name = k[7:] # remove 'module.' 372 | new_state_dict[name]=v 373 | model.load_my_state_dict(new_state_dict) 374 | 375 | return model 376 | 377 | 378 | def resnet101(pretrained=False, progress=True, **kwargs): 379 | """Constructs a ResNet-101 model. 380 | """ 381 | model = ResNet(Bottleneck, [3, 4, 23, 3], shortcut_type='B', **kwargs) 382 | if pretrained: 383 | checkpoint = load_state_dict_from_url(model_urls['resnet101'], 384 | progress=progress) 385 | state_dict = checkpoint['state_dict'] 386 | 387 | new_state_dict = OrderedDict() 388 | for k, v in state_dict.items(): 389 | name = k[7:] # remove 'module.' 390 | new_state_dict[name]=v 391 | model.load_my_state_dict(new_state_dict) 392 | 393 | return model 394 | 395 | 396 | def resnet152(pretrained=False, progress=True, **kwargs): 397 | """Constructs a ResNet-101 model. 398 | """ 399 | model = ResNet(Bottleneck, [3, 8, 36, 3], shortcut_type='B', **kwargs) 400 | if pretrained: 401 | checkpoint = load_state_dict_from_url(model_urls['resnet152'], 402 | progress=progress) 403 | state_dict = checkpoint['state_dict'] 404 | 405 | new_state_dict = OrderedDict() 406 | for k, v in state_dict.items(): 407 | name = k[7:] # remove 'module.' 408 | new_state_dict[name]=v 409 | model.load_my_state_dict(new_state_dict) 410 | 411 | return model 412 | 413 | 414 | def resnet200(pretrained=False, progress=True, **kwargs): 415 | """Constructs a ResNet-101 model. 416 | """ 417 | model = ResNet(Bottleneck, [3, 24, 36, 3], shortcut_type='B', **kwargs) 418 | if pretrained: 419 | checkpoint = load_state_dict_from_url(model_urls['resnet200'], 420 | progress=progress) 421 | state_dict = checkpoint['state_dict'] 422 | 423 | new_state_dict = OrderedDict() 424 | for k, v in state_dict.items(): 425 | name = k[7:] # remove 'module.' 426 | new_state_dict[name]=v 427 | model.load_my_state_dict(new_state_dict) 428 | 429 | return model 430 | 431 | 432 | """ 433 | 3D CNN Models from torchvision.models 434 | Reference: https://pytorch.org/docs/stable/torchvision/models.html#video-classification 435 | """ 436 | class r3d_18(nn.Module): 437 | def __init__(self, pretrained=True, num_classes=500): 438 | super(r3d_18, self).__init__() 439 | self.pretrained = pretrained 440 | self.num_classes = num_classes 441 | model = torchvision.models.video.r3d_18(pretrained=self.pretrained) 442 | # delete the last fc layer 443 | modules = list(model.children())[:-1] 444 | # print(modules) 445 | self.r3d_18 = nn.Sequential(*modules) 446 | self.fc1 = nn.Linear(model.fc.in_features, self.num_classes) 447 | 448 | def forward(self, x): 449 | out = self.r3d_18(x) 450 | # print(out.shape) 451 | # Flatten the layer to fc 452 | out = out.flatten(1) 453 | out = self.fc1(out) 454 | 455 | return out 456 | 457 | 458 | class mc3_18(nn.Module): 459 | def __init__(self, pretrained=True, num_classes=500): 460 | super(mc3_18, self).__init__() 461 | self.pretrained = pretrained 462 | self.num_classes = num_classes 463 | model = torchvision.models.video.mc3_18(pretrained=self.pretrained) 464 | # delete the last fc layer 465 | modules = list(model.children())[:-1] 466 | # print(modules) 467 | self.mc3_18 = nn.Sequential(*modules) 468 | self.fc1 = nn.Linear(model.fc.in_features, self.num_classes) 469 | 470 | def forward(self, x): 471 | out = self.mc3_18(x) 472 | # print(out.shape) 473 | # Flatten the layer to fc 474 | out = out.flatten(1) 475 | out = self.fc1(out) 476 | 477 | return out 478 | 479 | 480 | class r2plus1d_18(nn.Module): 481 | def __init__(self, pretrained=True, num_classes=500): 482 | super(r2plus1d_18, self).__init__() 483 | self.pretrained = pretrained 484 | self.num_classes = num_classes 485 | model = torchvision.models.video.r2plus1d_18(pretrained=self.pretrained) 486 | # delete the last fc layer 487 | modules = list(model.children())[:-1] 488 | # print(modules) 489 | self.r2plus1d_18 = nn.Sequential(*modules) 490 | self.fc1 = nn.Linear(model.fc.in_features, self.num_classes) 491 | 492 | def forward(self, x): 493 | out = self.r2plus1d_18(x) 494 | # print(out.shape) 495 | # Flatten the layer to fc 496 | out = out.flatten(1) 497 | out = self.fc1(out) 498 | 499 | return out 500 | 501 | 502 | # Test 503 | if __name__ == '__main__': 504 | import sys 505 | sys.path.append("..") 506 | import torchvision.transforms as transforms 507 | from dataset import CSL_Isolated 508 | sample_size = 128 509 | sample_duration = 16 510 | num_classes = 500 511 | transform = transforms.Compose([transforms.Resize([sample_size, sample_size]), transforms.ToTensor()]) 512 | dataset = CSL_Isolated(data_path="/home/haodong/Data/CSL_Isolated/color_video_125000", 513 | label_path="/home/haodong/Data/CSL_Isolated/dictionary.txt", frames=sample_duration, 514 | num_classes=num_classes, transform=transform) 515 | # cnn3d = CNN3D(sample_size=sample_size, sample_duration=sample_duration, num_classes=num_classes) 516 | cnn3d = resnet50(pretrained=True, progress=True, sample_size=sample_size, sample_duration=sample_duration, attention=True, num_classes=num_classes) 517 | # cnn3d = r3d_18(pretrained=True, num_classes=num_classes) 518 | # cnn3d = mc3_18(pretrained=True, num_classes=num_classes) 519 | # cnn3d = r2plus1d_18(pretrained=True, num_classes=num_classes) 520 | # print(dataset[0]['images'].shape) 521 | print(cnn3d(dataset[0]['data'].unsqueeze(0))) 522 | 523 | # Test for loading pretrained models 524 | # state_dict = torch.load('resnet-18-kinetics.pth') 525 | # for name, param in state_dict.items(): 526 | # print(name) 527 | # # print(state_dict['arch']) 528 | # # print(state_dict['optimizer']) 529 | # # print(state_dict['epoch']) 530 | -------------------------------------------------------------------------------- /models/ConvLSTM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torchvision.models as models 6 | import math 7 | 8 | import os,inspect,sys 9 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 10 | sys.path.insert(0,currentdir) 11 | from Attention import LSTMAttentionBlock 12 | 13 | """ 14 | Implementation of CNN+LSTM. 15 | """ 16 | class CRNN(nn.Module): 17 | def __init__(self, sample_size=256, sample_duration=16, num_classes=100, 18 | lstm_hidden_size=512, lstm_num_layers=1): 19 | super(CRNN, self).__init__() 20 | self.sample_size = sample_size 21 | self.sample_duration = sample_duration 22 | self.num_classes = num_classes 23 | 24 | # network params 25 | self.ch1, self.ch2, self.ch3, self.ch4 = 64, 128, 256, 512 26 | self.k1, self.k2, self.k3, self.k4 = (7, 7), (3, 3), (3, 3), (3, 3) 27 | self.s1, self.s2, self.s3, self.s4 = (2, 2), (1, 1), (1, 1), (1, 1) 28 | self.p1, self.p2, self.p3, self.p4 = (0, 0), (0, 0), (0, 0), (0, 0) 29 | self.d1, self.d2, self.d3, self.d4 = (1, 1), (1, 1), (1, 1), (1, 1) 30 | self.lstm_input_size = self.ch4 31 | self.lstm_hidden_size = lstm_hidden_size 32 | self.lstm_num_layers = lstm_num_layers 33 | 34 | # network architecture 35 | # in_channels=3 for rgb 36 | self.conv1 = nn.Sequential( 37 | nn.Conv2d(in_channels=3, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1, padding=self.p1, dilation=self.d1), 38 | nn.BatchNorm2d(self.ch1, momentum=0.01), 39 | nn.ReLU(inplace=True), 40 | nn.Conv2d(in_channels=self.ch1, out_channels=self.ch1, kernel_size=1, stride=1), 41 | nn.MaxPool2d(kernel_size=2), 42 | ) 43 | self.conv2 = nn.Sequential( 44 | nn.Conv2d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2, padding=self.p2, dilation=self.d2), 45 | nn.BatchNorm2d(self.ch2, momentum=0.01), 46 | nn.ReLU(inplace=True), 47 | nn.Conv2d(in_channels=self.ch2, out_channels=self.ch2, kernel_size=1, stride=1), 48 | nn.MaxPool2d(kernel_size=2), 49 | ) 50 | self.conv3 = nn.Sequential( 51 | nn.Conv2d(in_channels=self.ch2, out_channels=self.ch3, kernel_size=self.k3, stride=self.s3, padding=self.p3, dilation=self.d3), 52 | nn.BatchNorm2d(self.ch3, momentum=0.01), 53 | nn.ReLU(inplace=True), 54 | nn.Conv2d(in_channels=self.ch3, out_channels=self.ch3, kernel_size=1, stride=1), 55 | nn.MaxPool2d(kernel_size=2), 56 | ) 57 | self.conv4 = nn.Sequential( 58 | nn.Conv2d(in_channels=self.ch3, out_channels=self.ch4, kernel_size=self.k4, stride=self.s4, padding=self.p4, dilation=self.d4), 59 | nn.BatchNorm2d(self.ch4, momentum=0.01), 60 | nn.ReLU(inplace=True), 61 | nn.Conv2d(in_channels=self.ch4, out_channels=self.ch4, kernel_size=1, stride=1), 62 | nn.AdaptiveAvgPool2d((1,1)), 63 | ) 64 | self.lstm = nn.LSTM( 65 | input_size=self.lstm_input_size, 66 | hidden_size=self.lstm_hidden_size, 67 | num_layers=self.lstm_num_layers, 68 | batch_first=True, 69 | ) 70 | self.fc1 = nn.Linear(self.lstm_hidden_size, self.num_classes) 71 | 72 | def forward(self, x): 73 | # CNN 74 | cnn_embed_seq = [] 75 | # print(x.shape) 76 | # x: (batch_size, channel, t, h, w) 77 | for t in range(x.size(2)): 78 | # Conv 79 | out = self.conv1(x[:, :, t, :, :]) 80 | out = self.conv2(out) 81 | out = self.conv3(out) 82 | out = self.conv4(out) 83 | # print(out.shape) 84 | out = out.view(out.size(0), -1) 85 | cnn_embed_seq.append(out) 86 | 87 | cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0) 88 | # print(cnn_embed_seq.shape) 89 | # batch first 90 | cnn_embed_seq = cnn_embed_seq.transpose_(0, 1) 91 | 92 | # LSTM 93 | # use faster code paths 94 | self.lstm.flatten_parameters() 95 | out, (h_n, c_n) = self.lstm(cnn_embed_seq, None) 96 | # MLP 97 | # out: (batch, seq, feature), choose the last time step 98 | out = self.fc1(out[:, -1, :]) 99 | 100 | return out 101 | 102 | 103 | """ 104 | Implementation of Resnet+LSTM 105 | """ 106 | class ResCRNN(nn.Module): 107 | def __init__(self, sample_size=256, sample_duration=16, num_classes=100, 108 | lstm_hidden_size=512, lstm_num_layers=1, arch="resnet18", 109 | attention=False): 110 | super(ResCRNN, self).__init__() 111 | self.sample_size = sample_size 112 | self.sample_duration = sample_duration 113 | self.num_classes = num_classes 114 | 115 | # network params 116 | self.lstm_hidden_size = lstm_hidden_size 117 | self.lstm_num_layers = lstm_num_layers 118 | self.attention = attention 119 | 120 | # network architecture 121 | if arch == "resnet18": 122 | resnet = models.resnet18(pretrained=True) 123 | elif arch == "resnet34": 124 | resnet = models.resnet34(pretrained=True) 125 | elif arch == "resnet50": 126 | resnet = models.resnet50(pretrained=True) 127 | elif arch == "resnet101": 128 | resnet = models.resnet101(pretrained=True) 129 | elif arch == "resnet152": 130 | resnet = models.resnet152(pretrained=True) 131 | # delete the last fc layer 132 | modules = list(resnet.children())[:-1] 133 | self.resnet = nn.Sequential(*modules) 134 | self.lstm = nn.LSTM( 135 | input_size=resnet.fc.in_features, 136 | hidden_size=self.lstm_hidden_size, 137 | num_layers=self.lstm_num_layers, 138 | batch_first=True, 139 | ) 140 | if self.attention: 141 | self.attn_block = LSTMAttentionBlock(hidden_size=self.lstm_hidden_size) 142 | self.fc1 = nn.Linear(self.lstm_hidden_size, self.num_classes) 143 | 144 | def forward(self, x): 145 | # CNN 146 | cnn_embed_seq = [] 147 | # x: (batch_size, channel, t, h, w) 148 | for t in range(x.size(2)): 149 | # with torch.no_grad(): 150 | out = self.resnet(x[:, :, t, :, :]) 151 | # print(out.shape) 152 | out = out.view(out.size(0), -1) 153 | cnn_embed_seq.append(out) 154 | 155 | cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0) 156 | # print(cnn_embed_seq.shape) 157 | # batch first 158 | cnn_embed_seq = cnn_embed_seq.transpose_(0, 1) 159 | 160 | # LSTM 161 | # use faster code paths 162 | self.lstm.flatten_parameters() 163 | out, (h_n, c_n) = self.lstm(cnn_embed_seq, None) 164 | # MLP 165 | if self.attention: 166 | out = self.fc1(self.attn_block(out)) 167 | else: 168 | # out: (batch, seq, feature), choose the last time step 169 | out = self.fc1(out[:, -1, :]) 170 | 171 | return out 172 | 173 | 174 | # Test 175 | if __name__ == '__main__': 176 | import sys 177 | sys.path.append("..") 178 | import torchvision.transforms as transforms 179 | from dataset import CSL_Isolated 180 | sample_size = 128 181 | sample_duration = 16 182 | num_classes = 500 183 | transform = transforms.Compose([transforms.Resize([sample_size, sample_size]), transforms.ToTensor()]) 184 | dataset = CSL_Isolated(data_path="/home/haodong/Data/CSL_Isolated/color_video_125000", 185 | label_path="/home/haodong/Data/CSL_Isolated/dictionary.txt", frames=sample_duration, 186 | num_classes=num_classes, transform=transform) 187 | # crnn = CRNN() 188 | crnn = ResCRNN(sample_size=sample_size, sample_duration=sample_duration, num_classes=num_classes, arch="resnet152") 189 | print(crnn(dataset[0]['data'].unsqueeze(0))) 190 | -------------------------------------------------------------------------------- /models/GCN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy as np 6 | 7 | """ 8 | Implementation of Spatial Temporal Graph Convolutional Network 9 | Reference: Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition 10 | """ 11 | class Graph(): 12 | """ The Graph to model the skeletons extracted by the openpose 13 | Args: 14 | strategy (string): must be one of the follow candidates 15 | - uniform: Uniform Labeling 16 | - distance: Distance Partitioning 17 | - spatial: Spatial Configuration 18 | 19 | layout (string): must be one of the follow candidates 20 | - openpose: Is consists of 18 joints. 21 | - ntu-rgb+d: Is consists of 25 joints. 22 | 23 | max_hop (int): the maximal distance between two connected nodes 24 | dilation (int): controls the spacing between the kernel points 25 | """ 26 | def __init__(self, 27 | layout='openpose', 28 | strategy='uniform', 29 | max_hop=1, 30 | dilation=1): 31 | self.max_hop = max_hop 32 | self.dilation = dilation 33 | 34 | self.get_edge(layout) 35 | self.hop_dis = get_hop_distance( 36 | self.num_node, self.edge, max_hop=max_hop) 37 | self.get_adjacency(strategy) 38 | 39 | def __str__(self): 40 | return self.A 41 | 42 | def get_edge(self, layout): 43 | if layout == 'openpose': 44 | self.num_node = 18 45 | self_link = [(i, i) for i in range(self.num_node)] 46 | neighbor_link = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 47 | 11), 48 | (10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1), 49 | (0, 1), (15, 0), (14, 0), (17, 15), (16, 14)] 50 | self.edge = self_link + neighbor_link 51 | self.center = 1 52 | elif layout == 'ntu-rgb+d': 53 | self.num_node = 25 54 | # link to itself 55 | self_link = [(i, i) for i in range(self.num_node)] 56 | # link to neighbors 57 | neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), 58 | (6, 5), (7, 6), (8, 7), (9, 21), (10, 9), 59 | (11, 10), (12, 11), (13, 1), (14, 13), (15, 14), 60 | (16, 15), (17, 1), (18, 17), (19, 18), (20, 19), 61 | (22, 23), (23, 8), (24, 25), (25, 12)] 62 | neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base] 63 | self.edge = self_link + neighbor_link 64 | self.center = 21 - 1 65 | elif layout == 'ntu_edge': 66 | self.num_node = 24 67 | self_link = [(i, i) for i in range(self.num_node)] 68 | neighbor_1base = [(1, 2), (3, 2), (4, 3), (5, 2), (6, 5), (7, 6), 69 | (8, 7), (9, 2), (10, 9), (11, 10), (12, 11), 70 | (13, 1), (14, 13), (15, 14), (16, 15), (17, 1), 71 | (18, 17), (19, 18), (20, 19), (21, 22), (22, 8), 72 | (23, 24), (24, 12)] 73 | neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base] 74 | self.edge = self_link + neighbor_link 75 | self.center = 2 76 | # elif layout=='customer settings' 77 | # pass 78 | else: 79 | raise ValueError("Do Not Exist This Layout.") 80 | 81 | def get_adjacency(self, strategy): 82 | valid_hop = range(0, self.max_hop + 1, self.dilation) 83 | adjacency = np.zeros((self.num_node, self.num_node)) 84 | for hop in valid_hop: 85 | adjacency[self.hop_dis == hop] = 1 86 | normalize_adjacency = normalize_digraph(adjacency) 87 | 88 | if strategy == 'uniform': 89 | A = np.zeros((1, self.num_node, self.num_node)) 90 | A[0] = normalize_adjacency 91 | self.A = A 92 | elif strategy == 'distance': 93 | A = np.zeros((len(valid_hop), self.num_node, self.num_node)) 94 | for i, hop in enumerate(valid_hop): 95 | A[i][self.hop_dis == hop] = normalize_adjacency[self.hop_dis == 96 | hop] 97 | self.A = A 98 | elif strategy == 'spatial': 99 | A = [] 100 | for hop in valid_hop: 101 | a_root = np.zeros((self.num_node, self.num_node)) 102 | a_close = np.zeros((self.num_node, self.num_node)) 103 | a_further = np.zeros((self.num_node, self.num_node)) 104 | for i in range(self.num_node): 105 | for j in range(self.num_node): 106 | if self.hop_dis[j, i] == hop: 107 | if self.hop_dis[j, self.center] == self.hop_dis[ 108 | i, self.center]: 109 | a_root[j, i] = normalize_adjacency[j, i] 110 | elif self.hop_dis[j, self. 111 | center] > self.hop_dis[i, self. 112 | center]: 113 | a_close[j, i] = normalize_adjacency[j, i] 114 | else: 115 | a_further[j, i] = normalize_adjacency[j, i] 116 | if hop == 0: 117 | A.append(a_root) 118 | else: 119 | A.append(a_root + a_close) 120 | A.append(a_further) 121 | A = np.stack(A) 122 | self.A = A 123 | else: 124 | raise ValueError("Do Not Exist This Strategy") 125 | 126 | 127 | def get_hop_distance(num_node, edge, max_hop=1): 128 | # link matrix 129 | A = np.zeros((num_node, num_node)) 130 | for i, j in edge: 131 | A[j, i] = 1 132 | A[i, j] = 1 133 | 134 | # compute hop steps 135 | hop_dis = np.zeros((num_node, num_node)) + np.inf 136 | transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)] 137 | arrive_mat = (np.stack(transfer_mat) > 0) 138 | for d in range(max_hop, -1, -1): 139 | hop_dis[arrive_mat[d]] = d 140 | return hop_dis 141 | 142 | 143 | def normalize_digraph(A): 144 | Dl = np.sum(A, 0) 145 | num_node = A.shape[0] 146 | Dn = np.zeros((num_node, num_node)) 147 | for i in range(num_node): 148 | if Dl[i] > 0: 149 | Dn[i, i] = Dl[i]**(-1) 150 | AD = np.dot(A, Dn) 151 | return AD 152 | 153 | 154 | def normalize_undigraph(A): 155 | Dl = np.sum(A, 0) 156 | num_node = A.shape[0] 157 | Dn = np.zeros((num_node, num_node)) 158 | for i in range(num_node): 159 | if Dl[i] > 0: 160 | Dn[i, i] = Dl[i]**(-0.5) 161 | DAD = np.dot(np.dot(Dn, A), Dn) 162 | return DAD 163 | 164 | 165 | class ConvTemporalGraphical(nn.Module): 166 | 167 | r"""The basic module for applying a graph convolution. 168 | 169 | Args: 170 | in_channels (int): Number of channels in the input sequence data 171 | out_channels (int): Number of channels produced by the convolution 172 | kernel_size (int): Size of the graph convolving kernel 173 | t_kernel_size (int): Size of the temporal convolving kernel 174 | t_stride (int, optional): Stride of the temporal convolution. Default: 1 175 | t_padding (int, optional): Temporal zero-padding added to both sides of 176 | the input. Default: 0 177 | t_dilation (int, optional): Spacing between temporal kernel elements. 178 | Default: 1 179 | bias (bool, optional): If ``True``, adds a learnable bias to the output. 180 | Default: ``True`` 181 | 182 | Shape: 183 | - Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format 184 | - Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format 185 | - Output[0]: Outpu graph sequence in :math:`(N, out_channels, T_{out}, V)` format 186 | - Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format 187 | 188 | where 189 | :math:`N` is a batch size, 190 | :math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`, 191 | :math:`T_{in}/T_{out}` is a length of input/output sequence, 192 | :math:`V` is the number of graph nodes. 193 | """ 194 | 195 | def __init__(self, 196 | in_channels, 197 | out_channels, 198 | kernel_size, 199 | t_kernel_size=1, 200 | t_stride=1, 201 | t_padding=0, 202 | t_dilation=1, 203 | bias=True): 204 | super().__init__() 205 | 206 | self.kernel_size = kernel_size 207 | self.conv = nn.Conv2d( 208 | in_channels, 209 | out_channels * kernel_size, 210 | kernel_size=(t_kernel_size, 1), 211 | padding=(t_padding, 0), 212 | stride=(t_stride, 1), 213 | dilation=(t_dilation, 1), 214 | bias=bias) 215 | 216 | def forward(self, x, A): 217 | assert A.size(0) == self.kernel_size 218 | 219 | x = self.conv(x) 220 | 221 | n, kc, t, v = x.size() 222 | x = x.view(n, self.kernel_size, kc//self.kernel_size, t, v) 223 | x = torch.einsum('nkctv,kvw->nctw', (x, A)) 224 | 225 | return x.contiguous(), A 226 | 227 | 228 | class st_gcn(nn.Module): 229 | r"""Applies a spatial temporal graph convolution over an input graph sequence. 230 | 231 | Args: 232 | in_channels (int): Number of channels in the input sequence data 233 | out_channels (int): Number of channels produced by the convolution 234 | kernel_size (tuple): Size of the temporal convolving kernel and graph convolving kernel 235 | stride (int, optional): Stride of the temporal convolution. Default: 1 236 | dropout (int, optional): Dropout rate of the final output. Default: 0 237 | residual (bool, optional): If ``True``, applies a residual mechanism. Default: ``True`` 238 | 239 | Shape: 240 | - Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format 241 | - Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format 242 | - Output[0]: Outpu graph sequence in :math:`(N, out_channels, T_{out}, V)` format 243 | - Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format 244 | 245 | where 246 | :math:`N` is a batch size, 247 | :math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`, 248 | :math:`T_{in}/T_{out}` is a length of input/output sequence, 249 | :math:`V` is the number of graph nodes. 250 | 251 | """ 252 | 253 | def __init__(self, 254 | in_channels, 255 | out_channels, 256 | kernel_size, 257 | stride=1, 258 | dropout=0, 259 | residual=True): 260 | super().__init__() 261 | 262 | assert len(kernel_size) == 2 263 | assert kernel_size[0] % 2 == 1 264 | padding = ((kernel_size[0] - 1) // 2, 0) 265 | 266 | self.gcn = ConvTemporalGraphical(in_channels, out_channels, 267 | kernel_size[1]) 268 | 269 | self.tcn = nn.Sequential( 270 | nn.BatchNorm2d(out_channels), 271 | nn.ReLU(inplace=True), 272 | nn.Conv2d( 273 | out_channels, 274 | out_channels, 275 | (kernel_size[0], 1), 276 | (stride, 1), 277 | padding, 278 | ), 279 | nn.BatchNorm2d(out_channels), 280 | nn.Dropout(dropout, inplace=True), 281 | ) 282 | 283 | if not residual: 284 | self.residual = lambda x: 0 285 | 286 | elif (in_channels == out_channels) and (stride == 1): 287 | self.residual = lambda x: x 288 | 289 | else: 290 | self.residual = nn.Sequential( 291 | nn.Conv2d( 292 | in_channels, 293 | out_channels, 294 | kernel_size=1, 295 | stride=(stride, 1)), 296 | nn.BatchNorm2d(out_channels), 297 | ) 298 | 299 | self.relu = nn.ReLU(inplace=True) 300 | 301 | def forward(self, x, A): 302 | 303 | res = self.residual(x) 304 | x, A = self.gcn(x, A) 305 | x = self.tcn(x) + res 306 | 307 | return self.relu(x), A 308 | 309 | 310 | class GCN(nn.Module): 311 | r"""Spatial temporal graph convolutional networks. 312 | 313 | Args: 314 | in_channels (int): Number of channels in the input data 315 | num_class (int): Number of classes for the classification task 316 | graph_args (dict): The arguments for building the graph 317 | edge_importance_weighting (bool): If ``True``, adds a learnable 318 | importance weighting to the edges of the graph 319 | **kwargs (optional): Other parameters for graph convolution units 320 | 321 | Shape: 322 | - Input: :math:`(N, in_channels, T_{in}, V_{in}, M_{in})` 323 | - Output: :math:`(N, num_class)` where 324 | :math:`N` is a batch size, 325 | :math:`T_{in}` is a length of input sequence, 326 | :math:`V_{in}` is the number of graph nodes, 327 | :math:`M_{in}` is the number of instance in a frame. 328 | """ 329 | 330 | def __init__(self, in_channels, num_class, graph_args, 331 | edge_importance_weighting, **kwargs): 332 | super().__init__() 333 | 334 | # load graph 335 | self.graph = Graph(**graph_args) 336 | A = torch.tensor(self.graph.A, dtype=torch.float32, requires_grad=False) 337 | self.register_buffer('A', A) 338 | 339 | # build networks 340 | spatial_kernel_size = A.size(0) 341 | temporal_kernel_size = 9 342 | kernel_size = (temporal_kernel_size, spatial_kernel_size) 343 | self.data_bn = nn.BatchNorm1d(in_channels * A.size(1)) 344 | kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'} 345 | self.st_gcn_networks = nn.ModuleList(( 346 | st_gcn(in_channels, 64, kernel_size, 1, residual=False, **kwargs0), 347 | st_gcn(64, 64, kernel_size, 1, **kwargs), 348 | st_gcn(64, 64, kernel_size, 1, **kwargs), 349 | st_gcn(64, 64, kernel_size, 1, **kwargs), 350 | st_gcn(64, 128, kernel_size, 2, **kwargs), 351 | st_gcn(128, 128, kernel_size, 1, **kwargs), 352 | st_gcn(128, 128, kernel_size, 1, **kwargs), 353 | st_gcn(128, 256, kernel_size, 2, **kwargs), 354 | st_gcn(256, 256, kernel_size, 1, **kwargs), 355 | st_gcn(256, 256, kernel_size, 1, **kwargs), 356 | )) 357 | 358 | # initialize parameters for edge importance weighting 359 | if edge_importance_weighting: 360 | self.edge_importance = nn.ParameterList([ 361 | nn.Parameter(torch.ones(self.A.size())) 362 | for i in self.st_gcn_networks 363 | ]) 364 | else: 365 | self.edge_importance = [1] * len(self.st_gcn_networks) 366 | 367 | # fcn for prediction 368 | self.fcn = nn.Conv2d(256, num_class, kernel_size=1) 369 | 370 | def forward(self, x): 371 | # Trick: add new dimension & switch data dimension 372 | x = x.unsqueeze(-1) 373 | x = x.permute(0, 2, 1, 3, 4) 374 | # print(x.shape) 375 | 376 | # data normalization 377 | N, C, T, V, M = x.size() 378 | x = x.permute(0, 4, 3, 1, 2).contiguous() 379 | x = x.view(N * M, V * C, T) 380 | x = self.data_bn(x) 381 | x = x.view(N, M, V, C, T) 382 | x = x.permute(0, 1, 3, 4, 2).contiguous() 383 | x = x.view(N * M, C, T, V) 384 | 385 | # forwad 386 | for gcn, importance in zip(self.st_gcn_networks, self.edge_importance): 387 | x, _ = gcn(x, self.A * importance) 388 | 389 | # global pooling 390 | x = F.avg_pool2d(x, x.size()[2:]) 391 | x = x.view(N, M, -1, 1, 1).mean(dim=1) 392 | 393 | # prediction 394 | x = self.fcn(x) 395 | x = x.view(x.size(0), -1) 396 | 397 | return x 398 | 399 | def extract_feature(self, x): 400 | 401 | # data normalization 402 | N, C, T, V, M = x.size() 403 | x = x.permute(0, 4, 3, 1, 2).contiguous() 404 | x = x.view(N * M, V * C, T) 405 | x = self.data_bn(x) 406 | x = x.view(N, M, V, C, T) 407 | x = x.permute(0, 1, 3, 4, 2).contiguous() 408 | x = x.view(N * M, C, T, V) 409 | 410 | # forwad 411 | for gcn, importance in zip(self.st_gcn_networks, self.edge_importance): 412 | x, _ = gcn(x, self.A * importance) 413 | 414 | _, c, t, v = x.size() 415 | feature = x.view(N, M, c, t, v).permute(0, 2, 3, 4, 1) 416 | 417 | # prediction 418 | x = self.fcn(x) 419 | output = x.view(N, M, -1, t, v).permute(0, 2, 3, 4, 1) 420 | 421 | return output, feature 422 | 423 | 424 | # Test 425 | if __name__ == '__main__': 426 | import sys 427 | sys.path.append("..") 428 | from dataset import CSL_Skeleton 429 | dataset = CSL_Skeleton(data_path="/home/haodong/Data/CSL_Isolated_1/xf500_body_depth_txt", 430 | label_path="/home/haodong/Data/CSL_Isolated_1/dictionary.txt", split_to_channels=True) 431 | gcn = GCN(in_channels=2, num_class=500, graph_args={'layout': 'ntu-rgb+d'}, 432 | edge_importance_weighting=True) 433 | print(dataset[0]['images'].unsqueeze(0).shape) 434 | gcn(dataset[0]['images'].unsqueeze(0)) 435 | -------------------------------------------------------------------------------- /models/RNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | """ 6 | Implementation of LSTM 7 | Reference: SIGN LANGUAGE RECOGNITION WITH LONG SHORT-TERM MEMORY 8 | """ 9 | class LSTM(nn.Module): 10 | def __init__(self, lstm_input_size=512, lstm_hidden_size=512, lstm_num_layers=3, 11 | num_classes=100, hidden1=256, drop_p=0.0): 12 | super(LSTM, self).__init__() 13 | # network params 14 | self.lstm_input_size = lstm_input_size 15 | self.lstm_hidden_size = lstm_hidden_size 16 | self.lstm_num_layers = lstm_num_layers 17 | self.num_classes = num_classes 18 | self.hidden1 = hidden1 19 | self.drop_p = drop_p 20 | 21 | # network architecture 22 | self.lstm = nn.LSTM( 23 | input_size=self.lstm_input_size, 24 | hidden_size=self.lstm_hidden_size, 25 | num_layers=self.lstm_num_layers, 26 | batch_first=True, 27 | ) 28 | self.drop = nn.Dropout2d(p=self.drop_p) 29 | self.fc1 = nn.Linear(self.lstm_hidden_size, self.hidden1) 30 | self.fc2 = nn.Linear(self.hidden1, self.num_classes) 31 | 32 | def forward(self, x): 33 | # LSTM 34 | # use faster code paths 35 | self.lstm.flatten_parameters() 36 | # print(x.shape) 37 | # batch first: (batch, seq, feature) 38 | out, (h_n, c_n) = self.lstm(x, None) 39 | # MLP 40 | # out: (batch, seq, feature), choose the last time step 41 | out = F.relu(self.fc1(out[:, -1, :])) 42 | out = F.dropout(out, p=self.drop_p, training=self.training) 43 | out = self.fc2(out) 44 | 45 | return out 46 | 47 | 48 | """ 49 | Implementation of GRU 50 | """ 51 | class GRU(nn.Module): 52 | def __init__(self, gru_input_size=512, gru_hidden_size=512, gru_num_layers=3, 53 | num_classes=100, hidden1=256, drop_p=0.0): 54 | super(GRU, self).__init__() 55 | # network params 56 | self.gru_input_size = gru_input_size 57 | self.gru_hidden_size = gru_hidden_size 58 | self.gru_num_layers = gru_num_layers 59 | self.num_classes = num_classes 60 | self.hidden1 = hidden1 61 | self.drop_p = drop_p 62 | 63 | # network architecture 64 | self.gru = nn.GRU( 65 | input_size=self.gru_input_size, 66 | hidden_size=self.gru_hidden_size, 67 | num_layers=self.gru_num_layers, 68 | batch_first=True, 69 | ) 70 | self.drop = nn.Dropout2d(p=self.drop_p) 71 | self.fc1 = nn.Linear(self.gru_hidden_size, self.hidden1) 72 | self.fc2 = nn.Linear(self.hidden1, self.num_classes) 73 | 74 | def forward(self, x): 75 | # GRU 76 | # use faster code paths 77 | self.gru.flatten_parameters() 78 | # print(x.shape) 79 | # batch first: (batch, seq, feature) 80 | out, hidden = self.gru(x, None) 81 | # MLP 82 | # out: (batch, seq, feature), choose the last time step 83 | out = F.relu(self.fc1(out[:, -1, :])) 84 | out = F.dropout(out, p=self.drop_p, training=self.training) 85 | out = self.fc2(out) 86 | 87 | return out 88 | 89 | # Test 90 | if __name__ == '__main__': 91 | import sys 92 | sys.path.append("..") 93 | from dataset import CSL_Skeleton 94 | selected_joints = ['HANDLEFT', 'HANDRIGHT', 'ELBOWLEFT', 'ELBOWRIGHT'] 95 | dataset = CSL_Skeleton(data_path="/home/haodong/Data/CSL_Isolated/xf500_body_depth_txt", 96 | label_path="/home/haodong/Data/CSL_Isolated/dictionary.txt", selected_joints=selected_joints) 97 | input_size = len(selected_joints)*2 98 | # test LSTM 99 | lstm = LSTM(lstm_input_size=input_size) 100 | print(lstm(dataset[0]['data'].unsqueeze(0))) 101 | 102 | # test GRU 103 | gru = GRU(gru_input_size=input_size) 104 | print(gru(dataset[0]['data'].unsqueeze(0))) 105 | -------------------------------------------------------------------------------- /models/Seq2Seq.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision.models as models 4 | import random 5 | 6 | import os,inspect,sys 7 | currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 8 | sys.path.insert(0,currentdir) 9 | from ConvLSTM import ResCRNN 10 | 11 | """ 12 | Implementation of Sequence to Sequence Model 13 | Encoder: encode video spatial and temporal dynamics e.g. CNN+LSTM 14 | Decoder: decode the compressed info from encoder 15 | """ 16 | class Encoder(nn.Module): 17 | def __init__(self, lstm_hidden_size=512, arch="resnet18"): 18 | super(Encoder, self).__init__() 19 | self.lstm_hidden_size = lstm_hidden_size 20 | 21 | # network architecture 22 | if arch == "resnet18": 23 | resnet = models.resnet18(pretrained=True) 24 | elif arch == "resnet34": 25 | resnet = models.resnet34(pretrained=True) 26 | elif arch == "resnet50": 27 | resnet = models.resnet50(pretrained=True) 28 | elif arch == "resnet101": 29 | resnet = models.resnet101(pretrained=True) 30 | elif arch == "resnet152": 31 | resnet = models.resnet152(pretrained=True) 32 | # delete the last fc layer 33 | modules = list(resnet.children())[:-1] 34 | self.resnet = nn.Sequential(*modules) 35 | self.lstm = nn.LSTM( 36 | input_size=resnet.fc.in_features, 37 | hidden_size=self.lstm_hidden_size, 38 | batch_first=True, 39 | ) 40 | 41 | def forward(self, x): 42 | # CNN 43 | cnn_embed_seq = [] 44 | # x: (batch_size, channel, t, h, w) 45 | for t in range(x.size(2)): 46 | # with torch.no_grad(): 47 | out = self.resnet(x[:, :, t, :, :]) 48 | # print(out.shape) 49 | out = out.view(out.size(0), -1) 50 | cnn_embed_seq.append(out) 51 | 52 | cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0) 53 | # batch first 54 | cnn_embed_seq = cnn_embed_seq.transpose_(0, 1) 55 | 56 | # LSTM 57 | # use faster code paths 58 | self.lstm.flatten_parameters() 59 | out, (h_n, c_n) = self.lstm(cnn_embed_seq, None) 60 | 61 | # num_layers * num_directions = 1 62 | return out, (h_n.squeeze(0), c_n.squeeze(0)) 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout): 67 | super(Decoder, self).__init__() 68 | self.output_dim = output_dim 69 | self.embedding = nn.Embedding(output_dim, emb_dim) 70 | self.rnn = nn.LSTM(emb_dim+enc_hid_dim, dec_hid_dim) 71 | self.fc = nn.Linear(emb_dim+enc_hid_dim+dec_hid_dim, output_dim) 72 | self.dropout = nn.Dropout(dropout) 73 | 74 | def forward(self, input, hidden, cell, context): 75 | # input(batch_size): last prediction 76 | # hidden(batch_size, dec_hid_dim): decoder last hidden state 77 | # cell(batch_size, dec_hid_dim): decoder last cell state 78 | # context(batch_size, enc_hid_dim): context vector 79 | # print(input.shape, hidden.shape, cell.shape, context.shape) 80 | # expand dim to (1, batch_size) 81 | input = input.unsqueeze(0) 82 | 83 | # embedded(1, batch_size, emb_dim): embed last prediction word 84 | embedded = self.dropout(self.embedding(input)) 85 | 86 | # rnn_input(1, batch_size, emb_dim+enc_hide_dim): concat embedded and context 87 | rnn_input = torch.cat((embedded, context.unsqueeze(0)), dim=2) 88 | 89 | # output(seq_len, batch, num_directions * hidden_size) 90 | # hidden(num_layers * num_directions, batch, hidden_size) 91 | output, (hidden, cell) = self.rnn(rnn_input, (hidden.unsqueeze(0), cell.unsqueeze(0))) 92 | 93 | # hidden(batch_size, dec_hid_dim) 94 | # cell(batch_size, dec_hid_dim) 95 | # embedded(1, batch_size, emb_dim) 96 | hidden = hidden.squeeze(0) 97 | cell = cell.squeeze(0) 98 | embedded = embedded.squeeze(0) 99 | 100 | # prediction 101 | prediction = self.fc(torch.cat((embedded, context, hidden), dim=1)) 102 | 103 | return prediction, (hidden, cell) 104 | 105 | 106 | class Seq2Seq(nn.Module): 107 | def __init__(self, encoder, decoder, device): 108 | super(Seq2Seq, self).__init__() 109 | self.encoder = encoder 110 | self.decoder = decoder 111 | self.device = device 112 | 113 | def forward(self, imgs, target, teacher_forcing_ratio=0.5): 114 | # imgs: (batch_size, channels, T, H, W) 115 | # target: (batch_size, trg len) 116 | batch_size = imgs.shape[0] 117 | trg_len = target.shape[1] 118 | trg_vocab_size = self.decoder.output_dim 119 | 120 | # tensor to store decoder outputs 121 | outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device) 122 | 123 | # encoder_outputs(batch, seq_len, hidden_size): all hidden states of input sequence 124 | encoder_outputs, (hidden, cell) = self.encoder(imgs) 125 | 126 | # compute context vector 127 | context = encoder_outputs.mean(dim=1) 128 | 129 | # first input to the decoder is the tokens 130 | input = target[:,0] 131 | 132 | for t in range(1, trg_len): 133 | # decode 134 | output, (hidden, cell) = self.decoder(input, hidden, cell, context) 135 | 136 | # store prediction 137 | outputs[t] = output 138 | 139 | # decide whether to do teacher foring 140 | teacher_force = random.random() < teacher_forcing_ratio 141 | 142 | # get the highest predicted token 143 | top1 = output.argmax(1) 144 | 145 | # apply teacher forcing 146 | input = target[:,t] if teacher_force else top1 147 | 148 | return outputs 149 | 150 | 151 | # Test 152 | if __name__ == '__main__': 153 | # test encoder 154 | encoder = Encoder(lstm_hidden_size=512) 155 | # imgs = torch.randn(16, 3, 8, 128, 128) 156 | # print(encoder(imgs)) 157 | 158 | # test decoder 159 | decoder = Decoder(output_dim=500, emb_dim=256, enc_hid_dim=512, dec_hid_dim=512, dropout=0.5) 160 | # input = torch.LongTensor(16).random_(0, 500) 161 | # hidden = torch.randn(16, 512) 162 | # cell = torch.randn(16, 512) 163 | # context = torch.randn(16, 512) 164 | # print(decoder(input, hidden, cell, context)) 165 | 166 | # test seq2seq 167 | device = torch.device("cpu") 168 | seq2seq = Seq2Seq(encoder=encoder, decoder=decoder, device=device) 169 | imgs = torch.randn(16, 3, 8, 128, 128) 170 | target = torch.LongTensor(16, 8).random_(0, 500) 171 | print(seq2seq(imgs, target).argmax(dim=2).permute(1,0)) # batch first 172 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.metrics import accuracy_score 3 | 4 | def test(model, criterion, dataloader, device, epoch, logger, writer): 5 | model.eval() 6 | losses = [] 7 | all_label = [] 8 | all_pred = [] 9 | 10 | with torch.no_grad(): 11 | for batch_idx, data in enumerate(dataloader): 12 | # get the inputs and labels 13 | inputs, labels = data['data'].to(device), data['label'].to(device) 14 | # forward 15 | outputs = model(inputs) 16 | if isinstance(outputs, list): 17 | outputs = outputs[0] 18 | # compute the loss 19 | loss = criterion(outputs, labels.squeeze()) 20 | losses.append(loss.item()) 21 | # collect labels & prediction 22 | prediction = torch.max(outputs, 1)[1] 23 | all_label.extend(labels.squeeze()) 24 | all_pred.extend(prediction) 25 | # Compute the average loss & accuracy 26 | test_loss = sum(losses)/len(losses) 27 | all_label = torch.stack(all_label, dim=0) 28 | all_pred = torch.stack(all_pred, dim=0) 29 | test_acc = accuracy_score(all_label.squeeze().cpu().data.squeeze().numpy(), all_pred.cpu().data.squeeze().numpy()) 30 | # Log 31 | writer.add_scalars('Loss', {'test': test_loss}, epoch+1) 32 | writer.add_scalars('Accuracy', {'test': test_acc}, epoch+1) 33 | logger.info("Average Test Loss: {:.6f} | Acc: {:.2f}%".format(test_loss, test_acc*100)) 34 | 35 | if __name__ == '__main__': 36 | import os 37 | import argparse 38 | from torch.utils.data import DataLoader 39 | import torchvision.transforms as transforms 40 | from dataset import CSL_Isolated 41 | from models.Conv3D import resnet18, resnet34, resnet50, r2plus1d_18 42 | # Arguments 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('--data_path', default='/home/haodong/Data/CSL_Isolated/color_video_125000', 45 | type=str, help='Data path for testing') 46 | parser.add_argument('--label_path', default='/home/haodong/Data/CSL_Isolated/dictionary.txt', 47 | type=str, help='Label path for testing') 48 | parser.add_argument('--model', default='3dresnet18', 49 | type=str, help='Choose a model for testing') 50 | parser.add_argument('--model_path', default='3dresnet18.pth', 51 | type=str, help='Model state dict path') 52 | parser.add_argument('--num_classes', default=500, 53 | type=int, help='Number of classes for testing') 54 | parser.add_argument('--batch_size', default=32, 55 | type=int, help='Batch size for testing') 56 | parser.add_argument('--sample_size', default=128, 57 | type=int, help='Sample size for testing') 58 | parser.add_argument('--sample_duration', default=16, 59 | type=int, help='Sample duration for testing') 60 | parser.add_argument('--no_cuda', action='store_true', 61 | help='If true, dont use cuda') 62 | parser.add_argument('--cuda_devices', default='2', 63 | type=str, help='Cuda visible devices') 64 | args = parser.parse_args() 65 | 66 | # Path setting 67 | data_path = args.data_path 68 | label_path = args.label_path 69 | model_path = args.model_path 70 | # Use specific gpus 71 | os.environ["CUDA_VISIBLE_DEVICES"]=args.cuda_devices 72 | # Device setting 73 | if torch.cuda.is_available() and not args.no_cuda: 74 | device = torch.device("cuda") 75 | else: 76 | device = torch.device("cpu") 77 | 78 | # Hyperparams 79 | num_classes = args.num_classes 80 | batch_size = args.batch_size 81 | sample_size = args.sample_size 82 | sample_duration = args.sample_duration 83 | 84 | # Start testing 85 | # Load data 86 | transform = transforms.Compose([transforms.Resize([sample_size, sample_size]), 87 | transforms.ToTensor(), 88 | transforms.Normalize(mean=[0.5], std=[0.5])]) 89 | test_set = CSL_Isolated(data_path=data_path, label_path=label_path, frames=sample_duration, 90 | num_classes=num_classes, train=False, transform=transform) 91 | test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) 92 | 93 | # Create model 94 | if args.model == '3dresnet18': 95 | model = resnet18(pretrained=True, progress=True, sample_size=sample_size, 96 | sample_duration=sample_duration, num_classes=num_classes).to(device) 97 | elif args.model == '3dresnet34': 98 | model = resnet34(pretrained=True, progress=True, sample_size=sample_size, 99 | sample_duration=sample_duration, num_classes=num_classes).to(device) 100 | elif args.model == '3dresnet50': 101 | model = resnet50(pretrained=True, progress=True, sample_size=sample_size, 102 | sample_duration=sample_duration, num_classes=num_classes).to(device) 103 | elif args.model == 'r2plus1d': 104 | model = r2plus1d_18(pretrained=True, num_classes=num_classes).to(device) 105 | # Run the model parallelly 106 | if torch.cuda.device_count() > 1: 107 | model = nn.DataParallel(model) 108 | # Load model 109 | model.load_state_dict(torch.load(model_path)) 110 | 111 | # Test the model 112 | model.eval() 113 | all_label = [] 114 | all_pred = [] 115 | 116 | with torch.no_grad(): 117 | for batch_idx, data in enumerate(test_loader): 118 | # get the inputs and labels 119 | inputs, labels = data['data'].to(device), data['label'].to(device) 120 | # forward 121 | outputs = model(inputs) 122 | # collect labels & prediction 123 | prediction = torch.max(outputs, 1)[1] 124 | all_label.extend(labels.squeeze()) 125 | all_pred.extend(prediction) 126 | # Compute the average loss & accuracy 127 | all_label = torch.stack(all_label, dim=0) 128 | all_pred = torch.stack(all_pred, dim=0) 129 | test_acc = accuracy_score(all_label.squeeze().cpu().data.squeeze().numpy(), all_pred.cpu().data.squeeze().numpy()) 130 | print("Test Acc: {:.2f}%".format(test_acc*100)) 131 | -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.utils.tensorboard import SummaryWriter 4 | import torchvision.utils as utils 5 | import cv2 6 | from datetime import datetime 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from sklearn.metrics import confusion_matrix 10 | 11 | 12 | def get_label_and_pred(model, dataloader, device): 13 | all_label = [] 14 | all_pred = [] 15 | with torch.no_grad(): 16 | for batch_idx, data in enumerate(test_loader): 17 | # get the inputs and labels 18 | inputs, labels = data['data'].to(device), data['label'].to(device) 19 | # forward 20 | outputs = model(inputs) 21 | if isinstance(outputs, list): 22 | outputs = outputs[0] 23 | # collect labels & prediction 24 | prediction = torch.max(outputs, 1)[1] 25 | all_label.extend(labels.squeeze()) 26 | all_pred.extend(prediction) 27 | # Compute accuracy 28 | all_label = torch.stack(all_label, dim=0) 29 | all_pred = torch.stack(all_pred, dim=0) 30 | all_label = all_label.squeeze().cpu().data.squeeze().numpy() 31 | all_pred = all_pred.cpu().data.squeeze().numpy() 32 | return all_label, all_pred 33 | 34 | 35 | def plot_confusion_matrix(model, dataloader, device, save_path='confmat.png', normalize=True): 36 | # Get prediction 37 | all_label, all_pred = get_label_and_pred(model, dataloader, device) 38 | confmat = confusion_matrix(all_label, all_pred) 39 | 40 | # Normalize the matrix 41 | if normalize: 42 | confmat = confmat.astype('float') / confmat.sum(axis=1)[:, np.newaxis] 43 | # Draw matrix 44 | plt.figure(figsize=(20,20)) 45 | # confmat = np.random.rand(100,100) 46 | plt.imshow(confmat, interpolation='nearest', cmap=plt.cm.Blues) 47 | plt.colorbar() 48 | # Add ticks 49 | ticks = np.arange(100) 50 | plt.xticks(ticks, fontsize=8) 51 | plt.yticks(ticks, fontsize=8) 52 | plt.grid(True) 53 | # Add title & labels 54 | plt.title('Confusion matrix', fontsize=20) 55 | plt.xlabel('Predicted label', fontsize=20) 56 | plt.ylabel('True label', fontsize=20) 57 | # Save figure 58 | plt.savefig(save_path) 59 | 60 | # Ranking 61 | sorted_index = np.diag(confmat).argsort() 62 | for i in range(10): 63 | # print(type(sorted_index[i])) 64 | print(test_set.label_to_word(int(sorted_index[i])), confmat[sorted_index[i]][sorted_index[i]]) 65 | # Save to csv 66 | np.savetxt('matrix.csv', confmat, delimiter=',') 67 | 68 | 69 | def visualize_attn(I, c): 70 | # Image 71 | img = I.permute((1,2,0)).cpu().numpy() 72 | # Heatmap 73 | N, C, H, W = c.size() 74 | a = F.softmax(c.view(N,C,-1), dim=2).view(N,C,H,W) 75 | up_factor = 128/H 76 | # print(up_factor, I.size(), c.size()) 77 | if up_factor > 1: 78 | a = F.interpolate(a, scale_factor=up_factor, mode='bilinear', align_corners=False) 79 | attn = utils.make_grid(a, nrow=4, normalize=True, scale_each=True) 80 | attn = attn.permute((1,2,0)).mul(255).byte().cpu().numpy() 81 | attn = cv2.applyColorMap(attn, cv2.COLORMAP_JET) 82 | attn = cv2.cvtColor(attn, cv2.COLOR_BGR2RGB) 83 | # Add the heatmap to the image 84 | vis = 0.6 * img + 0.4 * attn 85 | return torch.from_numpy(vis).permute(2,0,1) 86 | 87 | 88 | def plot_attention_map(model, dataloader, device): 89 | # Summary writer 90 | writer = SummaryWriter("runs/attention_{:%Y-%m-%d_%H-%M-%S}".format(datetime.now())) 91 | 92 | model.eval() 93 | with torch.no_grad(): 94 | for batch_idx, data in enumerate(dataloader): 95 | # get images 96 | inputs = data['data'].to(device) 97 | if batch_idx == 0: 98 | images = inputs[0:16,:,:,:,:] 99 | I = utils.make_grid(images[:,:,0,:,:], nrow=4, normalize=True, scale_each=True) 100 | writer.add_image('origin', I) 101 | _, c1, c2, c3, c4 = model(images) 102 | # print(I.shape, c1.shape, c2.shape, c3.shape, c4.shape) 103 | attn1 = visualize_attn(I, c1[:,:,0,:,:]) 104 | writer.add_image('attn1', attn1) 105 | attn2 = visualize_attn(I, c2[:,:,0,:,:]) 106 | writer.add_image('attn2', attn2) 107 | attn3 = visualize_attn(I, c3[:,:,0,:,:]) 108 | writer.add_image('attn3', attn3) 109 | attn4 = visualize_attn(I, c4[:,:,0,:,:]) 110 | writer.add_image('attn4', attn4) 111 | break 112 | 113 | 114 | """ 115 | Calculate Word Error Rate 116 | Word Error Rate = (Substitutions + Insertions + Deletions) / Number of Words Spoken 117 | Reference: 118 | https://holianh.github.io/portfolio/Cach-tinh-WER/ 119 | https://github.com/imalic3/python-word-error-rate 120 | """ 121 | def wer(r, h): 122 | # initialisation 123 | d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8) 124 | d = d.reshape((len(r)+1, len(h)+1)) 125 | for i in range(len(r)+1): 126 | for j in range(len(h)+1): 127 | if i == 0: 128 | d[0][j] = j 129 | elif j == 0: 130 | d[i][0] = i 131 | 132 | # computation 133 | for i in range(1, len(r)+1): 134 | for j in range(1, len(h)+1): 135 | if r[i-1] == h[j-1]: 136 | d[i][j] = d[i-1][j-1] 137 | else: 138 | substitution = d[i-1][j-1] + 1 139 | insertion = d[i][j-1] + 1 140 | deletion = d[i-1][j] + 1 141 | d[i][j] = min(substitution, insertion, deletion) 142 | 143 | return float(d[len(r)][len(h)]) / len(r) * 100 144 | 145 | 146 | if __name__ == '__main__': 147 | # Calculate WER 148 | r = [1,2,3,4] 149 | h = [1,1,3,5,6] 150 | print(wer(r, h)) 151 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.metrics import accuracy_score 3 | from tools import wer 4 | 5 | def train_epoch(model, criterion, optimizer, dataloader, device, epoch, logger, log_interval, writer): 6 | model.train() 7 | losses = [] 8 | all_label = [] 9 | all_pred = [] 10 | 11 | for batch_idx, data in enumerate(dataloader): 12 | # get the inputs and labels 13 | inputs, labels = data['data'].to(device), data['label'].to(device) 14 | 15 | optimizer.zero_grad() 16 | # forward 17 | outputs = model(inputs) 18 | if isinstance(outputs, list): 19 | outputs = outputs[0] 20 | 21 | # compute the loss 22 | loss = criterion(outputs, labels.squeeze()) 23 | losses.append(loss.item()) 24 | 25 | # compute the accuracy 26 | prediction = torch.max(outputs, 1)[1] 27 | all_label.extend(labels.squeeze()) 28 | all_pred.extend(prediction) 29 | score = accuracy_score(labels.squeeze().cpu().data.squeeze().numpy(), prediction.cpu().data.squeeze().numpy()) 30 | 31 | # backward & optimize 32 | loss.backward() 33 | optimizer.step() 34 | 35 | if (batch_idx + 1) % log_interval == 0: 36 | logger.info("epoch {:3d} | iteration {:5d} | Loss {:.6f} | Acc {:.2f}%".format(epoch+1, batch_idx+1, loss.item(), score*100)) 37 | 38 | # Compute the average loss & accuracy 39 | training_loss = sum(losses)/len(losses) 40 | all_label = torch.stack(all_label, dim=0) 41 | all_pred = torch.stack(all_pred, dim=0) 42 | training_acc = accuracy_score(all_label.squeeze().cpu().data.squeeze().numpy(), all_pred.cpu().data.squeeze().numpy()) 43 | # Log 44 | writer.add_scalars('Loss', {'train': training_loss}, epoch+1) 45 | writer.add_scalars('Accuracy', {'train': training_acc}, epoch+1) 46 | logger.info("Average Training Loss of Epoch {}: {:.6f} | Acc: {:.2f}%".format(epoch+1, training_loss, training_acc*100)) 47 | 48 | 49 | def train_seq2seq(model, criterion, optimizer, clip, dataloader, device, epoch, logger, log_interval, writer): 50 | model.train() 51 | losses = [] 52 | all_trg = [] 53 | all_pred = [] 54 | all_wer = [] 55 | 56 | for batch_idx, (imgs, target) in enumerate(dataloader): 57 | imgs = imgs.to(device) 58 | target = target.to(device) 59 | 60 | optimizer.zero_grad() 61 | # forward 62 | outputs = model(imgs, target) 63 | 64 | # target: (batch_size, trg len) 65 | # outputs: (trg_len, batch_size, output_dim) 66 | # skip sos 67 | output_dim = outputs.shape[-1] 68 | outputs = outputs[1:].view(-1, output_dim) 69 | target = target.permute(1,0)[1:].reshape(-1) 70 | 71 | # compute the loss 72 | loss = criterion(outputs, target) 73 | losses.append(loss.item()) 74 | 75 | # compute the accuracy 76 | prediction = torch.max(outputs, 1)[1] 77 | score = accuracy_score(target.cpu().data.squeeze().numpy(), prediction.cpu().data.squeeze().numpy()) 78 | all_trg.extend(target) 79 | all_pred.extend(prediction) 80 | 81 | # compute wer 82 | # prediction: ((trg_len-1)*batch_size) 83 | # target: ((trg_len-1)*batch_size) 84 | batch_size = imgs.shape[0] 85 | prediction = prediction.view(-1, batch_size).permute(1,0).tolist() 86 | target = target.view(-1, batch_size).permute(1,0).tolist() 87 | wers = [] 88 | for i in range(batch_size): 89 | # add mask(remove padding, sos, eos) 90 | prediction[i] = [item for item in prediction[i] if item not in [0,1,2]] 91 | target[i] = [item for item in target[i] if item not in [0,1,2]] 92 | wers.append(wer(target[i], prediction[i])) 93 | all_wer.extend(wers) 94 | 95 | # backward & optimize 96 | loss.backward() 97 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip) 98 | optimizer.step() 99 | 100 | if (batch_idx + 1) % log_interval == 0: 101 | logger.info("epoch {:3d} | iteration {:5d} | Loss {:.6f} | Acc {:.2f}% | WER {:.2f}%".format(epoch+1, batch_idx+1, loss.item(), score*100, sum(wers)/len(wers))) 102 | 103 | # Compute the average loss & accuracy 104 | training_loss = sum(losses)/len(losses) 105 | all_trg = torch.stack(all_trg, dim=0) 106 | all_pred = torch.stack(all_pred, dim=0) 107 | training_acc = accuracy_score(all_trg.cpu().data.squeeze().numpy(), all_pred.cpu().data.squeeze().numpy()) 108 | training_wer = sum(all_wer)/len(all_wer) 109 | # Log 110 | writer.add_scalars('Loss', {'train': training_loss}, epoch+1) 111 | writer.add_scalars('Accuracy', {'train': training_acc}, epoch+1) 112 | writer.add_scalars('WER', {'train': training_wer}, epoch+1) 113 | logger.info("Average Training Loss of Epoch {}: {:.6f} | Acc: {:.2f}% | WER {:.2f}%".format(epoch+1, training_loss, training_acc*100, training_wer)) 114 | -------------------------------------------------------------------------------- /validation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.metrics import accuracy_score 3 | from tools import wer 4 | 5 | def val_epoch(model, criterion, dataloader, device, epoch, logger, writer): 6 | model.eval() 7 | losses = [] 8 | all_label = [] 9 | all_pred = [] 10 | 11 | with torch.no_grad(): 12 | for batch_idx, data in enumerate(dataloader): 13 | # get the inputs and labels 14 | inputs, labels = data['data'].to(device), data['label'].to(device) 15 | # forward 16 | outputs = model(inputs) 17 | if isinstance(outputs, list): 18 | outputs = outputs[0] 19 | # compute the loss 20 | loss = criterion(outputs, labels.squeeze()) 21 | losses.append(loss.item()) 22 | # collect labels & prediction 23 | prediction = torch.max(outputs, 1)[1] 24 | all_label.extend(labels.squeeze()) 25 | all_pred.extend(prediction) 26 | # Compute the average loss & accuracy 27 | validation_loss = sum(losses)/len(losses) 28 | all_label = torch.stack(all_label, dim=0) 29 | all_pred = torch.stack(all_pred, dim=0) 30 | validation_acc = accuracy_score(all_label.squeeze().cpu().data.squeeze().numpy(), all_pred.cpu().data.squeeze().numpy()) 31 | # Log 32 | writer.add_scalars('Loss', {'validation': validation_loss}, epoch+1) 33 | writer.add_scalars('Accuracy', {'validation': validation_acc}, epoch+1) 34 | logger.info("Average Validation Loss of Epoch {}: {:.6f} | Acc: {:.2f}%".format(epoch+1, validation_loss, validation_acc*100)) 35 | 36 | 37 | def val_seq2seq(model, criterion, dataloader, device, epoch, logger, writer): 38 | model.eval() 39 | losses = [] 40 | all_trg = [] 41 | all_pred = [] 42 | all_wer = [] 43 | 44 | with torch.no_grad(): 45 | for batch_idx, (imgs, target) in enumerate(dataloader): 46 | imgs = imgs.to(device) 47 | target = target.to(device) 48 | 49 | # forward(no teacher forcing) 50 | outputs = model(imgs, target, 0) 51 | 52 | # target: (batch_size, trg len) 53 | # outputs: (trg_len, batch_size, output_dim) 54 | # skip sos 55 | output_dim = outputs.shape[-1] 56 | outputs = outputs[1:].view(-1, output_dim) 57 | target = target.permute(1,0)[1:].reshape(-1) 58 | 59 | # compute the loss 60 | loss = criterion(outputs, target) 61 | losses.append(loss.item()) 62 | 63 | # compute the accuracy 64 | prediction = torch.max(outputs, 1)[1] 65 | score = accuracy_score(target.cpu().data.squeeze().numpy(), prediction.cpu().data.squeeze().numpy()) 66 | all_trg.extend(target) 67 | all_pred.extend(prediction) 68 | 69 | # compute wer 70 | # prediction: ((trg_len-1)*batch_size) 71 | # target: ((trg_len-1)*batch_size) 72 | batch_size = imgs.shape[0] 73 | prediction = prediction.view(-1, batch_size).permute(1,0).tolist() 74 | target = target.view(-1, batch_size).permute(1,0).tolist() 75 | wers = [] 76 | for i in range(batch_size): 77 | # add mask(remove padding, eos, sos) 78 | prediction[i] = [item for item in prediction[i] if item not in [0,1,2]] 79 | target[i] = [item for item in target[i] if item not in [0,1,2]] 80 | wers.append(wer(target[i], prediction[i])) 81 | all_wer.extend(wers) 82 | 83 | # Compute the average loss & accuracy 84 | validation_loss = sum(losses)/len(losses) 85 | all_trg = torch.stack(all_trg, dim=0) 86 | all_pred = torch.stack(all_pred, dim=0) 87 | validation_acc = accuracy_score(all_trg.cpu().data.squeeze().numpy(), all_pred.cpu().data.squeeze().numpy()) 88 | validation_wer = sum(all_wer)/len(all_wer) 89 | # Log 90 | writer.add_scalars('Loss', {'validation': validation_loss}, epoch+1) 91 | writer.add_scalars('Accuracy', {'validation': validation_acc}, epoch+1) 92 | writer.add_scalars('WER', {'validation': validation_wer}, epoch+1) 93 | logger.info("Average Validation Loss of Epoch {}: {:.6f} | Acc: {:.2f}% | WER: {:.2f}%".format(epoch+1, validation_loss, validation_acc*100, validation_wer)) 94 | --------------------------------------------------------------------------------