├── src_gestformer ├── readme.md ├── __pycache__ │ ├── test.cpython-38.pyc │ └── train.cpython-38.pyc ├── models │ ├── __pycache__ │ │ ├── temporal.cpython-38.pyc │ │ ├── attention.cpython-38.pyc │ │ └── model_utilizer.cpython-38.pyc │ ├── backbones │ │ ├── __pycache__ │ │ │ ├── c3d.cpython-38.pyc │ │ │ ├── r3d.cpython-38.pyc │ │ │ ├── vgg.cpython-38.pyc │ │ │ └── resnet.cpython-38.pyc │ │ ├── c3d.py │ │ ├── vgg.py │ │ ├── resnet.py │ │ └── r3d.py │ ├── temporal.py │ ├── model_utilizer.py │ └── attention.py ├── utils │ ├── __pycache__ │ │ ├── configer.cpython-38.pyc │ │ └── average_meter.cpython-38.pyc │ ├── average_meter.py │ ├── visualization.py │ ├── configer.py │ └── test.py ├── datasets │ ├── __pycache__ │ │ ├── Briareo.cpython-38.pyc │ │ └── NVGestures.cpython-38.pyc │ ├── utils │ │ ├── __pycache__ │ │ │ ├── normals.cpython-38.pyc │ │ │ ├── normalize.cpython-38.pyc │ │ │ ├── read_data.cpython-38.pyc │ │ │ ├── optical_flow.cpython-38.pyc │ │ │ └── utils_briareo.cpython-38.pyc │ │ ├── normalize.py │ │ ├── optical_flow.py │ │ ├── normals.py │ │ ├── read_data.py │ │ └── utils_briareo.py │ ├── NVGestures.py │ └── Briareo.py ├── hyperparameters │ ├── NVGestures │ │ ├── test.json │ │ └── train.json │ └── Briareo │ │ ├── test.json │ │ └── train.json ├── main.py ├── cs.py ├── test.py └── train.py └── README.md /src_gestformer/readme.md: -------------------------------------------------------------------------------- 1 | code for gestformer 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | GestFormer: Multiscale Wavelet Pooling Transformer Network for Dynamic Hand Gesture Recognition 2 | -------------------------------------------------------------------------------- /src_gestformer/__pycache__/test.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/__pycache__/test.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/__pycache__/train.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/__pycache__/train.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/models/__pycache__/temporal.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/__pycache__/temporal.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/utils/__pycache__/configer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/utils/__pycache__/configer.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/datasets/__pycache__/Briareo.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/__pycache__/Briareo.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/models/__pycache__/attention.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/__pycache__/attention.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/datasets/__pycache__/NVGestures.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/__pycache__/NVGestures.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/models/backbones/__pycache__/c3d.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/backbones/__pycache__/c3d.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/models/backbones/__pycache__/r3d.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/backbones/__pycache__/r3d.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/models/backbones/__pycache__/vgg.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/backbones/__pycache__/vgg.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/utils/__pycache__/average_meter.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/utils/__pycache__/average_meter.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/datasets/utils/__pycache__/normals.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/utils/__pycache__/normals.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/models/__pycache__/model_utilizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/__pycache__/model_utilizer.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/models/backbones/__pycache__/resnet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/backbones/__pycache__/resnet.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/datasets/utils/__pycache__/normalize.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/utils/__pycache__/normalize.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/datasets/utils/__pycache__/read_data.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/utils/__pycache__/read_data.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/datasets/utils/__pycache__/optical_flow.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/utils/__pycache__/optical_flow.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/datasets/utils/__pycache__/utils_briareo.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/utils/__pycache__/utils_briareo.cpython-38.pyc -------------------------------------------------------------------------------- /src_gestformer/utils/average_meter.py: -------------------------------------------------------------------------------- 1 | class AverageMeter(object): 2 | """Average Meter object, contain val, avg, sum and count on concurrent values""" 3 | def __init__(self): 4 | self.reset() 5 | 6 | def reset(self): 7 | self.val = 0. 8 | self.avg = 0. 9 | self.sum = 0. 10 | self.count = 0 11 | 12 | def update(self, val, n=1): 13 | self.val = val 14 | self.sum += val * n 15 | self.count += n 16 | self.avg = self.sum / self.count 17 | -------------------------------------------------------------------------------- /src_gestformer/datasets/utils/normalize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def normalize(tensor: np.ndarray): 4 | """Normalize function for a single tensor. 5 | 6 | Args: 7 | block (np.ndarray): input tensor 8 | Returns: 9 | np.ndarray: normalized tensor 10 | 11 | """ 12 | if len(tensor.shape) < 4: 13 | tensor = np.expand_dims(tensor, axis=2) 14 | mean = np.array([tensor[..., chn, :].mean() for chn in range(tensor.shape[2])]) 15 | std = np.array([tensor[..., chn, :].std() for chn in range(tensor.shape[2])]) 16 | return (tensor - mean[:, np.newaxis]) / std[:, np.newaxis] 17 | -------------------------------------------------------------------------------- /src_gestformer/hyperparameters/NVGestures/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Gesture Recognition", 3 | "dataset": "NVGestures", 4 | "phase": "test", 5 | "data": { 6 | "optical_flow": false, 7 | "type": "normal", 8 | "n_classes": 25, 9 | "n_frames": 40, 10 | "data_path": "path/to/NVGestures", 11 | "result_dir": "./result", 12 | "batch_size": 8 13 | }, 14 | "solver": { 15 | "workers": 4 16 | }, 17 | "network":{ 18 | "backbone": "resnet", 19 | "pretrained": true, 20 | "ff_size": 1024, 21 | "n_head": 8, 22 | "dropout2d" : 0.1, 23 | "dropout1d": 0.5, 24 | "n_module": 6 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src_gestformer/hyperparameters/Briareo/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Gesture Recognition", 3 | "dataset": "Briareo", 4 | "phase": "test", 5 | "data": { 6 | "optical_flow": false, 7 | "type": "rgb", 8 | "n_classes": 12, 9 | "n_frames": 40, 10 | "data_path": "path/to/Briareo", 11 | "result_dir": "./result", 12 | "batch_size": 2 13 | }, 14 | "solver": { 15 | "workers": 4 16 | }, 17 | "network":{ 18 | "backbone": "resnet", 19 | "pretrained": true, 20 | "ff_size": 1024, 21 | "n_head": 8, 22 | "dropout2d" : 0.1, 23 | "dropout1d": 0.5, 24 | "n_module": 6 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src_gestformer/utils/visualization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | def plot_depth(path, depth): 5 | """Plot a single depth map 6 | 7 | Attributes: 8 | path (str): Path to save the depth map 9 | depth (np.ndarray): Depth map data 10 | 11 | """ 12 | if len(depth.shape) > 2: 13 | if depth.shape[-1] != 1: 14 | raise ValueError("Wrong number of channel, 1 is required, got {}".format(depth.shape)) 15 | else: 16 | depth = depth.squeeze() 17 | tmp = np.zeros((depth.shape[0], depth.shape[1], 3)) 18 | tmp[..., 0] = depth.copy() 19 | tmp[..., 1] = depth.copy() 20 | tmp[..., 2] = depth.copy() 21 | tmp = ((tmp * 255) / tmp.max()).astype(np.uint8) 22 | cv2.imwrite(path, tmp) 23 | -------------------------------------------------------------------------------- /src_gestformer/datasets/utils/optical_flow.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | def dense_flow(clip, rgb=True): 5 | """Calculate optical flow with Farneback algorithm 6 | 7 | Args: 8 | clip: input video clip 9 | rgb: if True, it will covert to gray level every frames 10 | Default: True 11 | 12 | Returns: 13 | flow: Calculated Optical flow 14 | 15 | """ 16 | prev = clip[..., 0] 17 | if rgb: 18 | prev = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY) 19 | flow = np.zeros((clip.shape[0], clip.shape[1], 2, clip.shape[-1] - 1)) 20 | for i in range(1 ,clip.shape[-1]): 21 | next = clip[..., i] 22 | if rgb: 23 | next = cv2.cvtColor(next, cv2.COLOR_BGR2GRAY) 24 | flow_calc = cv2.calcOpticalFlowFarneback(prev, next, None, 0.5, 3, 15, 3, 5, 1.2, 0) 25 | flow[..., i - 1] = flow_calc 26 | prev = next 27 | return flow -------------------------------------------------------------------------------- /src_gestformer/hyperparameters/Briareo/train.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Gesture Recognition", 3 | "dataset": "Briareo", 4 | "epochs": 100, 5 | "phase": "train", 6 | "data": { 7 | "optical_flow": false, 8 | "type": "ir", 9 | "n_classes": 12, 10 | "n_frames": 40, 11 | "data_path": "path/to/Briareo", 12 | "result_dir": "./result", 13 | "batch_size": 8 14 | }, 15 | "checkpoints": { 16 | "save_policy": "best", 17 | "save_name": "train_briareo_ir-xwavegatedffn_emb", 18 | "save_dir": "./checkpoints/", 19 | "save_iters": 30, 20 | "tb_path": "train_log" 21 | }, 22 | "solver": { 23 | "type": "AdamW", 24 | "workers": 4, 25 | "weight_decay": 0.0001, 26 | "base_lr": 0.0001, 27 | "decay_steps": [50, 75] 28 | }, 29 | "network":{ 30 | "backbone": "resnet", 31 | "pretrained": true, 32 | "ff_size": 1024, 33 | "n_head": 8, 34 | "dropout2d" : 0.1, 35 | "dropout1d": 0.5, 36 | "n_module": 6 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src_gestformer/hyperparameters/NVGestures/train.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Gesture Recognition", 3 | "dataset": "NVGestures", 4 | "epochs": 100, 5 | "phase": "train", 6 | "data": { 7 | "optical_flow": false, 8 | "type": "normal", 9 | "n_classes": 25, 10 | "n_frames": 40, 11 | "data_path": "path/to/NVGestures/", 12 | "result_dir": "./result", 13 | "batch_size":8 14 | }, 15 | "checkpoints": { 16 | "save_policy": "best", 17 | "save_name": "train_nv_normal-xwavegatedffn_multi", 18 | "save_dir": "./checkpoints/", 19 | "save_iters": 30, 20 | "tb_path": "train_log" 21 | }, 22 | "solver": { 23 | "type": "AdamW", 24 | "workers": 4, 25 | "weight_decay": 0.0001, 26 | "base_lr": 0.0001, 27 | "decay_steps": [50, 75] 28 | }, 29 | "network":{ 30 | "backbone": "resnet", 31 | "pretrained": true, 32 | "ff_size": 1024, 33 | "n_head": 8, 34 | "dropout2d" : 0.1, 35 | "dropout1d": 0.5, 36 | "n_module": 6 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src_gestformer/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from train import GestureTrainer 4 | from test import GestureTest 5 | from utils.configer import Configer 6 | 7 | import random 8 | import numpy as np 9 | import torch 10 | 11 | SEED = 1994 12 | random.seed(SEED) 13 | np.random.seed(SEED) 14 | torch.manual_seed(SEED) 15 | if torch.cuda.is_available(): 16 | torch.cuda.manual_seed(SEED) 17 | torch.backends.cudnn.deterministic = True # To have ~deterministic results 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--disable-cuda', action='store_true', 23 | help='Disable CUDA') 24 | parser.add_argument('--hypes', default=None, type=str, 25 | dest='hypes', help='The file of the hyper parameters.') 26 | parser.add_argument('--phase', default='train', type=str, 27 | dest='phase', help='The phase of module.') 28 | parser.add_argument('--gpu', default=[0, ], nargs='+', type=int, 29 | dest='gpu', help='The gpu used.') 30 | parser.add_argument('--resume', default=None, type=str, 31 | dest='resume', help='The path of pretrained model.') 32 | parser.add_argument('--nogesture', default=False, action='store_true', 33 | dest='nogesture', help='NoGesture CTC loss') 34 | 35 | args = parser.parse_args() 36 | args.device = None 37 | if not args.disable_cuda and torch.cuda.is_available(): 38 | args.device = torch.device('cuda:0') 39 | else: 40 | args.device = torch.device('cpu') 41 | 42 | torch.autograd.set_detect_anomaly(True) 43 | configer = Configer(args) 44 | if configer.get('phase') == 'train': 45 | model = GestureTrainer(configer) 46 | model.init_model() 47 | model.train() 48 | elif configer.get('phase') == 'test': 49 | model = GestureTest(configer) 50 | model.init_model() 51 | model.test() 52 | -------------------------------------------------------------------------------- /src_gestformer/models/temporal.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from models.backbones.resnet import resnet18 3 | from models.backbones.vgg import vgg16, vgg16_bn 4 | from models.backbones import c3d 5 | from models.backbones.r3d import r3d_18, r2plus1d_18 6 | from models.attention import EncoderSelfAttention 7 | 8 | backbone_dict = {'resnet': resnet18, 9 | 'vgg': vgg16, 'vgg_bn': vgg16_bn, 10 | 'c3d': c3d, 11 | 'r3d': r3d_18, 'r2plus1d': r2plus1d_18} 12 | 13 | class _GestureTransformer(nn.Module): 14 | """Multi Modal model for gesture recognition on 3 channel""" 15 | def __init__(self, backbone: nn.Module, in_planes: int, out_planes: int, 16 | pretrained: bool = False, dropout_backbone=0.1, 17 | **kwargs): 18 | super(_GestureTransformer, self).__init__() 19 | 20 | self.in_planes = in_planes 21 | self.backbone = backbone(pretrained, in_planes, dropout=dropout_backbone) 22 | 23 | self.self_attention = EncoderSelfAttention(512, 64, 64, **kwargs) 24 | 25 | self.pool = nn.AdaptiveAvgPool2d((1, 512)) 26 | self.classifier = nn.Linear(512, out_planes) 27 | 28 | 29 | def forward(self, x): 30 | shape = x.shape 31 | # print(x.shape) #8,40,192,256 32 | 33 | x = x.view(-1, self.in_planes, x.shape[-2], x.shape[-1]) 34 | # print(x.shape) #320,1,192,256 b*f, c,h,w 35 | 36 | x = self.backbone(x) 37 | # print(x.shape) 38 | x = x.view(shape[0], shape[1] // self.in_planes, -1) 39 | 40 | x = self.self_attention(x) 41 | 42 | x = self.pool(x).squeeze(dim=1) 43 | x = self.classifier(x) 44 | return x 45 | 46 | def GestureTransoformer(backbone: str="resnet", in_planes: int=3, n_classes: int=25, **kwargs): 47 | if backbone not in backbone_dict: 48 | raise NotImplementedError("Backbone type: [{}] is not implemented.".format(backbone)) 49 | model = _GestureTransformer(backbone_dict[backbone], in_planes, n_classes, **kwargs) 50 | return model -------------------------------------------------------------------------------- /src_gestformer/utils/configer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | class Configer(object): 5 | """Configuration details object 6 | 7 | Attributes: 8 | args (dict): Dictionary containing terminal parameters added to current procedure 9 | params (dict): Dictionary containing parameters in the json file provided 10 | 11 | """ 12 | def __init__(self, args): 13 | """Configer constructor 14 | 15 | Args: 16 | args (argparse.Namespace): Object containing terminal parameters 17 | 18 | """ 19 | self.args = args.__dict__ 20 | self.params = None 21 | 22 | if not os.path.exists(args.hypes): 23 | raise ValueError('Json Path: {} not exists!'.format(args.hypes)) 24 | 25 | json_stream = open(args.hypes, 'r') 26 | self.params = json.load(json_stream) 27 | json_stream.close() 28 | 29 | def get(self, *keys): 30 | """Item getter 31 | 32 | Args: 33 | *keys (list of str): List of keys 34 | 35 | Returns: 36 | el (str): Value retrived from args or params at keys location 37 | 38 | """ 39 | if len(keys) == 0: 40 | return self.params 41 | 42 | key = keys[-1] 43 | if key in self.args and self.args[key] is not None: 44 | return self.args[key] 45 | 46 | el = self.params 47 | for key in keys: 48 | if key in el and el[key] is not None: 49 | el = el[key] 50 | else: 51 | return None 52 | return el 53 | 54 | def __getitem__(self, item): 55 | """Get item function, same for the get[item]""" 56 | if isinstance(item, tuple): 57 | return self.get(*item) 58 | else: 59 | return self.get(item) 60 | 61 | def __getattr__(self, item): 62 | """Get attr function, same for the get[item]""" 63 | return self.get(item) 64 | 65 | def __str__(self): 66 | """To string function for the whole configuration state""" 67 | out = "" 68 | out += "Args:\n" + "\n".join([f" {str(key)}: {str(value)}" for key, value in self.args.items()]) + "\n" 69 | out += "Params:\n" + "\n".join([f" {str(key)}: {str(value)}" for key, value in self.params.items()]) 70 | return out 71 | -------------------------------------------------------------------------------- /src_gestformer/datasets/utils/normals.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def normals(depthmap, normalize=True, keep_dims=True): 4 | """Calculate depth normals as normals = gF(x,y,z) = (-dF/dx, -dF/dy, 1) 5 | 6 | Args: 7 | depthmap (np.ndarray): depth map of any dtype, single channel, len(depthmap.shape) == 3 8 | normalize (bool, optional): if True, normals will be normalized to have unit-magnitude 9 | Default: True 10 | keep_dims (bool, optional): 11 | if True, normals shape will be equals to depthmap shape, 12 | if False, normals shape will be smaller than depthmap shape. 13 | Default: True 14 | 15 | Returns: 16 | Depth normals 17 | 18 | """ 19 | depthmap = np.asarray(depthmap, np.float32) 20 | 21 | if keep_dims is True: 22 | mask = depthmap != 0 23 | else: 24 | mask = depthmap[1:-1, 1:-1] != 0 25 | 26 | if keep_dims is True: 27 | normals = np.zeros((depthmap.shape[0], depthmap.shape[1], 3), dtype=np.float32) 28 | normals[1:-1, 1:-1, 0] = - (depthmap[2:, 1:-1] - depthmap[:-2, 1:-1]) / 2 29 | normals[1:-1, 1:-1, 1] = - (depthmap[1:-1, 2:] - depthmap[1:-1, :-2]) / 2 30 | else: 31 | normals = np.zeros((depthmap.shape[0] - 2, depthmap.shape[1] - 2, 3), dtype=np.float32) 32 | normals[:, :, 0] = - (depthmap[2:, 1:-1] - depthmap[:-2, 1:-1]) / 2 33 | normals[:, :, 1] = - (depthmap[1:-1, 2:] - depthmap[1:-1, :-2]) / 2 34 | normals[:, :, 2] = 1 35 | 36 | normals[~mask] = [0, 0, 0] 37 | 38 | if normalize: 39 | div = np.linalg.norm(normals[mask], ord=2, axis=-1, keepdims=True).repeat(3, axis=-1) + 1e-12 40 | normals[mask] /= div 41 | 42 | return normals 43 | 44 | 45 | def normals_multi(depthmaps, normalize=True, keep_dims=True): 46 | """Calculate depth normals for multiple depthmaps inputs 47 | 48 | Args: 49 | depthmap (np.ndarray): multiple input depth maps 50 | normalize (bool, optional): if True, normals will be normalized to have unit-magnitude 51 | Default: True 52 | keep_dims (bool, optional): 53 | if True, normals shape will be equals to depthmap shape, 54 | if False, normals shape will be smaller than depthmap shape. 55 | Default: True 56 | 57 | Returns: 58 | Depth normals 59 | 60 | """ 61 | n_out = np.zeros((depthmaps.shape[0], depthmaps.shape[1], 3, depthmaps.shape[-1])) 62 | for i in range(depthmaps.shape[-1]): 63 | n_out[..., i] = normals(depthmaps[..., 0, i], normalize, keep_dims) 64 | return n_out -------------------------------------------------------------------------------- /src_gestformer/cs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | # df1 = pd.read_csv('csv/Briareo/normal.csv', header = None) # place your csv1 in df1 4 | 5 | #df1[df1.columns.drop('A')] 6 | o = pd.read_csv('csv/Briareo/original.csv', header = None) 7 | df1 = pd.read_csv('csv/Briareo/normal.csv', header = None) # place your csv1 in df1 8 | df2 = pd.read_csv('csv/Briareo/depth.csv', header = None) # place your csv2 in df2 9 | df3 = pd.read_csv('csv/Briareo/ir.csv', header = None) # place your csv2 in df2 10 | df4 = pd.read_csv('csv/Briareo/rgbop.csv', header = None) # place your csv2 in df2 11 | df5 = pd.read_csv('csv/Briareo/rgb.csv', header = None) # place your csv2 in df2 12 | #df2[df2.columns.drop('A')] 13 | #df4 = pd.read_csv('csv/Briareo/color.csv', header = None) 14 | 15 | o1 = o.iloc[:,:].values.tolist() 16 | #print(type(o1)) 17 | 18 | rate_in_1 = df1.iloc[:,:].values.tolist() #store the values of the 3rd column from csv1 to a list 19 | rate_out_1 = df2.iloc[:,:].values.tolist() #store the values of the 4th column from csv1 to a list 20 | rate_out_2 = df3.iloc[:,:].values.tolist() #store the values of the 4th column from csv1 to a list 21 | rate_in_2 = df4.iloc[:,:].values.tolist() #store the values of the 4th column from csv1 to a list 22 | rate_in_5 = df5.iloc[:,:].values.tolist() #store the values of the 4th column from csv1 to a list 23 | 24 | 25 | # rate_in_2 = df2.iloc[:,2].values.tolist() #store the values of the 3rd column from csv1 to a list 26 | #rate_out_2 = df2.iloc[:,3].values.tolist() #store the values of the 4th column from csv1 to a list 27 | 28 | # add the values of 2 rate in lists into rate_in_total list 29 | # rate_in_total = [x+y for x, y in zip(rate_in_1, rate_out_1)] # add the values of 2 rate out lists into rate_out_total list 30 | # rate_in_total = [max(x,y) for (x, y) in zip(rate_in_1, rate_out_2)] 31 | # rate_in_total = [np.add(x,y)/2 for (x, y) in zip(rate_in_1, rate_out_1)] 32 | # rate_in_total = [max(max(x,y),z) for (x, y, z) in zip(rate_in_1, rate_out_1, rate_out_2)] 33 | # rate_in_total = [np.add(np.add(x,y),z)/3 for (x, y, z) in zip(rate_in_1, rate_out_1, rate_out_2)] 34 | # rate_in_total = [np.add(np.add(np.add(x,y),w),z)/4 for (x, y,z,w) in zip(rate_in_1, rate_out_1,rate_out_2,rate_in_2)] 35 | rate_in_total = [np.add(np.add(np.add(np.add(x,y),w),z),k)/5 for (x, y,z,w,k) in zip(rate_in_1, rate_out_1,rate_out_2,rate_in_2,rate_in_5)] 36 | #print(rate_in_total[1]) 37 | 38 | final_df = pd.DataFrame(rate_in_total) 39 | #print(final_df) 40 | with open('csv/Briareo/ir_rgb.csv', 'a', newline='') as csvfile: 41 | final_df.to_csv(csvfile, mode='a',header=False,index =False) 42 | # print(csvfile) 43 | #print(np.where(max(rate_in_total[1]))) 44 | 45 | #print(len(rate_in_total)) 46 | c=0 47 | for x in range(len(rate_in_total)): 48 | #print(np.argmax(rate_in_total[x], axis=0)) 49 | #print(o1[x]) 50 | #print(np.argmax(rate_in_total[x], axis=0)==o1[x]) 51 | if np.argmax(rate_in_total[x], axis=0)==o1[x]: 52 | c +=1 53 | #print(c) 54 | print( c / 218) 55 | # print( c / 482) 56 | -------------------------------------------------------------------------------- /src_gestformer/models/backbones/c3d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class _C3D(nn.Module): 6 | """ 7 | The C3D network as described in [1]. 8 | """ 9 | 10 | def __init__(self, drop_prob: float): 11 | super(_C3D, self).__init__() 12 | 13 | self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 14 | self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)) 15 | 16 | self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 17 | self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) 18 | 19 | self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 20 | self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 21 | self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) 22 | 23 | self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 24 | self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 25 | self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) 26 | 27 | self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 28 | self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 29 | self.pool5 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1)) 30 | 31 | self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) 32 | 33 | self.dropout = nn.Dropout3d(p=drop_prob) 34 | 35 | self.fc6 = nn.Linear(8192, 4096) 36 | self.fc7 = nn.Linear(4096, 4096) 37 | self.fc8 = nn.Linear(4096, 487) 38 | 39 | self.relu = nn.ReLU() 40 | self.softmax = nn.Softmax() 41 | 42 | 43 | def forward(self, x): 44 | h = self.relu(self.conv1(x)) 45 | h = self.dropout(h) 46 | h = self.pool1(h) 47 | 48 | h = self.relu(self.conv2(h)) 49 | h = self.dropout(h) 50 | h = self.pool2(h) 51 | 52 | h = self.relu(self.conv3a(h)) 53 | h = self.dropout(h) 54 | h = self.relu(self.conv3b(h)) 55 | h = self.dropout(h) 56 | h = self.pool3(h) 57 | 58 | h = self.relu(self.conv4a(h)) 59 | h = self.dropout(h) 60 | h = self.relu(self.conv4b(h)) 61 | h = self.dropout(h) 62 | h = self.pool4(h) 63 | 64 | h = self.relu(self.conv5a(h)) 65 | h = self.dropout(h) 66 | h = self.relu(self.conv5b(h)) 67 | h = self.dropout(h) 68 | h = self.pool5(h) 69 | 70 | h = self.avgpool(h) 71 | 72 | return h.squeeze() 73 | 74 | 75 | def C3D(pretrained, in_planes: int=3, dropout=0., **kwargs): 76 | model = _C3D(drop_prob=dropout) 77 | if pretrained: 78 | state_dict = torch.load("./c3d.pickle") 79 | model.load_state_dict(state_dict) 80 | if in_planes in [1, 2]: 81 | w = model.conv1._parameters['weight'].data 82 | model.conv1 = nn.Conv3d(in_planes, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 83 | if in_planes == 1: 84 | model.conv1._parameters['weight'].data = w.mean(dim=1, keepdim=True) 85 | else: 86 | model.conv1._parameters['weight'].data = w[:, :-1] * 1.5 87 | model.conv1 = nn.Conv3d(in_planes, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 88 | model.fc6 = None 89 | model.fc7 = None 90 | model.fc8 = None 91 | return model -------------------------------------------------------------------------------- /src_gestformer/datasets/utils/read_data.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | from pathlib import Path 5 | 6 | def load_split_nvgesture(file_with_split='./nvgesture_train_correct.lst', list_split=list()): 7 | with open(file_with_split, 'rt') as f: 8 | dict_name = file_with_split[file_with_split.rfind('/') + 1:] 9 | dict_name = dict_name[:dict_name.find('_')] 10 | 11 | for line in f: 12 | params = line.split(' ') 13 | params_dictionary = dict() 14 | 15 | params_dictionary['dataset'] = dict_name 16 | 17 | path = params[0].split(':')[1] 18 | for param in params[1:]: 19 | parsed = param.split(':') 20 | key = parsed[0] 21 | if key == 'label': 22 | # make label start from 0 23 | label = int(parsed[1]) - 1 24 | params_dictionary['label'] = label 25 | elif key in ('depth', 'color', 'duo_left'): 26 | # first store path 27 | params_dictionary[key] = path + '/' + parsed[1] 28 | # store start frame 29 | params_dictionary[key + '_start'] = int(parsed[2]) 30 | 31 | params_dictionary[key + '_end'] = int(parsed[3]) 32 | 33 | params_dictionary['duo_right'] = params_dictionary['duo_left'].replace('duo_left', 'duo_right') 34 | params_dictionary['duo_right_start'] = params_dictionary['duo_left_start'] 35 | params_dictionary['duo_right_end'] = params_dictionary['duo_left_end'] 36 | 37 | params_dictionary['duo_disparity'] = params_dictionary['duo_left'].replace('duo_left', 'duo_disparity') 38 | params_dictionary['duo_disparity_start'] = params_dictionary['duo_left_start'] 39 | params_dictionary['duo_disparity_end'] = params_dictionary['duo_left_end'] 40 | 41 | list_split.append(params_dictionary) 42 | 43 | return list_split 44 | 45 | 46 | def load_data_from_file(data_path, example_config, sensor, image_width, image_height, nogesture = False): 47 | path = example_config[sensor] + ".avi" 48 | path = Path(data_path) / path[path.find('/') + 1:] 49 | start_frame = example_config[sensor + '_start'] 50 | end_frame = example_config[sensor + '_end'] 51 | label = example_config['label'] 52 | 53 | if end_frame - start_frame > 80: 54 | new_start = (end_frame - start_frame) // 2 - 40 + start_frame 55 | new_end = (end_frame - start_frame) // 2 + 40 + start_frame 56 | start_frame = new_start 57 | end_frame = new_end 58 | 59 | chnum = 3 if sensor == "color" else 1 60 | 61 | video_container = np.zeros((image_height, image_width, chnum, 160 if nogesture else 80), dtype=np.uint8) 62 | 63 | cap = cv2.VideoCapture(str(path)) 64 | 65 | if nogesture: 66 | start_offset = 40 if start_frame >= 40 else start_frame 67 | end_offset = 40 if int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - end_frame >= 40 \ 68 | else int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - end_frame 69 | frames_to_load = range(start_frame - start_offset, end_frame + end_offset) 70 | else: 71 | frames_to_load = range(start_frame, end_frame) 72 | 73 | cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) 74 | for indx, frameIndx in enumerate(frames_to_load): 75 | ret, frame = cap.read() 76 | if ret: 77 | frame = cv2.resize(frame, (image_width, image_height)) 78 | if sensor != "color": 79 | frame = frame[..., 0] 80 | frame = frame[..., np.newaxis] 81 | video_container[..., indx] = frame 82 | else: 83 | print("Could not load frame") 84 | 85 | cap.release() 86 | 87 | if nogesture: 88 | return video_container, label, (start_offset, end_offset) 89 | else: 90 | return video_container, label, None -------------------------------------------------------------------------------- /src_gestformer/datasets/NVGestures.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch.utils.data.dataset import Dataset 4 | 5 | import numpy as np 6 | import cv2 7 | 8 | from datasets.utils.read_data import load_split_nvgesture, load_data_from_file 9 | from datasets.utils.normals import normals_multi 10 | from datasets.utils.normalize import normalize 11 | 12 | from pathlib import Path 13 | 14 | class NVGesture(Dataset): 15 | """NVGesture Dataset class""" 16 | def __init__(self, configer, path, split="train", data_type="depth", transforms=None, n_frames=40, optical_flow=False): 17 | """Constructor method for NVGesture Dataset class 18 | 19 | Args: 20 | configer (Configer): Configer object for current procedure phase (train, test, val) 21 | split (str, optional): Current procedure phase (train, test, val) 22 | data_type (str, optional): Input data type (depth, rgb, normals, ir) 23 | transform (Object, optional): Data augmentation transformation for every data 24 | n_frames (int, optional): Number of frames selected for every input clip 25 | optical_flow (bool, optional): Flag to choose if calculate optical flow or not 26 | 27 | """ 28 | super().__init__() 29 | 30 | print("Loading NVGestures {} dataset...".format(split.upper()), end=" ") 31 | 32 | self.dataset_path = Path(path) / "nvgesture_arch" / "nvGesture_v1" 33 | self.split = split 34 | self.data_type = data_type 35 | self.transforms = transforms 36 | self.optical_flow = optical_flow 37 | if self.data_type in ["normal", "normals"] and self.optical_flow: 38 | raise NotImplementedError("Optical flow for normals image is not supported.") 39 | 40 | file_lists = self.dataset_path / \ 41 | "nvgesture_{}_correct_cvpr2016_v2.lst".format(self.split if self.split == "train" else "test") 42 | 43 | self.data_list = list() 44 | load_split_nvgesture(file_with_split=str(file_lists), list_split=self.data_list) 45 | 46 | if self.data_type in ["depth_z", "depth", "normal", "normals"]: 47 | self.sensor = "depth" 48 | elif self.data_type == "wrapped": 49 | self.sensor = "wrapped" 50 | elif self.data_type in ["rgb", "color"]: 51 | self.sensor = "color" 52 | elif self.data_type == "ir": 53 | self.sensor = "duo_left" 54 | else: 55 | raise NotImplementedError 56 | print("done.") 57 | 58 | def __len__(self): 59 | return len(self.data_list) 60 | 61 | def __getitem__(self, idx): 62 | data, label, offsets = load_data_from_file(self.dataset_path, example_config=self.data_list[idx], sensor=self.sensor, 63 | image_width=320, image_height=240) 64 | if self.optical_flow: 65 | if self.transforms: 66 | aug_det = self.transforms.to_deterministic() 67 | data = np.array([aug_det.augment_image(data[..., i]) 68 | for i in range(data.shape[-1])]).transpose(1, 2, 3, 0) 69 | prev = data[..., 0] 70 | if self.data_type == "rgb": 71 | prev = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY) 72 | data = data[..., [0, 1] + [*range(2, data.shape[-1], 2)]] 73 | flow = np.zeros((data.shape[0], data.shape[1], 2, data.shape[-1] - 1)) 74 | for i in range(1, data.shape[-1]): 75 | next = data[..., i] 76 | if self.data_type == "rgb": 77 | next = cv2.cvtColor(next, cv2.COLOR_BGR2GRAY) 78 | # print(flow.shape) 79 | flow[..., i - 1] = cv2.calcOpticalFlowFarneback(prev, next, None, 0.5, 3, 15, 3, 5, 1.2, 0) 80 | prev = next 81 | data = flow 82 | 83 | data = data[..., [*range(0, data.shape[-1], 2)]] # Our settings is working with static clip containing 40 frames 84 | 85 | if self.data_type in ["normal", "normals"]: 86 | data = normals_multi(data) 87 | else: 88 | data = normalize(data) 89 | 90 | if self.transforms is not None and not self.optical_flow: 91 | aug_det = self.transforms.to_deterministic() 92 | data = np.array([aug_det.augment_image(data[..., i]) for i in range(data.shape[-1])]).transpose(1, 2, 3, 0) 93 | 94 | data = np.concatenate(data.transpose(3, 0, 1, 2), axis=2).transpose(2, 0, 1) 95 | data = torch.from_numpy(data) 96 | label = torch.LongTensor(np.asarray([label])) 97 | 98 | return data.float(), label -------------------------------------------------------------------------------- /src_gestformer/models/backbones/vgg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torchvision.models.utils import load_state_dict_from_url 4 | 5 | 6 | __all__ = [ 7 | 'VGG', 'vgg16', 'vgg16_bn', 8 | ] 9 | 10 | 11 | model_urls = { 12 | 'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth', 13 | 'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth', 14 | } 15 | 16 | 17 | class VGG(nn.Module): 18 | 19 | def __init__(self, features, num_classes=1000, init_weights=True): 20 | super(VGG, self).__init__() 21 | self.features = features 22 | self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) 23 | self.classifier = nn.Sequential( 24 | nn.Linear(512 * 7 * 7, 4096), 25 | nn.ReLU(True), 26 | nn.Dropout(), 27 | nn.Linear(4096, 4096), 28 | nn.ReLU(True), 29 | nn.Dropout(), 30 | nn.Linear(4096, num_classes), 31 | ) 32 | if init_weights: 33 | self._initialize_weights() 34 | 35 | def forward(self, x): 36 | x = self.features(x) 37 | x = self.avgpool(x) 38 | x = torch.flatten(x, 1) 39 | return x 40 | 41 | def _initialize_weights(self): 42 | for m in self.modules(): 43 | if isinstance(m, nn.Conv2d): 44 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 45 | if m.bias is not None: 46 | nn.init.constant_(m.bias, 0) 47 | elif isinstance(m, nn.BatchNorm2d): 48 | nn.init.constant_(m.weight, 1) 49 | nn.init.constant_(m.bias, 0) 50 | elif isinstance(m, nn.Linear): 51 | nn.init.normal_(m.weight, 0, 0.01) 52 | nn.init.constant_(m.bias, 0) 53 | 54 | 55 | def make_layers(cfg, batch_norm=False): 56 | layers = [] 57 | in_channels = 3 58 | for v in cfg: 59 | if v == 'M': 60 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 61 | else: 62 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 63 | if batch_norm: 64 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 65 | else: 66 | layers += [conv2d, nn.ReLU(inplace=True)] 67 | in_channels = v 68 | return nn.Sequential(*layers) 69 | 70 | 71 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'] 72 | 73 | 74 | def _vgg(arch, batch_norm, pretrained, in_planes, drop_prob, **kwargs): 75 | if pretrained: 76 | kwargs['init_weights'] = False 77 | model = VGG(make_layers(cfg, batch_norm=batch_norm), **kwargs) 78 | if pretrained: 79 | state_dict = load_state_dict_from_url(model_urls[arch], 80 | progress=True) 81 | model.load_state_dict(state_dict) 82 | 83 | if in_planes in [1, 2]: 84 | w = model.features._modules['0']._parameters['weight'].data 85 | model.features._modules['0'] = nn.Conv2d(in_planes, 64, kernel_size=3, padding=1, bias=batch_norm is False) 86 | if in_planes == 1: 87 | model.conv1._parameters['weight'].data = w.mean(dim=1, keepdim=True) 88 | else: 89 | model.conv1._parameters['weight'].data = w[:, :-1] * 1.5 90 | else: 91 | model.features._modules['0'] = nn.Conv2d(in_planes, 64, kernel_size=3, padding=1, bias=batch_norm is False) 92 | if drop_prob > 0: 93 | new_features = list() 94 | for el in model.features: 95 | new_features.append(el) 96 | if isinstance(el, nn.ReLU): 97 | new_features.append(nn.Dropout2d(p=drop_prob)) 98 | model.features = nn.Sequential(*new_features) 99 | 100 | model.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 101 | model.classifier = None 102 | return model 103 | 104 | 105 | def vgg16(pretrained=False, in_planes: int=3, dropout2d: float=0., **kwargs): 106 | r"""VGG 16-layer model (configuration "D") 107 | `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ 108 | 109 | Args: 110 | pretrained (bool): If True, returns a model pre-trained on ImageNet 111 | progress (bool): If True, displays a progress bar of the download to stderr 112 | """ 113 | return _vgg('vgg16', False, pretrained, in_planes, dropout2d, **kwargs) 114 | 115 | 116 | def vgg16_bn(pretrained=False, in_planes: int=3, dropout2d: float=0., **kwargs): 117 | r"""VGG 16-layer model (configuration "D") with batch normalization 118 | `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ 119 | 120 | Args: 121 | pretrained (bool): If True, returns a model pre-trained on ImageNet 122 | progress (bool): If True, displays a progress bar of the download to stderr 123 | """ 124 | return _vgg('vgg16_bn', True, pretrained, in_planes, dropout2d, **kwargs) -------------------------------------------------------------------------------- /src_gestformer/datasets/Briareo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import torch 4 | from pathlib import Path 5 | 6 | import cv2 7 | import numpy as np 8 | 9 | from torch.utils.data.dataset import Dataset 10 | 11 | from datasets.utils.normals import normals_multi 12 | from datasets.utils.normalize import normalize 13 | from datasets.utils.optical_flow import dense_flow 14 | 15 | from datasets.utils.utils_briareo import from_json_to_list 16 | 17 | 18 | class Briareo(Dataset): 19 | """Briareo Dataset class""" 20 | def __init__(self, configer, path, split="train", data_type='depth', transforms=None, n_frames=30, optical_flow=False): 21 | """Constructor method for Briareo Dataset class 22 | 23 | Args: 24 | configer (Configer): Configer object for current procedure phase (train, test, val) 25 | split (str, optional): Current procedure phase (train, test, val) 26 | data_type (str, optional): Input data type (depth, rgb, normals, ir) 27 | transform (Object, optional): Data augmentation transformation for every data 28 | n_frames (int, optional): Number of frames selected for every input clip 29 | optical_flow (bool, optional): Flag to choose if calculate optical flow or not 30 | 31 | """ 32 | super().__init__() 33 | 34 | self.dataset_path = Path(path) 35 | self.split = split 36 | self.data_type = data_type 37 | self.optical_flow = optical_flow 38 | if self.data_type in ["normal", "normals"] and self.optical_flow: 39 | raise NotImplementedError("Optical flow for normals image is not supported.") 40 | 41 | self.transforms = transforms 42 | self.n_frames = n_frames if not optical_flow else n_frames + 1 43 | 44 | print("Loading Briareo {} dataset...".format(split.upper()), end=" ") 45 | if data_type in ["normal", "normals"]: 46 | data_type = "depth" 47 | data = np.load(self.dataset_path / "splits" / (self.split if self.split != "val" else "train") / 48 | "{}_{}.npz".format(data_type, self.split), allow_pickle=True)['arr_0'] 49 | 50 | # Prepare clip for the selected number of frames n_frame 51 | fixed_data = list() 52 | for i, record in enumerate(data): 53 | paths = record['data'] 54 | 55 | center_of_list = math.floor(len(paths) / 2) 56 | crop_limit = math.floor(self.n_frames / 2) 57 | 58 | start = center_of_list - crop_limit 59 | end = center_of_list + crop_limit 60 | paths_cropped = paths[start: end + 1 if self.n_frames % 2 == 1 else end] 61 | if self.data_type == 'leapmotion': 62 | valid = np.array(record['valid'][start: end + 1 if self.n_frames % 2 == 1 else end]) 63 | if valid.sum() == len(valid): 64 | data[i]['data'] = paths_cropped 65 | fixed_data.append(data[i]) 66 | else: 67 | data[i]['data'] = paths_cropped 68 | fixed_data.append(data[i]) 69 | 70 | self.data = np.array(fixed_data) 71 | print("done.") 72 | 73 | def __len__(self): 74 | return len(self.data) 75 | 76 | def __getitem__(self, idx): 77 | paths = self.data[idx]['data'] 78 | label = self.data[idx]['label'] 79 | 80 | clip = list() 81 | for p in paths: 82 | if self.data_type == "leapmotion": 83 | img = from_json_to_list(os.path.join(self.dataset_path, p))[0] 84 | else: 85 | if self.data_type in ["depth", "normal", "normals"]: 86 | img = np.load(str(self.dataset_path / p), allow_pickle=True)['arr_0'] 87 | if self.data_type in ["normal", "normals"]: 88 | img *= 1000 89 | elif self.data_type in ["ir"]: 90 | img = cv2.imread(str(self.dataset_path / p), cv2.IMREAD_ANYDEPTH) 91 | else: 92 | img = cv2.imread(str(self.dataset_path / p), cv2.IMREAD_COLOR) 93 | img = cv2.resize(img, (224, 224)) 94 | if self.data_type != "rgb": 95 | img = np.expand_dims(img, axis=2) 96 | clip.append(img) 97 | 98 | clip = np.array(clip).transpose(1, 2, 3, 0) 99 | 100 | if self.data_type in ["normal", "normals"]: 101 | clip = normals_multi(clip) 102 | else: 103 | if self.optical_flow: 104 | clip = dense_flow(clip, self.data_type == "rgb") 105 | clip = normalize(clip) 106 | 107 | if self.transforms is not None: 108 | aug_det = self.transforms.to_deterministic() 109 | clip = np.array([aug_det.augment_image(clip[..., i]) for i in range(clip.shape[-1])]).transpose(1, 2, 3, 0) 110 | 111 | clip = torch.from_numpy(clip.reshape(clip.shape[0], clip.shape[1], -1).transpose(2, 0, 1)) 112 | label = torch.LongTensor(np.asarray([label])) 113 | return clip.float(), label -------------------------------------------------------------------------------- /src_gestformer/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from torch.utils.data import DataLoader 5 | import imgaug.augmenters as iaa 6 | 7 | # Import Datasets 8 | from datasets.Briareo import Briareo 9 | from datasets.NVGestures import NVGesture 10 | from models.model_utilizer import ModuleUtilizer 11 | 12 | # Import Model 13 | from models.temporal import GestureTransoformer 14 | 15 | # Import Utils 16 | from tqdm import tqdm 17 | from utils.average_meter import AverageMeter 18 | from torchstat import stat 19 | import time 20 | import torchsummary 21 | from fvcore.nn import FlopCountAnalysis 22 | 23 | 24 | # Setting seeds 25 | def worker_init_fn(worker_id): 26 | np.random.seed(torch.initial_seed() % 2 ** 32) 27 | 28 | class GestureTest(object): 29 | """Gesture Recognition Test class 30 | 31 | Attributes: 32 | configer (Configer): Configer object, contains procedure configuration. 33 | train_loader (torch.utils.data.DataLoader): Train data loader variable 34 | val_loader (torch.utils.data.DataLoader): Val data loader variable 35 | test_loader (torch.utils.data.DataLoader): Test data loader variable 36 | net (torch.nn.Module): Network used for the current procedure 37 | lr (int): Learning rate value 38 | optimizer (torch.nn.optim.optimizer): Optimizer for training procedure 39 | iters (int): Starting iteration number, not zero if resuming training 40 | epoch (int): Starting epoch number, not zero if resuming training 41 | scheduler (torch.optim.lr_scheduler): Scheduler to utilize during training 42 | 43 | """ 44 | 45 | def __init__(self, configer): 46 | self.configer = configer 47 | 48 | self.data_path = configer.get("data", "data_path") #: str: Path to data directory 49 | 50 | # Train val and test accuracy 51 | self.accuracy = AverageMeter() 52 | 53 | # DataLoaders 54 | self.data_loader = None 55 | 56 | # Module load and save utility 57 | self.device = self.configer.get("device") 58 | self.model_utility = ModuleUtilizer(self.configer) #: Model utility for load, save and update optimizer 59 | self.net = None 60 | 61 | # Training procedure 62 | self.transforms = None 63 | 64 | # Other useful data 65 | self.backbone = self.configer.get("network", "backbone") #: str: Backbone type 66 | self.in_planes = None #: int: Input channels 67 | self.clip_length = self.configer.get("data", "n_frames") #: int: Number of frames per sequence 68 | self.n_classes = self.configer.get("data", "n_classes") #: int: Total number of classes for dataset 69 | self.data_type = self.configer.get("data", "type") #: str: Type of data (rgb, depth, ir, leapmotion) 70 | self.dataset = self.configer.get("dataset").lower() #: str: Type of dataset 71 | self.optical_flow = self.configer.get("data", "optical_flow") 72 | if self.optical_flow is None: 73 | self.optical_flow = True 74 | 75 | def init_model(self): 76 | """Initialize model and other data for procedure""" 77 | 78 | if self.optical_flow is True: 79 | self.in_planes = 2 80 | elif self.data_type in ["depth", "ir"]: 81 | self.in_planes = 1 82 | else: 83 | self.in_planes = 3 84 | 85 | # Selecting correct model and normalization variable based on type variable 86 | self.net = GestureTransoformer(self.backbone, self.in_planes, self.n_classes, 87 | pretrained=self.configer.get("network", "pretrained"), 88 | n_head=self.configer.get("network", "n_head"), 89 | dropout_backbone=self.configer.get("network", "dropout2d"), 90 | dropout_transformer=self.configer.get("network", "dropout1d"), 91 | dff=self.configer.get("network", "ff_size"), 92 | n_module=self.configer.get("network", "n_module") 93 | ) 94 | 95 | self.net, _, _, _ = self.model_utility.load_net(self.net) 96 | 97 | # Selecting Dataset and DataLoader 98 | if self.dataset == "briareo": 99 | Dataset = Briareo 100 | self.transforms = iaa.CenterCropToFixedSize(200, 200) 101 | elif self.dataset == "nvgestures": 102 | Dataset = NVGesture 103 | self.transforms = iaa.CenterCropToFixedSize(256, 192) 104 | else: 105 | raise NotImplementedError(f"Dataset not supported: {self.configer.get('dataset')}") 106 | 107 | # Setting Dataloaders 108 | self.data_loader = DataLoader( 109 | Dataset(self.configer, self.data_path, split="test", data_type=self.data_type, 110 | transforms=self.transforms, n_frames=self.clip_length, 111 | optical_flow=self.optical_flow), 112 | batch_size=1, shuffle=False, drop_last=True, 113 | num_workers=self.configer.get('solver', 'workers'), pin_memory=True, worker_init_fn=worker_init_fn) 114 | 115 | def __test(self): 116 | """Testing function.""" 117 | self.net.eval() 118 | c = 0 119 | tot = 0 120 | with torch.no_grad(): 121 | for i, data_tuple in enumerate(tqdm(self.data_loader, desc="Test")): 122 | """ 123 | input, gt 124 | """ 125 | inputs = data_tuple[0].to(self.device) 126 | gt = data_tuple[1].to(self.device) 127 | # print(self.device) 128 | flops = FlopCountAnalysis(self.net, inputs) 129 | print(flops.total()/1e9) 130 | torchsummary.summary(self.net, inputs[0].shape) 131 | 132 | start_time = time.time() 133 | output = self.net(inputs) 134 | end_time = time.time() 135 | 136 | predicted = torch.argmax(output.detach(), dim=1) 137 | correct = gt.detach().squeeze(dim=1) 138 | 139 | if predicted == correct: 140 | c += 1 141 | tot += 1 142 | break 143 | 144 | accuracy = c / tot 145 | inference_time = end_time - start_time 146 | print("Inference time:", inference_time, "seconds") 147 | 148 | print("Accuracy: {}".format(accuracy)) 149 | 150 | def test(self): 151 | self.__test() 152 | 153 | 154 | def update_metrics(self, split: str, loss, bs, accuracy=None): 155 | self.losses[split].update(loss, bs) 156 | if accuracy is not None: 157 | self.accuracy[split].update(accuracy, bs) 158 | if split == "train" and self.iters % self.save_iters == 0: 159 | self.tbx_summary.add_scalar('{}_loss'.format(split), self.losses[split].avg, self.iters) 160 | self.tbx_summary.add_scalar('{}_accuracy'.format(split), self.accuracy[split].avg, self.iters) 161 | self.losses[split].reset() 162 | self.accuracy[split].reset() -------------------------------------------------------------------------------- /src_gestformer/models/model_utilizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | 5 | from pathlib import Path 6 | 7 | class ModuleUtilizer(object): 8 | """Module utility class 9 | 10 | Attributes: 11 | configer (Configer): Configer object, contains procedure configuration. 12 | 13 | """ 14 | def __init__(self, configer): 15 | """Class constructor for Module utility""" 16 | self.configer = configer 17 | self.device = self.configer.get("device") 18 | 19 | self.save_policy = self.configer.get("checkpoints", "save_policy") 20 | if self.save_policy in ["early_stop", "earlystop"]: 21 | self.save = self.early_stop 22 | elif self.save_policy == "all": 23 | self.save = self.save_all 24 | else: 25 | self.save = self.save_best 26 | 27 | self.best_accuracy = 0 28 | self.last_improvement = 0 29 | 30 | def update_optimizer(self, net, iters): 31 | """Load optimizer and adjust learning rate during training, if using SGD. 32 | 33 | Args: 34 | net (torch.nn.Module): Module in use 35 | iters (int): current iteration number 36 | 37 | Returns: 38 | optimizer (torch.optim.optimizer): PyTorch Optimizer 39 | lr (float): Learning rate for training procedure 40 | 41 | """ 42 | optim = self.configer.get('solver', 'type') 43 | decay = self.configer.get('solver', 'weight_decay') 44 | 45 | if optim == "Adam": 46 | print("Using Adam.") 47 | lr = self.configer.get('solver', 'base_lr') 48 | optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr, 49 | weight_decay=decay) 50 | 51 | elif optim == "AdamW": 52 | lr = self.configer.get('solver', 'base_lr') 53 | optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, net.parameters()), lr=lr, 54 | weight_decay=decay) 55 | 56 | elif optim == "RMSProp": 57 | lr = self.configer.get('solver', 'base_lr') 58 | optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, net.parameters()), lr=lr, 59 | weight_decay=decay) 60 | 61 | elif optim == "SGD": 62 | print("Using SGD") 63 | policy = self.configer.get('solver', 'lr_policy') 64 | 65 | if policy == 'fixed': 66 | lr = self.configer.get('solver', 'base_lr') 67 | 68 | elif policy == 'step': 69 | gamma = self.configer.get('solver', 'gamma') 70 | ratio = gamma ** (iters // self.configer.get('solver', 'step_size')) 71 | lr = self.configer.get('solver', 'base_lr') * ratio 72 | 73 | elif policy == 'exp': 74 | lr = self.configer.get('solver', 'base_lr') * (self.configer.get('solver', 'gamma') ** iters) 75 | 76 | elif policy == 'inv': 77 | power = -self.configer.get('solver', 'power') 78 | ratio = (1 + self.configer.get('solver', 'gamma') * iters) ** power 79 | lr = self.configer.get('solver', 'base_lr') * ratio 80 | 81 | elif policy == 'multistep': 82 | lr = self.configer.get('solver', 'base_lr') 83 | for step_value in self.configer.get('solver', 'stepvalue'): 84 | if iters >= step_value: 85 | lr *= self.configer.get('solver', 'gamma') 86 | else: 87 | break 88 | else: 89 | raise NotImplementedError('Policy:{} is not valid.'.format(policy)) 90 | 91 | optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr = lr, 92 | momentum=self.configer.get('solver', 'momentum'), weight_decay=decay) 93 | 94 | else: 95 | raise NotImplementedError('Optimizer: {} is not valid.'.format(optim)) 96 | 97 | return optimizer, lr 98 | 99 | def load_net(self, net): 100 | """Loading net method. If resume is True load from provided checkpoint, if False load new DataParallel 101 | 102 | Args: 103 | net (torch.nn.Module): Module in use 104 | 105 | Returns: 106 | net (torch.nn.DataParallel): Loaded Network module 107 | iters (int): Loaded current iteration number, 0 if Resume is False 108 | epoch (int): Loaded current epoch number, 0 if Resume is False 109 | optimizer (torch.nn.optimizer): Loaded optimizer state, None if Resume is False 110 | 111 | """ 112 | iters = 0 113 | epoch = 0 114 | optimizer = None 115 | if self.configer.get('resume') is not None: 116 | print('Restoring checkpoint: ', self.configer.get('resume')) 117 | checkpoint_dict = torch.load(self.configer.get('resume')) 118 | # Remove "module." from DataParallel, if present 119 | checkpoint_dict['state_dict'] = {k[len('module.'):] if k.startswith('module.') else k: v for k, v in 120 | checkpoint_dict['state_dict'].items()} 121 | net.load_state_dict(checkpoint_dict['state_dict']) 122 | iters = checkpoint_dict['iter'] if 'iter' in checkpoint_dict else 0 123 | optimizer = checkpoint_dict['optimizer'] if 'optimizer' in checkpoint_dict else None 124 | epoch = checkpoint_dict['epoch'] if 'epoch' in checkpoint_dict else None 125 | net = nn.DataParallel(net, device_ids=self.configer.get('gpu')).to(self.device) 126 | return net, iters, epoch, optimizer 127 | 128 | def _save_net(self, net, optimizer, iters, epoch, all=False): 129 | """Saving net state method. 130 | 131 | Args: 132 | net (torch.nn.Module): Module in use 133 | optimizer (torch.nn.optimizer): Optimizer state to save 134 | iters (int): Current iteration number to save 135 | epoch (int): Current epoch number to save 136 | 137 | """ 138 | state = { 139 | 'iter': iters, 140 | 'epoch': epoch, 141 | 'state_dict': net.state_dict(), 142 | 'optimizer': optimizer.state_dict() 143 | } 144 | checkpoints_dir = str(Path(self.configer.get('checkpoints', 'save_dir')) / self.configer.get("dataset")) 145 | if not os.path.exists(checkpoints_dir): 146 | os.makedirs(checkpoints_dir) 147 | if all: 148 | latest_name = '{}_{}.pth'.format(self.configer.get('checkpoints', 'save_name'), epoch) 149 | else: 150 | latest_name = 'best_{}.pth'.format(self.configer.get('checkpoints', 'save_name')) 151 | torch.save(state, os.path.join(checkpoints_dir, latest_name)) 152 | 153 | def save_all(self, accuracy, net, optimizer, iters, epoch): 154 | self._save_net(net, optimizer, iters, epoch, all=True) 155 | return accuracy 156 | 157 | def save_best(self, accuracy, net, optimizer, iters, epoch): 158 | if accuracy > self.best_accuracy: 159 | self.best_accuracy = accuracy 160 | self._save_net(net, optimizer, iters, epoch) 161 | print(accuracy) 162 | return self.best_accuracy 163 | else: 164 | return 0 165 | 166 | def early_stop(self, accuracy, net, optimizer, iters, epoch): 167 | ret = self.save_best(accuracy, net, optimizer, iters, epoch) 168 | if ret > 0: 169 | self.last_improvement = 0 170 | else: 171 | self.last_improvement += 1 172 | if self.last_improvement >= self.configer.get("checkpoints", "early_stop"): 173 | return -1 174 | else: 175 | return ret -------------------------------------------------------------------------------- /src_gestformer/utils/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import csv 4 | import pandas as pd 5 | 6 | from torch.utils.data import DataLoader 7 | import imgaug.augmenters as iaa 8 | 9 | # Import Datasets 10 | from datasets.Briareo import Briareo 11 | from datasets.NVGestures import NVGesture 12 | from models.model_utilizer import ModuleUtilizer 13 | 14 | # Import Model 15 | from models.temporal import GestureTransoformer 16 | 17 | # Import Utils 18 | from tqdm import tqdm 19 | from utils.average_meter import AverageMeter 20 | 21 | # Setting seeds 22 | 23 | from torchstat import stat 24 | def worker_init_fn(worker_id): 25 | np.random.seed(torch.initial_seed() % 2 ** 32) 26 | 27 | class GestureTest(object): 28 | """Gesture Recognition Test class 29 | 30 | Attributes: 31 | configer (Configer): Configer object, contains procedure configuration. 32 | train_loader (torch.utils.data.DataLoader): Train data loader variable 33 | val_loader (torch.utils.data.DataLoader): Val data loader variable 34 | test_loader (torch.utils.data.DataLoader): Test data loader variable 35 | net (torch.nn.Module): Network used for the current procedure 36 | lr (int): Learning rate value 37 | optimizer (torch.nn.optim.optimizer): Optimizer for training procedure 38 | iters (int): Starting iteration number, not zero if resuming training 39 | epoch (int): Starting epoch number, not zero if resuming training 40 | scheduler (torch.optim.lr_scheduler): Scheduler to utilize during training 41 | 42 | """ 43 | 44 | def __init__(self, configer): 45 | self.configer = configer 46 | 47 | self.data_path = configer.get("data", "data_path") #: str: Path to data directory 48 | 49 | # Train val and test accuracy 50 | self.accuracy = AverageMeter() 51 | 52 | # DataLoaders 53 | self.data_loader = None 54 | 55 | # Module load and save utility 56 | self.device = self.configer.get("device") 57 | self.model_utility = ModuleUtilizer(self.configer) #: Model utility for load, save and update optimizer 58 | self.net = None 59 | 60 | # Training procedure 61 | self.transforms = None 62 | 63 | # Other useful data 64 | self.backbone = self.configer.get("network", "backbone") #: str: Backbone type 65 | self.in_planes = None #: int: Input channels 66 | self.clip_length = self.configer.get("data", "n_frames") #: int: Number of frames per sequence 67 | self.n_classes = self.configer.get("data", "n_classes") #: int: Total number of classes for dataset 68 | self.data_type = self.configer.get("data", "type") #: str: Type of data (rgb, depth, ir, leapmotion) 69 | self.dataset = self.configer.get("dataset").lower() #: str: Type of dataset 70 | self.optical_flow = self.configer.get("data", "optical_flow") 71 | if self.optical_flow is None: 72 | self.optical_flow = True 73 | 74 | def init_model(self): 75 | """Initialize model and other data for procedure""" 76 | 77 | if self.optical_flow is True: 78 | self.in_planes = 2 79 | elif self.data_type in ["depth", "ir"]: 80 | self.in_planes = 1 81 | else: 82 | self.in_planes = 3 83 | 84 | # Selecting correct model and normalization variable based on type variable 85 | self.net = GestureTransoformer(self.backbone, self.in_planes, self.n_classes, 86 | pretrained=self.configer.get("network", "pretrained"), 87 | n_head=self.configer.get("network", "n_head"), 88 | dropout_backbone=self.configer.get("network", "dropout2d"), 89 | dropout_transformer=self.configer.get("network", "dropout1d"), 90 | dff=self.configer.get("network", "ff_size"), 91 | n_module=self.configer.get("network", "n_module") 92 | ) 93 | 94 | self.net, _, _, _ = self.model_utility.load_net(self.net) 95 | 96 | # Selecting Dataset and DataLoader 97 | if self.dataset == "briareo": 98 | Dataset = Briareo 99 | self.transforms = iaa.CenterCropToFixedSize(200, 200) 100 | elif self.dataset == "nvgestures": 101 | Dataset = NVGesture 102 | self.transforms = iaa.CenterCropToFixedSize(256, 192) 103 | else: 104 | raise NotImplementedError(f"Dataset not supported: {self.configer.get('dataset')}") 105 | 106 | # Setting Dataloaders 107 | self.data_loader = DataLoader( 108 | Dataset(self.configer, self.data_path, split="val", data_type=self.data_type, 109 | transforms=self.transforms, n_frames=self.clip_length, 110 | optical_flow=self.optical_flow), 111 | batch_size=1, shuffle=False, drop_last=True, 112 | num_workers=self.configer.get('solver', 'workers'), pin_memory=True, worker_init_fn=worker_init_fn) 113 | 114 | def __test(self): 115 | """Testing function.""" 116 | self.net.eval() 117 | # self.net.cuda() 118 | num_params = sum(p.numel() for p in self.net.parameters()) 119 | print(f'Number of parameters in the model: {num_params}') 120 | device = next(self.net.parameters()).device 121 | print("device", device) 122 | 123 | if 'cuda' in str(device): 124 | print("Model is on CUDA") 125 | else: 126 | print("Model is on CPU") 127 | c = 0 128 | tot = 0 129 | with torch.no_grad(): 130 | with open('csv/Briareo/rgbop.csv', 'a', newline='') as csvfile: 131 | for i, data_tuple in enumerate(tqdm(self.data_loader, desc="Test")): 132 | """ 133 | input, gt 134 | """ 135 | # self.device= torch.device('cpu') 136 | 137 | inputs = data_tuple[0].to(self.device) 138 | gt = data_tuple[1].to(self.device) 139 | 140 | output = self.net(inputs) 141 | # with torch.cuda.device(0): 142 | # # print(inputs.shape.device) 143 | # print(stat(self.net, inputs[0].shape)) 144 | #print(output) 145 | #_, predicted = torch.max(output, 1) 146 | #print(predicted) 147 | 148 | sm = torch.nn.Softmax(dim=1) 149 | prob=sm(output) 150 | #print(i) 151 | predicted = torch.argmax(output.detach(), dim=1) 152 | #print(predicted) 153 | arr = prob.cpu().numpy() 154 | # print(arr) 155 | 156 | hist_df = pd.DataFrame(arr) 157 | # print(hist_df) 158 | #log_file.close() 159 | hist_df.to_csv(csvfile, mode='a',header=False, index=False) 160 | 161 | correct = gt.detach().squeeze(dim=1) 162 | #print(gt) 163 | #hist_df = pd.DataFrame(correct) 164 | #hist_df.to_csv(csvfile, mode='a',header=False) 165 | 166 | if predicted == correct: 167 | c += 1 168 | tot += 1 169 | 170 | #print(predicted) 171 | #print(correct) 172 | accuracy = c / tot 173 | #print(tot) 174 | 175 | print("Accuracy: {}".format(accuracy)) 176 | 177 | def test(self): 178 | self.__test() 179 | 180 | 181 | def update_metrics(self, split: str, loss, bs, accuracy=None): 182 | self.losses[split].update(loss, bs) 183 | if accuracy is not None: 184 | self.accuracy[split].update(accuracy, bs) 185 | if split == "train" and self.iters % self.save_iters == 0: 186 | self.tbx_summary.add_scalar('{}_loss'.format(split), self.losses[split].avg, self.iters) 187 | self.tbx_summary.add_scalar('{}_accuracy'.format(split), self.accuracy[split].avg, self.iters) 188 | self.losses[split].reset() 189 | self.accuracy[split].reset() -------------------------------------------------------------------------------- /src_gestformer/models/backbones/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torchvision.models.utils import load_state_dict_from_url 4 | 5 | 6 | __all__ = ['ResNet', 'resnet18'] 7 | 8 | 9 | model_urls = { 10 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 11 | } 12 | 13 | 14 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): 15 | """3x3 convolution with padding""" 16 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 17 | padding=dilation, groups=groups, bias=False, dilation=dilation) 18 | 19 | 20 | def conv1x1(in_planes, out_planes, stride=1): 21 | """1x1 convolution""" 22 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 23 | 24 | 25 | class BasicBlock(nn.Module): 26 | expansion = 1 27 | 28 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 29 | base_width=64, dilation=1, dropout=0., norm_layer=None): 30 | super(BasicBlock, self).__init__() 31 | if norm_layer is None: 32 | norm_layer = nn.BatchNorm2d 33 | if groups != 1 or base_width != 64: 34 | raise ValueError('BasicBlock only supports groups=1 and base_width=64') 35 | if dilation > 1: 36 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock") 37 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1 38 | self.conv1 = conv3x3(inplanes, planes, stride) 39 | self.bn1 = norm_layer(planes) 40 | self.relu = nn.ReLU(inplace=True) 41 | self.conv2 = conv3x3(planes, planes) 42 | self.bn2 = norm_layer(planes) 43 | self.downsample = downsample 44 | self.stride = stride 45 | self.dropout = nn.Dropout2d(p=dropout) 46 | 47 | def forward(self, x): 48 | identity = x 49 | 50 | out = self.conv1(x) 51 | out = self.bn1(out) 52 | out = self.relu(out) 53 | out = self.dropout(out) 54 | 55 | out = self.conv2(out) 56 | out = self.bn2(out) 57 | 58 | if self.downsample is not None: 59 | identity = self.downsample(x) 60 | out = self.dropout(out) 61 | 62 | out += identity 63 | out = self.relu(out) 64 | out = self.dropout(out) 65 | 66 | return out 67 | 68 | 69 | class ResNet(nn.Module): 70 | 71 | def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, 72 | groups=1, width_per_group=64, dropout=0., replace_stride_with_dilation=None, 73 | norm_layer=None): 74 | super(ResNet, self).__init__() 75 | if norm_layer is None: 76 | norm_layer = nn.BatchNorm2d 77 | self._norm_layer = norm_layer 78 | 79 | self.inplanes = 64 80 | self.dilation = 1 81 | self.drop_prob = dropout 82 | if replace_stride_with_dilation is None: 83 | # each element in the tuple indicates if we should replace 84 | # the 2x2 stride with a dilated convolution instead 85 | replace_stride_with_dilation = [False, False, False] 86 | if len(replace_stride_with_dilation) != 3: 87 | raise ValueError("replace_stride_with_dilation should be None " 88 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) 89 | self.groups = groups 90 | self.base_width = width_per_group 91 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) 92 | self.bn1 = norm_layer(self.inplanes) 93 | self.relu = nn.ReLU(inplace=True) 94 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 95 | self.layer1 = self._make_layer(block, 64, layers[0]) 96 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, 97 | dilate=replace_stride_with_dilation[0]) 98 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, 99 | dilate=replace_stride_with_dilation[1]) 100 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2, 101 | dilate=replace_stride_with_dilation[2]) 102 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 103 | self.fc = nn.Linear(512 * block.expansion, num_classes) 104 | 105 | for m in self.modules(): 106 | if isinstance(m, nn.Conv2d): 107 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 108 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): 109 | nn.init.constant_(m.weight, 1) 110 | nn.init.constant_(m.bias, 0) 111 | 112 | # Zero-initialize the last BN in each residual branch, 113 | # so that the residual branch starts with zeros, and each residual block behaves like an identity. 114 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 115 | if zero_init_residual: 116 | for m in self.modules(): 117 | if isinstance(m, Bottleneck): 118 | nn.init.constant_(m.bn3.weight, 0) 119 | elif isinstance(m, BasicBlock): 120 | nn.init.constant_(m.bn2.weight, 0) 121 | 122 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False): 123 | norm_layer = self._norm_layer 124 | downsample = None 125 | previous_dilation = self.dilation 126 | if dilate: 127 | self.dilation *= stride 128 | stride = 1 129 | if stride != 1 or self.inplanes != planes * block.expansion: 130 | downsample = nn.Sequential( 131 | conv1x1(self.inplanes, planes * block.expansion, stride), 132 | norm_layer(planes * block.expansion), 133 | ) 134 | 135 | layers = [] 136 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups, 137 | self.base_width, previous_dilation, self.drop_prob, norm_layer)) 138 | self.inplanes = planes * block.expansion 139 | for _ in range(1, blocks): 140 | layers.append(block(self.inplanes, planes, groups=self.groups, 141 | base_width=self.base_width, dilation=self.dilation, 142 | dropout=self.drop_prob, norm_layer=norm_layer)) 143 | 144 | return nn.Sequential(*layers) 145 | 146 | def _forward_impl(self, x): 147 | # See note [TorchScript super()] 148 | x = self.conv1(x) 149 | x = self.bn1(x) 150 | x = self.relu(x) 151 | x = self.maxpool(x) 152 | 153 | x1 = self.layer1(x) 154 | x2 = self.layer2(x1) 155 | x3 = self.layer3(x2) 156 | x4 = self.layer4(x3) 157 | 158 | x = self.avgpool(x4) 159 | x = torch.flatten(x, 1) 160 | # print(x.shape) 161 | # print(x1.shape) 162 | # print(x2.shape) 163 | # print(x3.shape) 164 | # print(x4.shape) 165 | 166 | return x 167 | 168 | def forward(self, x): 169 | return self._forward_impl(x) 170 | 171 | 172 | def _resnet(arch, block, layers, pretrained, progress, in_planes, **kwargs): 173 | model = ResNet(block, layers, **kwargs) 174 | if pretrained: 175 | state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) 176 | model.load_state_dict(state_dict) 177 | if in_planes in [1, 2]: 178 | w = model._modules['conv1']._parameters['weight'].data 179 | model.conv1 = nn.Conv2d(in_planes, 64, kernel_size=7, stride=2, padding=3, bias=False) 180 | if in_planes == 1: 181 | model.conv1._parameters['weight'].data = w.mean(dim=1, keepdim=True) 182 | else: 183 | model.conv1._parameters['weight'].data = w[:, :-1] * 1.5 184 | else: 185 | model.conv1 = nn.Conv2d(in_planes, 64, kernel_size=7, stride=2, padding=3, bias=False) 186 | model.fc = None 187 | return model 188 | 189 | 190 | def resnet18(pretrained=False, in_planes: int=3, **kwargs): 191 | r"""ResNet-18 model from 192 | `"Deep Residual Learning for Image Recognition" `_ 193 | 194 | Args: 195 | pretrained (bool): If True, returns a model pre-trained on ImageNet 196 | progress (bool): If True, displays a progress bar of the download to stderr 197 | """ 198 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, True, in_planes, 199 | **kwargs) -------------------------------------------------------------------------------- /src_gestformer/datasets/utils/utils_briareo.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # full 675, full_no_fingers 145, mod 192 4 | def from_json_to_list(json_file): 5 | with open(json_file) as f: 6 | j = json.load(f) 7 | if j['frame'] != 'invalid': 8 | j_vector = [ 9 | # palm 10 | j['frame']['right_hand']['palm_position'][0], 11 | j['frame']['right_hand']['palm_position'][1], 12 | j['frame']['right_hand']['palm_position'][2], 13 | j['frame']['right_hand']['palm_position'][3], 14 | j['frame']['right_hand']['palm_position'][4], 15 | j['frame']['right_hand']['palm_position'][5], 16 | j['frame']['right_hand']['palm_normal'][0], 17 | j['frame']['right_hand']['palm_normal'][1], 18 | j['frame']['right_hand']['palm_normal'][2], 19 | j['frame']['right_hand']['palm_normal'][3], 20 | j['frame']['right_hand']['palm_normal'][4], 21 | j['frame']['right_hand']['palm_normal'][5], 22 | j['frame']['right_hand']['palm_velocity'][0], 23 | j['frame']['right_hand']['palm_velocity'][1], 24 | j['frame']['right_hand']['palm_velocity'][2], 25 | j['frame']['right_hand']['palm_velocity'][3], 26 | j['frame']['right_hand']['palm_velocity'][4], 27 | j['frame']['right_hand']['palm_velocity'][5], 28 | j['frame']['right_hand']['palm_width'], 29 | j['frame']['right_hand']['pinch_strength'], 30 | j['frame']['right_hand']['grab_strength'], 31 | j['frame']['right_hand']['direction'][0], 32 | j['frame']['right_hand']['direction'][1], 33 | j['frame']['right_hand']['direction'][2], 34 | j['frame']['right_hand']['direction'][3], 35 | j['frame']['right_hand']['direction'][4], 36 | j['frame']['right_hand']['direction'][5], 37 | j['frame']['right_hand']['sphere_center'][0], 38 | j['frame']['right_hand']['sphere_center'][1], 39 | j['frame']['right_hand']['sphere_center'][2], 40 | j['frame']['right_hand']['sphere_center'][3], 41 | j['frame']['right_hand']['sphere_center'][4], 42 | j['frame']['right_hand']['sphere_center'][5], 43 | j['frame']['right_hand']['sphere_radius'], 44 | # wrist 45 | j['frame']['right_hand']['wrist_position'][0], 46 | j['frame']['right_hand']['wrist_position'][1], 47 | j['frame']['right_hand']['wrist_position'][2], 48 | j['frame']['right_hand']['wrist_position'][3], 49 | j['frame']['right_hand']['wrist_position'][4], 50 | j['frame']['right_hand']['wrist_position'][5], 51 | # pointables 52 | j['frame']['right_hand']['pointables']['p_0']['tip_position'][0], 53 | j['frame']['right_hand']['pointables']['p_0']['tip_position'][1], 54 | j['frame']['right_hand']['pointables']['p_0']['tip_position'][2], 55 | j['frame']['right_hand']['pointables']['p_0']['tip_position'][3], 56 | j['frame']['right_hand']['pointables']['p_0']['tip_position'][4], 57 | j['frame']['right_hand']['pointables']['p_0']['tip_position'][5], 58 | j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][0], 59 | j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][1], 60 | j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][2], 61 | j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][3], 62 | j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][4], 63 | j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][5], 64 | j['frame']['right_hand']['pointables']['p_0']['direction'][0], 65 | j['frame']['right_hand']['pointables']['p_0']['direction'][1], 66 | j['frame']['right_hand']['pointables']['p_0']['direction'][2], 67 | j['frame']['right_hand']['pointables']['p_0']['direction'][3], 68 | j['frame']['right_hand']['pointables']['p_0']['direction'][4], 69 | j['frame']['right_hand']['pointables']['p_0']['direction'][5], 70 | j['frame']['right_hand']['pointables']['p_0']['width'], 71 | j['frame']['right_hand']['pointables']['p_0']['length'], 72 | float(j['frame']['right_hand']['pointables']['p_0']['is_extended']), 73 | j['frame']['right_hand']['pointables']['p_1']['tip_position'][0], 74 | j['frame']['right_hand']['pointables']['p_1']['tip_position'][1], 75 | j['frame']['right_hand']['pointables']['p_1']['tip_position'][2], 76 | j['frame']['right_hand']['pointables']['p_1']['tip_position'][3], 77 | j['frame']['right_hand']['pointables']['p_1']['tip_position'][4], 78 | j['frame']['right_hand']['pointables']['p_1']['tip_position'][5], 79 | j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][0], 80 | j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][1], 81 | j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][2], 82 | j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][3], 83 | j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][4], 84 | j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][5], 85 | j['frame']['right_hand']['pointables']['p_1']['direction'][0], 86 | j['frame']['right_hand']['pointables']['p_1']['direction'][1], 87 | j['frame']['right_hand']['pointables']['p_1']['direction'][2], 88 | j['frame']['right_hand']['pointables']['p_1']['direction'][3], 89 | j['frame']['right_hand']['pointables']['p_1']['direction'][4], 90 | j['frame']['right_hand']['pointables']['p_1']['direction'][5], 91 | j['frame']['right_hand']['pointables']['p_1']['width'], 92 | j['frame']['right_hand']['pointables']['p_1']['length'], 93 | float(j['frame']['right_hand']['pointables']['p_1']['is_extended']), 94 | j['frame']['right_hand']['pointables']['p_2']['tip_position'][0], 95 | j['frame']['right_hand']['pointables']['p_2']['tip_position'][1], 96 | j['frame']['right_hand']['pointables']['p_2']['tip_position'][2], 97 | j['frame']['right_hand']['pointables']['p_2']['tip_position'][3], 98 | j['frame']['right_hand']['pointables']['p_2']['tip_position'][4], 99 | j['frame']['right_hand']['pointables']['p_2']['tip_position'][5], 100 | j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][0], 101 | j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][1], 102 | j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][2], 103 | j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][3], 104 | j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][4], 105 | j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][5], 106 | j['frame']['right_hand']['pointables']['p_2']['direction'][0], 107 | j['frame']['right_hand']['pointables']['p_2']['direction'][1], 108 | j['frame']['right_hand']['pointables']['p_2']['direction'][2], 109 | j['frame']['right_hand']['pointables']['p_2']['direction'][3], 110 | j['frame']['right_hand']['pointables']['p_2']['direction'][4], 111 | j['frame']['right_hand']['pointables']['p_2']['direction'][5], 112 | j['frame']['right_hand']['pointables']['p_2']['width'], 113 | j['frame']['right_hand']['pointables']['p_2']['length'], 114 | float(j['frame']['right_hand']['pointables']['p_2']['is_extended']), 115 | j['frame']['right_hand']['pointables']['p_3']['tip_position'][0], 116 | j['frame']['right_hand']['pointables']['p_3']['tip_position'][1], 117 | j['frame']['right_hand']['pointables']['p_3']['tip_position'][2], 118 | j['frame']['right_hand']['pointables']['p_3']['tip_position'][3], 119 | j['frame']['right_hand']['pointables']['p_3']['tip_position'][4], 120 | j['frame']['right_hand']['pointables']['p_3']['tip_position'][5], 121 | j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][0], 122 | j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][1], 123 | j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][2], 124 | j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][3], 125 | j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][4], 126 | j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][5], 127 | j['frame']['right_hand']['pointables']['p_3']['direction'][0], 128 | j['frame']['right_hand']['pointables']['p_3']['direction'][1], 129 | j['frame']['right_hand']['pointables']['p_3']['direction'][2], 130 | j['frame']['right_hand']['pointables']['p_3']['direction'][3], 131 | j['frame']['right_hand']['pointables']['p_3']['direction'][4], 132 | j['frame']['right_hand']['pointables']['p_3']['direction'][5], 133 | j['frame']['right_hand']['pointables']['p_3']['width'], 134 | j['frame']['right_hand']['pointables']['p_3']['length'], 135 | float(j['frame']['right_hand']['pointables']['p_3']['is_extended']), 136 | j['frame']['right_hand']['pointables']['p_4']['tip_position'][0], 137 | j['frame']['right_hand']['pointables']['p_4']['tip_position'][1], 138 | j['frame']['right_hand']['pointables']['p_4']['tip_position'][2], 139 | j['frame']['right_hand']['pointables']['p_4']['tip_position'][3], 140 | j['frame']['right_hand']['pointables']['p_4']['tip_position'][4], 141 | j['frame']['right_hand']['pointables']['p_4']['tip_position'][5], 142 | j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][0], 143 | j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][1], 144 | j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][2], 145 | j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][3], 146 | j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][4], 147 | j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][5], 148 | j['frame']['right_hand']['pointables']['p_4']['direction'][0], 149 | j['frame']['right_hand']['pointables']['p_4']['direction'][1], 150 | j['frame']['right_hand']['pointables']['p_4']['direction'][2], 151 | j['frame']['right_hand']['pointables']['p_4']['direction'][3], 152 | j['frame']['right_hand']['pointables']['p_4']['direction'][4], 153 | j['frame']['right_hand']['pointables']['p_4']['direction'][5], 154 | j['frame']['right_hand']['pointables']['p_4']['width'], 155 | j['frame']['right_hand']['pointables']['p_4']['length'], 156 | float(j['frame']['right_hand']['pointables']['p_4']['is_extended']), 157 | ] 158 | else: 159 | j_vector = False 160 | 161 | return j_vector, j -------------------------------------------------------------------------------- /src_gestformer/models/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from einops import rearrange, repeat 5 | import torch.nn.functional as F 6 | from pytorch_wavelets import DWTForward, DWTInverse 7 | 8 | def position_embedding(input, d_model): 9 | input = input.view(-1, 1) 10 | dim = torch.arange(d_model // 2, dtype=torch.float32, device=input.device).view(1, -1) 11 | sin = torch.sin(input / 10000 ** (2 * dim / d_model)) 12 | cos = torch.cos(input / 10000 ** (2 * dim / d_model)) 13 | 14 | out = torch.zeros((input.shape[0], d_model), device=input.device) 15 | out[:, ::2] = sin 16 | out[:, 1::2] = cos 17 | return out 18 | 19 | def sinusoid_encoding_table(max_len, d_model): 20 | pos = torch.arange(max_len, dtype=torch.float32) 21 | out = position_embedding(pos, d_model) 22 | return out 23 | 24 | class ScaledDotProductAttention(nn.Module): 25 | """ 26 | Scaled dot-product attention 27 | """ 28 | 29 | def __init__(self, d_model, d_k, d_v, h): 30 | """ 31 | :param d_model: Output dimensionality of the model 32 | :param d_k: Dimensionality of queries and keys 33 | :param d_v: Dimensionality of values 34 | :param h: Number of heads 35 | """ 36 | super(ScaledDotProductAttention, self).__init__() 37 | self.fc_q = nn.Linear(d_model, h * d_k) 38 | self.fc_k = nn.Linear(d_model, h * d_k) 39 | self.fc_v = nn.Linear(d_model, h * d_v) 40 | self.fc_o = nn.Linear(h * d_v, d_model) 41 | 42 | self.d_model = d_model 43 | self.d_k = d_k 44 | self.d_v = d_v 45 | self.h = h 46 | 47 | self.init_weights(gain=1.0) 48 | 49 | def init_weights(self, gain=1.0): 50 | nn.init.xavier_normal_(self.fc_q.weight, gain=gain) 51 | nn.init.xavier_normal_(self.fc_k.weight, gain=gain) 52 | nn.init.xavier_normal_(self.fc_v.weight, gain=gain) 53 | nn.init.xavier_normal_(self.fc_o.weight, gain=gain) 54 | nn.init.constant_(self.fc_q.bias, 0) 55 | nn.init.constant_(self.fc_k.bias, 0) 56 | nn.init.constant_(self.fc_v.bias, 0) 57 | nn.init.constant_(self.fc_o.bias, 0) 58 | 59 | def forward(self, queries, keys, values): 60 | """ 61 | Computes 62 | :param queries: Queries (b_s, nq, d_model) 63 | :param keys: Keys (b_s, nk, d_model) 64 | :param values: Values (b_s, nk, d_model) 65 | :return: 66 | """ 67 | b_s, nq = queries.shape[:2] 68 | nk = keys.shape[1] 69 | q = self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3) # (b_s, h, nq, d_k) 70 | k = self.fc_k(keys).view(b_s, nk, self.h, self.d_k).permute(0, 2, 3, 1) # (b_s, h, d_k, nk) 71 | v = self.fc_v(values).view(b_s, nk, self.h, self.d_v).permute(0, 2, 1, 3) # (b_s, h, nk, d_v) 72 | 73 | att = torch.matmul(q, k) / np.sqrt(self.d_k) # (b_s, h, nq, nk) 74 | 75 | att = torch.softmax(att, -1) 76 | 77 | out = torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v) # (b_s, nq, h*d_v) 78 | out = self.fc_o(out) # (b_s, nq, d_model) 79 | return out 80 | 81 | class MultiHeadAttention(nn.Module): 82 | """ 83 | Multi-head attention layer with Dropout and Layer Normalization. 84 | """ 85 | 86 | def __init__(self, d_model, d_k, d_v, h, dff=2048, dropout=.1): 87 | super(MultiHeadAttention, self).__init__() 88 | 89 | # self.attention = ScaledDotProductAttention(d_model=d_model, d_k=d_k, d_v=d_v, h=h) 90 | self.dropout = nn.Dropout(p=dropout) 91 | self.layer_norm = nn.LayerNorm(d_model) 92 | # self.layer_norm2 = nn.LayerNorm(d_model) 93 | 94 | self.fc = nn.Sequential(*[nn.Linear(d_model, dff), nn.ReLU(inplace=True), nn.Dropout(p=dropout), 95 | nn.Linear(dff, d_model)]) 96 | 97 | def forward(self, queries, keys, values): 98 | att = self.attention(queries, keys, values) 99 | att = self.dropout(att) 100 | # att = self.layer_norm(queries + att) 101 | att = self.fc(att) 102 | att = self.dropout(att) 103 | return self.layer_norm(queries + att) 104 | 105 | class ScaledDotProductAttention_(nn.Module): 106 | """ 107 | Scaled dot-product attention 108 | """ 109 | 110 | def __init__(self, d_model, d_k, d_v, h): 111 | """ 112 | :param d_model: Output dimensionality of the model 113 | :param d_k: Dimensionality of queries and keys 114 | :param d_v: Dimensionality of values 115 | :param h: Number of heads 116 | """ 117 | super(ScaledDotProductAttention_, self).__init__() 118 | # print(d_model) 119 | self.fc_q = nn.Linear(d_model, h * d_k) 120 | self.fc_k = nn.Linear(d_model, h * d_k) 121 | self.fc_v = nn.Linear(d_model, h * d_v) 122 | self.fc_o = nn.Linear(h * d_v, d_model) 123 | 124 | self.d_model = d_model 125 | self.d_k = d_k 126 | self.d_v = d_v 127 | self.h = h 128 | 129 | 130 | self.init_weights(gain=1.0) 131 | 132 | def init_weights(self, gain=1.0): 133 | nn.init.xavier_normal_(self.fc_q.weight, gain=gain) 134 | nn.init.xavier_normal_(self.fc_k.weight, gain=gain) 135 | nn.init.xavier_normal_(self.fc_v.weight, gain=gain) 136 | nn.init.xavier_normal_(self.fc_o.weight, gain=gain) 137 | nn.init.constant_(self.fc_q.bias, 0) 138 | nn.init.constant_(self.fc_k.bias, 0) 139 | nn.init.constant_(self.fc_v.bias, 0) 140 | nn.init.constant_(self.fc_o.bias, 0) 141 | 142 | def forward(self, queries, keys, values): 143 | """ 144 | Computes 145 | :param queries: Queries (b_s, nq, d_model) 146 | :param keys: Keys (b_s, nk, d_model) 147 | :param values: Values (b_s, nk, d_model) 148 | :return: 149 | """ 150 | b_s, nq = queries.shape[:2] 151 | # nk = keys.shape[1] 152 | # print(queries.shape) 153 | # print(b_s) 154 | q = self.fc_q(queries).view(b_s, self.h, self.d_k).permute(0, 1, 2) # (b_s, h, nq, d_k) 155 | k = self.fc_k(keys).view(b_s, self.h, self.d_k).permute(0, 2,1) # (b_s, h, d_k, nk) 156 | v = self.fc_v(values).view(b_s, self.h, self.d_v).permute(0, 1, 2) # (b_s, h, nk, d_v) 157 | 158 | att = torch.matmul(q, k) / np.sqrt(self.d_k) # (b_s, h, nq, nk) 159 | 160 | att = torch.softmax(att, -1) 161 | 162 | 163 | out = torch.matmul(att, v).permute(0, 1,2).contiguous().view(b_s, self.h * self.d_v) # (b_s, nq, h*d_v) 164 | out = self.fc_o(out) # (b_s, nq, d_model) 165 | return out 166 | 167 | class Pooling(nn.Module): 168 | """ 169 | Implementation of pooling for PoolFormer 170 | --pool_size: pooling size 171 | """ 172 | def __init__(self, pool_size=3): 173 | super().__init__() 174 | self.pool = nn.AvgPool1d( 175 | pool_size, stride=1, padding=pool_size//2, count_include_pad=False) 176 | 177 | def forward(self, x): 178 | return self.pool(x) 179 | 180 | class SSL(nn.Module): 181 | def __init__(self, channels): 182 | super(SSL, self).__init__() 183 | 184 | self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, groups=channels, bias=False)#conv_block_my(channels, channels, kernel_size = 3, stride=1, padding = 1, dilation=1) 185 | self.conv5 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, groups=channels, bias=False)#conv_block_my(channels, channels, kernel_size = 3, stride=1, padding = 5, dilation=5) 186 | self.conv7 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, groups=channels, bias=False)#conv_block_my(channels, channels, kernel_size = 3, stride=1, padding = 7, dilation=7) 187 | self.conv9 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, groups=channels, bias=False)#conv_block_my(channels, channels, kernel_size = 3, stride=1, padding = 9, dilation=9) 188 | 189 | self.conv_cat = nn.Conv2d(channels*4, channels, kernel_size=3, padding=1, groups=channels, bias=False)#conv_block_my(channels*4, channels, kernel_size = 3, stride = 1, padding = 1, dilation=1) 190 | 191 | def forward(self, x): 192 | 193 | aa = DWTForward(J=1, mode='zero', wave='db3').cuda(device=0) 194 | yl, yh = aa(x) 195 | 196 | yh_out = yh[0] 197 | ylh = yh_out[:,:,0,:,:] 198 | yhl = yh_out[:,:,1,:,:] 199 | yhh = yh_out[:,:,2,:,:] 200 | 201 | conv_rec1 = self.conv5(yl) 202 | conv_rec5 = self.conv5(ylh) 203 | conv_rec7 = self.conv7(yhl) 204 | conv_rec9 = self.conv9(yhh) 205 | 206 | cat_all = torch.stack((conv_rec5, conv_rec7, conv_rec9),dim=2) 207 | rec_yh = [] 208 | rec_yh.append(cat_all) 209 | 210 | 211 | ifm = DWTInverse(wave='db3', mode='zero').cuda(device=0) 212 | Y = ifm((conv_rec1, rec_yh)) 213 | 214 | return Y 215 | 216 | class MultiHeadAttention_(nn.Module): 217 | """ 218 | Multi-head attention layer with Dropout and Layer Normalization. 219 | """ 220 | 221 | def __init__(self, d_model, d_k, d_v, h, dff=2048, dropout=.1): 222 | super(MultiHeadAttention_, self).__init__() 223 | 224 | # self.attention = ScaledDotProductAttention_(d_model=d_model, d_k=d_k, d_v=d_v, h=h) 225 | self.s = SSL(1) 226 | # self.token_mixer = Pooling(pool_size=3) 227 | self.dropout = nn.Dropout(p=dropout) 228 | self.layer_norm = nn.LayerNorm(d_model) 229 | # self.layer_norm2 = nn.LayerNorm(d_model) 230 | self.token_mixer1 = Pooling(pool_size=3) 231 | self.token_mixer2 = Pooling(pool_size=5) 232 | self.token_mixer3 = Pooling(pool_size=7) 233 | 234 | hidden_features = int(d_model*2.66) 235 | 236 | self.project_in = nn.Conv1d(d_model, hidden_features*2, kernel_size=1) 237 | 238 | self.dwconv = nn.Conv1d(hidden_features*2, hidden_features*2, kernel_size=3, stride=1, padding=1, groups=hidden_features*2) 239 | 240 | self.project_out = nn.Conv1d(hidden_features, d_model, kernel_size=1) 241 | 242 | 243 | # self.fc = nn.Sequential(*[nn.Linear(d_model, dff), nn.ReLU(inplace=True), nn.Dropout(p=dropout), 244 | # nn.Linear(dff, d_model)]) 245 | 246 | # def forward(self, queries, keys, values): 247 | def forward(self, x): 248 | x = x.unsqueeze(1) 249 | x = self.s(x) 250 | # print(x.shape) 251 | x = x.squeeze(1) 252 | att1 = self.token_mixer1(x) 253 | att2 = self.token_mixer2(x) 254 | att3 = self.token_mixer3(x) 255 | 256 | att =( att1 +att2+ att3 )/3 257 | # att = self.token_mixer(x) 258 | # print(att.shape) 259 | # att = self.attention(queries, keys, values) 260 | att = self.dropout(att) 261 | # att = self.layer_norm(queries + att) 262 | g = self.project_in(att.permute(0, 2, 1)) 263 | x1, x2 = self.dwconv(g).chunk(2, dim=1) 264 | g = F.gelu(x1) * x2 265 | g = self.project_out(g) 266 | 267 | att = self.dropout(g.permute(0, 2, 1)) 268 | 269 | return self.layer_norm( x+att) 270 | 271 | class EncoderSelfAttention(nn.Module): 272 | def __init__(self, d_model, d_k, d_v, n_head, dff=2048, dropout_transformer=.1, n_module=6): 273 | super(EncoderSelfAttention, self).__init__() 274 | # self.Spatial_pos_embed = nn.Parameter(torch.zeros(1, num_joints, embed_dim_ratio)) 275 | self.Spatial_embedding= nn.Linear(d_model, d_model) 276 | 277 | # self.spatial_encoder = nn.ModuleList([MultiHeadAttention_(d_model*4, d_k*4, d_v*4, n_head, dff, dropout_transformer) 278 | # for _ in range(n_module)]) 279 | # self.weighted_mean = torch.nn.Conv1d(in_channels=d_model*4, out_channels=d_model, kernel_size=1) 280 | 281 | self.encoder = nn.ModuleList([MultiHeadAttention_(d_model, d_k, d_v, n_head, dff, dropout_transformer) 282 | for _ in range(n_module)]) 283 | 284 | 285 | def forward(self, x): 286 | x = self.Spatial_embedding(x) 287 | in_encoder = x + sinusoid_encoding_table(x.shape[1], x.shape[2]).expand(x.shape).cuda(device=0) 288 | for l in self.encoder: 289 | # in_encoder = l(in_encoder, in_encoder, in_encoder) 290 | in_encoder = l(in_encoder) 291 | # print(in_encoder.shape) # 8,40,512 292 | return in_encoder 293 | -------------------------------------------------------------------------------- /src_gestformer/train.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | from torch.utils.data import DataLoader 8 | import imgaug.augmenters as iaa 9 | 10 | # Import Datasets 11 | from datasets.Briareo import Briareo 12 | from datasets.NVGestures import NVGesture 13 | from models.model_utilizer import ModuleUtilizer 14 | 15 | # Import Model 16 | from models.temporal import GestureTransoformer 17 | from torch.optim.lr_scheduler import MultiStepLR 18 | 19 | # Import loss 20 | 21 | # Import Utils 22 | from tqdm import tqdm 23 | from utils.average_meter import AverageMeter 24 | from tensorboardX import SummaryWriter 25 | 26 | # Setting seeds 27 | def worker_init_fn(worker_id): 28 | np.random.seed(torch.initial_seed() % 2 ** 32) 29 | 30 | class GestureTrainer(object): 31 | """Gesture Recognition Train class 32 | 33 | Attributes: 34 | configer (Configer): Configer object, contains procedure configuration. 35 | train_loader (torch.utils.data.DataLoader): Train data loader variable 36 | val_loader (torch.utils.data.DataLoader): Val data loader variable 37 | test_loader (torch.utils.data.DataLoader): Test data loader variable 38 | net (torch.nn.Module): Network used for the current procedure 39 | lr (int): Learning rate value 40 | optimizer (torch.nn.optim.optimizer): Optimizer for training procedure 41 | iters (int): Starting iteration number, not zero if resuming training 42 | epoch (int): Starting epoch number, not zero if resuming training 43 | scheduler (torch.optim.lr_scheduler): Scheduler to utilize during training 44 | 45 | """ 46 | 47 | def __init__(self, configer): 48 | self.configer = configer 49 | 50 | self.data_path = configer.get("data", "data_path") #: str: Path to data directory 51 | 52 | # Losses 53 | self.losses = { 54 | 'train': AverageMeter(), #: Train loss avg meter 55 | 'val': AverageMeter(), #: Val loss avg meter 56 | 'test': AverageMeter() #: Test loss avg meter 57 | } 58 | 59 | # Train val and test accuracy 60 | self.accuracy = { 61 | 'train': AverageMeter(), #: Train accuracy avg meter 62 | 'val': AverageMeter(), #: Val accuracy avg meter 63 | 'test': AverageMeter() #: Test accuracy avg meter 64 | } 65 | 66 | # DataLoaders 67 | self.train_loader = None 68 | self.val_loader = None 69 | self.test_loader = None 70 | 71 | # Module load and save utility 72 | self.device = self.configer.get("device") 73 | self.model_utility = ModuleUtilizer(self.configer) #: Model utility for load, save and update optimizer 74 | self.net = None 75 | self.lr = None 76 | 77 | # Training procedure 78 | self.optimizer = None 79 | self.iters = None 80 | self.epoch = 0 81 | self.train_transforms = None 82 | self.val_transforms = None 83 | self.loss = None 84 | 85 | # Tensorboard and Metrics 86 | self.tbx_summary = SummaryWriter(str(Path(configer.get('checkpoints', 'tb_path')) #: Summary Writer plot 87 | / configer.get("dataset") #: data with TensorboardX 88 | / configer.get('checkpoints', 'save_name'))) 89 | self.tbx_summary.add_text('parameters', str(self.configer).replace("\n", "\n\n")) 90 | self.save_iters = self.configer.get('checkpoints', 'save_iters') #: int: Saving ratio 91 | 92 | # Other useful data 93 | self.backbone = self.configer.get("network", "backbone") #: str: Backbone type 94 | self.in_planes = None #: int: Input channels 95 | self.clip_length = self.configer.get("data", "n_frames") #: int: Number of frames per sequence 96 | self.n_classes = self.configer.get("data", "n_classes") #: int: Total number of classes for dataset 97 | self.data_type = self.configer.get("data", "type") #: str: Type of data (rgb, depth, ir, leapmotion) 98 | self.dataset = self.configer.get("dataset").lower() #: str: Type of dataset 99 | self.optical_flow = self.configer.get("data", "optical_flow") 100 | if self.optical_flow is None: 101 | self.optical_flow = True 102 | self.scheduler = None 103 | 104 | def init_model(self): 105 | """Initialize model and other data for procedure""" 106 | 107 | if self.optical_flow is True: 108 | self.in_planes = 2 109 | elif self.data_type in ["depth", "ir"]: 110 | self.in_planes = 1 111 | else: 112 | self.in_planes = 3 113 | 114 | self.loss = nn.CrossEntropyLoss().to(self.device) 115 | 116 | # Selecting correct model and normalization variable based on type variable 117 | self.net = GestureTransoformer(self.backbone, self.in_planes, self.n_classes, 118 | pretrained=self.configer.get("network", "pretrained"), 119 | n_head=self.configer.get("network", "n_head"), 120 | dropout_backbone=self.configer.get("network", "dropout2d"), 121 | dropout_transformer=self.configer.get("network", "dropout1d"), 122 | dff=self.configer.get("network", "ff_size"), 123 | n_module=self.configer.get("network", "n_module") 124 | ) 125 | 126 | # Initializing training 127 | self.iters = 0 128 | self.epoch = None 129 | phase = self.configer.get('phase') 130 | 131 | # Starting or resuming procedure 132 | if phase == 'train': 133 | self.net, self.iters, self.epoch, optim_dict = self.model_utility.load_net(self.net) 134 | else: 135 | raise ValueError('Phase: {} is not valid.'.format(phase)) 136 | 137 | if self.epoch is None: 138 | self.epoch = 0 139 | 140 | # ToDo Restore optimizer and scheduler from checkpoint 141 | self.optimizer, self.lr = self.model_utility.update_optimizer(self.net, self.iters) 142 | self.scheduler = MultiStepLR(self.optimizer, self.configer["solver", "decay_steps"], gamma=0.1) 143 | 144 | # Resuming training, restoring optimizer value 145 | if optim_dict is not None: 146 | print("Resuming training from epoch {}.".format(self.epoch)) 147 | self.optimizer.load_state_dict(optim_dict) 148 | 149 | # Selecting Dataset and DataLoader 150 | if self.dataset == "briareo": 151 | Dataset = Briareo 152 | self.train_transforms = iaa.Sequential([ 153 | iaa.Resize((0.85, 1.15)), 154 | iaa.CropToFixedSize(width=190, height=190), 155 | iaa.Rotate((-15, 15)) 156 | ]) 157 | self.val_transforms = iaa.CenterCropToFixedSize(200, 200) 158 | 159 | elif self.dataset == "nvgestures": 160 | Dataset = NVGesture 161 | self.train_transforms = iaa.Sequential([ 162 | iaa.Resize((0.8, 1.2)), 163 | iaa.CropToFixedSize(width=256, height=192), 164 | iaa.Rotate((-15, 15)) 165 | ]) 166 | self.val_transforms = iaa.CenterCropToFixedSize(256, 192) 167 | else: 168 | raise NotImplementedError(f"Dataset not supported: {self.configer.get('dataset')}") 169 | 170 | # Setting Dataloaders 171 | self.train_loader = DataLoader( 172 | Dataset(self.configer, self.data_path, split="train", data_type=self.data_type, 173 | transforms=self.train_transforms, n_frames=self.clip_length, optical_flow=self.optical_flow), 174 | batch_size=self.configer.get('data', 'batch_size'), shuffle=True, drop_last=True, 175 | num_workers=self.configer.get('solver', 'workers'), pin_memory=True, worker_init_fn=worker_init_fn) 176 | self.val_loader = DataLoader( 177 | Dataset(self.configer, self.data_path, split="val", data_type=self.data_type, 178 | transforms=self.val_transforms, n_frames=self.clip_length, optical_flow=self.optical_flow), 179 | batch_size=self.configer.get('data', 'batch_size'), shuffle=False, drop_last=True, 180 | num_workers=self.configer.get('solver', 'workers'), pin_memory=True, worker_init_fn=worker_init_fn) 181 | if self.dataset == "nvgestures": 182 | self.test_loader = None 183 | else: 184 | self.test_loader = DataLoader( 185 | Dataset(self.configer, self.data_path, split="test", data_type=self.data_type, 186 | transforms=self.val_transforms, n_frames=self.clip_length, optical_flow=self.optical_flow), 187 | batch_size=1, shuffle=False, drop_last=True, 188 | num_workers=self.configer.get('solver', 'workers'), pin_memory=True, worker_init_fn=worker_init_fn) 189 | 190 | def __train(self): 191 | """Train function for every epoch.""" 192 | 193 | self.net.train() 194 | for data_tuple in tqdm(self.train_loader, desc="Train"): 195 | """ 196 | input, gt 197 | """ 198 | inputs = data_tuple[0].to(self.device) 199 | gt = data_tuple[1].to(self.device) 200 | 201 | output = self.net(inputs) 202 | 203 | self.optimizer.zero_grad() 204 | loss = self.loss(output, gt.squeeze(dim=1)) 205 | loss.backward() 206 | 207 | torch.nn.utils.clip_grad_norm_(self.net.parameters(), max_norm=1) 208 | self.optimizer.step() 209 | 210 | predicted = torch.argmax(output.detach(), dim=1) 211 | correct = gt.detach().squeeze(dim=1) 212 | 213 | self.iters += 1 214 | self.update_metrics("train", loss.item(), inputs.size(0), 215 | float((predicted==correct).sum()) / len(correct)) 216 | 217 | 218 | def __val(self): 219 | """Validation function.""" 220 | self.net.eval() 221 | 222 | with torch.no_grad(): 223 | # for i, data_tuple in enumerate(tqdm(self.val_loader, desc="Val", postfix=str(self.accuracy["val"].avg))): 224 | for i, data_tuple in enumerate(tqdm(self.val_loader, desc="Val", postfix=""+str(np.random.randint(200)))): 225 | """ 226 | input, gt 227 | """ 228 | inputs = data_tuple[0].to(self.device) 229 | gt = data_tuple[1].to(self.device) 230 | 231 | output = self.net(inputs) 232 | loss = self.loss(output, gt.squeeze(dim=1)) 233 | 234 | predicted = torch.argmax(output.detach(), dim=1) 235 | correct = gt.detach().squeeze(dim=1) 236 | 237 | self.iters += 1 238 | self.update_metrics("val", loss.item(), inputs.size(0), 239 | float((predicted == correct).sum()) / len(correct)) 240 | 241 | self.tbx_summary.add_scalar('val_loss', self.losses["val"].avg, self.epoch + 1) 242 | self.tbx_summary.add_scalar('val_accuracy', self.accuracy["val"].avg, self.epoch + 1) 243 | accuracy = self.accuracy["val"].avg 244 | self.accuracy["val"].reset() 245 | self.losses["val"].reset() 246 | 247 | ret = self.model_utility.save(accuracy, self.net, self.optimizer, self.iters, self.epoch + 1) 248 | if ret < 0: 249 | return -1 250 | elif ret > 0 and self.test_loader is not None: 251 | self.__test() 252 | return ret 253 | 254 | def __test(self): 255 | """Testing function.""" 256 | self.net.eval() 257 | 258 | with torch.no_grad(): 259 | for i, data_tuple in enumerate(tqdm(self.test_loader, desc="Test", postfix=str(self.accuracy["test"].avg))): 260 | """ 261 | input, gt 262 | """ 263 | inputs = data_tuple[0].to(self.device) 264 | gt = data_tuple[1].to(self.device) 265 | 266 | output = self.net(inputs) 267 | loss = self.loss(output, gt.squeeze(dim=1)) 268 | 269 | predicted = torch.argmax(output.detach(), dim=1) 270 | correct = gt.detach().squeeze(dim=1) 271 | 272 | self.iters += 1 273 | self.update_metrics("test", loss.item(), inputs.size(0), 274 | float((predicted == correct).sum()) / len(correct)) 275 | self.tbx_summary.add_scalar('test_loss', self.losses["test"].avg, self.epoch + 1) 276 | self.tbx_summary.add_scalar('test_accuracy', self.accuracy["test"].avg, self.epoch + 1) 277 | self.losses["test"].reset() 278 | self.accuracy["test"].reset() 279 | 280 | def train(self): 281 | for n in range(self.configer.get("epochs")): 282 | print("Starting epoch {}".format(self.epoch + 1)) 283 | self.__train() 284 | ret = self.__val() 285 | if ret < 0: 286 | print("Got no improvement for {} epochs, current epoch is {}." 287 | .format(self.configer.get("checkpoints", "early_stop"), n)) 288 | break 289 | self.epoch += 1 290 | 291 | def update_metrics(self, split: str, loss, bs, accuracy=None): 292 | self.losses[split].update(loss, bs) 293 | if accuracy is not None: 294 | self.accuracy[split].update(accuracy, bs) 295 | if split == "train" and self.iters % self.save_iters == 0: 296 | self.tbx_summary.add_scalar('{}_loss'.format(split), self.losses[split].avg, self.iters) 297 | self.tbx_summary.add_scalar('{}_accuracy'.format(split), self.accuracy[split].avg, self.iters) 298 | self.losses[split].reset() 299 | self.accuracy[split].reset() -------------------------------------------------------------------------------- /src_gestformer/models/backbones/r3d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from torchvision.models.utils import load_state_dict_from_url 5 | 6 | 7 | __all__ = ['r3d_18', 'mc3_18', 'r2plus1d_18'] 8 | 9 | model_urls = { 10 | 'r3d_18': 'https://download.pytorch.org/models/r3d_18-b3b3357e.pth', 11 | 'mc3_18': 'https://download.pytorch.org/models/mc3_18-a90a0ba3.pth', 12 | 'r2plus1d_18': 'https://download.pytorch.org/models/r2plus1d_18-91a641e6.pth', 13 | } 14 | 15 | 16 | class Conv3DSimple(nn.Conv3d): 17 | def __init__(self, 18 | in_planes, 19 | out_planes, 20 | midplanes=None, 21 | stride=1, 22 | padding=1): 23 | 24 | super(Conv3DSimple, self).__init__( 25 | in_channels=in_planes, 26 | out_channels=out_planes, 27 | kernel_size=(3, 3, 3), 28 | stride=stride, 29 | padding=padding, 30 | bias=False) 31 | 32 | @staticmethod 33 | def get_downsample_stride(stride): 34 | return (stride, stride, stride) 35 | 36 | 37 | class Conv2Plus1D(nn.Sequential): 38 | 39 | def __init__(self, 40 | in_planes, 41 | out_planes, 42 | midplanes, 43 | stride=1, 44 | padding=1): 45 | super(Conv2Plus1D, self).__init__( 46 | nn.Conv3d(in_planes, midplanes, kernel_size=(1, 3, 3), 47 | stride=(1, stride, stride), padding=(0, padding, padding), 48 | bias=False), 49 | nn.BatchNorm3d(midplanes), 50 | nn.ReLU(inplace=True), 51 | nn.Conv3d(midplanes, out_planes, kernel_size=(3, 1, 1), 52 | stride=(stride, 1, 1), padding=(padding, 0, 0), 53 | bias=False)) 54 | 55 | @staticmethod 56 | def get_downsample_stride(stride): 57 | return (stride, stride, stride) 58 | 59 | 60 | class Conv3DNoTemporal(nn.Conv3d): 61 | 62 | def __init__(self, 63 | in_planes, 64 | out_planes, 65 | midplanes=None, 66 | stride=1, 67 | padding=1): 68 | 69 | super(Conv3DNoTemporal, self).__init__( 70 | in_channels=in_planes, 71 | out_channels=out_planes, 72 | kernel_size=(1, 3, 3), 73 | stride=(1, stride, stride), 74 | padding=(0, padding, padding), 75 | bias=False) 76 | 77 | @staticmethod 78 | def get_downsample_stride(stride): 79 | return (1, stride, stride) 80 | 81 | 82 | class BasicBlock(nn.Module): 83 | 84 | expansion = 1 85 | 86 | def __init__(self, inplanes, planes, conv_builder, stride=1, downsample=None): 87 | midplanes = (inplanes * planes * 3 * 3 * 3) // (inplanes * 3 * 3 + 3 * planes) 88 | 89 | super(BasicBlock, self).__init__() 90 | self.dropout = nn.Dropout3d(p = 0.1) 91 | self.conv1 = nn.Sequential( 92 | conv_builder(inplanes, planes, midplanes, stride), 93 | nn.BatchNorm3d(planes), 94 | nn.ReLU(inplace=True) 95 | ) 96 | self.conv2 = nn.Sequential( 97 | conv_builder(planes, planes, midplanes), 98 | nn.BatchNorm3d(planes) 99 | ) 100 | self.relu = nn.ReLU(inplace=True) 101 | self.downsample = downsample 102 | self.stride = stride 103 | 104 | def forward(self, x): 105 | residual = x 106 | 107 | out = self.conv1(x) 108 | out = self.conv2(out) 109 | if self.downsample is not None: 110 | residual = self.downsample(x) 111 | 112 | out += residual 113 | out = self.relu(out) 114 | 115 | return out 116 | 117 | 118 | class Bottleneck(nn.Module): 119 | expansion = 4 120 | 121 | def __init__(self, inplanes, planes, conv_builder, stride=1, downsample=None): 122 | 123 | super(Bottleneck, self).__init__() 124 | midplanes = (inplanes * planes * 3 * 3 * 3) // (inplanes * 3 * 3 + 3 * planes) 125 | 126 | # 1x1x1 127 | self.conv1 = nn.Sequential( 128 | nn.Conv3d(inplanes, planes, kernel_size=1, bias=False), 129 | nn.BatchNorm3d(planes), 130 | nn.ReLU(inplace=True) 131 | ) 132 | # Second kernel 133 | self.conv2 = nn.Sequential( 134 | conv_builder(planes, planes, midplanes, stride), 135 | nn.BatchNorm3d(planes), 136 | nn.ReLU(inplace=True) 137 | ) 138 | 139 | # 1x1x1 140 | self.conv3 = nn.Sequential( 141 | nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False), 142 | nn.BatchNorm3d(planes * self.expansion) 143 | ) 144 | self.relu = nn.ReLU(inplace=True) 145 | self.downsample = downsample 146 | self.stride = stride 147 | 148 | def forward(self, x): 149 | residual = x 150 | 151 | out = self.conv1(x) 152 | out = self.conv2(out) 153 | out = self.conv3(out) 154 | 155 | if self.downsample is not None: 156 | residual = self.downsample(x) 157 | 158 | out += residual 159 | out = self.relu(out) 160 | 161 | return out 162 | 163 | 164 | class BasicStem(nn.Sequential): 165 | """The default conv-batchnorm-relu stem 166 | """ 167 | def __init__(self, pretrained): 168 | super(BasicStem, self).__init__( 169 | nn.Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), 170 | padding=(1, 3, 3), bias=False), 171 | nn.BatchNorm3d(64), 172 | nn.ReLU(inplace=True)) 173 | 174 | 175 | class R2Plus1dStem(nn.Sequential): 176 | """R(2+1)D stem is different than the default one as it uses separated 3D convolution 177 | """ 178 | def __init__(self, pretrained): 179 | super(R2Plus1dStem, self).__init__( 180 | nn.Conv3d(3, 45, kernel_size=(1, 7, 7), 181 | stride=(1, 2, 2), padding=(0, 3, 3), 182 | bias=False), 183 | nn.BatchNorm3d(45), 184 | nn.ReLU(inplace=True), 185 | nn.Conv3d(45, 64, kernel_size=(3, 1, 1), 186 | stride=(1, 1, 1), padding=(1, 0, 0), 187 | bias=False), 188 | nn.BatchNorm3d(64), 189 | nn.ReLU(inplace=True)) 190 | 191 | 192 | class VideoResNet(nn.Module): 193 | 194 | def __init__(self, pretrained, block, conv_makers, layers, 195 | stem, drop_prob, num_classes=400, 196 | zero_init_residual=False): 197 | """Generic resnet video generator. 198 | 199 | Args: 200 | block (nn.Module): resnet building block 201 | conv_makers (list(functions)): generator function for each layer 202 | layers (List[int]): number of blocks per layer 203 | stem (nn.Module, optional): Resnet stem, if None, defaults to conv-bn-relu. Defaults to None. 204 | num_classes (int, optional): Dimension of the final FC layer. Defaults to 400. 205 | zero_init_residual (bool, optional): Zero init bottleneck residual BN. Defaults to False. 206 | """ 207 | super(VideoResNet, self).__init__() 208 | self.inplanes = 64 209 | 210 | self.stem = stem(pretrained) 211 | 212 | self.layer1 = self._make_layer(block, conv_makers[0], 64, layers[0], stride=1) 213 | self.layer2 = self._make_layer(block, conv_makers[1], 128, layers[1], stride=2) 214 | self.layer3 = self._make_layer(block, conv_makers[2], 256, layers[2], stride=2) 215 | self.layer4 = self._make_layer(block, conv_makers[3], 512, layers[3], stride=2) 216 | 217 | self.dropout = nn.Dropout3d(drop_prob) 218 | 219 | self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) 220 | self.fc = nn.Linear(512 * block.expansion, num_classes) 221 | 222 | # init weights 223 | self._initialize_weights() 224 | 225 | if zero_init_residual: 226 | for m in self.modules(): 227 | if isinstance(m, Bottleneck): 228 | nn.init.constant_(m.bn3.weight, 0) 229 | 230 | def forward(self, x): 231 | x = self.stem(x) 232 | 233 | x = self.layer1(x) 234 | x = self.dropout(x) 235 | x = self.layer2(x) 236 | x = self.dropout(x) 237 | x = self.layer3(x) 238 | x = self.dropout(x) 239 | x = self.layer4(x) 240 | x = self.dropout(x) 241 | 242 | x = self.avgpool(x) 243 | 244 | x = x.flatten(1) 245 | 246 | return x 247 | 248 | def _make_layer(self, block, conv_builder, planes, blocks, stride=1): 249 | downsample = None 250 | 251 | if stride != 1 or self.inplanes != planes * block.expansion: 252 | ds_stride = conv_builder.get_downsample_stride(stride) 253 | downsample = nn.Sequential( 254 | nn.Conv3d(self.inplanes, planes * block.expansion, 255 | kernel_size=1, stride=ds_stride, bias=False), 256 | nn.BatchNorm3d(planes * block.expansion) 257 | ) 258 | layers = [] 259 | layers.append(block(self.inplanes, planes, conv_builder, stride, downsample)) 260 | 261 | self.inplanes = planes * block.expansion 262 | for i in range(1, blocks): 263 | layers.append(block(self.inplanes, planes, conv_builder)) 264 | 265 | return nn.Sequential(*layers) 266 | 267 | def _initialize_weights(self): 268 | for m in self.modules(): 269 | if isinstance(m, nn.Conv3d): 270 | nn.init.kaiming_normal_(m.weight, mode='fan_out', 271 | nonlinearity='relu') 272 | if m.bias is not None: 273 | nn.init.constant_(m.bias, 0) 274 | elif isinstance(m, nn.BatchNorm3d): 275 | nn.init.constant_(m.weight, 1) 276 | nn.init.constant_(m.bias, 0) 277 | elif isinstance(m, nn.Linear): 278 | nn.init.normal_(m.weight, 0, 0.01) 279 | nn.init.constant_(m.bias, 0) 280 | 281 | 282 | def _video_resnet(arch, pretrained=False, in_planes=3, dropout=0., **kwargs): 283 | model = VideoResNet(pretrained, drop_prob=dropout, **kwargs) 284 | 285 | if pretrained: 286 | state_dict = load_state_dict_from_url(model_urls[arch], progress=True) 287 | model.load_state_dict(state_dict) 288 | if in_planes in [1, 2]: 289 | w = model.stem._modules['0']._parameters['weight'] 290 | if kwargs['stem'].__name__ == 'BasicStem': 291 | model.stem._modules['0'] = nn.Conv3d(in_planes, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), 292 | padding=(1, 3, 3), bias=False) 293 | elif kwargs['stem'].__name__ == 'R2Plus1dStem': 294 | model.stem._modules['0'] = nn.Conv3d(in_planes, 45, kernel_size=(1, 7, 7), stride=(1, 2, 2), 295 | padding=(0, 3, 3), bias=False) 296 | if in_planes == 1: 297 | model.stem._modules['0']._parameters['weight'].data = w.sum(dim=1, keepdim=True) 298 | else: 299 | model.stem._modules['0']._parameters['weight'].data = w[:, :-1] * 1.5 300 | else: 301 | if kwargs['stem'].__name__ == 'BasicStem': 302 | model.stem._modules['0'] = nn.Conv3d(in_planes, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), 303 | padding=(1, 3, 3), bias=False) 304 | elif kwargs['stem'].__name__ == 'R2Plus1dStem': 305 | model.stem._modules['0'] = nn.Conv3d(in_planes, 45, kernel_size=(1, 7, 7), stride=(1, 2, 2), 306 | padding=(0, 3, 3), bias=False) 307 | model.fc = None 308 | return model 309 | 310 | 311 | def r3d_18(pretrained=False, in_planes: int=3, dropout: float=0., **kwargs): 312 | """Construct 18 layer Resnet3D model as in 313 | https://arxiv.org/abs/1711.11248 314 | 315 | Args: 316 | pretrained (bool): If True, returns a model pre-trained on Kinetics-400 317 | progress (bool): If True, displays a progress bar of the download to stderr 318 | 319 | Returns: 320 | nn.Module: R3D-18 network 321 | """ 322 | 323 | return _video_resnet('r3d_18', 324 | pretrained, in_planes, dropout, 325 | block=BasicBlock, 326 | conv_makers=[Conv3DSimple] * 4, 327 | layers=[2, 2, 2, 2], 328 | stem=BasicStem, **kwargs) 329 | 330 | 331 | def mc3_18(pretrained=False, in_planes: int=3, dropout: float=0., **kwargs): 332 | """Constructor for 18 layer Mixed Convolution network as in 333 | https://arxiv.org/abs/1711.11248 334 | 335 | Args: 336 | pretrained (bool): If True, returns a model pre-trained on Kinetics-400 337 | progress (bool): If True, displays a progress bar of the download to stderr 338 | 339 | Returns: 340 | nn.Module: MC3 Network definition 341 | """ 342 | return _video_resnet('mc3_18', 343 | pretrained, in_planes, dropout, 344 | block=BasicBlock, 345 | conv_makers=[Conv3DSimple] + [Conv3DNoTemporal] * 3, 346 | layers=[2, 2, 2, 2], 347 | stem=BasicStem, **kwargs) 348 | 349 | 350 | def r2plus1d_18(pretrained=False, in_planes: int=3, dropout: float=0., **kwargs): 351 | """Constructor for the 18 layer deep R(2+1)D network as in 352 | https://arxiv.org/abs/1711.11248 353 | 354 | Args: 355 | pretrained (bool): If True, returns a model pre-trained on Kinetics-400 356 | progress (bool): If True, displays a progress bar of the download to stderr 357 | 358 | Returns: 359 | nn.Module: R(2+1)D-18 network 360 | """ 361 | return _video_resnet('r2plus1d_18', 362 | pretrained, in_planes, dropout, 363 | block=BasicBlock, 364 | conv_makers=[Conv2Plus1D] * 4, 365 | layers=[2, 2, 2, 2], 366 | stem=R2Plus1dStem, **kwargs) 367 | 368 | if __name__ == "__main__": 369 | m = r3d_18(False, 1) --------------------------------------------------------------------------------