├── src_gestformer
    ├── readme.md
    ├── __pycache__
    │   ├── test.cpython-38.pyc
    │   └── train.cpython-38.pyc
    ├── models
    │   ├── __pycache__
    │   │   ├── temporal.cpython-38.pyc
    │   │   ├── attention.cpython-38.pyc
    │   │   └── model_utilizer.cpython-38.pyc
    │   ├── backbones
    │   │   ├── __pycache__
    │   │   │   ├── c3d.cpython-38.pyc
    │   │   │   ├── r3d.cpython-38.pyc
    │   │   │   ├── vgg.cpython-38.pyc
    │   │   │   └── resnet.cpython-38.pyc
    │   │   ├── c3d.py
    │   │   ├── vgg.py
    │   │   ├── resnet.py
    │   │   └── r3d.py
    │   ├── temporal.py
    │   ├── model_utilizer.py
    │   └── attention.py
    ├── utils
    │   ├── __pycache__
    │   │   ├── configer.cpython-38.pyc
    │   │   └── average_meter.cpython-38.pyc
    │   ├── average_meter.py
    │   ├── visualization.py
    │   ├── configer.py
    │   └── test.py
    ├── datasets
    │   ├── __pycache__
    │   │   ├── Briareo.cpython-38.pyc
    │   │   └── NVGestures.cpython-38.pyc
    │   ├── utils
    │   │   ├── __pycache__
    │   │   │   ├── normals.cpython-38.pyc
    │   │   │   ├── normalize.cpython-38.pyc
    │   │   │   ├── read_data.cpython-38.pyc
    │   │   │   ├── optical_flow.cpython-38.pyc
    │   │   │   └── utils_briareo.cpython-38.pyc
    │   │   ├── normalize.py
    │   │   ├── optical_flow.py
    │   │   ├── normals.py
    │   │   ├── read_data.py
    │   │   └── utils_briareo.py
    │   ├── NVGestures.py
    │   └── Briareo.py
    ├── hyperparameters
    │   ├── NVGestures
    │   │   ├── test.json
    │   │   └── train.json
    │   └── Briareo
    │   │   ├── test.json
    │   │   └── train.json
    ├── main.py
    ├── cs.py
    ├── test.py
    └── train.py
└── README.md


/src_gestformer/readme.md:
--------------------------------------------------------------------------------
1 | code for gestformer
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | GestFormer: Multiscale Wavelet Pooling Transformer Network for Dynamic Hand Gesture Recognition
2 | 


--------------------------------------------------------------------------------
/src_gestformer/__pycache__/test.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/__pycache__/test.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/__pycache__/train.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/__pycache__/train.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/models/__pycache__/temporal.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/__pycache__/temporal.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/utils/__pycache__/configer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/utils/__pycache__/configer.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/datasets/__pycache__/Briareo.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/__pycache__/Briareo.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/models/__pycache__/attention.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/__pycache__/attention.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/datasets/__pycache__/NVGestures.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/__pycache__/NVGestures.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/models/backbones/__pycache__/c3d.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/backbones/__pycache__/c3d.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/models/backbones/__pycache__/r3d.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/backbones/__pycache__/r3d.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/models/backbones/__pycache__/vgg.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/backbones/__pycache__/vgg.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/utils/__pycache__/average_meter.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/utils/__pycache__/average_meter.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/datasets/utils/__pycache__/normals.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/utils/__pycache__/normals.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/models/__pycache__/model_utilizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/__pycache__/model_utilizer.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/models/backbones/__pycache__/resnet.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/models/backbones/__pycache__/resnet.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/datasets/utils/__pycache__/normalize.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/utils/__pycache__/normalize.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/datasets/utils/__pycache__/read_data.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/utils/__pycache__/read_data.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/datasets/utils/__pycache__/optical_flow.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/utils/__pycache__/optical_flow.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/datasets/utils/__pycache__/utils_briareo.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mallikagarg/GestFormer/HEAD/src_gestformer/datasets/utils/__pycache__/utils_briareo.cpython-38.pyc


--------------------------------------------------------------------------------
/src_gestformer/utils/average_meter.py:
--------------------------------------------------------------------------------
 1 | class AverageMeter(object):
 2 |     """Average Meter object, contain val, avg, sum and count on concurrent values"""
 3 |     def __init__(self):
 4 |         self.reset()
 5 | 
 6 |     def reset(self):
 7 |         self.val = 0.
 8 |         self.avg = 0.
 9 |         self.sum = 0.
10 |         self.count = 0
11 | 
12 |     def update(self, val, n=1):
13 |         self.val = val
14 |         self.sum += val * n
15 |         self.count += n
16 |         self.avg = self.sum / self.count
17 | 


--------------------------------------------------------------------------------
/src_gestformer/datasets/utils/normalize.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def normalize(tensor: np.ndarray):
 4 |     """Normalize function for a single tensor.
 5 | 
 6 |     Args:
 7 |         block (np.ndarray): input tensor
 8 |     Returns:
 9 |         np.ndarray: normalized tensor
10 | 
11 |     """
12 |     if len(tensor.shape) < 4:
13 |         tensor = np.expand_dims(tensor, axis=2)
14 |     mean = np.array([tensor[..., chn, :].mean() for chn in range(tensor.shape[2])])
15 |     std = np.array([tensor[..., chn, :].std() for chn in range(tensor.shape[2])])
16 |     return (tensor - mean[:, np.newaxis]) / std[:, np.newaxis]
17 | 


--------------------------------------------------------------------------------
/src_gestformer/hyperparameters/NVGestures/test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Gesture Recognition",
 3 |     "dataset": "NVGestures",
 4 |     "phase": "test",
 5 |     "data": {
 6 |       "optical_flow": false,
 7 |       "type": "normal",
 8 |       "n_classes": 25,
 9 |       "n_frames": 40,
10 |       "data_path": "path/to/NVGestures",
11 |       "result_dir": "./result",
12 |       "batch_size": 8
13 |     },
14 |     "solver": {
15 |       "workers": 4
16 |     },
17 |     "network":{
18 |       "backbone": "resnet",
19 |       "pretrained": true,
20 |       "ff_size": 1024,
21 |       "n_head": 8,
22 |       "dropout2d" : 0.1,
23 |       "dropout1d": 0.5,
24 |       "n_module": 6
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src_gestformer/hyperparameters/Briareo/test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Gesture Recognition",
 3 |     "dataset": "Briareo",
 4 |     "phase": "test",
 5 |     "data": {
 6 |       "optical_flow": false,
 7 |       "type": "rgb",
 8 |       "n_classes": 12,
 9 |       "n_frames": 40,
10 |       "data_path": "path/to/Briareo",
11 |       "result_dir": "./result",
12 |       "batch_size": 2
13 |     },
14 |     "solver": {
15 |       "workers": 4
16 |     },
17 |     "network":{
18 |       "backbone": "resnet",
19 |       "pretrained": true,
20 |       "ff_size": 1024,
21 |       "n_head": 8,
22 |       "dropout2d" : 0.1,
23 |       "dropout1d": 0.5,
24 |       "n_module": 6
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src_gestformer/utils/visualization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | 
 4 | def plot_depth(path, depth):
 5 |     """Plot a single depth map
 6 | 
 7 |     Attributes:
 8 |         path (str): Path to save the depth map
 9 |         depth (np.ndarray): Depth map data
10 | 
11 |     """
12 |     if len(depth.shape) > 2:
13 |         if depth.shape[-1] != 1:
14 |             raise ValueError("Wrong number of channel, 1 is required, got {}".format(depth.shape))
15 |         else:
16 |             depth = depth.squeeze()
17 |     tmp = np.zeros((depth.shape[0], depth.shape[1], 3))
18 |     tmp[..., 0] = depth.copy()
19 |     tmp[..., 1] = depth.copy()
20 |     tmp[..., 2] = depth.copy()
21 |     tmp = ((tmp * 255) / tmp.max()).astype(np.uint8)
22 |     cv2.imwrite(path, tmp)
23 | 


--------------------------------------------------------------------------------
/src_gestformer/datasets/utils/optical_flow.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | 
 4 | def dense_flow(clip, rgb=True):
 5 |     """Calculate optical flow with Farneback algorithm
 6 | 
 7 |     Args:
 8 |         clip: input video clip
 9 |         rgb: if True, it will covert to gray level every frames
10 |             Default: True
11 | 
12 |     Returns:
13 |         flow: Calculated Optical flow
14 | 
15 |     """
16 |     prev = clip[..., 0]
17 |     if rgb:
18 |         prev = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
19 |     flow = np.zeros((clip.shape[0], clip.shape[1], 2, clip.shape[-1] - 1))
20 |     for i in range(1 ,clip.shape[-1]):
21 |         next = clip[..., i]
22 |         if rgb:
23 |             next = cv2.cvtColor(next, cv2.COLOR_BGR2GRAY)
24 |         flow_calc = cv2.calcOpticalFlowFarneback(prev, next, None, 0.5, 3, 15, 3, 5, 1.2, 0)
25 |         flow[..., i - 1] = flow_calc
26 |         prev = next
27 |     return flow


--------------------------------------------------------------------------------
/src_gestformer/hyperparameters/Briareo/train.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Gesture Recognition",
 3 |     "dataset": "Briareo",
 4 |     "epochs": 100,
 5 |     "phase": "train",
 6 |     "data": {
 7 |       "optical_flow": false,
 8 |       "type": "ir",
 9 |       "n_classes": 12,
10 |       "n_frames": 40,
11 |       "data_path": "path/to/Briareo",
12 |       "result_dir": "./result",
13 |       "batch_size": 8
14 |     },
15 |     "checkpoints": {
16 |       "save_policy": "best",
17 |       "save_name": "train_briareo_ir-xwavegatedffn_emb",
18 |       "save_dir": "./checkpoints/",
19 |       "save_iters": 30,
20 |       "tb_path": "train_log"
21 |     },
22 |     "solver": {
23 |       "type": "AdamW",
24 |       "workers": 4,
25 |       "weight_decay": 0.0001,
26 |       "base_lr": 0.0001,
27 |       "decay_steps": [50, 75]
28 |     },
29 |     "network":{
30 |       "backbone": "resnet",
31 |       "pretrained": true,
32 |       "ff_size": 1024,
33 |       "n_head": 8,
34 |       "dropout2d" : 0.1,
35 |       "dropout1d": 0.5,
36 |       "n_module": 6
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src_gestformer/hyperparameters/NVGestures/train.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Gesture Recognition",
 3 |     "dataset": "NVGestures",
 4 |     "epochs": 100,
 5 |     "phase": "train",
 6 |     "data": {
 7 |       "optical_flow": false,
 8 |       "type": "normal",
 9 |       "n_classes": 25,
10 |       "n_frames": 40,
11 |       "data_path": "path/to/NVGestures/",
12 |       "result_dir": "./result",
13 |       "batch_size":8
14 |     },
15 |     "checkpoints": {
16 |       "save_policy": "best",
17 |       "save_name": "train_nv_normal-xwavegatedffn_multi",
18 |       "save_dir": "./checkpoints/",
19 |       "save_iters": 30,
20 |       "tb_path": "train_log"
21 |     },
22 |     "solver": {
23 |       "type": "AdamW",
24 |       "workers": 4,
25 |       "weight_decay": 0.0001,
26 |       "base_lr": 0.0001,
27 |       "decay_steps": [50, 75]
28 |     },
29 |     "network":{
30 |       "backbone": "resnet",
31 |       "pretrained": true,
32 |       "ff_size": 1024,
33 |       "n_head": 8,
34 |       "dropout2d" : 0.1,
35 |       "dropout1d": 0.5,
36 |       "n_module": 6
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src_gestformer/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from train import GestureTrainer
 4 | from test import GestureTest
 5 | from utils.configer import Configer
 6 | 
 7 | import random
 8 | import numpy as np
 9 | import torch
10 | 
11 | SEED = 1994
12 | random.seed(SEED)
13 | np.random.seed(SEED)
14 | torch.manual_seed(SEED)
15 | if torch.cuda.is_available():
16 |     torch.cuda.manual_seed(SEED)
17 |     torch.backends.cudnn.deterministic = True  # To have ~deterministic results
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('--disable-cuda', action='store_true',
23 |                         help='Disable CUDA')
24 |     parser.add_argument('--hypes', default=None, type=str,
25 |                         dest='hypes', help='The file of the hyper parameters.')
26 |     parser.add_argument('--phase', default='train', type=str,
27 |                         dest='phase', help='The phase of module.')
28 |     parser.add_argument('--gpu', default=[0, ], nargs='+', type=int,
29 |                         dest='gpu', help='The gpu used.')
30 |     parser.add_argument('--resume', default=None, type=str,
31 |                         dest='resume', help='The path of pretrained model.')
32 |     parser.add_argument('--nogesture', default=False, action='store_true',
33 |                         dest='nogesture', help='NoGesture CTC loss')
34 | 
35 |     args = parser.parse_args()
36 |     args.device = None
37 |     if not args.disable_cuda and torch.cuda.is_available():
38 |         args.device = torch.device('cuda:0')
39 |     else:
40 |         args.device = torch.device('cpu')
41 | 
42 |     torch.autograd.set_detect_anomaly(True)
43 |     configer = Configer(args)
44 |     if configer.get('phase') == 'train':
45 |         model = GestureTrainer(configer)
46 |         model.init_model()
47 |         model.train()
48 |     elif configer.get('phase') == 'test':
49 |         model = GestureTest(configer)
50 |         model.init_model()
51 |         model.test()
52 | 


--------------------------------------------------------------------------------
/src_gestformer/models/temporal.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from models.backbones.resnet import resnet18
 3 | from models.backbones.vgg import vgg16, vgg16_bn
 4 | from models.backbones import c3d
 5 | from models.backbones.r3d import r3d_18, r2plus1d_18
 6 | from models.attention import EncoderSelfAttention
 7 | 
 8 | backbone_dict = {'resnet': resnet18,
 9 |                  'vgg': vgg16, 'vgg_bn': vgg16_bn,
10 |                  'c3d': c3d,
11 |                  'r3d': r3d_18, 'r2plus1d': r2plus1d_18}
12 | 
13 | class _GestureTransformer(nn.Module):
14 |     """Multi Modal model for gesture recognition on 3 channel"""
15 |     def __init__(self, backbone: nn.Module, in_planes: int, out_planes: int,
16 |                  pretrained: bool = False, dropout_backbone=0.1,
17 |                  **kwargs):
18 |         super(_GestureTransformer, self).__init__()
19 | 
20 |         self.in_planes = in_planes
21 |         self.backbone = backbone(pretrained, in_planes, dropout=dropout_backbone)
22 | 
23 |         self.self_attention = EncoderSelfAttention(512, 64, 64, **kwargs)
24 | 
25 |         self.pool = nn.AdaptiveAvgPool2d((1, 512))
26 |         self.classifier = nn.Linear(512, out_planes)
27 | 
28 | 
29 |     def forward(self, x):
30 |         shape = x.shape
31 |         # print(x.shape)  #8,40,192,256
32 |  
33 |         x = x.view(-1, self.in_planes, x.shape[-2], x.shape[-1])
34 |         # print(x.shape)   #320,1,192,256   b*f, c,h,w
35 | 
36 |         x = self.backbone(x)
37 |         # print(x.shape)
38 |         x = x.view(shape[0], shape[1] // self.in_planes, -1)
39 | 
40 |         x = self.self_attention(x)
41 | 
42 |         x = self.pool(x).squeeze(dim=1)
43 |         x = self.classifier(x)
44 |         return x
45 | 
46 | def GestureTransoformer(backbone: str="resnet", in_planes: int=3, n_classes: int=25, **kwargs):
47 |     if backbone not in backbone_dict:
48 |         raise NotImplementedError("Backbone type: [{}] is not implemented.".format(backbone))
49 |     model = _GestureTransformer(backbone_dict[backbone], in_planes, n_classes, **kwargs)
50 |     return model


--------------------------------------------------------------------------------
/src_gestformer/utils/configer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | class Configer(object):
 5 |     """Configuration details object
 6 | 
 7 |     Attributes:
 8 |         args (dict): Dictionary containing terminal parameters added to current procedure
 9 |         params (dict): Dictionary containing parameters in the json file provided
10 | 
11 |     """
12 |     def __init__(self, args):
13 |         """Configer constructor
14 | 
15 |         Args:
16 |             args (argparse.Namespace): Object containing terminal parameters
17 | 
18 |         """
19 |         self.args = args.__dict__
20 |         self.params = None
21 | 
22 |         if not os.path.exists(args.hypes):
23 |             raise ValueError('Json Path: {} not exists!'.format(args.hypes))
24 | 
25 |         json_stream = open(args.hypes, 'r')
26 |         self.params = json.load(json_stream)
27 |         json_stream.close()
28 | 
29 |     def get(self, *keys):
30 |         """Item getter
31 | 
32 |         Args:
33 |             *keys (list of str): List of keys
34 | 
35 |         Returns:
36 |             el (str): Value retrived from args or params at keys location
37 | 
38 |         """
39 |         if len(keys) == 0:
40 |             return self.params
41 | 
42 |         key = keys[-1]
43 |         if key in self.args and self.args[key] is not None:
44 |             return self.args[key]
45 | 
46 |         el = self.params
47 |         for key in keys:
48 |             if key in el and el[key] is not None:
49 |                 el = el[key]
50 |             else:
51 |                 return None
52 |         return el
53 | 
54 |     def __getitem__(self, item):
55 |         """Get item function, same for the get[item]"""
56 |         if isinstance(item, tuple):
57 |             return self.get(*item)
58 |         else:
59 |             return self.get(item)
60 | 
61 |     def __getattr__(self, item):
62 |         """Get attr function, same for the get[item]"""
63 |         return self.get(item)
64 | 
65 |     def __str__(self):
66 |         """To string function for the whole configuration state"""
67 |         out = ""
68 |         out += "Args:\n" + "\n".join([f"  {str(key)}: {str(value)}" for key, value in self.args.items()]) + "\n"
69 |         out += "Params:\n" + "\n".join([f"  {str(key)}: {str(value)}" for key, value in self.params.items()])
70 |         return out
71 | 


--------------------------------------------------------------------------------
/src_gestformer/datasets/utils/normals.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def normals(depthmap, normalize=True, keep_dims=True):
 4 |     """Calculate depth normals as normals = gF(x,y,z) = (-dF/dx, -dF/dy, 1)
 5 | 
 6 |     Args:
 7 |         depthmap (np.ndarray): depth map of any dtype, single channel, len(depthmap.shape) == 3
 8 |         normalize (bool, optional): if True, normals will be normalized to have unit-magnitude
 9 |             Default: True
10 |         keep_dims (bool, optional):
11 |             if True, normals shape will be equals to depthmap shape,
12 |             if False, normals shape will be smaller than depthmap shape.
13 |             Default: True
14 | 
15 |     Returns:
16 |         Depth normals
17 | 
18 |     """
19 |     depthmap = np.asarray(depthmap, np.float32)
20 | 
21 |     if keep_dims is True:
22 |         mask = depthmap != 0
23 |     else:
24 |         mask = depthmap[1:-1, 1:-1] != 0
25 | 
26 |     if keep_dims is True:
27 |         normals = np.zeros((depthmap.shape[0], depthmap.shape[1], 3), dtype=np.float32)
28 |         normals[1:-1, 1:-1, 0] = - (depthmap[2:, 1:-1] - depthmap[:-2, 1:-1]) / 2
29 |         normals[1:-1, 1:-1, 1] = - (depthmap[1:-1, 2:] - depthmap[1:-1, :-2]) / 2
30 |     else:
31 |         normals = np.zeros((depthmap.shape[0] - 2, depthmap.shape[1] - 2, 3), dtype=np.float32)
32 |         normals[:, :, 0] = - (depthmap[2:, 1:-1] - depthmap[:-2, 1:-1]) / 2
33 |         normals[:, :, 1] = - (depthmap[1:-1, 2:] - depthmap[1:-1, :-2]) / 2
34 |     normals[:, :, 2] = 1
35 | 
36 |     normals[~mask] = [0, 0, 0]
37 | 
38 |     if normalize:
39 |         div = np.linalg.norm(normals[mask], ord=2, axis=-1, keepdims=True).repeat(3, axis=-1) + 1e-12
40 |         normals[mask] /= div
41 | 
42 |     return normals
43 | 
44 | 
45 | def normals_multi(depthmaps, normalize=True, keep_dims=True):
46 |     """Calculate depth normals for multiple depthmaps inputs
47 | 
48 |     Args:
49 |         depthmap (np.ndarray): multiple input depth maps
50 |         normalize (bool, optional): if True, normals will be normalized to have unit-magnitude
51 |             Default: True
52 |         keep_dims (bool, optional):
53 |             if True, normals shape will be equals to depthmap shape,
54 |             if False, normals shape will be smaller than depthmap shape.
55 |             Default: True
56 | 
57 |     Returns:
58 |         Depth normals
59 | 
60 |     """
61 |     n_out = np.zeros((depthmaps.shape[0], depthmaps.shape[1], 3, depthmaps.shape[-1]))
62 |     for i in range(depthmaps.shape[-1]):
63 |         n_out[..., i] = normals(depthmaps[..., 0, i], normalize, keep_dims)
64 |     return n_out


--------------------------------------------------------------------------------
/src_gestformer/cs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | # df1 = pd.read_csv('csv/Briareo/normal.csv', header = None) # place your csv1 in df1
 4 | 
 5 | #df1[df1.columns.drop('A')]
 6 | o = pd.read_csv('csv/Briareo/original.csv', header = None) 
 7 | df1 = pd.read_csv('csv/Briareo/normal.csv', header = None) # place your csv1 in df1
 8 | df2 = pd.read_csv('csv/Briareo/depth.csv', header = None) # place your csv2 in df2
 9 | df3 = pd.read_csv('csv/Briareo/ir.csv', header = None) # place your csv2 in df2
10 | df4 = pd.read_csv('csv/Briareo/rgbop.csv', header = None) # place your csv2 in df2
11 | df5 = pd.read_csv('csv/Briareo/rgb.csv', header = None) # place your csv2 in df2
12 | #df2[df2.columns.drop('A')]
13 | #df4 = pd.read_csv('csv/Briareo/color.csv', header = None) 
14 | 
15 | o1 = o.iloc[:,:].values.tolist() 
16 | #print(type(o1))
17 | 
18 | rate_in_1 = df1.iloc[:,:].values.tolist() #store the values of the 3rd column from csv1 to a list
19 | rate_out_1 = df2.iloc[:,:].values.tolist() #store the values of the 4th column from csv1 to a list
20 | rate_out_2 = df3.iloc[:,:].values.tolist() #store the values of the 4th column from csv1 to a list
21 | rate_in_2 = df4.iloc[:,:].values.tolist() #store the values of the 4th column from csv1 to a list
22 | rate_in_5 = df5.iloc[:,:].values.tolist() #store the values of the 4th column from csv1 to a list
23 | 
24 | 
25 | # rate_in_2 = df2.iloc[:,2].values.tolist() #store the values of the 3rd column from csv1 to a list
26 | #rate_out_2 = df2.iloc[:,3].values.tolist() #store the values of the 4th column from csv1 to a list
27 | 
28 |  # add the values of 2 rate in lists into rate_in_total list
29 | # rate_in_total = [x+y for x, y in zip(rate_in_1, rate_out_1)] # add the values of 2 rate out lists into rate_out_total list
30 | # rate_in_total = [max(x,y) for (x, y) in zip(rate_in_1, rate_out_2)]
31 | # rate_in_total = [np.add(x,y)/2 for (x, y) in zip(rate_in_1, rate_out_1)]
32 | # rate_in_total = [max(max(x,y),z) for (x, y, z) in zip(rate_in_1, rate_out_1, rate_out_2)]
33 | # rate_in_total = [np.add(np.add(x,y),z)/3 for (x, y, z) in zip(rate_in_1, rate_out_1, rate_out_2)]
34 | # rate_in_total = [np.add(np.add(np.add(x,y),w),z)/4 for (x, y,z,w) in zip(rate_in_1, rate_out_1,rate_out_2,rate_in_2)]
35 | rate_in_total = [np.add(np.add(np.add(np.add(x,y),w),z),k)/5 for (x, y,z,w,k) in zip(rate_in_1, rate_out_1,rate_out_2,rate_in_2,rate_in_5)]
36 | #print(rate_in_total[1]) 
37 | 
38 | final_df = pd.DataFrame(rate_in_total)
39 | #print(final_df)
40 | with open('csv/Briareo/ir_rgb.csv', 'a', newline='') as csvfile:
41 | 	final_df.to_csv(csvfile, mode='a',header=False,index =False)
42 | 	# print(csvfile)
43 | #print(np.where(max(rate_in_total[1])))
44 | 
45 | #print(len(rate_in_total))
46 | c=0
47 | for x in range(len(rate_in_total)):
48 | 	#print(np.argmax(rate_in_total[x], axis=0))
49 | 	#print(o1[x])
50 | 	#print(np.argmax(rate_in_total[x], axis=0)==o1[x])
51 | 	if np.argmax(rate_in_total[x], axis=0)==o1[x]:
52 | 		c +=1
53 | #print(c)		
54 | print( c / 218)
55 | # print( c / 482)
56 | 


--------------------------------------------------------------------------------
/src_gestformer/models/backbones/c3d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class _C3D(nn.Module):
 6 |     """
 7 |     The C3D network as described in [1].
 8 |     """
 9 | 
10 |     def __init__(self, drop_prob: float):
11 |         super(_C3D, self).__init__()
12 | 
13 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
14 |         self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
15 | 
16 |         self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
17 |         self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
18 | 
19 |         self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
20 |         self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
21 |         self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
22 | 
23 |         self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
24 |         self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
25 |         self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
26 | 
27 |         self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
28 |         self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
29 |         self.pool5 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))
30 | 
31 |         self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
32 | 
33 |         self.dropout = nn.Dropout3d(p=drop_prob)
34 | 
35 |         self.fc6 = nn.Linear(8192, 4096)
36 |         self.fc7 = nn.Linear(4096, 4096)
37 |         self.fc8 = nn.Linear(4096, 487)
38 | 
39 |         self.relu = nn.ReLU()
40 |         self.softmax = nn.Softmax()
41 | 
42 | 
43 |     def forward(self, x):
44 |         h = self.relu(self.conv1(x))
45 |         h = self.dropout(h)
46 |         h = self.pool1(h)
47 | 
48 |         h = self.relu(self.conv2(h))
49 |         h = self.dropout(h)
50 |         h = self.pool2(h)
51 | 
52 |         h = self.relu(self.conv3a(h))
53 |         h = self.dropout(h)
54 |         h = self.relu(self.conv3b(h))
55 |         h = self.dropout(h)
56 |         h = self.pool3(h)
57 | 
58 |         h = self.relu(self.conv4a(h))
59 |         h = self.dropout(h)
60 |         h = self.relu(self.conv4b(h))
61 |         h = self.dropout(h)
62 |         h = self.pool4(h)
63 | 
64 |         h = self.relu(self.conv5a(h))
65 |         h = self.dropout(h)
66 |         h = self.relu(self.conv5b(h))
67 |         h = self.dropout(h)
68 |         h = self.pool5(h)
69 | 
70 |         h = self.avgpool(h)
71 | 
72 |         return h.squeeze()
73 | 
74 | 
75 | def C3D(pretrained, in_planes: int=3, dropout=0., **kwargs):
76 |     model = _C3D(drop_prob=dropout)
77 |     if pretrained:
78 |         state_dict = torch.load("./c3d.pickle")
79 |         model.load_state_dict(state_dict)
80 |         if in_planes in [1, 2]:
81 |             w = model.conv1._parameters['weight'].data
82 |             model.conv1 = nn.Conv3d(in_planes, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
83 |             if in_planes == 1:
84 |                 model.conv1._parameters['weight'].data = w.mean(dim=1, keepdim=True)
85 |             else:
86 |                 model.conv1._parameters['weight'].data = w[:, :-1] * 1.5
87 |     model.conv1 = nn.Conv3d(in_planes, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
88 |     model.fc6 = None
89 |     model.fc7 = None
90 |     model.fc8 = None
91 |     return model


--------------------------------------------------------------------------------
/src_gestformer/datasets/utils/read_data.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | def load_split_nvgesture(file_with_split='./nvgesture_train_correct.lst', list_split=list()):
 7 |     with open(file_with_split, 'rt') as f:
 8 |         dict_name = file_with_split[file_with_split.rfind('/') + 1:]
 9 |         dict_name = dict_name[:dict_name.find('_')]
10 | 
11 |         for line in f:
12 |             params = line.split(' ')
13 |             params_dictionary = dict()
14 | 
15 |             params_dictionary['dataset'] = dict_name
16 | 
17 |             path = params[0].split(':')[1]
18 |             for param in params[1:]:
19 |                 parsed = param.split(':')
20 |                 key = parsed[0]
21 |                 if key == 'label':
22 |                     # make label start from 0
23 |                     label = int(parsed[1]) - 1
24 |                     params_dictionary['label'] = label
25 |                 elif key in ('depth', 'color', 'duo_left'):
26 |                     # first store path
27 |                     params_dictionary[key] = path + '/' + parsed[1]
28 |                     # store start frame
29 |                     params_dictionary[key + '_start'] = int(parsed[2])
30 | 
31 |                     params_dictionary[key + '_end'] = int(parsed[3])
32 | 
33 |             params_dictionary['duo_right'] = params_dictionary['duo_left'].replace('duo_left', 'duo_right')
34 |             params_dictionary['duo_right_start'] = params_dictionary['duo_left_start']
35 |             params_dictionary['duo_right_end'] = params_dictionary['duo_left_end']
36 | 
37 |             params_dictionary['duo_disparity'] = params_dictionary['duo_left'].replace('duo_left', 'duo_disparity')
38 |             params_dictionary['duo_disparity_start'] = params_dictionary['duo_left_start']
39 |             params_dictionary['duo_disparity_end'] = params_dictionary['duo_left_end']
40 | 
41 |             list_split.append(params_dictionary)
42 | 
43 |     return list_split
44 | 
45 | 
46 | def load_data_from_file(data_path, example_config, sensor, image_width, image_height, nogesture = False):
47 |     path = example_config[sensor] + ".avi"
48 |     path = Path(data_path) / path[path.find('/') + 1:]
49 |     start_frame = example_config[sensor + '_start']
50 |     end_frame = example_config[sensor + '_end']
51 |     label = example_config['label']
52 | 
53 |     if end_frame - start_frame > 80:
54 |         new_start = (end_frame - start_frame) // 2 - 40 + start_frame
55 |         new_end = (end_frame - start_frame) // 2 + 40 + start_frame
56 |         start_frame = new_start
57 |         end_frame = new_end
58 | 
59 |     chnum = 3 if sensor == "color" else 1
60 | 
61 |     video_container = np.zeros((image_height, image_width, chnum, 160 if nogesture else 80), dtype=np.uint8)
62 | 
63 |     cap = cv2.VideoCapture(str(path))
64 | 
65 |     if nogesture:
66 |         start_offset = 40 if start_frame >= 40 else start_frame
67 |         end_offset = 40 if int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - end_frame >= 40 \
68 |             else int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - end_frame
69 |         frames_to_load = range(start_frame - start_offset, end_frame + end_offset)
70 |     else:
71 |         frames_to_load = range(start_frame, end_frame)
72 | 
73 |     cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
74 |     for indx, frameIndx in enumerate(frames_to_load):
75 |         ret, frame = cap.read()
76 |         if ret:
77 |             frame = cv2.resize(frame, (image_width, image_height))
78 |             if sensor != "color":
79 |                 frame = frame[..., 0]
80 |                 frame = frame[..., np.newaxis]
81 |             video_container[..., indx] = frame
82 |         else:
83 |             print("Could not load frame")
84 | 
85 |     cap.release()
86 | 
87 |     if nogesture:
88 |         return video_container, label, (start_offset, end_offset)
89 |     else:
90 |         return video_container, label, None


--------------------------------------------------------------------------------
/src_gestformer/datasets/NVGestures.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch.utils.data.dataset import Dataset
 4 | 
 5 | import numpy as np
 6 | import cv2
 7 | 
 8 | from datasets.utils.read_data import load_split_nvgesture, load_data_from_file
 9 | from datasets.utils.normals import normals_multi
10 | from datasets.utils.normalize import normalize
11 | 
12 | from pathlib import Path
13 | 
14 | class NVGesture(Dataset):
15 |     """NVGesture Dataset class"""
16 |     def __init__(self, configer, path, split="train", data_type="depth", transforms=None, n_frames=40, optical_flow=False):
17 |         """Constructor method for NVGesture Dataset class
18 | 
19 |         Args:
20 |             configer (Configer): Configer object for current procedure phase (train, test, val)
21 |             split (str, optional): Current procedure phase (train, test, val)
22 |             data_type (str, optional): Input data type (depth, rgb, normals, ir)
23 |             transform (Object, optional): Data augmentation transformation for every data
24 |             n_frames (int, optional): Number of frames selected for every input clip
25 |             optical_flow (bool, optional): Flag to choose if calculate optical flow or not
26 | 
27 |         """
28 |         super().__init__()
29 | 
30 |         print("Loading NVGestures {} dataset...".format(split.upper()), end=" ")
31 | 
32 |         self.dataset_path = Path(path) / "nvgesture_arch" / "nvGesture_v1"
33 |         self.split = split
34 |         self.data_type = data_type
35 |         self.transforms = transforms
36 |         self.optical_flow = optical_flow
37 |         if self.data_type in ["normal", "normals"] and self.optical_flow:
38 |             raise NotImplementedError("Optical flow for normals image is not supported.")
39 | 
40 |         file_lists = self.dataset_path / \
41 |                      "nvgesture_{}_correct_cvpr2016_v2.lst".format(self.split if self.split == "train" else "test")
42 | 
43 |         self.data_list = list()
44 |         load_split_nvgesture(file_with_split=str(file_lists), list_split=self.data_list)
45 | 
46 |         if self.data_type in ["depth_z", "depth", "normal", "normals"]:
47 |             self.sensor = "depth"
48 |         elif self.data_type == "wrapped":
49 |             self.sensor = "wrapped"
50 |         elif self.data_type in ["rgb", "color"]:
51 |             self.sensor = "color"
52 |         elif self.data_type == "ir":
53 |             self.sensor = "duo_left"
54 |         else:
55 |             raise NotImplementedError
56 |         print("done.")
57 | 
58 |     def __len__(self):
59 |         return len(self.data_list)
60 | 
61 |     def __getitem__(self, idx):
62 |         data, label, offsets = load_data_from_file(self.dataset_path, example_config=self.data_list[idx], sensor=self.sensor,
63 |                                           image_width=320, image_height=240)
64 |         if self.optical_flow:
65 |             if self.transforms:
66 |                 aug_det = self.transforms.to_deterministic()
67 |                 data = np.array([aug_det.augment_image(data[..., i])
68 |                                  for i in range(data.shape[-1])]).transpose(1, 2, 3, 0)
69 |             prev = data[..., 0]
70 |             if self.data_type == "rgb":
71 |                 prev = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
72 |             data = data[..., [0, 1] + [*range(2, data.shape[-1], 2)]]
73 |             flow = np.zeros((data.shape[0], data.shape[1], 2, data.shape[-1] - 1))
74 |             for i in range(1, data.shape[-1]):
75 |                 next = data[..., i]
76 |                 if self.data_type == "rgb":
77 |                     next = cv2.cvtColor(next, cv2.COLOR_BGR2GRAY)
78 |                 # print(flow.shape)
79 |                 flow[..., i - 1] = cv2.calcOpticalFlowFarneback(prev, next, None, 0.5, 3, 15, 3, 5, 1.2, 0)
80 |                 prev = next
81 |             data = flow
82 | 
83 |         data = data[..., [*range(0, data.shape[-1], 2)]]  # Our settings is working with static clip containing 40 frames
84 | 
85 |         if self.data_type in ["normal", "normals"]:
86 |             data = normals_multi(data)
87 |         else:
88 |             data = normalize(data)
89 | 
90 |         if self.transforms is not None and not self.optical_flow:
91 |             aug_det = self.transforms.to_deterministic()
92 |             data = np.array([aug_det.augment_image(data[..., i]) for i in range(data.shape[-1])]).transpose(1, 2, 3, 0)
93 | 
94 |         data = np.concatenate(data.transpose(3, 0, 1, 2), axis=2).transpose(2, 0, 1)
95 |         data = torch.from_numpy(data)
96 |         label = torch.LongTensor(np.asarray([label]))
97 | 
98 |         return data.float(), label


--------------------------------------------------------------------------------
/src_gestformer/models/backbones/vgg.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torchvision.models.utils import load_state_dict_from_url
  4 | 
  5 | 
  6 | __all__ = [
  7 |     'VGG', 'vgg16', 'vgg16_bn',
  8 | ]
  9 | 
 10 | 
 11 | model_urls = {
 12 |     'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth',
 13 |     'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth',
 14 | }
 15 | 
 16 | 
 17 | class VGG(nn.Module):
 18 | 
 19 |     def __init__(self, features, num_classes=1000, init_weights=True):
 20 |         super(VGG, self).__init__()
 21 |         self.features = features
 22 |         self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
 23 |         self.classifier = nn.Sequential(
 24 |             nn.Linear(512 * 7 * 7, 4096),
 25 |             nn.ReLU(True),
 26 |             nn.Dropout(),
 27 |             nn.Linear(4096, 4096),
 28 |             nn.ReLU(True),
 29 |             nn.Dropout(),
 30 |             nn.Linear(4096, num_classes),
 31 |         )
 32 |         if init_weights:
 33 |             self._initialize_weights()
 34 | 
 35 |     def forward(self, x):
 36 |         x = self.features(x)
 37 |         x = self.avgpool(x)
 38 |         x = torch.flatten(x, 1)
 39 |         return x
 40 | 
 41 |     def _initialize_weights(self):
 42 |         for m in self.modules():
 43 |             if isinstance(m, nn.Conv2d):
 44 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
 45 |                 if m.bias is not None:
 46 |                     nn.init.constant_(m.bias, 0)
 47 |             elif isinstance(m, nn.BatchNorm2d):
 48 |                 nn.init.constant_(m.weight, 1)
 49 |                 nn.init.constant_(m.bias, 0)
 50 |             elif isinstance(m, nn.Linear):
 51 |                 nn.init.normal_(m.weight, 0, 0.01)
 52 |                 nn.init.constant_(m.bias, 0)
 53 | 
 54 | 
 55 | def make_layers(cfg, batch_norm=False):
 56 |     layers = []
 57 |     in_channels = 3
 58 |     for v in cfg:
 59 |         if v == 'M':
 60 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
 61 |         else:
 62 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
 63 |             if batch_norm:
 64 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
 65 |             else:
 66 |                 layers += [conv2d, nn.ReLU(inplace=True)]
 67 |             in_channels = v
 68 |     return nn.Sequential(*layers)
 69 | 
 70 | 
 71 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M']
 72 | 
 73 | 
 74 | def _vgg(arch, batch_norm, pretrained, in_planes, drop_prob, **kwargs):
 75 |     if pretrained:
 76 |         kwargs['init_weights'] = False
 77 |     model = VGG(make_layers(cfg, batch_norm=batch_norm), **kwargs)
 78 |     if pretrained:
 79 |         state_dict = load_state_dict_from_url(model_urls[arch],
 80 |                                               progress=True)
 81 |         model.load_state_dict(state_dict)
 82 | 
 83 |         if in_planes in [1, 2]:
 84 |             w = model.features._modules['0']._parameters['weight'].data
 85 |             model.features._modules['0'] = nn.Conv2d(in_planes, 64, kernel_size=3, padding=1, bias=batch_norm is False)
 86 |             if in_planes == 1:
 87 |                 model.conv1._parameters['weight'].data = w.mean(dim=1, keepdim=True)
 88 |             else:
 89 |                 model.conv1._parameters['weight'].data = w[:, :-1] * 1.5
 90 |     else:
 91 |         model.features._modules['0'] = nn.Conv2d(in_planes, 64, kernel_size=3, padding=1, bias=batch_norm is False)
 92 |     if drop_prob > 0:
 93 |         new_features = list()
 94 |         for el in model.features:
 95 |             new_features.append(el)
 96 |             if isinstance(el, nn.ReLU):
 97 |                 new_features.append(nn.Dropout2d(p=drop_prob))
 98 |         model.features = nn.Sequential(*new_features)
 99 | 
100 |     model.avgpool = nn.AdaptiveAvgPool2d((1, 1))
101 |     model.classifier = None
102 |     return model
103 | 
104 | 
105 | def vgg16(pretrained=False, in_planes: int=3, dropout2d: float=0., **kwargs):
106 |     r"""VGG 16-layer model (configuration "D")
107 |     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
108 | 
109 |     Args:
110 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
111 |         progress (bool): If True, displays a progress bar of the download to stderr
112 |     """
113 |     return _vgg('vgg16', False, pretrained, in_planes, dropout2d, **kwargs)
114 | 
115 | 
116 | def vgg16_bn(pretrained=False, in_planes: int=3, dropout2d: float=0., **kwargs):
117 |     r"""VGG 16-layer model (configuration "D") with batch normalization
118 |     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
119 | 
120 |     Args:
121 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
122 |         progress (bool): If True, displays a progress bar of the download to stderr
123 |     """
124 |     return _vgg('vgg16_bn', True, pretrained, in_planes, dropout2d, **kwargs)


--------------------------------------------------------------------------------
/src_gestformer/datasets/Briareo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import torch
  4 | from pathlib import Path
  5 | 
  6 | import cv2
  7 | import numpy as np
  8 | 
  9 | from torch.utils.data.dataset import Dataset
 10 | 
 11 | from datasets.utils.normals import normals_multi
 12 | from datasets.utils.normalize import normalize
 13 | from datasets.utils.optical_flow import dense_flow
 14 | 
 15 | from datasets.utils.utils_briareo import from_json_to_list
 16 | 
 17 | 
 18 | class Briareo(Dataset):
 19 |     """Briareo Dataset class"""
 20 |     def __init__(self, configer, path, split="train", data_type='depth', transforms=None, n_frames=30, optical_flow=False):
 21 |         """Constructor method for Briareo Dataset class
 22 | 
 23 |         Args:
 24 |             configer (Configer): Configer object for current procedure phase (train, test, val)
 25 |             split (str, optional): Current procedure phase (train, test, val)
 26 |             data_type (str, optional): Input data type (depth, rgb, normals, ir)
 27 |             transform (Object, optional): Data augmentation transformation for every data
 28 |             n_frames (int, optional): Number of frames selected for every input clip
 29 |             optical_flow (bool, optional): Flag to choose if calculate optical flow or not
 30 | 
 31 |         """
 32 |         super().__init__()
 33 | 
 34 |         self.dataset_path = Path(path)
 35 |         self.split = split
 36 |         self.data_type = data_type
 37 |         self.optical_flow = optical_flow
 38 |         if self.data_type in ["normal", "normals"] and self.optical_flow:
 39 |             raise NotImplementedError("Optical flow for normals image is not supported.")
 40 | 
 41 |         self.transforms = transforms
 42 |         self.n_frames = n_frames if not optical_flow else n_frames + 1
 43 | 
 44 |         print("Loading Briareo {} dataset...".format(split.upper()), end=" ")
 45 |         if data_type in ["normal", "normals"]:
 46 |             data_type = "depth"
 47 |         data = np.load(self.dataset_path / "splits" / (self.split if self.split != "val" else "train") /
 48 |                                     "{}_{}.npz".format(data_type, self.split), allow_pickle=True)['arr_0']
 49 | 
 50 |         # Prepare clip for the selected number of frames n_frame
 51 |         fixed_data = list()
 52 |         for i, record in enumerate(data):
 53 |             paths = record['data']
 54 | 
 55 |             center_of_list = math.floor(len(paths) / 2)
 56 |             crop_limit = math.floor(self.n_frames / 2)
 57 | 
 58 |             start = center_of_list - crop_limit
 59 |             end = center_of_list + crop_limit
 60 |             paths_cropped = paths[start: end + 1 if self.n_frames % 2 == 1 else end]
 61 |             if self.data_type == 'leapmotion':
 62 |                 valid = np.array(record['valid'][start: end + 1 if self.n_frames % 2 == 1 else end])
 63 |                 if valid.sum() == len(valid):
 64 |                     data[i]['data'] = paths_cropped
 65 |                     fixed_data.append(data[i])
 66 |             else:
 67 |                 data[i]['data'] = paths_cropped
 68 |                 fixed_data.append(data[i])
 69 | 
 70 |         self.data = np.array(fixed_data)
 71 |         print("done.")
 72 | 
 73 |     def __len__(self):
 74 |         return len(self.data)
 75 | 
 76 |     def __getitem__(self, idx):
 77 |         paths = self.data[idx]['data']
 78 |         label = self.data[idx]['label']
 79 | 
 80 |         clip = list()
 81 |         for p in paths:
 82 |             if self.data_type == "leapmotion":
 83 |                 img = from_json_to_list(os.path.join(self.dataset_path, p))[0]
 84 |             else:
 85 |                 if self.data_type in ["depth", "normal", "normals"]:
 86 |                     img = np.load(str(self.dataset_path / p), allow_pickle=True)['arr_0']
 87 |                     if self.data_type in ["normal", "normals"]:
 88 |                         img *= 1000
 89 |                 elif self.data_type in ["ir"]:
 90 |                     img = cv2.imread(str(self.dataset_path / p), cv2.IMREAD_ANYDEPTH)
 91 |                 else:
 92 |                     img = cv2.imread(str(self.dataset_path / p), cv2.IMREAD_COLOR)
 93 |                 img = cv2.resize(img, (224, 224))
 94 |                 if self.data_type != "rgb":
 95 |                     img = np.expand_dims(img, axis=2)
 96 |             clip.append(img)
 97 | 
 98 |         clip = np.array(clip).transpose(1, 2, 3, 0)
 99 | 
100 |         if self.data_type in ["normal", "normals"]:
101 |             clip = normals_multi(clip)
102 |         else:
103 |             if self.optical_flow:
104 |                 clip = dense_flow(clip, self.data_type == "rgb")
105 |             clip = normalize(clip)
106 | 
107 |         if self.transforms is not None:
108 |             aug_det = self.transforms.to_deterministic()
109 |             clip = np.array([aug_det.augment_image(clip[..., i]) for i in range(clip.shape[-1])]).transpose(1, 2, 3, 0)
110 | 
111 |         clip = torch.from_numpy(clip.reshape(clip.shape[0], clip.shape[1], -1).transpose(2, 0, 1))
112 |         label = torch.LongTensor(np.asarray([label]))
113 |         return clip.float(), label


--------------------------------------------------------------------------------
/src_gestformer/test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | from torch.utils.data import DataLoader
  5 | import imgaug.augmenters as iaa
  6 | 
  7 | # Import Datasets
  8 | from datasets.Briareo import Briareo
  9 | from datasets.NVGestures import NVGesture
 10 | from models.model_utilizer import ModuleUtilizer
 11 | 
 12 | # Import Model
 13 | from models.temporal import GestureTransoformer
 14 | 
 15 | # Import Utils
 16 | from tqdm import tqdm
 17 | from utils.average_meter import AverageMeter
 18 | from torchstat import stat
 19 | import time
 20 | import torchsummary
 21 | from fvcore.nn import FlopCountAnalysis
 22 | 
 23 | 
 24 | # Setting seeds
 25 | def worker_init_fn(worker_id):
 26 |     np.random.seed(torch.initial_seed() % 2 ** 32)
 27 | 
 28 | class GestureTest(object):
 29 |     """Gesture Recognition Test class
 30 | 
 31 |     Attributes:
 32 |         configer (Configer): Configer object, contains procedure configuration.
 33 |         train_loader (torch.utils.data.DataLoader): Train data loader variable
 34 |         val_loader (torch.utils.data.DataLoader): Val data loader variable
 35 |         test_loader (torch.utils.data.DataLoader): Test data loader variable
 36 |         net (torch.nn.Module): Network used for the current procedure
 37 |         lr (int): Learning rate value
 38 |         optimizer (torch.nn.optim.optimizer): Optimizer for training procedure
 39 |         iters (int): Starting iteration number, not zero if resuming training
 40 |         epoch (int): Starting epoch number, not zero if resuming training
 41 |         scheduler (torch.optim.lr_scheduler): Scheduler to utilize during training
 42 | 
 43 |     """
 44 | 
 45 |     def __init__(self, configer):
 46 |         self.configer = configer
 47 | 
 48 |         self.data_path = configer.get("data", "data_path")      #: str: Path to data directory
 49 | 
 50 |         # Train val and test accuracy
 51 |         self.accuracy = AverageMeter()
 52 | 
 53 |         # DataLoaders
 54 |         self.data_loader = None
 55 | 
 56 |         # Module load and save utility
 57 |         self.device = self.configer.get("device")
 58 |         self.model_utility = ModuleUtilizer(self.configer)      #: Model utility for load, save and update optimizer
 59 |         self.net = None
 60 | 
 61 |         # Training procedure
 62 |         self.transforms = None
 63 | 
 64 |         # Other useful data
 65 |         self.backbone = self.configer.get("network", "backbone")     #: str: Backbone type
 66 |         self.in_planes = None                                       #: int: Input channels
 67 |         self.clip_length = self.configer.get("data", "n_frames")    #: int: Number of frames per sequence
 68 |         self.n_classes = self.configer.get("data", "n_classes")     #: int: Total number of classes for dataset
 69 |         self.data_type = self.configer.get("data", "type")          #: str: Type of data (rgb, depth, ir, leapmotion)
 70 |         self.dataset = self.configer.get("dataset").lower()         #: str: Type of dataset
 71 |         self.optical_flow = self.configer.get("data", "optical_flow")
 72 |         if self.optical_flow is None:
 73 |             self.optical_flow = True
 74 | 
 75 |     def init_model(self):
 76 |         """Initialize model and other data for procedure"""
 77 | 
 78 |         if self.optical_flow is True:
 79 |             self.in_planes = 2
 80 |         elif self.data_type in ["depth", "ir"]:
 81 |             self.in_planes = 1
 82 |         else:
 83 |             self.in_planes = 3
 84 | 
 85 |         # Selecting correct model and normalization variable based on type variable
 86 |         self.net = GestureTransoformer(self.backbone, self.in_planes, self.n_classes,
 87 |                                        pretrained=self.configer.get("network", "pretrained"),
 88 |                                        n_head=self.configer.get("network", "n_head"),
 89 |                                        dropout_backbone=self.configer.get("network", "dropout2d"),
 90 |                                        dropout_transformer=self.configer.get("network", "dropout1d"),
 91 |                                        dff=self.configer.get("network", "ff_size"),
 92 |                                        n_module=self.configer.get("network", "n_module")
 93 |                                        )
 94 | 
 95 |         self.net, _, _, _ = self.model_utility.load_net(self.net)
 96 | 
 97 |         # Selecting Dataset and DataLoader
 98 |         if self.dataset == "briareo":
 99 |             Dataset = Briareo
100 |             self.transforms = iaa.CenterCropToFixedSize(200, 200)
101 |         elif self.dataset == "nvgestures":
102 |             Dataset = NVGesture
103 |             self.transforms = iaa.CenterCropToFixedSize(256, 192)
104 |         else:
105 |             raise NotImplementedError(f"Dataset not supported: {self.configer.get('dataset')}")
106 | 
107 |         # Setting Dataloaders
108 |         self.data_loader = DataLoader(
109 |             Dataset(self.configer, self.data_path, split="test", data_type=self.data_type,
110 |                     transforms=self.transforms, n_frames=self.clip_length,
111 |                     optical_flow=self.optical_flow),
112 |             batch_size=1, shuffle=False, drop_last=True,
113 |             num_workers=self.configer.get('solver', 'workers'), pin_memory=True, worker_init_fn=worker_init_fn)
114 | 
115 |     def __test(self):
116 |         """Testing function."""
117 |         self.net.eval()
118 |         c = 0
119 |         tot = 0
120 |         with torch.no_grad():
121 |             for i, data_tuple in enumerate(tqdm(self.data_loader, desc="Test")):
122 |                 """
123 |                 input, gt
124 |                 """
125 |                 inputs = data_tuple[0].to(self.device)
126 |                 gt = data_tuple[1].to(self.device)
127 |                 # print(self.device)
128 |                 flops = FlopCountAnalysis(self.net, inputs)
129 |                 print(flops.total()/1e9)
130 |                 torchsummary.summary(self.net, inputs[0].shape)
131 | 
132 |                 start_time = time.time()
133 |                 output = self.net(inputs)
134 |                 end_time = time.time()
135 | 
136 |                 predicted = torch.argmax(output.detach(), dim=1)
137 |                 correct = gt.detach().squeeze(dim=1)
138 | 
139 |                 if predicted == correct:
140 |                     c += 1
141 |                 tot += 1
142 |                 break
143 | 
144 |         accuracy = c / tot
145 |         inference_time = end_time - start_time
146 |         print("Inference time:", inference_time, "seconds")
147 | 
148 |         print("Accuracy: {}".format(accuracy))
149 | 
150 |     def test(self):
151 |         self.__test()
152 | 
153 | 
154 |     def update_metrics(self, split: str, loss, bs, accuracy=None):
155 |         self.losses[split].update(loss, bs)
156 |         if accuracy is not None:
157 |             self.accuracy[split].update(accuracy, bs)
158 |         if split == "train" and self.iters % self.save_iters == 0:
159 |             self.tbx_summary.add_scalar('{}_loss'.format(split), self.losses[split].avg, self.iters)
160 |             self.tbx_summary.add_scalar('{}_accuracy'.format(split), self.accuracy[split].avg, self.iters)
161 |             self.losses[split].reset()
162 |             self.accuracy[split].reset()


--------------------------------------------------------------------------------
/src_gestformer/models/model_utilizer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | from pathlib import Path
  6 | 
  7 | class ModuleUtilizer(object):
  8 |     """Module utility class
  9 | 
 10 |     Attributes:
 11 |         configer (Configer): Configer object, contains procedure configuration.
 12 | 
 13 |     """
 14 |     def __init__(self, configer):
 15 |         """Class constructor for Module utility"""
 16 |         self.configer = configer
 17 |         self.device = self.configer.get("device")
 18 | 
 19 |         self.save_policy = self.configer.get("checkpoints", "save_policy")
 20 |         if self.save_policy in ["early_stop", "earlystop"]:
 21 |             self.save = self.early_stop
 22 |         elif self.save_policy == "all":
 23 |             self.save = self.save_all
 24 |         else:
 25 |             self.save = self.save_best
 26 | 
 27 |         self.best_accuracy = 0
 28 |         self.last_improvement = 0
 29 | 
 30 |     def update_optimizer(self, net, iters):
 31 |         """Load optimizer and adjust learning rate during training, if using SGD.
 32 | 
 33 |                 Args:
 34 |                     net (torch.nn.Module): Module in use
 35 |                     iters (int): current iteration number
 36 | 
 37 |                 Returns:
 38 |                     optimizer (torch.optim.optimizer): PyTorch Optimizer
 39 |                     lr (float): Learning rate for training procedure
 40 | 
 41 |         """
 42 |         optim = self.configer.get('solver', 'type')
 43 |         decay = self.configer.get('solver', 'weight_decay')
 44 | 
 45 |         if optim == "Adam":
 46 |             print("Using Adam.")
 47 |             lr = self.configer.get('solver', 'base_lr')
 48 |             optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr,
 49 |                                          weight_decay=decay)
 50 | 
 51 |         elif optim == "AdamW":
 52 |             lr = self.configer.get('solver', 'base_lr')
 53 |             optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, net.parameters()), lr=lr,
 54 |                                           weight_decay=decay)
 55 | 
 56 |         elif optim == "RMSProp":
 57 |             lr = self.configer.get('solver', 'base_lr')
 58 |             optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, net.parameters()), lr=lr,
 59 |                                             weight_decay=decay)
 60 | 
 61 |         elif optim == "SGD":
 62 |             print("Using SGD")
 63 |             policy = self.configer.get('solver', 'lr_policy')
 64 | 
 65 |             if policy == 'fixed':
 66 |                 lr = self.configer.get('solver', 'base_lr')
 67 | 
 68 |             elif policy == 'step':
 69 |                 gamma = self.configer.get('solver', 'gamma')
 70 |                 ratio = gamma ** (iters // self.configer.get('solver', 'step_size'))
 71 |                 lr = self.configer.get('solver', 'base_lr') * ratio
 72 | 
 73 |             elif policy == 'exp':
 74 |                 lr = self.configer.get('solver', 'base_lr') * (self.configer.get('solver', 'gamma') ** iters)
 75 | 
 76 |             elif policy == 'inv':
 77 |                 power = -self.configer.get('solver', 'power')
 78 |                 ratio = (1 + self.configer.get('solver', 'gamma') * iters) ** power
 79 |                 lr = self.configer.get('solver', 'base_lr') * ratio
 80 | 
 81 |             elif policy == 'multistep':
 82 |                 lr = self.configer.get('solver', 'base_lr')
 83 |                 for step_value in self.configer.get('solver', 'stepvalue'):
 84 |                     if iters >= step_value:
 85 |                         lr *= self.configer.get('solver', 'gamma')
 86 |                     else:
 87 |                         break
 88 |             else:
 89 |                 raise NotImplementedError('Policy:{} is not valid.'.format(policy))
 90 | 
 91 |             optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr = lr,
 92 |                                         momentum=self.configer.get('solver', 'momentum'), weight_decay=decay)
 93 | 
 94 |         else:
 95 |             raise NotImplementedError('Optimizer: {} is not valid.'.format(optim))
 96 | 
 97 |         return optimizer, lr
 98 | 
 99 |     def load_net(self, net):
100 |         """Loading net method. If resume is True load from provided checkpoint, if False load new DataParallel
101 | 
102 |                 Args:
103 |                     net (torch.nn.Module): Module in use
104 | 
105 |                 Returns:
106 |                     net (torch.nn.DataParallel): Loaded Network module
107 |                     iters (int): Loaded current iteration number, 0 if Resume is False
108 |                     epoch (int): Loaded current epoch number, 0 if Resume is False
109 |                     optimizer (torch.nn.optimizer): Loaded optimizer state, None if Resume is False
110 | 
111 |         """
112 |         iters = 0
113 |         epoch = 0
114 |         optimizer = None
115 |         if self.configer.get('resume') is not None:
116 |             print('Restoring checkpoint: ', self.configer.get('resume'))
117 |             checkpoint_dict = torch.load(self.configer.get('resume'))
118 |             # Remove "module." from DataParallel, if present
119 |             checkpoint_dict['state_dict'] = {k[len('module.'):] if k.startswith('module.') else k: v for k, v in
120 |                                              checkpoint_dict['state_dict'].items()}
121 |             net.load_state_dict(checkpoint_dict['state_dict'])
122 |             iters = checkpoint_dict['iter'] if 'iter' in checkpoint_dict else 0
123 |             optimizer = checkpoint_dict['optimizer'] if 'optimizer' in checkpoint_dict else None
124 |             epoch = checkpoint_dict['epoch'] if 'epoch' in checkpoint_dict else None
125 |         net = nn.DataParallel(net, device_ids=self.configer.get('gpu')).to(self.device)
126 |         return net, iters, epoch, optimizer
127 | 
128 |     def _save_net(self, net, optimizer, iters, epoch, all=False):
129 |         """Saving net state method.
130 | 
131 |                 Args:
132 |                     net (torch.nn.Module): Module in use
133 |                     optimizer (torch.nn.optimizer): Optimizer state to save
134 |                     iters (int): Current iteration number to save
135 |                     epoch (int): Current epoch number to save
136 | 
137 |         """
138 |         state = {
139 |             'iter': iters,
140 |             'epoch': epoch,
141 |             'state_dict': net.state_dict(),
142 |             'optimizer': optimizer.state_dict()
143 |         }
144 |         checkpoints_dir = str(Path(self.configer.get('checkpoints', 'save_dir')) / self.configer.get("dataset"))
145 |         if not os.path.exists(checkpoints_dir):
146 |             os.makedirs(checkpoints_dir)
147 |         if all:
148 |             latest_name = '{}_{}.pth'.format(self.configer.get('checkpoints', 'save_name'), epoch)
149 |         else:
150 |             latest_name = 'best_{}.pth'.format(self.configer.get('checkpoints', 'save_name'))
151 |         torch.save(state, os.path.join(checkpoints_dir, latest_name))
152 | 
153 |     def save_all(self, accuracy, net, optimizer, iters, epoch):
154 |         self._save_net(net, optimizer, iters, epoch, all=True)
155 |         return accuracy
156 | 
157 |     def save_best(self, accuracy, net, optimizer, iters, epoch):
158 |         if accuracy > self.best_accuracy:
159 |             self.best_accuracy = accuracy
160 |             self._save_net(net, optimizer, iters, epoch)
161 |             print(accuracy)
162 |             return self.best_accuracy
163 |         else:
164 |             return 0
165 | 
166 |     def early_stop(self, accuracy, net, optimizer, iters, epoch):
167 |         ret = self.save_best(accuracy, net, optimizer, iters, epoch)
168 |         if ret > 0:
169 |             self.last_improvement = 0
170 |         else:
171 |             self.last_improvement += 1
172 |         if self.last_improvement >= self.configer.get("checkpoints", "early_stop"):
173 |             return -1
174 |         else:
175 |             return ret


--------------------------------------------------------------------------------
/src_gestformer/utils/test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import csv
  4 | import pandas as pd
  5 | 
  6 | from torch.utils.data import DataLoader
  7 | import imgaug.augmenters as iaa
  8 | 
  9 | # Import Datasets
 10 | from datasets.Briareo import Briareo
 11 | from datasets.NVGestures import NVGesture
 12 | from models.model_utilizer import ModuleUtilizer
 13 | 
 14 | # Import Model
 15 | from models.temporal import GestureTransoformer
 16 | 
 17 | # Import Utils
 18 | from tqdm import tqdm
 19 | from utils.average_meter import AverageMeter
 20 | 
 21 | # Setting seeds
 22 | 
 23 | from torchstat import stat
 24 | def worker_init_fn(worker_id):
 25 |     np.random.seed(torch.initial_seed() % 2 ** 32)
 26 | 
 27 | class GestureTest(object):
 28 |     """Gesture Recognition Test class
 29 | 
 30 |     Attributes:
 31 |         configer (Configer): Configer object, contains procedure configuration.
 32 |         train_loader (torch.utils.data.DataLoader): Train data loader variable
 33 |         val_loader (torch.utils.data.DataLoader): Val data loader variable
 34 |         test_loader (torch.utils.data.DataLoader): Test data loader variable
 35 |         net (torch.nn.Module): Network used for the current procedure
 36 |         lr (int): Learning rate value
 37 |         optimizer (torch.nn.optim.optimizer): Optimizer for training procedure
 38 |         iters (int): Starting iteration number, not zero if resuming training
 39 |         epoch (int): Starting epoch number, not zero if resuming training
 40 |         scheduler (torch.optim.lr_scheduler): Scheduler to utilize during training
 41 | 
 42 |     """
 43 | 
 44 |     def __init__(self, configer):
 45 |         self.configer = configer
 46 | 
 47 |         self.data_path = configer.get("data", "data_path")      #: str: Path to data directory
 48 | 
 49 |         # Train val and test accuracy
 50 |         self.accuracy = AverageMeter()
 51 | 
 52 |         # DataLoaders
 53 |         self.data_loader = None
 54 | 
 55 |         # Module load and save utility
 56 |         self.device = self.configer.get("device")
 57 |         self.model_utility = ModuleUtilizer(self.configer)      #: Model utility for load, save and update optimizer
 58 |         self.net = None
 59 | 
 60 |         # Training procedure
 61 |         self.transforms = None
 62 | 
 63 |         # Other useful data
 64 |         self.backbone = self.configer.get("network", "backbone")     #: str: Backbone type
 65 |         self.in_planes = None                                       #: int: Input channels
 66 |         self.clip_length = self.configer.get("data", "n_frames")    #: int: Number of frames per sequence
 67 |         self.n_classes = self.configer.get("data", "n_classes")     #: int: Total number of classes for dataset
 68 |         self.data_type = self.configer.get("data", "type")          #: str: Type of data (rgb, depth, ir, leapmotion)
 69 |         self.dataset = self.configer.get("dataset").lower()         #: str: Type of dataset
 70 |         self.optical_flow = self.configer.get("data", "optical_flow")
 71 |         if self.optical_flow is None:
 72 |             self.optical_flow = True
 73 | 
 74 |     def init_model(self):
 75 |         """Initialize model and other data for procedure"""
 76 | 
 77 |         if self.optical_flow is True:
 78 |             self.in_planes = 2
 79 |         elif self.data_type in ["depth", "ir"]:
 80 |             self.in_planes = 1
 81 |         else:
 82 |             self.in_planes = 3
 83 | 
 84 |         # Selecting correct model and normalization variable based on type variable
 85 |         self.net = GestureTransoformer(self.backbone, self.in_planes, self.n_classes,
 86 |                                        pretrained=self.configer.get("network", "pretrained"),
 87 |                                        n_head=self.configer.get("network", "n_head"),
 88 |                                        dropout_backbone=self.configer.get("network", "dropout2d"),
 89 |                                        dropout_transformer=self.configer.get("network", "dropout1d"),
 90 |                                        dff=self.configer.get("network", "ff_size"),
 91 |                                        n_module=self.configer.get("network", "n_module")
 92 |                                        )
 93 | 
 94 |         self.net, _, _, _ = self.model_utility.load_net(self.net)
 95 | 
 96 |         # Selecting Dataset and DataLoader
 97 |         if self.dataset == "briareo":
 98 |             Dataset = Briareo
 99 |             self.transforms = iaa.CenterCropToFixedSize(200, 200)
100 |         elif self.dataset == "nvgestures":
101 |             Dataset = NVGesture
102 |             self.transforms = iaa.CenterCropToFixedSize(256, 192)
103 |         else:
104 |             raise NotImplementedError(f"Dataset not supported: {self.configer.get('dataset')}")
105 | 
106 |         # Setting Dataloaders
107 |         self.data_loader = DataLoader(
108 |             Dataset(self.configer, self.data_path, split="val", data_type=self.data_type,
109 |                     transforms=self.transforms, n_frames=self.clip_length,
110 |                     optical_flow=self.optical_flow),
111 |             batch_size=1, shuffle=False, drop_last=True,
112 |             num_workers=self.configer.get('solver', 'workers'), pin_memory=True, worker_init_fn=worker_init_fn)
113 | 
114 |     def __test(self):
115 |         """Testing function."""
116 |         self.net.eval()
117 |         # self.net.cuda()
118 |         num_params = sum(p.numel() for p in self.net.parameters())
119 |         print(f'Number of parameters in the model: {num_params}')
120 |         device = next(self.net.parameters()).device
121 |         print("device", device)
122 |         
123 |         if 'cuda' in str(device):
124 |             print("Model is on CUDA")
125 |         else:
126 |             print("Model is on CPU")
127 |         c = 0
128 |         tot = 0
129 |         with torch.no_grad():
130 |             with open('csv/Briareo/rgbop.csv', 'a', newline='') as csvfile:
131 |                 for i, data_tuple in enumerate(tqdm(self.data_loader, desc="Test")):
132 |                     """
133 |                     input, gt
134 |                     """
135 |                     # self.device= torch.device('cpu')
136 | 
137 |                     inputs = data_tuple[0].to(self.device)
138 |                     gt = data_tuple[1].to(self.device)
139 | 
140 |                     output = self.net(inputs)
141 |                     # with torch.cuda.device(0):
142 |                     #     # print(inputs.shape.device)
143 |                     #     print(stat(self.net, inputs[0].shape))
144 |                     #print(output)
145 |                     #_, predicted = torch.max(output, 1)
146 |                     #print(predicted)
147 | 
148 |                     sm = torch.nn.Softmax(dim=1)
149 |                     prob=sm(output)
150 |                     #print(i)
151 |                     predicted = torch.argmax(output.detach(), dim=1)
152 |                     #print(predicted)
153 |                     arr = prob.cpu().numpy()
154 |                     # print(arr)
155 |                     
156 |                     hist_df = pd.DataFrame(arr)
157 |                     # print(hist_df)
158 |                     #log_file.close()
159 |                     hist_df.to_csv(csvfile, mode='a',header=False, index=False)
160 | 
161 |                     correct = gt.detach().squeeze(dim=1)
162 |                     #print(gt)
163 |                     #hist_df = pd.DataFrame(correct)
164 |                     #hist_df.to_csv(csvfile, mode='a',header=False)
165 | 
166 |                     if predicted == correct:
167 |                         c += 1
168 |                     tot += 1
169 | 
170 |             #print(predicted)
171 |             #print(correct)
172 |         accuracy = c / tot
173 |         #print(tot)
174 | 
175 |         print("Accuracy: {}".format(accuracy))
176 | 
177 |     def test(self):
178 |         self.__test()
179 | 
180 | 
181 |     def update_metrics(self, split: str, loss, bs, accuracy=None):
182 |         self.losses[split].update(loss, bs)
183 |         if accuracy is not None:
184 |             self.accuracy[split].update(accuracy, bs)
185 |         if split == "train" and self.iters % self.save_iters == 0:
186 |             self.tbx_summary.add_scalar('{}_loss'.format(split), self.losses[split].avg, self.iters)
187 |             self.tbx_summary.add_scalar('{}_accuracy'.format(split), self.accuracy[split].avg, self.iters)
188 |             self.losses[split].reset()
189 |             self.accuracy[split].reset()


--------------------------------------------------------------------------------
/src_gestformer/models/backbones/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torchvision.models.utils import load_state_dict_from_url
  4 | 
  5 | 
  6 | __all__ = ['ResNet', 'resnet18']
  7 | 
  8 | 
  9 | model_urls = {
 10 |     'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
 11 | }
 12 | 
 13 | 
 14 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
 15 |     """3x3 convolution with padding"""
 16 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
 17 |                      padding=dilation, groups=groups, bias=False, dilation=dilation)
 18 | 
 19 | 
 20 | def conv1x1(in_planes, out_planes, stride=1):
 21 |     """1x1 convolution"""
 22 |     return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
 23 | 
 24 | 
 25 | class BasicBlock(nn.Module):
 26 |     expansion = 1
 27 | 
 28 |     def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
 29 |                  base_width=64, dilation=1, dropout=0., norm_layer=None):
 30 |         super(BasicBlock, self).__init__()
 31 |         if norm_layer is None:
 32 |             norm_layer = nn.BatchNorm2d
 33 |         if groups != 1 or base_width != 64:
 34 |             raise ValueError('BasicBlock only supports groups=1 and base_width=64')
 35 |         if dilation > 1:
 36 |             raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
 37 |         # Both self.conv1 and self.downsample layers downsample the input when stride != 1
 38 |         self.conv1 = conv3x3(inplanes, planes, stride)
 39 |         self.bn1 = norm_layer(planes)
 40 |         self.relu = nn.ReLU(inplace=True)
 41 |         self.conv2 = conv3x3(planes, planes)
 42 |         self.bn2 = norm_layer(planes)
 43 |         self.downsample = downsample
 44 |         self.stride = stride
 45 |         self.dropout = nn.Dropout2d(p=dropout)
 46 | 
 47 |     def forward(self, x):
 48 |         identity = x
 49 | 
 50 |         out = self.conv1(x)
 51 |         out = self.bn1(out)
 52 |         out = self.relu(out)
 53 |         out = self.dropout(out)
 54 | 
 55 |         out = self.conv2(out)
 56 |         out = self.bn2(out)
 57 | 
 58 |         if self.downsample is not None:
 59 |             identity = self.downsample(x)
 60 |             out = self.dropout(out)
 61 | 
 62 |         out += identity
 63 |         out = self.relu(out)
 64 |         out = self.dropout(out)
 65 | 
 66 |         return out
 67 | 
 68 | 
 69 | class ResNet(nn.Module):
 70 | 
 71 |     def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
 72 |                  groups=1, width_per_group=64, dropout=0., replace_stride_with_dilation=None,
 73 |                  norm_layer=None):
 74 |         super(ResNet, self).__init__()
 75 |         if norm_layer is None:
 76 |             norm_layer = nn.BatchNorm2d
 77 |         self._norm_layer = norm_layer
 78 | 
 79 |         self.inplanes = 64
 80 |         self.dilation = 1
 81 |         self.drop_prob = dropout
 82 |         if replace_stride_with_dilation is None:
 83 |             # each element in the tuple indicates if we should replace
 84 |             # the 2x2 stride with a dilated convolution instead
 85 |             replace_stride_with_dilation = [False, False, False]
 86 |         if len(replace_stride_with_dilation) != 3:
 87 |             raise ValueError("replace_stride_with_dilation should be None "
 88 |                              "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
 89 |         self.groups = groups
 90 |         self.base_width = width_per_group
 91 |         self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
 92 |         self.bn1 = norm_layer(self.inplanes)
 93 |         self.relu = nn.ReLU(inplace=True)
 94 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 95 |         self.layer1 = self._make_layer(block, 64, layers[0])
 96 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
 97 |                                        dilate=replace_stride_with_dilation[0])
 98 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
 99 |                                        dilate=replace_stride_with_dilation[1])
100 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
101 |                                        dilate=replace_stride_with_dilation[2])
102 |         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
103 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
104 | 
105 |         for m in self.modules():
106 |             if isinstance(m, nn.Conv2d):
107 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
108 |             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
109 |                 nn.init.constant_(m.weight, 1)
110 |                 nn.init.constant_(m.bias, 0)
111 | 
112 |         # Zero-initialize the last BN in each residual branch,
113 |         # so that the residual branch starts with zeros, and each residual block behaves like an identity.
114 |         # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
115 |         if zero_init_residual:
116 |             for m in self.modules():
117 |                 if isinstance(m, Bottleneck):
118 |                     nn.init.constant_(m.bn3.weight, 0)
119 |                 elif isinstance(m, BasicBlock):
120 |                     nn.init.constant_(m.bn2.weight, 0)
121 | 
122 |     def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
123 |         norm_layer = self._norm_layer
124 |         downsample = None
125 |         previous_dilation = self.dilation
126 |         if dilate:
127 |             self.dilation *= stride
128 |             stride = 1
129 |         if stride != 1 or self.inplanes != planes * block.expansion:
130 |             downsample = nn.Sequential(
131 |                 conv1x1(self.inplanes, planes * block.expansion, stride),
132 |                 norm_layer(planes * block.expansion),
133 |             )
134 | 
135 |         layers = []
136 |         layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
137 |                             self.base_width, previous_dilation, self.drop_prob, norm_layer))
138 |         self.inplanes = planes * block.expansion
139 |         for _ in range(1, blocks):
140 |             layers.append(block(self.inplanes, planes, groups=self.groups,
141 |                                 base_width=self.base_width, dilation=self.dilation,
142 |                                 dropout=self.drop_prob, norm_layer=norm_layer))
143 | 
144 |         return nn.Sequential(*layers)
145 | 
146 |     def _forward_impl(self, x):
147 |         # See note [TorchScript super()]
148 |         x = self.conv1(x)
149 |         x = self.bn1(x)
150 |         x = self.relu(x)
151 |         x = self.maxpool(x)
152 | 
153 |         x1 = self.layer1(x)
154 |         x2 = self.layer2(x1)
155 |         x3 = self.layer3(x2)
156 |         x4 = self.layer4(x3)
157 | 
158 |         x = self.avgpool(x4)
159 |         x = torch.flatten(x, 1)
160 |         # print(x.shape)
161 |         # print(x1.shape)
162 |         # print(x2.shape)
163 |         # print(x3.shape)
164 |         # print(x4.shape)
165 | 
166 |         return x
167 | 
168 |     def forward(self, x):
169 |         return self._forward_impl(x)
170 | 
171 | 
172 | def _resnet(arch, block, layers, pretrained, progress, in_planes, **kwargs):
173 |     model = ResNet(block, layers, **kwargs)
174 |     if pretrained:
175 |         state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
176 |         model.load_state_dict(state_dict)
177 |         if in_planes in [1, 2]:
178 |             w = model._modules['conv1']._parameters['weight'].data
179 |             model.conv1 = nn.Conv2d(in_planes, 64, kernel_size=7, stride=2, padding=3, bias=False)
180 |             if in_planes == 1:
181 |                 model.conv1._parameters['weight'].data = w.mean(dim=1, keepdim=True)
182 |             else:
183 |                 model.conv1._parameters['weight'].data = w[:, :-1] * 1.5
184 |     else:
185 |         model.conv1 = nn.Conv2d(in_planes, 64, kernel_size=7, stride=2, padding=3, bias=False)
186 |     model.fc = None
187 |     return model
188 | 
189 | 
190 | def resnet18(pretrained=False, in_planes: int=3, **kwargs):
191 |     r"""ResNet-18 model from
192 |     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
193 | 
194 |     Args:
195 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
196 |         progress (bool): If True, displays a progress bar of the download to stderr
197 |     """
198 |     return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, True, in_planes,
199 |                    **kwargs)


--------------------------------------------------------------------------------
/src_gestformer/datasets/utils/utils_briareo.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | # full 675, full_no_fingers 145, mod 192
  4 | def from_json_to_list(json_file):
  5 |     with open(json_file) as f:
  6 |         j = json.load(f)
  7 |         if j['frame'] != 'invalid':
  8 |             j_vector = [
  9 |                 # palm
 10 |                 j['frame']['right_hand']['palm_position'][0],
 11 |                 j['frame']['right_hand']['palm_position'][1],
 12 |                 j['frame']['right_hand']['palm_position'][2],
 13 |                 j['frame']['right_hand']['palm_position'][3],
 14 |                 j['frame']['right_hand']['palm_position'][4],
 15 |                 j['frame']['right_hand']['palm_position'][5],
 16 |                 j['frame']['right_hand']['palm_normal'][0],
 17 |                 j['frame']['right_hand']['palm_normal'][1],
 18 |                 j['frame']['right_hand']['palm_normal'][2],
 19 |                 j['frame']['right_hand']['palm_normal'][3],
 20 |                 j['frame']['right_hand']['palm_normal'][4],
 21 |                 j['frame']['right_hand']['palm_normal'][5],
 22 |                 j['frame']['right_hand']['palm_velocity'][0],
 23 |                 j['frame']['right_hand']['palm_velocity'][1],
 24 |                 j['frame']['right_hand']['palm_velocity'][2],
 25 |                 j['frame']['right_hand']['palm_velocity'][3],
 26 |                 j['frame']['right_hand']['palm_velocity'][4],
 27 |                 j['frame']['right_hand']['palm_velocity'][5],
 28 |                 j['frame']['right_hand']['palm_width'],
 29 |                 j['frame']['right_hand']['pinch_strength'],
 30 |                 j['frame']['right_hand']['grab_strength'],
 31 |                 j['frame']['right_hand']['direction'][0],
 32 |                 j['frame']['right_hand']['direction'][1],
 33 |                 j['frame']['right_hand']['direction'][2],
 34 |                 j['frame']['right_hand']['direction'][3],
 35 |                 j['frame']['right_hand']['direction'][4],
 36 |                 j['frame']['right_hand']['direction'][5],
 37 |                 j['frame']['right_hand']['sphere_center'][0],
 38 |                 j['frame']['right_hand']['sphere_center'][1],
 39 |                 j['frame']['right_hand']['sphere_center'][2],
 40 |                 j['frame']['right_hand']['sphere_center'][3],
 41 |                 j['frame']['right_hand']['sphere_center'][4],
 42 |                 j['frame']['right_hand']['sphere_center'][5],
 43 |                 j['frame']['right_hand']['sphere_radius'],
 44 |                 # wrist
 45 |                 j['frame']['right_hand']['wrist_position'][0],
 46 |                 j['frame']['right_hand']['wrist_position'][1],
 47 |                 j['frame']['right_hand']['wrist_position'][2],
 48 |                 j['frame']['right_hand']['wrist_position'][3],
 49 |                 j['frame']['right_hand']['wrist_position'][4],
 50 |                 j['frame']['right_hand']['wrist_position'][5],
 51 |                 # pointables
 52 |                 j['frame']['right_hand']['pointables']['p_0']['tip_position'][0],
 53 |                 j['frame']['right_hand']['pointables']['p_0']['tip_position'][1],
 54 |                 j['frame']['right_hand']['pointables']['p_0']['tip_position'][2],
 55 |                 j['frame']['right_hand']['pointables']['p_0']['tip_position'][3],
 56 |                 j['frame']['right_hand']['pointables']['p_0']['tip_position'][4],
 57 |                 j['frame']['right_hand']['pointables']['p_0']['tip_position'][5],
 58 |                 j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][0],
 59 |                 j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][1],
 60 |                 j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][2],
 61 |                 j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][3],
 62 |                 j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][4],
 63 |                 j['frame']['right_hand']['pointables']['p_0']['tip_velocity'][5],
 64 |                 j['frame']['right_hand']['pointables']['p_0']['direction'][0],
 65 |                 j['frame']['right_hand']['pointables']['p_0']['direction'][1],
 66 |                 j['frame']['right_hand']['pointables']['p_0']['direction'][2],
 67 |                 j['frame']['right_hand']['pointables']['p_0']['direction'][3],
 68 |                 j['frame']['right_hand']['pointables']['p_0']['direction'][4],
 69 |                 j['frame']['right_hand']['pointables']['p_0']['direction'][5],
 70 |                 j['frame']['right_hand']['pointables']['p_0']['width'],
 71 |                 j['frame']['right_hand']['pointables']['p_0']['length'],
 72 |                 float(j['frame']['right_hand']['pointables']['p_0']['is_extended']),
 73 |                 j['frame']['right_hand']['pointables']['p_1']['tip_position'][0],
 74 |                 j['frame']['right_hand']['pointables']['p_1']['tip_position'][1],
 75 |                 j['frame']['right_hand']['pointables']['p_1']['tip_position'][2],
 76 |                 j['frame']['right_hand']['pointables']['p_1']['tip_position'][3],
 77 |                 j['frame']['right_hand']['pointables']['p_1']['tip_position'][4],
 78 |                 j['frame']['right_hand']['pointables']['p_1']['tip_position'][5],
 79 |                 j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][0],
 80 |                 j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][1],
 81 |                 j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][2],
 82 |                 j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][3],
 83 |                 j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][4],
 84 |                 j['frame']['right_hand']['pointables']['p_1']['tip_velocity'][5],
 85 |                 j['frame']['right_hand']['pointables']['p_1']['direction'][0],
 86 |                 j['frame']['right_hand']['pointables']['p_1']['direction'][1],
 87 |                 j['frame']['right_hand']['pointables']['p_1']['direction'][2],
 88 |                 j['frame']['right_hand']['pointables']['p_1']['direction'][3],
 89 |                 j['frame']['right_hand']['pointables']['p_1']['direction'][4],
 90 |                 j['frame']['right_hand']['pointables']['p_1']['direction'][5],
 91 |                 j['frame']['right_hand']['pointables']['p_1']['width'],
 92 |                 j['frame']['right_hand']['pointables']['p_1']['length'],
 93 |                 float(j['frame']['right_hand']['pointables']['p_1']['is_extended']),
 94 |                 j['frame']['right_hand']['pointables']['p_2']['tip_position'][0],
 95 |                 j['frame']['right_hand']['pointables']['p_2']['tip_position'][1],
 96 |                 j['frame']['right_hand']['pointables']['p_2']['tip_position'][2],
 97 |                 j['frame']['right_hand']['pointables']['p_2']['tip_position'][3],
 98 |                 j['frame']['right_hand']['pointables']['p_2']['tip_position'][4],
 99 |                 j['frame']['right_hand']['pointables']['p_2']['tip_position'][5],
100 |                 j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][0],
101 |                 j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][1],
102 |                 j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][2],
103 |                 j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][3],
104 |                 j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][4],
105 |                 j['frame']['right_hand']['pointables']['p_2']['tip_velocity'][5],
106 |                 j['frame']['right_hand']['pointables']['p_2']['direction'][0],
107 |                 j['frame']['right_hand']['pointables']['p_2']['direction'][1],
108 |                 j['frame']['right_hand']['pointables']['p_2']['direction'][2],
109 |                 j['frame']['right_hand']['pointables']['p_2']['direction'][3],
110 |                 j['frame']['right_hand']['pointables']['p_2']['direction'][4],
111 |                 j['frame']['right_hand']['pointables']['p_2']['direction'][5],
112 |                 j['frame']['right_hand']['pointables']['p_2']['width'],
113 |                 j['frame']['right_hand']['pointables']['p_2']['length'],
114 |                 float(j['frame']['right_hand']['pointables']['p_2']['is_extended']),
115 |                 j['frame']['right_hand']['pointables']['p_3']['tip_position'][0],
116 |                 j['frame']['right_hand']['pointables']['p_3']['tip_position'][1],
117 |                 j['frame']['right_hand']['pointables']['p_3']['tip_position'][2],
118 |                 j['frame']['right_hand']['pointables']['p_3']['tip_position'][3],
119 |                 j['frame']['right_hand']['pointables']['p_3']['tip_position'][4],
120 |                 j['frame']['right_hand']['pointables']['p_3']['tip_position'][5],
121 |                 j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][0],
122 |                 j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][1],
123 |                 j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][2],
124 |                 j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][3],
125 |                 j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][4],
126 |                 j['frame']['right_hand']['pointables']['p_3']['tip_velocity'][5],
127 |                 j['frame']['right_hand']['pointables']['p_3']['direction'][0],
128 |                 j['frame']['right_hand']['pointables']['p_3']['direction'][1],
129 |                 j['frame']['right_hand']['pointables']['p_3']['direction'][2],
130 |                 j['frame']['right_hand']['pointables']['p_3']['direction'][3],
131 |                 j['frame']['right_hand']['pointables']['p_3']['direction'][4],
132 |                 j['frame']['right_hand']['pointables']['p_3']['direction'][5],
133 |                 j['frame']['right_hand']['pointables']['p_3']['width'],
134 |                 j['frame']['right_hand']['pointables']['p_3']['length'],
135 |                 float(j['frame']['right_hand']['pointables']['p_3']['is_extended']),
136 |                 j['frame']['right_hand']['pointables']['p_4']['tip_position'][0],
137 |                 j['frame']['right_hand']['pointables']['p_4']['tip_position'][1],
138 |                 j['frame']['right_hand']['pointables']['p_4']['tip_position'][2],
139 |                 j['frame']['right_hand']['pointables']['p_4']['tip_position'][3],
140 |                 j['frame']['right_hand']['pointables']['p_4']['tip_position'][4],
141 |                 j['frame']['right_hand']['pointables']['p_4']['tip_position'][5],
142 |                 j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][0],
143 |                 j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][1],
144 |                 j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][2],
145 |                 j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][3],
146 |                 j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][4],
147 |                 j['frame']['right_hand']['pointables']['p_4']['tip_velocity'][5],
148 |                 j['frame']['right_hand']['pointables']['p_4']['direction'][0],
149 |                 j['frame']['right_hand']['pointables']['p_4']['direction'][1],
150 |                 j['frame']['right_hand']['pointables']['p_4']['direction'][2],
151 |                 j['frame']['right_hand']['pointables']['p_4']['direction'][3],
152 |                 j['frame']['right_hand']['pointables']['p_4']['direction'][4],
153 |                 j['frame']['right_hand']['pointables']['p_4']['direction'][5],
154 |                 j['frame']['right_hand']['pointables']['p_4']['width'],
155 |                 j['frame']['right_hand']['pointables']['p_4']['length'],
156 |                 float(j['frame']['right_hand']['pointables']['p_4']['is_extended']),
157 |             ]
158 |         else:
159 |             j_vector = False
160 | 
161 |         return j_vector, j


--------------------------------------------------------------------------------
/src_gestformer/models/attention.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | from einops import rearrange, repeat
  5 | import torch.nn.functional as F
  6 | from pytorch_wavelets import DWTForward, DWTInverse
  7 | 
  8 | def position_embedding(input, d_model):
  9 |     input = input.view(-1, 1)
 10 |     dim = torch.arange(d_model // 2, dtype=torch.float32, device=input.device).view(1, -1)
 11 |     sin = torch.sin(input / 10000 ** (2 * dim / d_model))
 12 |     cos = torch.cos(input / 10000 ** (2 * dim / d_model))
 13 | 
 14 |     out = torch.zeros((input.shape[0], d_model), device=input.device)
 15 |     out[:, ::2] = sin
 16 |     out[:, 1::2] = cos
 17 |     return out
 18 | 
 19 | def sinusoid_encoding_table(max_len, d_model):
 20 |     pos = torch.arange(max_len, dtype=torch.float32)
 21 |     out = position_embedding(pos, d_model)
 22 |     return out
 23 | 
 24 | class ScaledDotProductAttention(nn.Module):
 25 |     """
 26 |     Scaled dot-product attention
 27 |     """
 28 | 
 29 |     def __init__(self, d_model, d_k, d_v, h):
 30 |         """
 31 |         :param d_model: Output dimensionality of the model
 32 |         :param d_k: Dimensionality of queries and keys
 33 |         :param d_v: Dimensionality of values
 34 |         :param h: Number of heads
 35 |         """
 36 |         super(ScaledDotProductAttention, self).__init__()
 37 |         self.fc_q = nn.Linear(d_model, h * d_k)
 38 |         self.fc_k = nn.Linear(d_model, h * d_k)
 39 |         self.fc_v = nn.Linear(d_model, h * d_v)
 40 |         self.fc_o = nn.Linear(h * d_v, d_model)
 41 | 
 42 |         self.d_model = d_model
 43 |         self.d_k = d_k
 44 |         self.d_v = d_v
 45 |         self.h = h
 46 | 
 47 |         self.init_weights(gain=1.0)
 48 | 
 49 |     def init_weights(self, gain=1.0):
 50 |         nn.init.xavier_normal_(self.fc_q.weight, gain=gain)
 51 |         nn.init.xavier_normal_(self.fc_k.weight, gain=gain)
 52 |         nn.init.xavier_normal_(self.fc_v.weight, gain=gain)
 53 |         nn.init.xavier_normal_(self.fc_o.weight, gain=gain)
 54 |         nn.init.constant_(self.fc_q.bias, 0)
 55 |         nn.init.constant_(self.fc_k.bias, 0)
 56 |         nn.init.constant_(self.fc_v.bias, 0)
 57 |         nn.init.constant_(self.fc_o.bias, 0)
 58 | 
 59 |     def forward(self, queries, keys, values):
 60 |         """
 61 |         Computes
 62 |         :param queries: Queries (b_s, nq, d_model)
 63 |         :param keys: Keys (b_s, nk, d_model)
 64 |         :param values: Values (b_s, nk, d_model)
 65 |         :return:
 66 |         """
 67 |         b_s, nq = queries.shape[:2]
 68 |         nk = keys.shape[1]
 69 |         q = self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3)  # (b_s, h, nq, d_k)
 70 |         k = self.fc_k(keys).view(b_s, nk, self.h, self.d_k).permute(0, 2, 3, 1)  # (b_s, h, d_k, nk)
 71 |         v = self.fc_v(values).view(b_s, nk, self.h, self.d_v).permute(0, 2, 1, 3)  # (b_s, h, nk, d_v)
 72 | 
 73 |         att = torch.matmul(q, k) / np.sqrt(self.d_k)  # (b_s, h, nq, nk)
 74 | 
 75 |         att = torch.softmax(att, -1)
 76 | 
 77 |         out = torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v)  # (b_s, nq, h*d_v)
 78 |         out = self.fc_o(out)  # (b_s, nq, d_model)
 79 |         return out
 80 | 
 81 | class MultiHeadAttention(nn.Module):
 82 |     """
 83 |     Multi-head attention layer with Dropout and Layer Normalization.
 84 |     """
 85 | 
 86 |     def __init__(self, d_model, d_k, d_v, h, dff=2048, dropout=.1):
 87 |         super(MultiHeadAttention, self).__init__()
 88 | 
 89 |         # self.attention = ScaledDotProductAttention(d_model=d_model, d_k=d_k, d_v=d_v, h=h)
 90 |         self.dropout = nn.Dropout(p=dropout)
 91 |         self.layer_norm = nn.LayerNorm(d_model)
 92 |         # self.layer_norm2 = nn.LayerNorm(d_model)
 93 | 
 94 |         self.fc = nn.Sequential(*[nn.Linear(d_model, dff), nn.ReLU(inplace=True), nn.Dropout(p=dropout),
 95 |                                   nn.Linear(dff, d_model)])
 96 | 
 97 |     def forward(self, queries, keys, values):
 98 |         att = self.attention(queries, keys, values)
 99 |         att = self.dropout(att)
100 |         # att = self.layer_norm(queries + att)
101 |         att = self.fc(att)
102 |         att = self.dropout(att)
103 |         return self.layer_norm(queries + att)
104 | 
105 | class ScaledDotProductAttention_(nn.Module):
106 |     """
107 |     Scaled dot-product attention
108 |     """
109 | 
110 |     def __init__(self, d_model, d_k, d_v, h):
111 |         """
112 |         :param d_model: Output dimensionality of the model
113 |         :param d_k: Dimensionality of queries and keys
114 |         :param d_v: Dimensionality of values
115 |         :param h: Number of heads
116 |         """
117 |         super(ScaledDotProductAttention_, self).__init__()
118 |         # print(d_model)
119 |         self.fc_q = nn.Linear(d_model, h * d_k)
120 |         self.fc_k = nn.Linear(d_model, h * d_k)
121 |         self.fc_v = nn.Linear(d_model, h * d_v)
122 |         self.fc_o = nn.Linear(h * d_v, d_model)
123 | 
124 |         self.d_model = d_model
125 |         self.d_k = d_k
126 |         self.d_v = d_v
127 |         self.h = h
128 |         
129 | 
130 |         self.init_weights(gain=1.0)
131 | 
132 |     def init_weights(self, gain=1.0):
133 |         nn.init.xavier_normal_(self.fc_q.weight, gain=gain)
134 |         nn.init.xavier_normal_(self.fc_k.weight, gain=gain)
135 |         nn.init.xavier_normal_(self.fc_v.weight, gain=gain)
136 |         nn.init.xavier_normal_(self.fc_o.weight, gain=gain)
137 |         nn.init.constant_(self.fc_q.bias, 0)
138 |         nn.init.constant_(self.fc_k.bias, 0)
139 |         nn.init.constant_(self.fc_v.bias, 0)
140 |         nn.init.constant_(self.fc_o.bias, 0)
141 | 
142 |     def forward(self, queries, keys, values):
143 |         """
144 |         Computes
145 |         :param queries: Queries (b_s, nq, d_model)
146 |         :param keys: Keys (b_s, nk, d_model)
147 |         :param values: Values (b_s, nk, d_model)
148 |         :return:
149 |         """
150 |         b_s, nq = queries.shape[:2]
151 |         # nk = keys.shape[1]
152 |         # print(queries.shape)
153 |         # print(b_s)
154 |         q = self.fc_q(queries).view(b_s,  self.h, self.d_k).permute(0, 1, 2)  # (b_s, h, nq, d_k)
155 |         k = self.fc_k(keys).view(b_s,  self.h, self.d_k).permute(0, 2,1)  # (b_s, h, d_k, nk)
156 |         v = self.fc_v(values).view(b_s,  self.h, self.d_v).permute(0, 1,  2)  # (b_s, h, nk, d_v)
157 | 
158 |         att = torch.matmul(q, k) / np.sqrt(self.d_k)  # (b_s, h, nq, nk)
159 | 
160 |         att = torch.softmax(att, -1)
161 |         
162 | 
163 |         out = torch.matmul(att, v).permute(0, 1,2).contiguous().view(b_s, self.h * self.d_v)  # (b_s, nq, h*d_v)
164 |         out = self.fc_o(out)  # (b_s, nq, d_model)
165 |         return out
166 | 
167 | class Pooling(nn.Module):
168 |     """
169 |     Implementation of pooling for PoolFormer
170 |     --pool_size: pooling size
171 |     """
172 |     def __init__(self, pool_size=3):
173 |         super().__init__()
174 |         self.pool = nn.AvgPool1d(
175 |             pool_size, stride=1, padding=pool_size//2, count_include_pad=False)
176 | 
177 |     def forward(self, x):
178 |         return self.pool(x) 
179 | 
180 | class SSL(nn.Module): 
181 |     def __init__(self, channels):
182 |         super(SSL, self).__init__()
183 | 
184 |         self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, groups=channels, bias=False)#conv_block_my(channels, channels, kernel_size = 3, stride=1, padding = 1, dilation=1)
185 |         self.conv5 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, groups=channels, bias=False)#conv_block_my(channels, channels, kernel_size = 3, stride=1, padding = 5, dilation=5)
186 |         self.conv7 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, groups=channels, bias=False)#conv_block_my(channels, channels, kernel_size = 3, stride=1, padding = 7, dilation=7)
187 |         self.conv9 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, groups=channels, bias=False)#conv_block_my(channels, channels, kernel_size = 3, stride=1, padding = 9, dilation=9)
188 | 
189 |         self.conv_cat = nn.Conv2d(channels*4, channels, kernel_size=3, padding=1, groups=channels, bias=False)#conv_block_my(channels*4, channels, kernel_size = 3, stride = 1, padding = 1, dilation=1)
190 | 
191 |     def forward(self, x):
192 | 
193 |         aa =  DWTForward(J=1, mode='zero', wave='db3').cuda(device=0)
194 |         yl, yh = aa(x)
195 | 
196 |         yh_out = yh[0]
197 |         ylh = yh_out[:,:,0,:,:]
198 |         yhl = yh_out[:,:,1,:,:]
199 |         yhh = yh_out[:,:,2,:,:]
200 | 
201 |         conv_rec1 = self.conv5(yl)
202 |         conv_rec5 = self.conv5(ylh)
203 |         conv_rec7 = self.conv7(yhl)
204 |         conv_rec9 = self.conv9(yhh)
205 | 
206 |         cat_all = torch.stack((conv_rec5, conv_rec7, conv_rec9),dim=2)
207 |         rec_yh = []
208 |         rec_yh.append(cat_all)
209 | 
210 | 
211 |         ifm = DWTInverse(wave='db3', mode='zero').cuda(device=0)
212 |         Y = ifm((conv_rec1, rec_yh))
213 | 
214 |         return Y
215 |     
216 | class MultiHeadAttention_(nn.Module):
217 |     """
218 |     Multi-head attention layer with Dropout and Layer Normalization.
219 |     """
220 | 
221 |     def __init__(self, d_model, d_k, d_v, h, dff=2048, dropout=.1):
222 |         super(MultiHeadAttention_, self).__init__()
223 | 
224 |         # self.attention = ScaledDotProductAttention_(d_model=d_model, d_k=d_k, d_v=d_v, h=h)
225 |         self.s = SSL(1)
226 |         # self.token_mixer = Pooling(pool_size=3)
227 |         self.dropout = nn.Dropout(p=dropout)
228 |         self.layer_norm = nn.LayerNorm(d_model)
229 |         # self.layer_norm2 = nn.LayerNorm(d_model)
230 |         self.token_mixer1 = Pooling(pool_size=3)
231 |         self.token_mixer2 = Pooling(pool_size=5)
232 |         self.token_mixer3 = Pooling(pool_size=7)
233 | 
234 |         hidden_features = int(d_model*2.66)
235 | 
236 |         self.project_in = nn.Conv1d(d_model, hidden_features*2, kernel_size=1)
237 | 
238 |         self.dwconv = nn.Conv1d(hidden_features*2, hidden_features*2, kernel_size=3, stride=1, padding=1, groups=hidden_features*2)
239 | 
240 |         self.project_out = nn.Conv1d(hidden_features, d_model, kernel_size=1)
241 | 
242 | 
243 |         # self.fc = nn.Sequential(*[nn.Linear(d_model, dff), nn.ReLU(inplace=True), nn.Dropout(p=dropout),
244 |         #                           nn.Linear(dff, d_model)])
245 | 
246 |     # def forward(self, queries, keys, values):
247 |     def forward(self, x):
248 |         x = x.unsqueeze(1)
249 |         x = self.s(x)
250 |         # print(x.shape)
251 |         x = x.squeeze(1)
252 |         att1 = self.token_mixer1(x)
253 |         att2 = self.token_mixer2(x)
254 |         att3 = self.token_mixer3(x)
255 | 
256 |         att =( att1 +att2+ att3 )/3  
257 |         # att = self.token_mixer(x)
258 |         # print(att.shape)
259 |         # att = self.attention(queries, keys, values)
260 |         att = self.dropout(att)
261 |         # att = self.layer_norm(queries + att)
262 |         g = self.project_in(att.permute(0, 2, 1))
263 |         x1, x2 = self.dwconv(g).chunk(2, dim=1)
264 |         g = F.gelu(x1) * x2
265 |         g = self.project_out(g)
266 | 
267 |         att = self.dropout(g.permute(0, 2, 1))
268 | 
269 |         return self.layer_norm( x+att)
270 |         
271 | class EncoderSelfAttention(nn.Module):
272 |     def __init__(self, d_model, d_k, d_v, n_head, dff=2048, dropout_transformer=.1, n_module=6):
273 |         super(EncoderSelfAttention, self).__init__()
274 |         # self.Spatial_pos_embed = nn.Parameter(torch.zeros(1, num_joints, embed_dim_ratio))
275 |         self.Spatial_embedding= nn.Linear(d_model, d_model)
276 | 
277 |         # self.spatial_encoder = nn.ModuleList([MultiHeadAttention_(d_model*4, d_k*4, d_v*4, n_head, dff, dropout_transformer)
278 |         #                               for _ in range(n_module)])
279 |         # self.weighted_mean = torch.nn.Conv1d(in_channels=d_model*4, out_channels=d_model, kernel_size=1)
280 | 
281 |         self.encoder = nn.ModuleList([MultiHeadAttention_(d_model, d_k, d_v, n_head, dff, dropout_transformer)
282 |                                       for _ in range(n_module)])
283 |                              
284 | 
285 |     def forward(self, x):
286 |         x = self.Spatial_embedding(x)
287 |         in_encoder = x + sinusoid_encoding_table(x.shape[1], x.shape[2]).expand(x.shape).cuda(device=0)
288 |         for l in self.encoder:
289 |             # in_encoder = l(in_encoder, in_encoder, in_encoder)
290 |             in_encoder = l(in_encoder)
291 |         # print(in_encoder.shape)  # 8,40,512
292 |         return in_encoder
293 | 


--------------------------------------------------------------------------------
/src_gestformer/train.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from torch.utils.data import DataLoader
  8 | import imgaug.augmenters as iaa
  9 | 
 10 | # Import Datasets
 11 | from datasets.Briareo import Briareo
 12 | from datasets.NVGestures import NVGesture
 13 | from models.model_utilizer import ModuleUtilizer
 14 | 
 15 | # Import Model
 16 | from models.temporal import GestureTransoformer
 17 | from torch.optim.lr_scheduler import MultiStepLR
 18 | 
 19 | # Import loss
 20 | 
 21 | # Import Utils
 22 | from tqdm import tqdm
 23 | from utils.average_meter import AverageMeter
 24 | from tensorboardX import SummaryWriter
 25 | 
 26 | # Setting seeds
 27 | def worker_init_fn(worker_id):
 28 |     np.random.seed(torch.initial_seed() % 2 ** 32)
 29 | 
 30 | class GestureTrainer(object):
 31 |     """Gesture Recognition Train class
 32 | 
 33 |     Attributes:
 34 |         configer (Configer): Configer object, contains procedure configuration.
 35 |         train_loader (torch.utils.data.DataLoader): Train data loader variable
 36 |         val_loader (torch.utils.data.DataLoader): Val data loader variable
 37 |         test_loader (torch.utils.data.DataLoader): Test data loader variable
 38 |         net (torch.nn.Module): Network used for the current procedure
 39 |         lr (int): Learning rate value
 40 |         optimizer (torch.nn.optim.optimizer): Optimizer for training procedure
 41 |         iters (int): Starting iteration number, not zero if resuming training
 42 |         epoch (int): Starting epoch number, not zero if resuming training
 43 |         scheduler (torch.optim.lr_scheduler): Scheduler to utilize during training
 44 | 
 45 |     """
 46 | 
 47 |     def __init__(self, configer):
 48 |         self.configer = configer
 49 | 
 50 |         self.data_path = configer.get("data", "data_path")      #: str: Path to data directory
 51 | 
 52 |         # Losses
 53 |         self.losses = {
 54 |             'train': AverageMeter(),                      #: Train loss avg meter
 55 |             'val': AverageMeter(),                        #: Val loss avg meter
 56 |             'test': AverageMeter()                        #: Test loss avg meter
 57 |         }
 58 | 
 59 |         # Train val and test accuracy
 60 |         self.accuracy = {
 61 |             'train': AverageMeter(),                      #: Train accuracy avg meter
 62 |             'val': AverageMeter(),                        #: Val accuracy avg meter
 63 |             'test': AverageMeter()                        #: Test accuracy avg meter
 64 |         }
 65 | 
 66 |         # DataLoaders
 67 |         self.train_loader = None
 68 |         self.val_loader = None
 69 |         self.test_loader = None
 70 | 
 71 |         # Module load and save utility
 72 |         self.device = self.configer.get("device")
 73 |         self.model_utility = ModuleUtilizer(self.configer)      #: Model utility for load, save and update optimizer
 74 |         self.net = None
 75 |         self.lr = None
 76 | 
 77 |         # Training procedure
 78 |         self.optimizer = None
 79 |         self.iters = None
 80 |         self.epoch = 0
 81 |         self.train_transforms = None
 82 |         self.val_transforms = None
 83 |         self.loss = None
 84 | 
 85 |         # Tensorboard and Metrics
 86 |         self.tbx_summary = SummaryWriter(str(Path(configer.get('checkpoints', 'tb_path'))  #: Summary Writer plot
 87 |                                              / configer.get("dataset")                     #: data with TensorboardX
 88 |                                              / configer.get('checkpoints', 'save_name')))
 89 |         self.tbx_summary.add_text('parameters', str(self.configer).replace("\n", "\n\n"))
 90 |         self.save_iters = self.configer.get('checkpoints', 'save_iters')    #: int: Saving ratio
 91 | 
 92 |         # Other useful data
 93 |         self.backbone = self.configer.get("network", "backbone")     #: str: Backbone type
 94 |         self.in_planes = None                                       #: int: Input channels
 95 |         self.clip_length = self.configer.get("data", "n_frames")    #: int: Number of frames per sequence
 96 |         self.n_classes = self.configer.get("data", "n_classes")     #: int: Total number of classes for dataset
 97 |         self.data_type = self.configer.get("data", "type")          #: str: Type of data (rgb, depth, ir, leapmotion)
 98 |         self.dataset = self.configer.get("dataset").lower()         #: str: Type of dataset
 99 |         self.optical_flow = self.configer.get("data", "optical_flow")
100 |         if self.optical_flow is None:
101 |             self.optical_flow = True
102 |         self.scheduler = None
103 | 
104 |     def init_model(self):
105 |         """Initialize model and other data for procedure"""
106 | 
107 |         if self.optical_flow is True:
108 |             self.in_planes = 2
109 |         elif self.data_type in ["depth", "ir"]:
110 |             self.in_planes = 1
111 |         else:
112 |             self.in_planes = 3
113 | 
114 |         self.loss = nn.CrossEntropyLoss().to(self.device)
115 | 
116 |         # Selecting correct model and normalization variable based on type variable
117 |         self.net = GestureTransoformer(self.backbone, self.in_planes, self.n_classes,
118 |                                        pretrained=self.configer.get("network", "pretrained"),
119 |                                        n_head=self.configer.get("network", "n_head"),
120 |                                        dropout_backbone=self.configer.get("network", "dropout2d"),
121 |                                        dropout_transformer=self.configer.get("network", "dropout1d"),
122 |                                        dff=self.configer.get("network", "ff_size"),
123 |                                        n_module=self.configer.get("network", "n_module")
124 |                                        )
125 | 
126 |         # Initializing training
127 |         self.iters = 0
128 |         self.epoch = None
129 |         phase = self.configer.get('phase')
130 | 
131 |         # Starting or resuming procedure
132 |         if phase == 'train':
133 |             self.net, self.iters, self.epoch, optim_dict = self.model_utility.load_net(self.net)
134 |         else:
135 |             raise ValueError('Phase: {} is not valid.'.format(phase))
136 | 
137 |         if self.epoch is None:
138 |             self.epoch = 0
139 | 
140 |         # ToDo Restore optimizer and scheduler from checkpoint
141 |         self.optimizer, self.lr = self.model_utility.update_optimizer(self.net, self.iters)
142 |         self.scheduler = MultiStepLR(self.optimizer, self.configer["solver", "decay_steps"], gamma=0.1)
143 | 
144 |         #  Resuming training, restoring optimizer value
145 |         if optim_dict is not None:
146 |             print("Resuming training from epoch {}.".format(self.epoch))
147 |             self.optimizer.load_state_dict(optim_dict)
148 | 
149 |         # Selecting Dataset and DataLoader
150 |         if self.dataset == "briareo":
151 |             Dataset = Briareo
152 |             self.train_transforms = iaa.Sequential([
153 |                 iaa.Resize((0.85, 1.15)),
154 |                 iaa.CropToFixedSize(width=190, height=190),
155 |                 iaa.Rotate((-15, 15))
156 |             ])
157 |             self.val_transforms = iaa.CenterCropToFixedSize(200, 200)
158 | 
159 |         elif self.dataset == "nvgestures":
160 |             Dataset = NVGesture
161 |             self.train_transforms = iaa.Sequential([
162 |                 iaa.Resize((0.8, 1.2)),
163 |                 iaa.CropToFixedSize(width=256, height=192),
164 |                 iaa.Rotate((-15, 15))
165 |             ])
166 |             self.val_transforms = iaa.CenterCropToFixedSize(256, 192)
167 |         else:
168 |             raise NotImplementedError(f"Dataset not supported: {self.configer.get('dataset')}")
169 | 
170 |         # Setting Dataloaders
171 |         self.train_loader = DataLoader(
172 |             Dataset(self.configer, self.data_path, split="train", data_type=self.data_type,
173 |                     transforms=self.train_transforms, n_frames=self.clip_length, optical_flow=self.optical_flow),
174 |             batch_size=self.configer.get('data', 'batch_size'), shuffle=True, drop_last=True,
175 |             num_workers=self.configer.get('solver', 'workers'), pin_memory=True, worker_init_fn=worker_init_fn)
176 |         self.val_loader = DataLoader(
177 |             Dataset(self.configer, self.data_path, split="val", data_type=self.data_type,
178 |                     transforms=self.val_transforms, n_frames=self.clip_length, optical_flow=self.optical_flow),
179 |             batch_size=self.configer.get('data', 'batch_size'), shuffle=False, drop_last=True,
180 |             num_workers=self.configer.get('solver', 'workers'), pin_memory=True, worker_init_fn=worker_init_fn)
181 |         if self.dataset == "nvgestures":
182 |             self.test_loader = None
183 |         else:
184 |             self.test_loader = DataLoader(
185 |                 Dataset(self.configer, self.data_path, split="test", data_type=self.data_type,
186 |                         transforms=self.val_transforms, n_frames=self.clip_length, optical_flow=self.optical_flow),
187 |                 batch_size=1, shuffle=False, drop_last=True,
188 |                 num_workers=self.configer.get('solver', 'workers'), pin_memory=True, worker_init_fn=worker_init_fn)
189 | 
190 |     def __train(self):
191 |         """Train function for every epoch."""
192 | 
193 |         self.net.train()
194 |         for data_tuple in tqdm(self.train_loader, desc="Train"):
195 |             """
196 |             input, gt
197 |             """
198 |             inputs = data_tuple[0].to(self.device)
199 |             gt = data_tuple[1].to(self.device)
200 | 
201 |             output = self.net(inputs)
202 | 
203 |             self.optimizer.zero_grad()
204 |             loss = self.loss(output, gt.squeeze(dim=1))
205 |             loss.backward()
206 | 
207 |             torch.nn.utils.clip_grad_norm_(self.net.parameters(), max_norm=1)
208 |             self.optimizer.step()
209 | 
210 |             predicted = torch.argmax(output.detach(), dim=1)
211 |             correct = gt.detach().squeeze(dim=1)
212 | 
213 |             self.iters += 1
214 |             self.update_metrics("train", loss.item(), inputs.size(0),
215 |                                 float((predicted==correct).sum()) / len(correct))
216 | 
217 | 
218 |     def __val(self):
219 |         """Validation function."""
220 |         self.net.eval()
221 | 
222 |         with torch.no_grad():
223 |             # for i, data_tuple in enumerate(tqdm(self.val_loader, desc="Val", postfix=str(self.accuracy["val"].avg))):
224 |             for i, data_tuple in enumerate(tqdm(self.val_loader, desc="Val", postfix=""+str(np.random.randint(200)))):
225 |                 """
226 |                 input, gt
227 |                 """
228 |                 inputs = data_tuple[0].to(self.device)
229 |                 gt = data_tuple[1].to(self.device)
230 | 
231 |                 output = self.net(inputs)
232 |                 loss = self.loss(output, gt.squeeze(dim=1))
233 | 
234 |                 predicted = torch.argmax(output.detach(), dim=1)
235 |                 correct = gt.detach().squeeze(dim=1)
236 | 
237 |                 self.iters += 1
238 |                 self.update_metrics("val", loss.item(), inputs.size(0),
239 |                                     float((predicted == correct).sum()) / len(correct))
240 | 
241 |         self.tbx_summary.add_scalar('val_loss', self.losses["val"].avg, self.epoch + 1)
242 |         self.tbx_summary.add_scalar('val_accuracy', self.accuracy["val"].avg, self.epoch + 1)
243 |         accuracy = self.accuracy["val"].avg
244 |         self.accuracy["val"].reset()
245 |         self.losses["val"].reset()
246 | 
247 |         ret = self.model_utility.save(accuracy, self.net, self.optimizer, self.iters, self.epoch + 1)
248 |         if ret < 0:
249 |             return -1
250 |         elif ret > 0 and self.test_loader is not None:
251 |             self.__test()
252 |         return ret
253 | 
254 |     def __test(self):
255 |         """Testing function."""
256 |         self.net.eval()
257 | 
258 |         with torch.no_grad():
259 |             for i, data_tuple in enumerate(tqdm(self.test_loader, desc="Test", postfix=str(self.accuracy["test"].avg))):
260 |                 """
261 |                 input, gt
262 |                 """
263 |                 inputs = data_tuple[0].to(self.device)
264 |                 gt = data_tuple[1].to(self.device)
265 | 
266 |                 output = self.net(inputs)
267 |                 loss = self.loss(output, gt.squeeze(dim=1))
268 | 
269 |                 predicted = torch.argmax(output.detach(), dim=1)
270 |                 correct = gt.detach().squeeze(dim=1)
271 | 
272 |                 self.iters += 1
273 |                 self.update_metrics("test", loss.item(), inputs.size(0),
274 |                                     float((predicted == correct).sum()) / len(correct))
275 |         self.tbx_summary.add_scalar('test_loss', self.losses["test"].avg, self.epoch + 1)
276 |         self.tbx_summary.add_scalar('test_accuracy', self.accuracy["test"].avg, self.epoch + 1)
277 |         self.losses["test"].reset()
278 |         self.accuracy["test"].reset()
279 | 
280 |     def train(self):
281 |         for n in range(self.configer.get("epochs")):
282 |             print("Starting epoch {}".format(self.epoch + 1))
283 |             self.__train()
284 |             ret = self.__val()
285 |             if ret < 0:
286 |                 print("Got no improvement for {} epochs, current epoch is {}."
287 |                       .format(self.configer.get("checkpoints", "early_stop"), n))
288 |                 break
289 |             self.epoch += 1
290 | 
291 |     def update_metrics(self, split: str, loss, bs, accuracy=None):
292 |         self.losses[split].update(loss, bs)
293 |         if accuracy is not None:
294 |             self.accuracy[split].update(accuracy, bs)
295 |         if split == "train" and self.iters % self.save_iters == 0:
296 |             self.tbx_summary.add_scalar('{}_loss'.format(split), self.losses[split].avg, self.iters)
297 |             self.tbx_summary.add_scalar('{}_accuracy'.format(split), self.accuracy[split].avg, self.iters)
298 |             self.losses[split].reset()
299 |             self.accuracy[split].reset()


--------------------------------------------------------------------------------
/src_gestformer/models/backbones/r3d.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from torchvision.models.utils import load_state_dict_from_url
  5 | 
  6 | 
  7 | __all__ = ['r3d_18', 'mc3_18', 'r2plus1d_18']
  8 | 
  9 | model_urls = {
 10 |     'r3d_18': 'https://download.pytorch.org/models/r3d_18-b3b3357e.pth',
 11 |     'mc3_18': 'https://download.pytorch.org/models/mc3_18-a90a0ba3.pth',
 12 |     'r2plus1d_18': 'https://download.pytorch.org/models/r2plus1d_18-91a641e6.pth',
 13 | }
 14 | 
 15 | 
 16 | class Conv3DSimple(nn.Conv3d):
 17 |     def __init__(self,
 18 |                  in_planes,
 19 |                  out_planes,
 20 |                  midplanes=None,
 21 |                  stride=1,
 22 |                  padding=1):
 23 | 
 24 |         super(Conv3DSimple, self).__init__(
 25 |             in_channels=in_planes,
 26 |             out_channels=out_planes,
 27 |             kernel_size=(3, 3, 3),
 28 |             stride=stride,
 29 |             padding=padding,
 30 |             bias=False)
 31 | 
 32 |     @staticmethod
 33 |     def get_downsample_stride(stride):
 34 |         return (stride, stride, stride)
 35 | 
 36 | 
 37 | class Conv2Plus1D(nn.Sequential):
 38 | 
 39 |     def __init__(self,
 40 |                  in_planes,
 41 |                  out_planes,
 42 |                  midplanes,
 43 |                  stride=1,
 44 |                  padding=1):
 45 |         super(Conv2Plus1D, self).__init__(
 46 |             nn.Conv3d(in_planes, midplanes, kernel_size=(1, 3, 3),
 47 |                       stride=(1, stride, stride), padding=(0, padding, padding),
 48 |                       bias=False),
 49 |             nn.BatchNorm3d(midplanes),
 50 |             nn.ReLU(inplace=True),
 51 |             nn.Conv3d(midplanes, out_planes, kernel_size=(3, 1, 1),
 52 |                       stride=(stride, 1, 1), padding=(padding, 0, 0),
 53 |                       bias=False))
 54 | 
 55 |     @staticmethod
 56 |     def get_downsample_stride(stride):
 57 |         return (stride, stride, stride)
 58 | 
 59 | 
 60 | class Conv3DNoTemporal(nn.Conv3d):
 61 | 
 62 |     def __init__(self,
 63 |                  in_planes,
 64 |                  out_planes,
 65 |                  midplanes=None,
 66 |                  stride=1,
 67 |                  padding=1):
 68 | 
 69 |         super(Conv3DNoTemporal, self).__init__(
 70 |             in_channels=in_planes,
 71 |             out_channels=out_planes,
 72 |             kernel_size=(1, 3, 3),
 73 |             stride=(1, stride, stride),
 74 |             padding=(0, padding, padding),
 75 |             bias=False)
 76 | 
 77 |     @staticmethod
 78 |     def get_downsample_stride(stride):
 79 |         return (1, stride, stride)
 80 | 
 81 | 
 82 | class BasicBlock(nn.Module):
 83 | 
 84 |     expansion = 1
 85 | 
 86 |     def __init__(self, inplanes, planes, conv_builder, stride=1, downsample=None):
 87 |         midplanes = (inplanes * planes * 3 * 3 * 3) // (inplanes * 3 * 3 + 3 * planes)
 88 | 
 89 |         super(BasicBlock, self).__init__()
 90 |         self.dropout = nn.Dropout3d(p = 0.1)
 91 |         self.conv1 = nn.Sequential(
 92 |             conv_builder(inplanes, planes, midplanes, stride),
 93 |             nn.BatchNorm3d(planes),
 94 |             nn.ReLU(inplace=True)
 95 |         )
 96 |         self.conv2 = nn.Sequential(
 97 |             conv_builder(planes, planes, midplanes),
 98 |             nn.BatchNorm3d(planes)
 99 |         )
100 |         self.relu = nn.ReLU(inplace=True)
101 |         self.downsample = downsample
102 |         self.stride = stride
103 | 
104 |     def forward(self, x):
105 |         residual = x
106 | 
107 |         out = self.conv1(x)
108 |         out = self.conv2(out)
109 |         if self.downsample is not None:
110 |             residual = self.downsample(x)
111 | 
112 |         out += residual
113 |         out = self.relu(out)
114 | 
115 |         return out
116 | 
117 | 
118 | class Bottleneck(nn.Module):
119 |     expansion = 4
120 | 
121 |     def __init__(self, inplanes, planes, conv_builder, stride=1, downsample=None):
122 | 
123 |         super(Bottleneck, self).__init__()
124 |         midplanes = (inplanes * planes * 3 * 3 * 3) // (inplanes * 3 * 3 + 3 * planes)
125 | 
126 |         # 1x1x1
127 |         self.conv1 = nn.Sequential(
128 |             nn.Conv3d(inplanes, planes, kernel_size=1, bias=False),
129 |             nn.BatchNorm3d(planes),
130 |             nn.ReLU(inplace=True)
131 |         )
132 |         # Second kernel
133 |         self.conv2 = nn.Sequential(
134 |             conv_builder(planes, planes, midplanes, stride),
135 |             nn.BatchNorm3d(planes),
136 |             nn.ReLU(inplace=True)
137 |         )
138 | 
139 |         # 1x1x1
140 |         self.conv3 = nn.Sequential(
141 |             nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False),
142 |             nn.BatchNorm3d(planes * self.expansion)
143 |         )
144 |         self.relu = nn.ReLU(inplace=True)
145 |         self.downsample = downsample
146 |         self.stride = stride
147 | 
148 |     def forward(self, x):
149 |         residual = x
150 | 
151 |         out = self.conv1(x)
152 |         out = self.conv2(out)
153 |         out = self.conv3(out)
154 | 
155 |         if self.downsample is not None:
156 |             residual = self.downsample(x)
157 | 
158 |         out += residual
159 |         out = self.relu(out)
160 | 
161 |         return out
162 | 
163 | 
164 | class BasicStem(nn.Sequential):
165 |     """The default conv-batchnorm-relu stem
166 |     """
167 |     def __init__(self, pretrained):
168 |         super(BasicStem, self).__init__(
169 |             nn.Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2),
170 |                       padding=(1, 3, 3), bias=False),
171 |             nn.BatchNorm3d(64),
172 |             nn.ReLU(inplace=True))
173 | 
174 | 
175 | class R2Plus1dStem(nn.Sequential):
176 |     """R(2+1)D stem is different than the default one as it uses separated 3D convolution
177 |     """
178 |     def __init__(self, pretrained):
179 |         super(R2Plus1dStem, self).__init__(
180 |             nn.Conv3d(3, 45, kernel_size=(1, 7, 7),
181 |                       stride=(1, 2, 2), padding=(0, 3, 3),
182 |                       bias=False),
183 |             nn.BatchNorm3d(45),
184 |             nn.ReLU(inplace=True),
185 |             nn.Conv3d(45, 64, kernel_size=(3, 1, 1),
186 |                       stride=(1, 1, 1), padding=(1, 0, 0),
187 |                       bias=False),
188 |             nn.BatchNorm3d(64),
189 |             nn.ReLU(inplace=True))
190 | 
191 | 
192 | class VideoResNet(nn.Module):
193 | 
194 |     def __init__(self, pretrained, block, conv_makers, layers,
195 |                  stem, drop_prob, num_classes=400,
196 |                  zero_init_residual=False):
197 |         """Generic resnet video generator.
198 | 
199 |         Args:
200 |             block (nn.Module): resnet building block
201 |             conv_makers (list(functions)): generator function for each layer
202 |             layers (List[int]): number of blocks per layer
203 |             stem (nn.Module, optional): Resnet stem, if None, defaults to conv-bn-relu. Defaults to None.
204 |             num_classes (int, optional): Dimension of the final FC layer. Defaults to 400.
205 |             zero_init_residual (bool, optional): Zero init bottleneck residual BN. Defaults to False.
206 |         """
207 |         super(VideoResNet, self).__init__()
208 |         self.inplanes = 64
209 | 
210 |         self.stem = stem(pretrained)
211 | 
212 |         self.layer1 = self._make_layer(block, conv_makers[0], 64, layers[0], stride=1)
213 |         self.layer2 = self._make_layer(block, conv_makers[1], 128, layers[1], stride=2)
214 |         self.layer3 = self._make_layer(block, conv_makers[2], 256, layers[2], stride=2)
215 |         self.layer4 = self._make_layer(block, conv_makers[3], 512, layers[3], stride=2)
216 | 
217 |         self.dropout = nn.Dropout3d(drop_prob)
218 | 
219 |         self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
220 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
221 | 
222 |         # init weights
223 |         self._initialize_weights()
224 | 
225 |         if zero_init_residual:
226 |             for m in self.modules():
227 |                 if isinstance(m, Bottleneck):
228 |                     nn.init.constant_(m.bn3.weight, 0)
229 | 
230 |     def forward(self, x):
231 |         x = self.stem(x)
232 | 
233 |         x = self.layer1(x)
234 |         x = self.dropout(x)
235 |         x = self.layer2(x)
236 |         x = self.dropout(x)
237 |         x = self.layer3(x)
238 |         x = self.dropout(x)
239 |         x = self.layer4(x)
240 |         x = self.dropout(x)
241 | 
242 |         x = self.avgpool(x)
243 | 
244 |         x = x.flatten(1)
245 | 
246 |         return x
247 | 
248 |     def _make_layer(self, block, conv_builder, planes, blocks, stride=1):
249 |         downsample = None
250 | 
251 |         if stride != 1 or self.inplanes != planes * block.expansion:
252 |             ds_stride = conv_builder.get_downsample_stride(stride)
253 |             downsample = nn.Sequential(
254 |                 nn.Conv3d(self.inplanes, planes * block.expansion,
255 |                           kernel_size=1, stride=ds_stride, bias=False),
256 |                 nn.BatchNorm3d(planes * block.expansion)
257 |             )
258 |         layers = []
259 |         layers.append(block(self.inplanes, planes, conv_builder, stride, downsample))
260 | 
261 |         self.inplanes = planes * block.expansion
262 |         for i in range(1, blocks):
263 |             layers.append(block(self.inplanes, planes, conv_builder))
264 | 
265 |         return nn.Sequential(*layers)
266 | 
267 |     def _initialize_weights(self):
268 |         for m in self.modules():
269 |             if isinstance(m, nn.Conv3d):
270 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out',
271 |                                         nonlinearity='relu')
272 |                 if m.bias is not None:
273 |                     nn.init.constant_(m.bias, 0)
274 |             elif isinstance(m, nn.BatchNorm3d):
275 |                 nn.init.constant_(m.weight, 1)
276 |                 nn.init.constant_(m.bias, 0)
277 |             elif isinstance(m, nn.Linear):
278 |                 nn.init.normal_(m.weight, 0, 0.01)
279 |                 nn.init.constant_(m.bias, 0)
280 | 
281 | 
282 | def _video_resnet(arch, pretrained=False, in_planes=3, dropout=0., **kwargs):
283 |     model = VideoResNet(pretrained, drop_prob=dropout, **kwargs)
284 | 
285 |     if pretrained:
286 |         state_dict = load_state_dict_from_url(model_urls[arch], progress=True)
287 |         model.load_state_dict(state_dict)
288 |         if in_planes in [1, 2]:
289 |             w = model.stem._modules['0']._parameters['weight']
290 |             if kwargs['stem'].__name__ == 'BasicStem':
291 |                 model.stem._modules['0'] = nn.Conv3d(in_planes, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2),
292 |                                                      padding=(1, 3, 3), bias=False)
293 |             elif kwargs['stem'].__name__ == 'R2Plus1dStem':
294 |                 model.stem._modules['0'] = nn.Conv3d(in_planes, 45, kernel_size=(1, 7, 7), stride=(1, 2, 2),
295 |                                                      padding=(0, 3, 3), bias=False)
296 |             if in_planes == 1:
297 |                 model.stem._modules['0']._parameters['weight'].data = w.sum(dim=1, keepdim=True)
298 |             else:
299 |                 model.stem._modules['0']._parameters['weight'].data = w[:, :-1] * 1.5
300 |     else:
301 |         if kwargs['stem'].__name__ == 'BasicStem':
302 |             model.stem._modules['0'] = nn.Conv3d(in_planes, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2),
303 |                                                  padding=(1, 3, 3), bias=False)
304 |         elif kwargs['stem'].__name__ == 'R2Plus1dStem':
305 |             model.stem._modules['0'] = nn.Conv3d(in_planes, 45, kernel_size=(1, 7, 7), stride=(1, 2, 2),
306 |                                                  padding=(0, 3, 3), bias=False)
307 |     model.fc = None
308 |     return model
309 | 
310 | 
311 | def r3d_18(pretrained=False, in_planes: int=3, dropout: float=0., **kwargs):
312 |     """Construct 18 layer Resnet3D model as in
313 |     https://arxiv.org/abs/1711.11248
314 | 
315 |     Args:
316 |         pretrained (bool): If True, returns a model pre-trained on Kinetics-400
317 |         progress (bool): If True, displays a progress bar of the download to stderr
318 | 
319 |     Returns:
320 |         nn.Module: R3D-18 network
321 |     """
322 | 
323 |     return _video_resnet('r3d_18',
324 |                          pretrained, in_planes, dropout,
325 |                          block=BasicBlock,
326 |                          conv_makers=[Conv3DSimple] * 4,
327 |                          layers=[2, 2, 2, 2],
328 |                          stem=BasicStem, **kwargs)
329 | 
330 | 
331 | def mc3_18(pretrained=False, in_planes: int=3, dropout: float=0., **kwargs):
332 |     """Constructor for 18 layer Mixed Convolution network as in
333 |     https://arxiv.org/abs/1711.11248
334 | 
335 |     Args:
336 |         pretrained (bool): If True, returns a model pre-trained on Kinetics-400
337 |         progress (bool): If True, displays a progress bar of the download to stderr
338 | 
339 |     Returns:
340 |         nn.Module: MC3 Network definition
341 |     """
342 |     return _video_resnet('mc3_18',
343 |                          pretrained, in_planes, dropout,
344 |                          block=BasicBlock,
345 |                          conv_makers=[Conv3DSimple] + [Conv3DNoTemporal] * 3,
346 |                          layers=[2, 2, 2, 2],
347 |                          stem=BasicStem, **kwargs)
348 | 
349 | 
350 | def r2plus1d_18(pretrained=False, in_planes: int=3, dropout: float=0., **kwargs):
351 |     """Constructor for the 18 layer deep R(2+1)D network as in
352 |     https://arxiv.org/abs/1711.11248
353 | 
354 |     Args:
355 |         pretrained (bool): If True, returns a model pre-trained on Kinetics-400
356 |         progress (bool): If True, displays a progress bar of the download to stderr
357 | 
358 |     Returns:
359 |         nn.Module: R(2+1)D-18 network
360 |     """
361 |     return _video_resnet('r2plus1d_18',
362 |                          pretrained, in_planes, dropout,
363 |                          block=BasicBlock,
364 |                          conv_makers=[Conv2Plus1D] * 4,
365 |                          layers=[2, 2, 2, 2],
366 |                          stem=R2Plus1dStem, **kwargs)
367 | 
368 | if __name__ == "__main__":
369 |     m = r3d_18(False, 1)


--------------------------------------------------------------------------------