├── action_recognition ├── model │ ├── __init__.py │ ├── agcn.py │ ├── agcn_mult.py │ └── aagcn.py ├── feeders │ ├── __init__.py │ ├── tools.py │ └── feeder.py ├── graph │ ├── __init__.py │ ├── tools.py │ ├── ntu_rgb_d.py │ └── kinetics.py ├── config │ └── babel_v1.0 │ │ ├── test_60.yaml │ │ ├── test_120.yaml │ │ ├── test_60_wfl.yaml │ │ ├── test_120_wfl.yaml │ │ ├── train_60.yaml │ │ ├── train_120.yaml │ │ ├── train_60_wfl.yaml │ │ └── train_120_wfl.yaml ├── data_gen │ ├── rotation.py │ ├── preprocess.py │ ├── dutils.py │ ├── viz.py │ └── create_dataset.py ├── data │ └── action_label_2_idx.json ├── class_balanced_loss.py ├── challenge │ └── create_submission.py ├── Readme.md └── train_test.py ├── .gitignore ├── notebooks ├── Readme.md ├── BABEL_explore.ipynb └── BABEL_visualization.ipynb ├── requirements.txt └── Readme.md /action_recognition/model/__init__.py: -------------------------------------------------------------------------------- 1 | from . import agcn, aagcn 2 | -------------------------------------------------------------------------------- /action_recognition/feeders/__init__.py: -------------------------------------------------------------------------------- 1 | from . import tools 2 | from . import feeder 3 | -------------------------------------------------------------------------------- /action_recognition/graph/__init__.py: -------------------------------------------------------------------------------- 1 | from . import tools 2 | from . import ntu_rgb_d 3 | from . import kinetics 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # System 2 | babel-env 3 | 4 | # Temp files 5 | *.DS_Store 6 | *.swp 7 | __pycache__ 8 | notebooks/.ipynb_checkpoints 9 | 10 | # Data 11 | data/babel_v1.0_release 12 | action_recognition/data/release 13 | 14 | # Predictions 15 | action_recognition/challenge/*.pkl 16 | action_recognition/challenge/*.npz 17 | ckpts 18 | 19 | # Logging 20 | wandb 21 | runs 22 | work_dir 23 | -------------------------------------------------------------------------------- /notebooks/Readme.md: -------------------------------------------------------------------------------- 1 | ### Load and visualize BABEL 2 | 3 | [`BABEL_visualization.ipynb`](BABEL_visualization.ipynb) contains code that demonstrates how to: 4 | - Load the BABEL dataset 5 | - Visualize rendered videos of mocap sequences 6 | - Visualize their action labels 7 | 8 | 9 | ### Explore BABEL 10 | 11 | [`BABEL_explore.ipynb`](BABEL_explore.ipynb) contains code that shows how to: 12 | - Compute stats. from BABEL (e.g., duration of labeled mocap) 13 | - Search BABEL for mocap sequences containing a specific action, and retrieve their annotations. -------------------------------------------------------------------------------- /action_recognition/config/babel_v1.0/test_60.yaml: -------------------------------------------------------------------------------- 1 | # feeder 2 | feeder: feeders.feeder.Feeder 3 | test_feeder_args: 4 | data_path: ./data/release/val_ntu_sk_60.npy 5 | label_path: ./data/release/val_label_60.pkl 6 | debug: False 7 | 8 | # model 9 | model: model.agcn.Model 10 | model_args: 11 | num_class: 60 12 | num_point: 25 13 | num_person: 1 14 | graph: graph.ntu_rgb_d.Graph 15 | graph_args: 16 | labeling_mode: 'spatial' 17 | 18 | # test 19 | phase: test 20 | device: [0] 21 | test_batch_size: 128 22 | weights: ./ckpts/ntu_sk_60_agcn_joint_const_lr_1e-3-17-6390.pt 23 | 24 | work_dir: ./work_dir/babel_v1.0/test_runs/test_ntu_sk_60_agcn_joint_const_lr_1e-3 25 | save_score: True 26 | -------------------------------------------------------------------------------- /action_recognition/config/babel_v1.0/test_120.yaml: -------------------------------------------------------------------------------- 1 | # feeder 2 | feeder: feeders.feeder.Feeder 3 | test_feeder_args: 4 | data_path: ./data/release/val_ntu_sk_120.npy 5 | label_path: ./data/release/val_label_120.pkl 6 | debug: False 7 | 8 | # model 9 | model: model.agcn.Model 10 | model_args: 11 | num_class: 120 12 | num_point: 25 13 | num_person: 1 14 | graph: graph.ntu_rgb_d.Graph 15 | graph_args: 16 | labeling_mode: 'spatial' 17 | 18 | # test 19 | phase: test 20 | device: [0] 21 | test_batch_size: 128 22 | weights: ./ckpts/ntu_sk_120_agcn_joint_const_lr_1e-3-15-12240.pt 23 | 24 | work_dir: ./work_dir/babel_v1.0/test_runs/test_ntu_sk_120_agcn_joint_const_lr_1e-3 25 | save_score: True 26 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backcall==0.2.0 2 | certifi==2020.12.5 3 | decorator==4.4.2 4 | ipdb==0.13.4 5 | ipython==7.19.0 6 | ipython-genutils==0.2.0 7 | jedi==0.18.0 8 | joblib==1.0.0 9 | networkx==2.5 10 | numpy==1.19.5 11 | parso==0.8.1 12 | pexpect==4.8.0 13 | pickleshare==0.7.5 14 | Pillow==8.1.0 15 | prompt-toolkit==3.0.10 16 | protobuf==3.14.0 17 | ptyprocess==0.7.0 18 | Pygments==2.7.4 19 | PyYAML==5.4 20 | scikit-learn==0.24.1 21 | scipy==1.6.0 22 | six==1.15.0 23 | tensorboardX==2.1 24 | threadpoolctl==2.1.0 25 | torch==1.7.1 26 | torchvision==0.8.2 27 | tqdm==4.56.0 28 | traitlets==5.0.5 29 | typing-extensions==3.7.4.3 30 | wcwidth==0.2.5 31 | pandas==1.3.4 32 | smplx==0.1.13 33 | matplotlib==3.1.3 34 | opencv-python==4.4.0.42 35 | -------------------------------------------------------------------------------- /action_recognition/graph/tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def edge2mat(link, num_node): 5 | A = np.zeros((num_node, num_node)) 6 | for i, j in link: 7 | A[j, i] = 1 8 | return A 9 | 10 | 11 | def normalize_digraph(A): # 除以每列的和 12 | Dl = np.sum(A, 0) 13 | h, w = A.shape 14 | Dn = np.zeros((w, w)) 15 | for i in range(w): 16 | if Dl[i] > 0: 17 | Dn[i, i] = Dl[i] ** (-1) 18 | AD = np.dot(A, Dn) 19 | return AD 20 | 21 | 22 | def get_spatial_graph(num_node, self_link, inward, outward): 23 | I = edge2mat(self_link, num_node) 24 | In = normalize_digraph(edge2mat(inward, num_node)) 25 | Out = normalize_digraph(edge2mat(outward, num_node)) 26 | A = np.stack((I, In, Out)) 27 | return A 28 | -------------------------------------------------------------------------------- /action_recognition/config/babel_v1.0/test_60_wfl.yaml: -------------------------------------------------------------------------------- 1 | # feeder 2 | feeder: feeders.feeder.Feeder 3 | test_feeder_args: 4 | data_path: ./data/release/val_ntu_sk_60.npy 5 | label_path: ./data/release/val_label_60.pkl 6 | debug: False 7 | 8 | label_count_path: ./data/release/train_label_60_count.pkl 9 | 10 | # model 11 | model: model.agcn.Model 12 | model_args: 13 | num_class: 60 14 | num_point: 25 15 | num_person: 1 16 | graph: graph.ntu_rgb_d.Graph 17 | graph_args: 18 | labeling_mode: 'spatial' 19 | 20 | # test 21 | phase: test 22 | device: [0] 23 | test_batch_size: 32 24 | weights: ./ckpts/wfl_ntu_sk_60_agcn_joint_const_lr_1e-3-93-33370.pt 25 | 26 | work_dir: ./work_dir/babel_v1.0/test_runs/test_wfl_ntu_sk_60_agcn_joint_const_lr_1e-3 27 | save_score: True 28 | -------------------------------------------------------------------------------- /action_recognition/config/babel_v1.0/test_120_wfl.yaml: -------------------------------------------------------------------------------- 1 | # feeder 2 | feeder: feeders.feeder.Feeder 3 | test_feeder_args: 4 | data_path: ./data/release/val_ntu_sk_120.npy 5 | label_path: ./data/release/val_label_120.pkl 6 | debug: False 7 | 8 | label_count_path: ./data/release/train_label_120_count.pkl 9 | 10 | # model 11 | model: model.agcn.Model 12 | model_args: 13 | num_class: 120 14 | num_point: 25 15 | num_person: 1 16 | graph: graph.ntu_rgb_d.Graph 17 | graph_args: 18 | labeling_mode: 'spatial' 19 | 20 | # test 21 | phase: test 22 | device: [0] 23 | test_batch_size: 128 24 | weights: ./ckpts/wfl_ntu_sk_120_agcn_joint_const_lr_1e-3-157-60356.pt 25 | 26 | work_dir: ./work_dir/babel_v1.0/test_runs/test_wfl_ntu_sk_120_agcn_joint_const_lr_1e-3 27 | save_score: True 28 | -------------------------------------------------------------------------------- /action_recognition/config/babel_v1.0/train_60.yaml: -------------------------------------------------------------------------------- 1 | work_dir: ./work_dir/babel_v1.0/ntu_sk_60_agcn_joint_const_lr_1e-3 2 | model_saved_name: ./runs/babel_v1.0/ntu_sk_60_agcn_joint_const_lr_1e-3 3 | 4 | # feeder 5 | feeder: feeders.feeder.Feeder 6 | train_feeder_args: 7 | data_path: ./data/release/train_ntu_sk_60.npy 8 | label_path: ./data/release/train_label_60.pkl 9 | debug: False 10 | random_choose: False 11 | random_shift: False 12 | random_move: False 13 | window_size: -1 14 | normalization: False 15 | 16 | test_feeder_args: 17 | data_path: ./data/release/val_ntu_sk_60.npy 18 | label_path: ./data/release/val_label_60.pkl 19 | 20 | # model 21 | model: model.agcn.Model 22 | model_args: 23 | num_class: 60 24 | num_person: 1 25 | num_point: 25 26 | graph: graph.ntu_rgb_d.Graph 27 | graph_args: 28 | labeling_mode: 'spatial' 29 | 30 | #optim 31 | weight_decay: 0.0001 32 | base_lr: 0.001 33 | step: [] 34 | 35 | # training 36 | device: [0] 37 | optimizer: 'Adam' 38 | loss: 'CE' 39 | batch_size: 64 40 | test_batch_size: 64 41 | num_epoch: 250 42 | nesterov: True 43 | 44 | # weights: /ps/project/conditional_action_gen/2s_agcn/runs/babel_v1.0/ntu_sk_60_agcn_joint_const_lr_1e-3-49-23450.pt 45 | -------------------------------------------------------------------------------- /action_recognition/config/babel_v1.0/train_120.yaml: -------------------------------------------------------------------------------- 1 | work_dir: ./work_dir/babel_v1.0/ntu_sk_120_agcn_joint_const_lr_1e-3 2 | model_saved_name: ./runs/babel_v1.0/ntu_sk_120_agcn_joint_const_lr_1e-3 3 | 4 | # feeder 5 | feeder: feeders.feeder.Feeder 6 | train_feeder_args: 7 | data_path: ./data/release/train_ntu_sk_120.npy 8 | label_path: ./data/release/train_label_120.pkl 9 | debug: False 10 | random_choose: False 11 | random_shift: False 12 | random_move: False 13 | window_size: -1 14 | normalization: False 15 | 16 | test_feeder_args: 17 | data_path: ./data/release/val_ntu_sk_120.npy 18 | label_path: ./data/release/val_label_120.pkl 19 | 20 | # model 21 | model: model.agcn.Model 22 | model_args: 23 | num_class: 120 24 | num_person: 1 25 | num_point: 25 26 | graph: graph.ntu_rgb_d.Graph 27 | graph_args: 28 | labeling_mode: 'spatial' 29 | 30 | #optim 31 | weight_decay: 0.0001 32 | base_lr: 0.001 33 | step: [] 34 | 35 | # training 36 | device: [0] 37 | optimizer: 'Adam' 38 | loss: 'CE' 39 | batch_size: 64 40 | test_batch_size: 64 41 | num_epoch: 250 42 | nesterov: True 43 | 44 | # weights: /ps/project/conditional_action_gen/2s_agcn/runs/babel_v1.0/ntu_sk_120_agcn_joint_const_lr_1e-3-49-23450.pt 45 | -------------------------------------------------------------------------------- /action_recognition/config/babel_v1.0/train_60_wfl.yaml: -------------------------------------------------------------------------------- 1 | work_dir: ./work_dir/babel_v1.0/wfl_ntu_sk_60_agcn_joint_const_lr_1e-3 2 | model_saved_name: ./runs/babel_v1.0/wfl_ntu_sk_60_agcn_joint_const_lr_1e-3 3 | 4 | # feeder 5 | feeder: feeders.feeder.Feeder 6 | train_feeder_args: 7 | data_path: ./data/release/train_ntu_sk_60.npy 8 | label_path: ./data/release/train_label_60.pkl 9 | debug: False 10 | random_choose: False 11 | random_shift: False 12 | random_move: False 13 | window_size: -1 14 | normalization: False 15 | 16 | test_feeder_args: 17 | data_path: ./data/release/val_ntu_sk_60.npy 18 | label_path: ./data/release/val_label_60.pkl 19 | 20 | # model 21 | model: model.agcn.Model 22 | model_args: 23 | num_class: 60 24 | num_person: 1 25 | num_point: 25 26 | graph: graph.ntu_rgb_d.Graph 27 | graph_args: 28 | labeling_mode: 'spatial' 29 | 30 | #optim 31 | weight_decay: 0.0001 32 | base_lr: 0.001 33 | step: [] 34 | 35 | # training 36 | device: [0] 37 | optimizer: 'Adam' 38 | loss: 'focal' 39 | beta: 0.9999 40 | gamma: 1.0 41 | label_count_path: ./data/release/train_label_60_count.pkl 42 | batch_size: 64 43 | test_batch_size: 64 44 | num_epoch: 200 45 | nesterov: True 46 | 47 | # weights: /ps/project/conditional_action_gen/2s_agcn/runs/babel_v1.0/wfl_ntu_sk_60_agcn_joint_const_lr_1e-3-19-8760.pt 48 | -------------------------------------------------------------------------------- /action_recognition/config/babel_v1.0/train_120_wfl.yaml: -------------------------------------------------------------------------------- 1 | work_dir: ./work_dir/babel_v1.0/wfl_ntu_sk_120_agcn_joint_const_lr_1e-3 2 | model_saved_name: ./runs/babel_v1.0/wfl_ntu_sk_120_agcn_joint_const_lr_1e-3 3 | 4 | # feeder 5 | feeder: feeders.feeder.Feeder 6 | train_feeder_args: 7 | data_path: ./data/release/train_ntu_sk_120.npy 8 | label_path: ./data/release/train_label_120.pkl 9 | debug: False 10 | random_choose: False 11 | random_shift: False 12 | random_move: False 13 | window_size: -1 14 | normalization: False 15 | 16 | test_feeder_args: 17 | data_path: ./data/release/val_ntu_sk_120.npy 18 | label_path: ./data/release/val_label_120.pkl 19 | 20 | # model 21 | model: model.agcn.Model 22 | model_args: 23 | num_class: 120 24 | num_person: 1 25 | num_point: 25 26 | graph: graph.ntu_rgb_d.Graph 27 | graph_args: 28 | labeling_mode: 'spatial' 29 | 30 | #optim 31 | weight_decay: 0.0001 32 | base_lr: 0.001 33 | step: [] 34 | 35 | # training 36 | device: [0] 37 | optimizer: 'Adam' 38 | loss: 'focal' 39 | beta: 0.9999 40 | gamma: 1.0 41 | label_count_path: ./data/release/train_label_120_count.pkl 42 | batch_size: 64 43 | test_batch_size: 64 44 | num_epoch: 200 45 | nesterov: True 46 | 47 | # weights: /ps/project/conditional_action_gen/2s_agcn/runs/babel_v1.0/wfl_ntu_sk_120_agcn_joint_const_lr_1e-3-19-8760.pt 48 | -------------------------------------------------------------------------------- /action_recognition/graph/ntu_rgb_d.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.extend(['../']) 4 | from graph import tools 5 | 6 | num_node = 25 7 | self_link = [(i, i) for i in range(num_node)] 8 | inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5), (7, 6), 9 | (8, 7), (9, 21), (10, 9), (11, 10), (12, 11), (13, 1), 10 | (14, 13), (15, 14), (16, 15), (17, 1), (18, 17), (19, 18), 11 | (20, 19), (22, 23), (23, 8), (24, 25), (25, 12)] 12 | inward = [(i - 1, j - 1) for (i, j) in inward_ori_index] 13 | outward = [(j, i) for (i, j) in inward] 14 | neighbor = inward + outward 15 | 16 | 17 | class Graph: 18 | def __init__(self, labeling_mode='spatial'): 19 | self.A = self.get_adjacency_matrix(labeling_mode) 20 | self.num_node = num_node 21 | self.self_link = self_link 22 | self.inward = inward 23 | self.outward = outward 24 | self.neighbor = neighbor 25 | 26 | def get_adjacency_matrix(self, labeling_mode=None): 27 | if labeling_mode is None: 28 | return self.A 29 | if labeling_mode == 'spatial': 30 | A = tools.get_spatial_graph(num_node, self_link, inward, outward) 31 | else: 32 | raise ValueError() 33 | return A 34 | 35 | 36 | if __name__ == '__main__': 37 | import matplotlib.pyplot as plt 38 | import os 39 | 40 | # os.environ['DISPLAY'] = 'localhost:11.0' 41 | A = Graph('spatial').get_adjacency_matrix() 42 | for i in A: 43 | plt.imshow(i, cmap='gray') 44 | plt.show() 45 | print(A) 46 | -------------------------------------------------------------------------------- /action_recognition/graph/kinetics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | 4 | sys.path.extend(['../']) 5 | from graph import tools 6 | import networkx as nx 7 | 8 | # Joint index: 9 | # {0, "Nose"} 10 | # {1, "Neck"}, 11 | # {2, "RShoulder"}, 12 | # {3, "RElbow"}, 13 | # {4, "RWrist"}, 14 | # {5, "LShoulder"}, 15 | # {6, "LElbow"}, 16 | # {7, "LWrist"}, 17 | # {8, "RHip"}, 18 | # {9, "RKnee"}, 19 | # {10, "RAnkle"}, 20 | # {11, "LHip"}, 21 | # {12, "LKnee"}, 22 | # {13, "LAnkle"}, 23 | # {14, "REye"}, 24 | # {15, "LEye"}, 25 | # {16, "REar"}, 26 | # {17, "LEar"}, 27 | 28 | # Edge format: (origin, neighbor) 29 | num_node = 18 30 | self_link = [(i, i) for i in range(num_node)] 31 | inward = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11), (10, 9), (9, 8), 32 | (11, 5), (8, 2), (5, 1), (2, 1), (0, 1), (15, 0), (14, 0), (17, 15), 33 | (16, 14)] 34 | outward = [(j, i) for (i, j) in inward] 35 | neighbor = inward + outward 36 | 37 | 38 | class Graph: 39 | def __init__(self, labeling_mode='spatial'): 40 | self.A = self.get_adjacency_matrix(labeling_mode) 41 | self.num_node = num_node 42 | self.self_link = self_link 43 | self.inward = inward 44 | self.outward = outward 45 | self.neighbor = neighbor 46 | 47 | def get_adjacency_matrix(self, labeling_mode=None): 48 | if labeling_mode is None: 49 | return self.A 50 | if labeling_mode == 'spatial': 51 | A = tools.get_spatial_graph(num_node, self_link, inward, outward) 52 | else: 53 | raise ValueError() 54 | return A 55 | 56 | 57 | if __name__ == '__main__': 58 | A = Graph('spatial').get_adjacency_matrix() 59 | print('') 60 | -------------------------------------------------------------------------------- /action_recognition/data_gen/rotation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | 4 | 5 | def rotation_matrix(axis, theta): 6 | """ 7 | Return the rotation matrix associated with counterclockwise rotation about 8 | the given axis by theta radians. 9 | """ 10 | if np.abs(axis).sum() < 1e-6 or np.abs(theta) < 1e-6: 11 | return np.eye(3) 12 | axis = np.asarray(axis) 13 | axis = axis / math.sqrt(np.dot(axis, axis)) 14 | a = math.cos(theta / 2.0) 15 | b, c, d = -axis * math.sin(theta / 2.0) 16 | aa, bb, cc, dd = a * a, b * b, c * c, d * d 17 | bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d 18 | return np.array([[aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)], 19 | [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)], 20 | [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc]]) 21 | 22 | 23 | def unit_vector(vector): 24 | """ Returns the unit vector of the vector. """ 25 | return vector / np.linalg.norm(vector) 26 | 27 | 28 | def angle_between(v1, v2): 29 | """ Returns the angle in radians between vectors 'v1' and 'v2':: 30 | 31 | >>> angle_between((1, 0, 0), (0, 1, 0)) 32 | 1.5707963267948966 33 | >>> angle_between((1, 0, 0), (1, 0, 0)) 34 | 0.0 35 | >>> angle_between((1, 0, 0), (-1, 0, 0)) 36 | 3.141592653589793 37 | """ 38 | if np.abs(v1).sum() < 1e-6 or np.abs(v2).sum() < 1e-6: 39 | return 0 40 | v1_u = unit_vector(v1) 41 | v2_u = unit_vector(v2) 42 | return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0)) 43 | 44 | 45 | def x_rotation(vector, theta): 46 | """Rotates 3-D vector around x-axis""" 47 | R = np.array([[1, 0, 0], [0, np.cos(theta), -np.sin(theta)], [0, np.sin(theta), np.cos(theta)]]) 48 | return np.dot(R, vector) 49 | 50 | 51 | def y_rotation(vector, theta): 52 | """Rotates 3-D vector around y-axis""" 53 | R = np.array([[np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)]]) 54 | return np.dot(R, vector) 55 | 56 | 57 | def z_rotation(vector, theta): 58 | """Rotates 3-D vector around z-axis""" 59 | R = np.array([[np.cos(theta), -np.sin(theta), 0], [np.sin(theta), np.cos(theta), 0], [0, 0, 1]]) 60 | return np.dot(R, vector) 61 | -------------------------------------------------------------------------------- /action_recognition/data/action_label_2_idx.json: -------------------------------------------------------------------------------- 1 | { 2 | "walk": 0, 3 | "stand": 1, 4 | "hand movements": 2, 5 | "turn": 3, 6 | "interact with/use object": 4, 7 | "arm movements": 5, 8 | "t pose": 6, 9 | "step": 7, 10 | "backwards movement": 8, 11 | "raising body part": 9, 12 | "look": 10, 13 | "touch object": 11, 14 | "leg movements": 12, 15 | "forward movement": 13, 16 | "circular movement": 14, 17 | "stretch": 15, 18 | "jump": 16, 19 | "touching body part": 17, 20 | "sit": 18, 21 | "place something": 19, 22 | "take/pick something up": 20, 23 | "run": 21, 24 | "bend": 22, 25 | "throw": 23, 26 | "foot movements": 24, 27 | "a pose": 25, 28 | "stand up": 26, 29 | "lowering body part": 27, 30 | "sideways movement": 28, 31 | "move up/down incline": 29, 32 | "action with ball": 30, 33 | "kick": 31, 34 | "gesture": 32, 35 | "head movements": 33, 36 | "jog": 34, 37 | "grasp object": 35, 38 | "waist movements": 36, 39 | "lift something": 37, 40 | "knee movement": 38, 41 | "wave": 39, 42 | "move something": 40, 43 | "swing body part": 41, 44 | "catch": 42, 45 | "dance": 43, 46 | "lean": 44, 47 | "greet": 45, 48 | "poses": 46, 49 | "touching face": 47, 50 | "sports move": 48, 51 | "exercise/training": 49, 52 | "clean something": 50, 53 | "punch": 51, 54 | "squat": 52, 55 | "scratch": 53, 56 | "hop": 54, 57 | "play sport": 55, 58 | "stumble": 56, 59 | "crossing limbs": 57, 60 | "perform": 58, 61 | "martial art": 59, 62 | "balance": 60, 63 | "kneel": 61, 64 | "shake": 62, 65 | "grab body part": 63, 66 | "clap": 64, 67 | "crouch": 65, 68 | "spin": 66, 69 | "upper body movements": 67, 70 | "knock": 68, 71 | "adjust": 69, 72 | "crawl": 70, 73 | "twist": 71, 74 | "move back to original position": 72, 75 | "bow": 73, 76 | "hit": 74, 77 | "touch ground": 75, 78 | "shoulder movements": 76, 79 | "telephone call": 77, 80 | "grab person": 78, 81 | "play instrument": 79, 82 | "tap": 80, 83 | "spread": 81, 84 | "skip": 82, 85 | "rolling movement": 83, 86 | "jump rope": 84, 87 | "play catch": 85, 88 | "drink": 86, 89 | "evade": 87, 90 | "support": 88, 91 | "point": 89, 92 | "side to side movement": 90, 93 | "stop": 91, 94 | "protect": 92, 95 | "wrist movements": 93, 96 | "stances": 94, 97 | "wait": 95, 98 | "shuffle": 96, 99 | "lunge": 97, 100 | "communicate (vocalise)": 98, 101 | "jumping jacks": 99, 102 | "rub": 100, 103 | "dribble": 101, 104 | "swim": 102, 105 | "sneak": 103, 106 | "to lower a body part": 104, 107 | "misc. abstract action": 105, 108 | "mix": 106, 109 | "limp": 107, 110 | "sway": 108, 111 | "slide": 109, 112 | "cartwheel": 110, 113 | "press something": 111, 114 | "shrug": 112, 115 | "open something": 113, 116 | "leap": 114, 117 | "trip": 115, 118 | "golf": 116, 119 | "move misc. body part": 117, 120 | "get injured": 118, 121 | "sudden movement": 119, 122 | "duck": 120, 123 | "flap": 121, 124 | "salute": 122, 125 | "stagger": 123, 126 | "draw": 124, 127 | "tie": 125, 128 | "eat": 126, 129 | "style hair": 127, 130 | "relax": 128, 131 | "pray": 129, 132 | "flip": 130, 133 | "shivering": 131, 134 | "interact with rope": 132, 135 | "march": 133, 136 | "zombie": 134, 137 | "check": 135, 138 | "wiggle": 136, 139 | "bump": 137, 140 | "give something": 138, 141 | "yoga": 139, 142 | "mime": 140, 143 | "wobble": 141, 144 | "release": 142, 145 | "wash": 143, 146 | "stroke": 144, 147 | "rocking movement": 145, 148 | "swipe": 146, 149 | "strafe": 147, 150 | "hang": 148, 151 | "flail arms": 149 152 | } -------------------------------------------------------------------------------- /action_recognition/data_gen/preprocess.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.extend(['../']) 4 | from data_gen.rotation import * 5 | #from rotation import * 6 | from tqdm import tqdm 7 | 8 | 9 | def pre_normalization(data, zaxis=[0, 1], xaxis=[8, 4]): 10 | N, C, T, V, M = data.shape 11 | s = np.transpose(data, [0, 4, 2, 3, 1]) # N, C, T, V, M to N, M, T, V, C 12 | l_m_sk = [] # List idxs of missing skeletons 13 | 14 | print('pad the null frames with the previous frames') 15 | for i_s, skeleton in enumerate(tqdm(s)): # pad 16 | if skeleton.sum() == 0: 17 | print(i_s, ' has no skeleton') 18 | l_m_sk.append(i_s) 19 | for i_p, person in enumerate(skeleton): 20 | if person.sum() == 0: 21 | continue 22 | if person[0].sum() == 0: 23 | index = (person.sum(-1).sum(-1) != 0) 24 | tmp = person[index].copy() 25 | person *= 0 26 | person[:len(tmp)] = tmp 27 | for i_f, frame in enumerate(person): 28 | if frame.sum() == 0: 29 | if person[i_f:].sum() == 0: 30 | rest = len(person) - i_f 31 | num = int(np.ceil(rest / i_f)) 32 | pad = np.concatenate([person[0:i_f] for _ in range(num)], 0)[:rest] 33 | s[i_s, i_p, i_f:] = pad 34 | break 35 | 36 | print('sub the center joint #1 (spine joint in ntu and neck joint in kinetics)') 37 | for i_s, skeleton in enumerate(tqdm(s)): 38 | if skeleton.sum() == 0: 39 | continue 40 | main_body_center = skeleton[0][:, 1:2, :].copy() 41 | for i_p, person in enumerate(skeleton): 42 | if person.sum() == 0: 43 | continue 44 | mask = (person.sum(-1) != 0).reshape(T, V, 1) 45 | s[i_s, i_p] = (s[i_s, i_p] - main_body_center) * mask 46 | 47 | print('parallel the bone between hip(jpt 0) and spine(jpt 1) of the first person to the z axis') 48 | for i_s, skeleton in enumerate(tqdm(s)): 49 | if skeleton.sum() == 0: 50 | continue 51 | joint_bottom = skeleton[0, 0, zaxis[0]] 52 | joint_top = skeleton[0, 0, zaxis[1]] 53 | axis = np.cross(joint_top - joint_bottom, [0, 0, 1]) 54 | angle = angle_between(joint_top - joint_bottom, [0, 0, 1]) 55 | matrix_z = rotation_matrix(axis, angle) 56 | for i_p, person in enumerate(skeleton): 57 | if person.sum() == 0: 58 | continue 59 | for i_f, frame in enumerate(person): 60 | if frame.sum() == 0: 61 | continue 62 | for i_j, joint in enumerate(frame): 63 | s[i_s, i_p, i_f, i_j] = np.dot(matrix_z, joint) 64 | 65 | print( 66 | 'parallel the bone between right shoulder(jpt 8) and left shoulder(jpt 4) of the first person to the x axis') 67 | for i_s, skeleton in enumerate(tqdm(s)): 68 | if skeleton.sum() == 0: 69 | continue 70 | joint_rshoulder = skeleton[0, 0, xaxis[0]] 71 | joint_lshoulder = skeleton[0, 0, xaxis[1]] 72 | axis = np.cross(joint_rshoulder - joint_lshoulder, [1, 0, 0]) 73 | angle = angle_between(joint_rshoulder - joint_lshoulder, [1, 0, 0]) 74 | matrix_x = rotation_matrix(axis, angle) 75 | for i_p, person in enumerate(skeleton): 76 | if person.sum() == 0: 77 | continue 78 | for i_f, frame in enumerate(person): 79 | if frame.sum() == 0: 80 | continue 81 | for i_j, joint in enumerate(frame): 82 | s[i_s, i_p, i_f, i_j] = np.dot(matrix_x, joint) 83 | 84 | data = np.transpose(s, [0, 4, 2, 3, 1]) 85 | return data, l_m_sk 86 | 87 | 88 | if __name__ == '__main__': 89 | data = np.load('../data/ntu/xview/val_data.npy') 90 | pre_normalization(data) 91 | np.save('../data/ntu/xview/data_val_pre.npy', data) 92 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | 2 | # BABEL: Bodies, Action and Behavior with English Labels [[CVPR 2021](http://cvpr2021.thecvf.com/)] 3 | 4 |

5 | 6 |

7 | 8 | > [Abhinanda R. Punnakkal\*](https://ps.is.tuebingen.mpg.de/person/apunnakkal), [Arjun Chandrasekaran\*](https://ps.is.tuebingen.mpg.de/person/achandrasekaran), [Nikos Athanasiou](https://ps.is.tuebingen.mpg.de/person/nathanasiou), [Alejandra Quiros-Ramirez](https://ps.is.tuebingen.mpg.de/person/aquiros), [Michael J. Black](https://ps.is.tuebingen.mpg.de/person/black). 9 | > \* denotes equal contribution 10 | 11 | [Project Website](https://babel.is.tue.mpg.de) | [Paper](https://arxiv.org/pdf/2106.09696.pdf) | [Video](https://www.youtube.com/watch?v=BYWxvjKpCqA) | [Poster](https://babel.is.tue.mpg.de/media/upload/CVPR_2021_BABEL_poster.pdf) 12 | 13 | --- 14 | 15 | BABEL is a large dataset with language labels describing the actions being performed in mocap sequences. BABEL labels about 43 hours of mocap sequences from [AMASS](https://amass.is.tue.mpg.de/) [1] with action labels. 16 | Sequences have action labels at two possible levels of abstraction: 17 | - **Sequence labels** which describe the overall action in the sequence 18 | - **Frame labels** which describe all actions in every frame of the sequence. Each frame label is precisely aligned with the duration of the corresponding action in the mocap sequence, and multiple actions can overlap. 19 | 20 | To download the BABEL action labels, visit our ['Data' page](https://babel.is.tue.mpg.de/data.html). You can download the mocap sequences from [AMASS](https://amass.is.tue.mpg.de/). 21 | 22 | 23 | ### Tutorials 24 | 25 | We release some helper code in Jupyter notebooks to load the BABEL dataset, visualize mocap sequences and their action labels, search BABEL for sequences containing specific actions, etc. 26 | 27 | See [`notebooks/`](notebooks/) for more details. 28 | 29 | 30 | ### Action Recognition 31 | 32 | We provide features, training and inference code, and pre-trained checkpoints for 3D skeleton-based action recognition. 33 | 34 | Please see [`action_recognition/`](action_recognition/) for more details. 35 | 36 | 37 | ### Acknowledgements 38 | 39 | We thank the [Software Workshop](https://is.mpg.de/en/software-workshop) at MPI for building the action recognition test set evaluation web server. 40 | The notebooks in this repo are inspired by the those provided by [AMASS](https://github.com/nghorbani/amass). 41 | The Action Recognition code is based on the [2s-AGCN](https://github.com/lshiwjx/2s-AGCN) [2] implementation. 42 | 43 | 44 | ### References 45 | 46 | [1] Mahmood, Naureen, et al. "AMASS: Archive of motion capture as surface shapes." Proceedings of the IEEE/CVF International Conference on Computer Vision. 2019.
47 | [2] Shi, Lei, et al. "Two-stream adaptive graph convolutional networks for skeleton-based action recognition." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019. 48 | 49 | ### License 50 | 51 | Software Copyright License for non-commercial scientific research purposes. Please read carefully the terms and conditions and any accompanying documentation before you download and/or use the BABEL dataset, and software, (the "Model & Software"). By downloading and/or using the Model & Software (including downloading, cloning, installing, and any other use of this GitHub repository), you acknowledge that you have read these terms and conditions, understand them, and agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use the Model & Software. Any infringement of the terms of this agreement will automatically terminate your rights under this License. 52 | 53 | ### Contact 54 | 55 | The code in this repository is developed by [Abhinanda Punnakkal](https://www.is.mpg.de/person/apunnakkal) and [Arjun Chandrasekaran](https://www.is.mpg.de/person/achandrasekaran), and tested by [Nikos Athanasiou](https://www.is.mpg.de/person/nathanasiou). 56 | 57 | If you have any questions you can contact us at babel@tue.mpg.de. 58 | 59 | -------------------------------------------------------------------------------- /action_recognition/class_balanced_loss.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | 6 | """ 7 | Code from: 8 | https://raw.githubusercontent.com/vandit15/Class-balanced-loss-pytorch/master/class_balanced_loss.py 9 | 10 | Pytorch implementation of Class-Balanced-Loss 11 | Reference: "Class-Balanced Loss Based on Effective Number of Samples" 12 | Authors: Yin Cui and 13 | Menglin Jia and 14 | Tsung Yi Lin and 15 | Yang Song and 16 | Serge J. Belongie 17 | https://arxiv.org/abs/1901.05555, CVPR'19. 18 | """ 19 | 20 | 21 | import numpy as np 22 | import torch 23 | import torch.nn.functional as F 24 | 25 | 26 | 27 | def focal_loss(labels, logits, alpha, gamma): 28 | """Compute the focal loss between `logits` and the ground truth `labels`. 29 | 30 | Focal loss = -alpha_t * (1-pt)^gamma * log(pt) 31 | where pt is the probability of being classified to the true class. 32 | pt = p (if true class), otherwise pt = 1 - p. p = sigmoid(logit). 33 | 34 | Args: 35 | labels: A float tensor of size [batch, num_classes]. 36 | logits: A float tensor of size [batch, num_classes]. 37 | alpha: A float tensor of size [batch_size] 38 | specifying per-example weight for balanced cross entropy. 39 | gamma: A float scalar modulating loss from hard and easy examples. 40 | 41 | Returns: 42 | focal_loss: A float32 scalar representing normalized total loss. 43 | """ 44 | BCLoss = F.binary_cross_entropy_with_logits(input = logits, target = labels,reduction = "none") 45 | 46 | if gamma == 0.0: 47 | modulator = 1.0 48 | else: 49 | modulator = torch.exp(-gamma * labels * logits - gamma * torch.log(1 + 50 | torch.exp(-1.0 * logits))) 51 | 52 | loss = modulator * BCLoss 53 | 54 | weighted_loss = alpha * loss 55 | focal_loss = torch.sum(weighted_loss) 56 | 57 | focal_loss /= torch.sum(labels) 58 | return focal_loss 59 | 60 | 61 | def CB_loss(labels, logits, samples_per_cls, no_of_classes, loss_type, beta, gamma, device): 62 | """Compute the Class Balanced Loss between `logits` and the ground truth `labels`. 63 | 64 | Class Balanced Loss: ((1-beta)/(1-beta^n))*Loss(labels, logits) 65 | where Loss is one of the standard losses used for Neural Networks. 66 | 67 | Args: 68 | labels: A int tensor of size [batch]. 69 | logits: A float tensor of size [batch, no_of_classes]. 70 | samples_per_cls: A python list of size [no_of_classes]. 71 | no_of_classes: total number of classes. int 72 | loss_type: string. One of "sigmoid", "focal", "softmax". 73 | beta: float. Hyperparameter for Class balanced loss. 74 | gamma: float. Hyperparameter for Focal loss. 75 | 76 | Returns: 77 | cb_loss: A float tensor representing class balanced loss 78 | """ 79 | effective_num = 1.0 - np.power(beta, samples_per_cls) 80 | weights = (1.0 - beta) / np.array(effective_num) 81 | weights = weights / np.sum(weights) * no_of_classes 82 | 83 | labels_one_hot = F.one_hot(labels, no_of_classes).float().cuda(device) 84 | 85 | weights = torch.tensor(weights).float().cuda(device) 86 | weights = weights.unsqueeze(0) 87 | weights = weights.repeat(labels_one_hot.shape[0],1) * labels_one_hot 88 | weights = weights.sum(1) 89 | weights = weights.unsqueeze(1) 90 | weights = weights.repeat(1,no_of_classes) 91 | 92 | if loss_type == "focal": 93 | cb_loss = focal_loss(labels_one_hot, logits, weights, gamma) 94 | elif loss_type == "sigmoid": 95 | cb_loss = F.binary_cross_entropy_with_logits(input = logits,target = labels_one_hot, weight = weights) 96 | elif loss_type == "softmax": 97 | pred = logits.softmax(dim = 1) 98 | cb_loss = F.binary_cross_entropy(input = pred, target = labels_one_hot, weight = weights) 99 | return cb_loss 100 | 101 | 102 | def test(): 103 | no_of_classes = 5 104 | logits = torch.rand(10,no_of_classes).float() 105 | labels = torch.randint(0,no_of_classes, size = (10,)) 106 | beta = 0.9999 107 | gamma = 2.0 108 | samples_per_cls = [2,3,1,2,2] 109 | loss_type = "focal" 110 | cb_loss = CB_loss(labels, logits, samples_per_cls, no_of_classes,loss_type, beta, gamma) 111 | print(cb_loss) 112 | -------------------------------------------------------------------------------- /action_recognition/challenge/create_submission.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2021 achandrasekaran 6 | # 7 | # Distributed under terms of the MIT license. 8 | 9 | import sys, os, pdb, glob 10 | import uuid 11 | from os.path import join as ospj 12 | from os.path import dirname as ospd 13 | import json, pickle 14 | import argparse 15 | from tqdm import tqdm 16 | from collections import * 17 | 18 | import numpy as np 19 | import pandas as pd 20 | from pandas.core.common import flatten 21 | from fnmatch import fnmatch 22 | import re 23 | 24 | 25 | def load_test_scores(test_scores_fp): 26 | ''' 27 | Load the score prediction file, validate. 28 | 29 | Format of data structure stored in prediction file: 30 | score_dict = list(zip( 31 | self.data_loader[ln].dataset.label[1], # sid 32 | self.data_loader[ln].dataset.sample_name, # seg_id 33 | self.data_loader[ln].dataset.label[2], # chunk_id 34 | score)) 35 | ''' 36 | # load test set predictions from model 37 | test_scores = pickle.load(open(test_scores_fp, 'rb')) 38 | 39 | # GT labels (-1 for test set), seg_id, chunk_id, score 40 | _, seg_ids, chunk_ids, scores = zip(*test_scores) 41 | 42 | # Validate the shape of predictions 43 | scores = np.array(scores) 44 | n_samples, n_classes = scores.shape 45 | assert n_classes in (60, 120) 46 | 47 | return list(zip(seg_ids, chunk_ids, scores)), n_classes 48 | 49 | 50 | def load_test_samples(n_classes): 51 | '''Load the GT samples corresponding to the BABEL subset (# classes) used. 52 | 53 | GT labels data structure format: 54 | List of seg_id, (label, sid, chunk_n, anntr_id) 55 | 56 | Arguments: 57 | scores: np.array (n_samples, n_classes) contains predicted scores for samples. 58 | ''' 59 | # load test set samples 60 | samples_filename = f'test_label_{n_classes}.pkl' 61 | test_samples = pickle.load(open(f'../data/release/{samples_filename}', 'rb')) 62 | 63 | # GT labels (-1 for test set), sid, chunk_id, anntr_id 64 | seg_ids, (_, _, chunk_ids, _) = test_samples 65 | 66 | return list(zip(seg_ids, chunk_ids)) 67 | 68 | 69 | def create_submission(test_samples, test_pred_scores, n_classes): 70 | '''Create a submission with the same ordering of samples 71 | as provided in the `test_label_{60, 120}.pkl` file. 72 | ''' 73 | submission = [] 74 | perfect_map = True 75 | 76 | # Ideal scenario -- 1:1 map between samples in two files 77 | for i, ((seg_id, chunk_id), (pred_seg_id, pred_chunk_id, _)) in \ 78 | enumerate(zip(test_samples, test_pred_scores)): 79 | if seg_id != pred_seg_id or chunk_id != pred_chunk_id: 80 | perfect_map = False 81 | 82 | if True == perfect_map: 83 | submission = np.array(list(zip(*test_pred_scores))[2]) 84 | else: 85 | # For each sample, find its predicted score 86 | for i, (seg_id, chunk_id) in enumerate(test_samples): 87 | for pred_seg_id, pred_chunk_id, score in test_pred_scores: 88 | if pred_seg_id == seg_id and pred_chunk_id == chunk_id: 89 | submission.append(score) 90 | break 91 | submission = np.array(submission) 92 | if 60 == n_classes: 93 | assert 15647 == submission.shape[0] 94 | elif 120 == n_classes: 95 | assert 16839 == submission.shape[0] 96 | 97 | return submission 98 | 99 | 100 | def save_submission(submission, filepath): 101 | '''Save predicted scores for test samples in .npz format for 102 | submission to BABEL Action Recognition Challenge. 103 | ''' 104 | np.savez(filepath, submission) 105 | print(f'Successfully saved submission in: {filepath}') 106 | 107 | return None 108 | 109 | 110 | if __name__ == '__main__': 111 | # Add args 112 | parser = argparse.ArgumentParser( 113 | description='Predicted test scores --> Submission to server') 114 | parser.add_argument( 115 | '--pred_path', 116 | default='./epoch1_test_score.pkl', 117 | help='Path to file containing model predictions (saved to disk by train_test.py.') 118 | parser.add_argument( 119 | '--sub_path', 120 | default='./test_sub.npz', 121 | help='Path to write submission file.') 122 | 123 | # Parse args 124 | args = parser.parse_args() 125 | 126 | # Process scores into submission file 127 | test_pred_scores, n_classes = load_test_scores(args.pred_path) 128 | test_samples = load_test_samples(n_classes) 129 | submission = create_submission(test_samples, test_pred_scores, n_classes) 130 | save_submission(submission, args.sub_path) 131 | -------------------------------------------------------------------------------- /action_recognition/feeders/tools.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | def downsample(data_numpy, step, random_sample=True): 5 | # input: C,T,V,M 6 | begin = np.random.randint(step) if random_sample else 0 7 | return data_numpy[:, begin::step, :, :] 8 | 9 | 10 | def temporal_slice(data_numpy, step): 11 | # input: C,T,V,M 12 | C, T, V, M = data_numpy.shape 13 | return data_numpy.reshape(C, T / step, step, V, M).transpose( 14 | (0, 1, 3, 2, 4)).reshape(C, T / step, V, step * M) 15 | 16 | 17 | def mean_subtractor(data_numpy, mean): 18 | # input: C,T,V,M 19 | # naive version 20 | if mean == 0: 21 | return 22 | C, T, V, M = data_numpy.shape 23 | valid_frame = (data_numpy != 0).sum(axis=3).sum(axis=2).sum(axis=0) > 0 24 | begin = valid_frame.argmax() 25 | end = len(valid_frame) - valid_frame[::-1].argmax() 26 | data_numpy[:, :end, :, :] = data_numpy[:, :end, :, :] - mean 27 | return data_numpy 28 | 29 | 30 | def auto_pading(data_numpy, size, random_pad=False): 31 | C, T, V, M = data_numpy.shape 32 | if T < size: 33 | begin = random.randint(0, size - T) if random_pad else 0 34 | data_numpy_paded = np.zeros((C, size, V, M)) 35 | data_numpy_paded[:, begin:begin + T, :, :] = data_numpy 36 | return data_numpy_paded 37 | else: 38 | return data_numpy 39 | 40 | 41 | def random_choose(data_numpy, size, auto_pad=True): 42 | # input: C,T,V,M 随机选择其中一段,不是很合理。因为有0 43 | C, T, V, M = data_numpy.shape 44 | if T == size: 45 | return data_numpy 46 | elif T < size: 47 | if auto_pad: 48 | return auto_pading(data_numpy, size, random_pad=True) 49 | else: 50 | return data_numpy 51 | else: 52 | begin = random.randint(0, T - size) 53 | return data_numpy[:, begin:begin + size, :, :] 54 | 55 | 56 | def random_move(data_numpy, 57 | angle_candidate=[-10., -5., 0., 5., 10.], 58 | scale_candidate=[0.9, 1.0, 1.1], 59 | transform_candidate=[-0.2, -0.1, 0.0, 0.1, 0.2], 60 | move_time_candidate=[1]): 61 | # input: C,T,V,M 62 | C, T, V, M = data_numpy.shape 63 | move_time = random.choice(move_time_candidate) 64 | node = np.arange(0, T, T * 1.0 / move_time).round().astype(int) 65 | node = np.append(node, T) 66 | num_node = len(node) 67 | 68 | A = np.random.choice(angle_candidate, num_node) 69 | S = np.random.choice(scale_candidate, num_node) 70 | T_x = np.random.choice(transform_candidate, num_node) 71 | T_y = np.random.choice(transform_candidate, num_node) 72 | 73 | a = np.zeros(T) 74 | s = np.zeros(T) 75 | t_x = np.zeros(T) 76 | t_y = np.zeros(T) 77 | 78 | # linspace 79 | for i in range(num_node - 1): 80 | a[node[i]:node[i + 1]] = np.linspace( 81 | A[i], A[i + 1], node[i + 1] - node[i]) * np.pi / 180 82 | s[node[i]:node[i + 1]] = np.linspace(S[i], S[i + 1], 83 | node[i + 1] - node[i]) 84 | t_x[node[i]:node[i + 1]] = np.linspace(T_x[i], T_x[i + 1], 85 | node[i + 1] - node[i]) 86 | t_y[node[i]:node[i + 1]] = np.linspace(T_y[i], T_y[i + 1], 87 | node[i + 1] - node[i]) 88 | 89 | theta = np.array([[np.cos(a) * s, -np.sin(a) * s], 90 | [np.sin(a) * s, np.cos(a) * s]]) # xuanzhuan juzhen 91 | 92 | # perform transformation 93 | for i_frame in range(T): 94 | xy = data_numpy[0:2, i_frame, :, :] 95 | new_xy = np.dot(theta[:, :, i_frame], xy.reshape(2, -1)) 96 | new_xy[0] += t_x[i_frame] 97 | new_xy[1] += t_y[i_frame] # pingyi bianhuan 98 | data_numpy[0:2, i_frame, :, :] = new_xy.reshape(2, V, M) 99 | 100 | return data_numpy 101 | 102 | 103 | def random_shift(data_numpy): 104 | # input: C,T,V,M 偏移其中一段 105 | C, T, V, M = data_numpy.shape 106 | data_shift = np.zeros(data_numpy.shape) 107 | valid_frame = (data_numpy != 0).sum(axis=3).sum(axis=2).sum(axis=0) > 0 108 | begin = valid_frame.argmax() 109 | end = len(valid_frame) - valid_frame[::-1].argmax() 110 | 111 | size = end - begin 112 | bias = random.randint(0, T - size) 113 | data_shift[:, bias:bias + size, :, :] = data_numpy[:, begin:end, :, :] 114 | 115 | return data_shift 116 | 117 | 118 | def openpose_match(data_numpy): 119 | C, T, V, M = data_numpy.shape 120 | assert (C == 3) 121 | score = data_numpy[2, :, :, :].sum(axis=1) 122 | # the rank of body confidence in each frame (shape: T-1, M) 123 | rank = (-score[0:T - 1]).argsort(axis=1).reshape(T - 1, M) 124 | 125 | # data of frame 1 126 | xy1 = data_numpy[0:2, 0:T - 1, :, :].reshape(2, T - 1, V, M, 1) 127 | # data of frame 2 128 | xy2 = data_numpy[0:2, 1:T, :, :].reshape(2, T - 1, V, 1, M) 129 | # square of distance between frame 1&2 (shape: T-1, M, M) 130 | distance = ((xy2 - xy1) ** 2).sum(axis=2).sum(axis=0) 131 | 132 | # match pose 133 | forward_map = np.zeros((T, M), dtype=int) - 1 134 | forward_map[0] = range(M) 135 | for m in range(M): 136 | choose = (rank == m) 137 | forward = distance[choose].argmin(axis=1) 138 | for t in range(T - 1): 139 | distance[t, :, forward[t]] = np.inf 140 | forward_map[1:][choose] = forward 141 | assert (np.all(forward_map >= 0)) 142 | 143 | # string data 144 | for t in range(T - 1): 145 | forward_map[t + 1] = forward_map[t + 1][forward_map[t]] 146 | 147 | # generate data 148 | new_data_numpy = np.zeros(data_numpy.shape) 149 | for t in range(T): 150 | new_data_numpy[:, t, :, :] = data_numpy[:, t, :, forward_map[ 151 | t]].transpose(1, 2, 0) 152 | data_numpy = new_data_numpy 153 | 154 | # score sort 155 | trace_score = data_numpy[2, :, :, :].sum(axis=1).sum(axis=0) 156 | rank = (-trace_score).argsort() 157 | data_numpy = data_numpy[:, :, :, rank] 158 | 159 | return data_numpy 160 | -------------------------------------------------------------------------------- /action_recognition/model/agcn.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | from torch.autograd import Variable 7 | 8 | def import_class(name): 9 | components = name.split('.') 10 | mod = __import__(components[0]) 11 | for comp in components[1:]: 12 | mod = getattr(mod, comp) 13 | return mod 14 | 15 | 16 | def conv_branch_init(conv, branches): 17 | weight = conv.weight 18 | n = weight.size(0) 19 | k1 = weight.size(1) 20 | k2 = weight.size(2) 21 | nn.init.normal_(weight, 0, math.sqrt(2. / (n * k1 * k2 * branches))) 22 | nn.init.constant_(conv.bias, 0) 23 | 24 | 25 | def conv_init(conv): 26 | nn.init.kaiming_normal_(conv.weight, mode='fan_out') 27 | nn.init.constant_(conv.bias, 0) 28 | 29 | 30 | def bn_init(bn, scale): 31 | nn.init.constant_(bn.weight, scale) 32 | nn.init.constant_(bn.bias, 0) 33 | 34 | 35 | class unit_tcn(nn.Module): 36 | def __init__(self, in_channels, out_channels, kernel_size=9, stride=1): 37 | super(unit_tcn, self).__init__() 38 | pad = int((kernel_size - 1) / 2) 39 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(pad, 0), 40 | stride=(stride, 1)) 41 | 42 | self.bn = nn.BatchNorm2d(out_channels) 43 | self.relu = nn.ReLU() 44 | conv_init(self.conv) 45 | bn_init(self.bn, 1) 46 | 47 | def forward(self, x): 48 | x = self.bn(self.conv(x)) 49 | return x 50 | 51 | 52 | class unit_gcn(nn.Module): 53 | def __init__(self, in_channels, out_channels, A, coff_embedding=4, num_subset=3): 54 | super(unit_gcn, self).__init__() 55 | inter_channels = out_channels // coff_embedding 56 | self.inter_c = inter_channels 57 | self.PA = nn.Parameter(torch.from_numpy(A.astype(np.float32))) 58 | nn.init.constant_(self.PA, 1e-6) 59 | self.A = Variable(torch.from_numpy(A.astype(np.float32)), requires_grad=False) 60 | self.num_subset = num_subset 61 | 62 | self.conv_a = nn.ModuleList() 63 | self.conv_b = nn.ModuleList() 64 | self.conv_d = nn.ModuleList() 65 | for i in range(self.num_subset): 66 | self.conv_a.append(nn.Conv2d(in_channels, inter_channels, 1)) 67 | self.conv_b.append(nn.Conv2d(in_channels, inter_channels, 1)) 68 | self.conv_d.append(nn.Conv2d(in_channels, out_channels, 1)) 69 | 70 | if in_channels != out_channels: 71 | self.down = nn.Sequential( 72 | nn.Conv2d(in_channels, out_channels, 1), 73 | nn.BatchNorm2d(out_channels) 74 | ) 75 | else: 76 | self.down = lambda x: x 77 | 78 | self.bn = nn.BatchNorm2d(out_channels) 79 | self.soft = nn.Softmax(-2) 80 | self.relu = nn.ReLU() 81 | 82 | for m in self.modules(): 83 | if isinstance(m, nn.Conv2d): 84 | conv_init(m) 85 | elif isinstance(m, nn.BatchNorm2d): 86 | bn_init(m, 1) 87 | bn_init(self.bn, 1e-6) 88 | for i in range(self.num_subset): 89 | conv_branch_init(self.conv_d[i], self.num_subset) 90 | 91 | def forward(self, x): 92 | N, C, T, V = x.size() 93 | A = self.A 94 | if -1 != x.get_device(): 95 | A = A.cuda(x.get_device()) 96 | A = A + self.PA 97 | 98 | y = None 99 | for i in range(self.num_subset): 100 | A1 = self.conv_a[i](x).permute(0, 3, 1, 2).contiguous().view(N, V, self.inter_c * T) 101 | A2 = self.conv_b[i](x).view(N, self.inter_c * T, V) 102 | A1 = self.soft(torch.matmul(A1, A2) / A1.size(-1)) # N V V 103 | A1 = A1 + A[i] 104 | A2 = x.view(N, C * T, V) 105 | z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V)) 106 | y = z + y if y is not None else z 107 | 108 | y = self.bn(y) 109 | y += self.down(x) 110 | return self.relu(y) 111 | 112 | 113 | class TCN_GCN_unit(nn.Module): 114 | def __init__(self, in_channels, out_channels, A, stride=1, residual=True): 115 | super(TCN_GCN_unit, self).__init__() 116 | self.gcn1 = unit_gcn(in_channels, out_channels, A) 117 | self.tcn1 = unit_tcn(out_channels, out_channels, stride=stride) 118 | self.relu = nn.ReLU() 119 | if not residual: 120 | self.residual = lambda x: 0 121 | 122 | elif (in_channels == out_channels) and (stride == 1): 123 | self.residual = lambda x: x 124 | 125 | else: 126 | self.residual = unit_tcn(in_channels, out_channels, kernel_size=1, stride=stride) 127 | 128 | def forward(self, x): 129 | x = self.tcn1(self.gcn1(x)) + self.residual(x) 130 | return self.relu(x) 131 | 132 | 133 | class Model(nn.Module): 134 | def __init__(self, num_class=60, num_point=25, num_person=2, graph=None, graph_args=dict(), in_channels=3): 135 | super(Model, self).__init__() 136 | 137 | if graph is None: 138 | raise ValueError() 139 | else: 140 | Graph = import_class(graph) 141 | self.graph = Graph(**graph_args) 142 | 143 | A = self.graph.A 144 | self.data_bn = nn.BatchNorm1d(num_person * in_channels * num_point) 145 | 146 | self.l1 = TCN_GCN_unit(3, 64, A, residual=False) 147 | self.l2 = TCN_GCN_unit(64, 64, A) 148 | self.l3 = TCN_GCN_unit(64, 64, A) 149 | self.l4 = TCN_GCN_unit(64, 64, A) 150 | self.l5 = TCN_GCN_unit(64, 128, A, stride=2) 151 | self.l6 = TCN_GCN_unit(128, 128, A) 152 | self.l7 = TCN_GCN_unit(128, 128, A) 153 | self.l8 = TCN_GCN_unit(128, 256, A, stride=2) 154 | self.l9 = TCN_GCN_unit(256, 256, A) 155 | self.l10 = TCN_GCN_unit(256, 256, A) 156 | 157 | self.fc = nn.Linear(256, num_class) 158 | nn.init.normal_(self.fc.weight, 0, math.sqrt(2. / num_class)) 159 | bn_init(self.data_bn, 1) 160 | 161 | def forward(self, x): 162 | N, C, T, V, M = x.size() 163 | 164 | x = x.permute(0, 4, 3, 1, 2).contiguous().view(N, M * V * C, T) 165 | x = self.data_bn(x) 166 | x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, 2).contiguous().view(N * M, C, T, V) 167 | 168 | x = self.l1(x) 169 | x = self.l2(x) 170 | x = self.l3(x) 171 | x = self.l4(x) 172 | x = self.l5(x) 173 | x = self.l6(x) 174 | x = self.l7(x) 175 | x = self.l8(x) 176 | x = self.l9(x) 177 | x = self.l10(x) 178 | 179 | # N*M,C,T,V 180 | c_new = x.size(1) 181 | x = x.view(N, M, c_new, -1) 182 | x = x.mean(3).mean(1) 183 | 184 | return self.fc(x) 185 | -------------------------------------------------------------------------------- /action_recognition/model/agcn_mult.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | from torch.autograd import Variable 7 | 8 | 9 | def import_class(name): 10 | components = name.split('.') 11 | mod = __import__(components[0]) 12 | for comp in components[1:]: 13 | mod = getattr(mod, comp) 14 | return mod 15 | 16 | 17 | def conv_branch_init(conv, branches): 18 | weight = conv.weight 19 | n = weight.size(0) 20 | k1 = weight.size(1) 21 | k2 = weight.size(2) 22 | nn.init.normal_(weight, 0, math.sqrt(2. / (n * k1 * k2 * branches))) 23 | nn.init.constant_(conv.bias, 0) 24 | 25 | 26 | def conv_init(conv): 27 | nn.init.kaiming_normal_(conv.weight, mode='fan_out') 28 | nn.init.constant_(conv.bias, 0) 29 | 30 | 31 | def bn_init(bn, scale): 32 | nn.init.constant_(bn.weight, scale) 33 | nn.init.constant_(bn.bias, 0) 34 | 35 | 36 | class unit_tcn(nn.Module): 37 | def __init__(self, in_channels, out_channels, kernel_size=9, stride=1): 38 | super(unit_tcn, self).__init__() 39 | pad = int((kernel_size - 1) / 2) 40 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(pad, 0), 41 | stride=(stride, 1)) 42 | 43 | self.bn = nn.BatchNorm2d(out_channels) 44 | self.relu = nn.ReLU() 45 | conv_init(self.conv) 46 | bn_init(self.bn, 1) 47 | 48 | def forward(self, x): 49 | x = self.bn(self.conv(x)) 50 | return x 51 | 52 | 53 | class unit_gcn(nn.Module): 54 | def __init__(self, in_channels, out_channels, A, coff_embedding=4, num_subset=3): 55 | super(unit_gcn, self).__init__() 56 | inter_channels = out_channels // coff_embedding 57 | self.inter_c = inter_channels 58 | self.PA = nn.Parameter(torch.from_numpy(A.astype(np.float32))) 59 | nn.init.constant_(self.PA, 1e-6) 60 | self.A = Variable(torch.from_numpy(A.astype(np.float32)), requires_grad=False) 61 | self.num_subset = num_subset 62 | 63 | self.conv_a = nn.ModuleList() 64 | self.conv_b = nn.ModuleList() 65 | self.conv_d = nn.ModuleList() 66 | for i in range(self.num_subset): 67 | self.conv_a.append(nn.Conv2d(in_channels, inter_channels, 1)) 68 | self.conv_b.append(nn.Conv2d(in_channels, inter_channels, 1)) 69 | self.conv_d.append(nn.Conv2d(in_channels, out_channels, 1)) 70 | 71 | if in_channels != out_channels: 72 | self.down = nn.Sequential( 73 | nn.Conv2d(in_channels, out_channels, 1), 74 | nn.BatchNorm2d(out_channels) 75 | ) 76 | else: 77 | self.down = lambda x: x 78 | 79 | self.bn = nn.BatchNorm2d(out_channels) 80 | self.soft = nn.Softmax(-2) 81 | self.relu = nn.ReLU() 82 | 83 | for m in self.modules(): 84 | if isinstance(m, nn.Conv2d): 85 | conv_init(m) 86 | elif isinstance(m, nn.BatchNorm2d): 87 | bn_init(m, 1) 88 | bn_init(self.bn, 1e-6) 89 | for i in range(self.num_subset): 90 | conv_branch_init(self.conv_d[i], self.num_subset) 91 | 92 | def forward(self, x): 93 | N, C, T, V = x.size() 94 | A = self.A.cuda(x.get_device()) 95 | A = A + self.PA 96 | 97 | y = None 98 | for i in range(self.num_subset): 99 | A1 = self.conv_a[i](x).permute(0, 3, 1, 2).contiguous().view(N, V, self.inter_c * T) 100 | A2 = self.conv_b[i](x).view(N, self.inter_c * T, V) 101 | A1 = self.soft(torch.matmul(A1, A2) / A1.size(-1)) # N V V 102 | A1 = A1 + A[i] 103 | A2 = x.view(N, C * T, V) 104 | z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V)) 105 | y = z + y if y is not None else z 106 | 107 | y = self.bn(y) 108 | y += self.down(x) 109 | return self.relu(y) 110 | 111 | 112 | class TCN_GCN_unit(nn.Module): 113 | def __init__(self, in_channels, out_channels, A, stride=1, residual=True): 114 | super(TCN_GCN_unit, self).__init__() 115 | self.gcn1 = unit_gcn(in_channels, out_channels, A) 116 | self.tcn1 = unit_tcn(out_channels, out_channels, stride=stride) 117 | self.relu = nn.ReLU() 118 | if not residual: 119 | self.residual = lambda x: 0 120 | 121 | elif (in_channels == out_channels) and (stride == 1): 122 | self.residual = lambda x: x 123 | 124 | else: 125 | self.residual = unit_tcn(in_channels, out_channels, kernel_size=1, stride=stride) 126 | 127 | def forward(self, x): 128 | x = self.tcn1(self.gcn1(x)) + self.residual(x) 129 | return self.relu(x) 130 | 131 | 132 | class Model(nn.Module): 133 | def __init__(self, num_class=60, num_point=25, loss_type='softmax', num_person=2, graph=None, graph_args=dict(), in_channels=3): 134 | super(Model, self).__init__() 135 | 136 | if graph is None: 137 | raise ValueError() 138 | else: 139 | Graph = import_class(graph) 140 | self.graph = Graph(**graph_args) 141 | 142 | A = self.graph.A 143 | self.data_bn = nn.BatchNorm1d(num_person * in_channels * num_point) 144 | 145 | self.l1 = TCN_GCN_unit(3, 64, A, residual=False) 146 | self.l2 = TCN_GCN_unit(64, 64, A) 147 | self.l3 = TCN_GCN_unit(64, 64, A) 148 | self.l4 = TCN_GCN_unit(64, 64, A) 149 | self.l5 = TCN_GCN_unit(64, 128, A, stride=2) 150 | self.l6 = TCN_GCN_unit(128, 128, A) 151 | self.l7 = TCN_GCN_unit(128, 128, A) 152 | self.l8 = TCN_GCN_unit(128, 256, A, stride=2) 153 | self.l9 = TCN_GCN_unit(256, 256, A) 154 | self.l10 = TCN_GCN_unit(256, 256, A) 155 | 156 | 157 | self.fc = nn.Linear(256, num_class) 158 | self.sig = nn.Sigmoid() 159 | nn.init.normal_(self.fc.weight, 0, math.sqrt(2. / num_class)) 160 | if loss_type == 'sigmoid' or loss_type == 'focal' or loss_type=='focal2': 161 | nn.init.constant(self.fc.bias, -np.log(num_class - 1)) 162 | # self.sof = nn.Softmax(-1) 163 | bn_init(self.data_bn, 1) 164 | 165 | def forward(self, x): 166 | N, C, T, V, M = x.size() 167 | 168 | x = x.permute(0, 4, 3, 1, 2).contiguous().view(N, M * V * C, T) 169 | x = self.data_bn(x) 170 | x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, 2).contiguous().view(N * M, C, T, V) 171 | 172 | x = self.l1(x) 173 | x = self.l2(x) 174 | x = self.l3(x) 175 | x = self.l4(x) 176 | x = self.l5(x) 177 | x = self.l6(x) 178 | x = self.l7(x) 179 | x = self.l8(x) 180 | x = self.l9(x) 181 | x = self.l10(x) 182 | 183 | # N*M,C,T,V 184 | c_new = x.size(1) 185 | x = x.view(N, M, c_new, -1) 186 | x = x.mean(3).mean(1) 187 | 188 | x = self.fc(x) 189 | # import pdb 190 | # pdb.set_trace() 191 | x = self.sig(x) 192 | return x 193 | -------------------------------------------------------------------------------- /action_recognition/Readme.md: -------------------------------------------------------------------------------- 1 | ## Action Recognition 2 | 3 | We follow the 3D skeleton-based action recognition setup and [implementation](https://github.com/lshiwjx/2s-AGCN) from Shi et al. [2] 4 | 5 | ### Task 6 | 7 | **Sample** `(n_frames, feat_dim)`: Each action segment (start-end span) from BABEL is divided into contiguous 5-second chunks. See the [paper](https://arxiv.org/pdf/2106.09696.pdf) for more details. 8 | **Label** ``: Index of the ground-truth action label of the segment that the current chunk belongs to. 9 | 10 | 11 | ### Features 12 | 13 | We extract the joint positions (in `x, y, z` co-ordinates) from the AMASS mocap sequences in NTU RGB+D [1] skeleton format. There are 25 joints, resulting in `feat_dim=25*3=75`. 14 | 15 | Each sample is a 5-second chunk @ 30fps, resulting in `n_frames=150`. 16 | 17 | Pre-preprocessing of the skeleton joints follows Shi et al. [2]. Download the pre-processed sample features and corresponding labels: 18 | 19 | ``` 20 | # BABEL Dense 21 | cd data/ 22 | wget https://human-movement.is.tue.mpg.de/babel_feats_labels.tar.gz 23 | tar -xzvf babel_feats_labels.tar.gz -C ./ 24 | 25 | # BABEL Dense+Extra 26 | wget https://human-movement.is.tue.mpg.de/babel_dense_and_extra_feats_labels.tar.gz 27 | tar -xzvf babel_dense_and_extra_feats_labels.tar.gz -C ./ 28 | ``` 29 | 30 | Note: We only train and test with Dense annotations. For details regarding Dense and Extra annotations, please see BABEL's [Data page](https://babel.is.tue.mpg.de/data.html). 31 | 32 | 33 | ### Training and Inference 34 | 35 | Set up and activate a virtual environment: 36 | 37 | ``` 38 | python3 -m venv babel-env 39 | source $PWD/babel-env/bin/activate 40 | $PWD/babel-env/bin/pip install --upgrade pip setuptools 41 | $PWD/babel-env/bin/pip install -r requirements.txt 42 | ``` 43 | 44 | #### Model 45 | 46 | We use [this](https://github.com/lshiwjx/2s-AGCN) implementation for the 2S-AGCN [2] model for 3D skeleton-based action recognition. Note that we use only the Joint-stream alone. 47 | 48 | 49 | #### Training 50 | 51 | To train a model with CE loss: 52 | 53 | From the top directory `babel/`, enter the following to train a model with the Cross-Entropy loss: 54 | 55 | ```python action_recognition/train_test.py --config action_recognition/config/babel_v1.0/train_60.yaml``` 56 | 57 | To train a model with Focal loss [3] with class-balancing [4]: 58 | 59 | ```python action_recognition/train_test.py --config action_recognition/config/babel_v1.0/train_60_wfl.yaml``` 60 | 61 | You can use the repsective configuration files inside `config/babel_v1.0` to train the model with `120` classes in both ways. 62 | 63 | 64 | #### Inference 65 | 66 | Provide the path to the trained model in the `weights` key in the respective config file. 67 | 68 | To perform inference, use the same command as when training, and pass the test config file as argument. E.g.: 69 | 70 | ```python action_recognition/main.py --config action_recognition/config/babel_v1.0/test_60.yaml``` 71 | 72 | or 73 | 74 | ```python action_recognition/main_wl.py --config action_recognition/config/babel_v1.0/test_60_wfl.yaml``` 75 | 76 | To save the predicted scores to disk, in the config file, set `save_score: True`. 77 | 78 | ### Pre-trained models 79 | 80 | Download the checkpoints from the links below and place them in `action_recognition/ckpts/`. 81 | 82 | Performing inference on the validation set should result in the following performance. 83 | 84 | | \# Classes | Loss type | Ckpt | Top-5 | Top-1 | Top-1-norm | 85 | |---|---|---|---|---|--| 86 | | BABEL-60 | CE | [ntu_sk_60_agcn_joint_const_lr_1e-3-17-6390.pt](https://human-movement.is.tue.mpg.de/release/ckpts/ntu_sk_60_agcn_joint_const_lr_1e-3-17-6390.pt) | 0.74 | 0.42 | 0.24 | 87 | | BABEL-60 | Focal | [wfl_ntu_sk_60_agcn_joint_const_lr_1e-3-93-33370.pt](https://human-movement.is.tue.mpg.de/release/ckpts/wfl_ntu_sk_60_agcn_joint_const_lr_1e-3-93-33370.pt) | 0.69 | 0.34 | 0.30 | 88 | | BABEL-120 | CE | [ntu_sk_120_agcn_joint_const_lr_1e-3-15-12240.pt](https://human-movement.is.tue.mpg.de/release/ckpts/ntu_sk_120_agcn_joint_const_lr_1e-3-15-12240.pt) | 0.72 | 0.4 | 0.16 | 89 | | BABEL-120 | Focal | [wfl_ntu_sk_120_agcn_joint_const_lr_1e-3-157-60356.pt](https://human-movement.is.tue.mpg.de/release/ckpts/wfl_ntu_sk_120_agcn_joint_const_lr_1e-3-157-60356.pt) | 0.59 | 0.29 | 0.23 | 90 | 91 | **Note:** The models are *only* trained with dense labels from `train.json` (See [project webpage](https://babel.is.tue.mpg.de/data.html) for more details about the data). 92 | 93 | 94 | ### Metrics 95 | 96 | **Description** 97 | 98 | 1. **Top-1** measures the accuracy of the highest-scoring prediction. 99 | 2. **Top-5** evaluates whether the ground-truth category is present among the top 5 highest-scoring predictions. 100 | 1. It accounts for labeling noise and inherent label ambiguity. 101 | 2. It also accounts for the possible association of multiple action categories with a single input movement sequence. For instance, a person `walking in a circle` is mapped to the two action categories `walk` and `circular movement`. 102 | Ideal models will predict high scores for all the categories relevant to the movement sample. 103 | 3. **Top-1-norm** is the mean `Top-1` across categories. The magnitude of `Top-1-norm` - `Top-1` illustrates the class-specific bias in the model performance. In Babel, it reflects the impact of class imbalance on learning. 104 | 105 | 106 | ### Challenge 107 | 108 | To make a submission: 109 | 110 | 1. Store the predictions (variable `pred_scores` in [L591](https://github.com/abhinanda-punnakkal/BABEL/blob/6454163e196fc6400e1b8232dffb651341ed7c14/action_recognition/train_test.py#L591) of `train_test.py`) as a python pickle. 111 | - `pred_scores` is list of tuples, each containing the following 4 elements — (sequence ID, segment ID, chunk ID, score). Here score is an `np.array` of size `(N, C)` where `N` is # samples in the test set and `C` is the # classes. 112 | - By default, `train_test.py` stores this pickle file as `/epoch1_test_score.pkl` (see [L604](https://github.com/abhinanda-punnakkal/BABEL/blob/6454163e196fc6400e1b8232dffb651341ed7c14/action_recognition/train_test.py#L606)). 113 | 2. In the command line, type the following commands: 114 | 1. `cd action_recognition/challenge/` 115 | 2. `python create_submission.py --pred_path /epoch1_test_score.pkl --sub_path ` 116 | - Note: This code assumes that the GT test samples (`test_label_{60, 120}.pkl`) are present in the following path: `action_recognition/data/release/` 117 | 3. Submit the `.npz` submission file to the BABEL Action Recognition Challenge [evaluation server](https://babel-evaluation.is.tuebingen.mpg.de/). 118 | 119 | 120 | ### References 121 | 122 | [1] Shahroudy, Amir, et al. "NTU RGB+D: A large scale dataset for 3d human activity analysis." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
123 | [2] Shi, Lei, et al. "Two-stream adaptive graph convolutional networks for skeleton-based action recognition." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.
124 | [3] Lin, Tsung-Yi, et al. "Focal loss for dense object detection." Proceedings of the IEEE international conference on computer vision. 2017.
125 | [4] Cui, Yin, et al. "Class-balanced loss based on effective number of samples." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.
126 | -------------------------------------------------------------------------------- /action_recognition/data_gen/dutils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2021 achandrasekaran 6 | # 7 | # Distributed under terms of the MIT license. 8 | 9 | import sys, os, pdb 10 | import os.path as osp 11 | from os.path import join as ospj 12 | from os.path import basename as ospb 13 | from os.path import dirname as ospd 14 | 15 | import numpy as np 16 | import torch 17 | 18 | import json, pickle, csv 19 | from collections import Counter 20 | from tqdm import tqdm 21 | 22 | from smplx import SMPLH 23 | 24 | import viz 25 | 26 | 27 | def read_json(json_filename): 28 | '''Return contents of JSON file''' 29 | jc = None 30 | with open(json_filename) as infile: 31 | jc = json.load(infile) 32 | return jc 33 | 34 | def read_pkl(pkl_filename): 35 | '''Return contents of pikcle file''' 36 | pklc = None 37 | with open(pkl_filename, 'rb') as infile: 38 | pklc = pickle.load(infile) 39 | return pklc 40 | 41 | def write_json(contents, filename): 42 | with open(filename, 'w') as outfile: 43 | json.dump(contents, outfile, indent=2) 44 | 45 | def write_pkl(contents, filename): 46 | with open(filename, 'wb') as outfile: 47 | pickle.dump(contents, outfile) 48 | 49 | def smpl_to_nturgbd(model_type='smplh', out_format='nturgbd'): 50 | ''' Borrowed from https://gitlab.tuebingen.mpg.de/apunnakkal/2s_agcn/-/blob/master/data_gen/smpl_data_utils.py 51 | NTU mapping 52 | ----------- 53 | 0 --> ? 54 | 1-base of the spine 55 | 2-middle of the spine 56 | 3-neck 57 | 4-head 58 | 5-left shoulder 59 | 6-left elbow 60 | 7-left wrist 61 | 8-left hand 62 | 9-right shoulder 63 | 10-right elbow 64 | 11-right wrist 65 | 12-right hand 66 | 13-left hip 67 | 14-left knee 68 | 15-left ankle 69 | 16-left foot 70 | 17-right hip 71 | 18-right knee 72 | 19-right ankle 73 | 20-right foot 74 | 21-spine 75 | 22-tip of the left hand 76 | 23-left thumb 77 | 24-tip of the right hand 78 | 25-right thumb 79 | 80 | :param model_type: 81 | :param out_format: 82 | :return: 83 | ''' 84 | if model_type == 'smplh' and out_format == 'nturgbd': 85 | '22 and 37 are approximation for hand (base of index finger)' 86 | return np.array([0, 3, 12, 15, 87 | 16, 18, 20, 22, #left hand 88 | 17, 19, 21, 37, # right hand 89 | 1, 4, 7, 10, #left leg 90 | 2, 5, 8, 11, #right hand 91 | 9, 92 | 63, 64 , 68, 69 93 | ], 94 | dtype=np.int32) 95 | 96 | class dotdict(dict): 97 | """dot.notation access to dictionary attributes""" 98 | __getattr__ = dict.get 99 | __setattr__ = dict.__setitem__ 100 | __delattr__ = dict.__delitem__ 101 | 102 | def store_counts(label_fp): 103 | """Compute # samples per class, from stored labels 104 | 105 | Args: 106 | label_fp : Path to label file 107 | 108 | Writes (to same path as label file): 109 | out_fp : # samples per class = {: , ...} 110 | """ 111 | Y_tup = read_pkl(label_fp) 112 | Y_idxs = Y_tup[1][0] 113 | print('# Samples in set = ', len(Y_idxs)) 114 | 115 | label_count = Counter(Y_idxs) 116 | print('File ', label_fp, 'len',len(label_count)) 117 | 118 | out_fp = label_fp.replace('.pkl', '_count.pkl') 119 | write_pkl(label_count, out_fp) 120 | 121 | def load_babel_dataset(d_folder='../../data/babel_v1.0_release'): 122 | '''Load the BABEL dataset''' 123 | # Data folder 124 | l_babel_dense_files = ['train', 'val', 'test'] 125 | l_babel_extra_files = ['extra_train', 'extra_val'] 126 | 127 | # BABEL Dataset 128 | babel = {} 129 | for fn in l_babel_dense_files + l_babel_extra_files: 130 | babel[fn] = json.load(open(ospj(d_folder, fn+'.json'))) 131 | 132 | return babel 133 | 134 | def store_seq_fps(amass_p): 135 | '''Get fps for each seq. in BABEL 136 | Arguments: 137 | --------- 138 | amass_p : Path where you download AMASS to. 139 | Save: 140 | ----- 141 | featp_2_fps.json : Key: feat path , value: orig. fps 142 | in AMASS . E.g.,: {'KIT/KIT/4/RightTurn01_poses.npz': 100.0, ...} 143 | ''' 144 | # Get BABEL dataset 145 | babel = load_babel_dataset() 146 | 147 | # Loop over each BABEL seq, store frame-rate 148 | ft_p_2_fps = {} 149 | for fn in babel: 150 | for sid in tqdm(babel[fn]): 151 | ann = babel[fn][sid] 152 | if ann['feat_p'] not in ft_p_2_fps: 153 | fps = np.load(ospj(amass_p, ann['feat_p']))['mocap_framerate'] 154 | ft_p_2_fps[ann['feat_p']] = float(fps) 155 | dest_fp = '../data/featp_2_fps.json' 156 | write_json(ft_p_2_fps, dest_fp) 157 | return None 158 | 159 | def store_ntu_jpos(smplh_model_p, dest_jpos_p, amass_p): 160 | '''Store joint positions of kfor NTU-RGBD skeleton 161 | ''' 162 | # Model to forward-pass through, to store joint positions 163 | smplh = SMPLH(smplh_model_p, create_transl=False, ext='pkl', 164 | gender='male', use_pca=False, batch_size=1) 165 | 166 | # Load paths to all BABEL features 167 | featp_2_fps = read_json('../data/featp_2_fps.json') 168 | 169 | # Loop over all BABEL data, verify that joint positions are stored on disk 170 | l_m_ft_p = [] 171 | for ft_p in featp_2_fps: 172 | 173 | # Get the correct dataset folder name 174 | ddir_n = ospb(ospd(ospd(ft_p))) 175 | ddir_map = {'BioMotionLab_NTroje': 'BMLrub', 'DFaust_67': 'DFaust'} 176 | ddir_n = ddir_map[ddir_n] if ddir_n in ddir_map else ddir_n 177 | # Get the subject folder name 178 | sub_fol_n = ospb(ospd(ft_p)) 179 | 180 | # Sanity check 181 | fft_p = ospj(dest_jpos_p, ddir_n, sub_fol_n, ospb(ft_p)) 182 | if not os.path.exists(fft_p): 183 | l_m_ft_p.append((ft_p, fft_p)) 184 | print('Total # missing NTU RGBD skeleton features = ', len(l_m_ft_p)) 185 | 186 | # Loop over missing joint positions and store them on disk 187 | for i, (ft_p, ntu_jpos_p) in enumerate(tqdm(l_m_ft_p)): 188 | jrot_smplh = np.load(ospj(amass_p, ft_p))['poses'] 189 | # Break joints down into body parts 190 | smpl_body_jrot = jrot_smplh[:, 3:66] 191 | left_hand_jrot = jrot_smplh[:, 66:111] 192 | right_hand_jrot = jrot_smplh[:, 111:] 193 | root_orient = jrot_smplh[:, 0:3].reshape(-1, 3) 194 | 195 | # Forward through model to get a superset of required joints 196 | T = jrot_smplh.shape[0] 197 | ntu_jpos = np.zeros((T, 219)) 198 | for t in range(T): 199 | res = smplh(body_pose=torch.Tensor(smpl_body_jrot[t:t+1, :]), 200 | global_orient=torch.Tensor(root_orient[t: t+1, :]), 201 | left_hand_pose = torch.Tensor(left_hand_jrot[t: t+1, :]), 202 | right_hand_pose=torch.Tensor(right_hand_jrot[t: t+1, :]), 203 | # transl=torch.Tensor(transl) 204 | ) 205 | jpos = res.joints.detach().cpu().numpy()[:, :, :].reshape(-1) 206 | ntu_jpos[t, :] = jpos 207 | 208 | # Save to disk 209 | if not os.path.exists(ospd(ntu_jpos_p)): 210 | os.makedirs(ospd(ntu_jpos_p)) 211 | np.savez(ntu_jpos_p, joint_pos=ntu_jpos, allow_pickle=True) 212 | 213 | return 214 | 215 | def viz_ntu_jpos(jpos_p, l_ft_p): 216 | '''Visualize sequences of NTU-skeleton joint positions''' 217 | # Load paths to all BABEL features 218 | featp_2_fps = read_json('../data/featp_2_fps.json') 219 | # Indices that are in the NTU RGBD skeleton 220 | smpl2nturgbd = smpl_to_nturgbd() 221 | # Iterate over each 222 | for ft_p in l_ft_p: 223 | x = np.load(ospj(jpos_p, ft_p))['joint_pos'] 224 | T, ft_sz = x.shape 225 | x = x.reshape(T, ft_sz//3, 3) 226 | # print('Data shape = {0}'.format(x.shape)) 227 | x = x[:, smpl2nturgbd, :] 228 | # print('Data shape = {0}'.format(x.shape)) 229 | # x = x[:,:,:, 0].transpose(1, 2, 0) # (3, 150, 22, 1) --> (150, 22, 3) 230 | print('Data shape = {0}'.format(x.shape)) 231 | viz.viz_seq(seq=x, folder_p='test_viz/test_ntu_w_axis', sk_type='nturgbd', debug=True) 232 | print('-'*50) 233 | 234 | 235 | def main(): 236 | '''Store preliminary stuff''' 237 | amass_p= '/ps/project/conditional_action_gen/data/AMASS_March2021/' 238 | 239 | # Save feature paths --> fps (released in babel/action_recognition/data/) 240 | # store_seq_fps(amass_p) 241 | 242 | # Save joint positions in NTU-RGBD skeleton format 243 | smplh_model_p = '/ps/project/conditional_action_gen/body_models/mano_v1_2/models_cleaned_merged/SMPLH_male.pkl' 244 | jpos_p = '/ps/project/conditional_action_gen/amass/babel_joint_pos' 245 | # store_ntu_jpos(smplh_model_p, jpos_p, amass_p) 246 | 247 | # Viz. saved seqs. 248 | # l_ft_p = ['KIT/917/Experiment3a_09_poses.npz'] 249 | # viz_ntu_jpos(jpos_p, l_ft_p) 250 | 251 | if __name__ == '__main__': 252 | main() 253 | 254 | -------------------------------------------------------------------------------- /action_recognition/feeders/feeder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | # 5 | # Adapted from https://github.com/lshiwjx/2s-AGCN for BABEL (https://babel.is.tue.mpg.de/) 6 | 7 | import numpy as np 8 | import pickle 9 | import torch 10 | from torch.utils.data import Dataset 11 | import sys 12 | import pdb 13 | 14 | sys.path.extend(['../']) 15 | from feeders import tools 16 | 17 | 18 | class Feeder(Dataset): 19 | def __init__(self, data_path, label_path, 20 | random_choose=False, random_shift=False, random_move=False, 21 | window_size=-1, normalization=False, debug=False, use_mmap=True): 22 | """ 23 | 24 | :param data_path: 25 | :param label_path: 26 | :param random_choose: If true, randomly choose a portion of the input sequence 27 | :param random_shift: If true, randomly pad zeros at the begining or end of sequence 28 | :param random_move: 29 | :param window_size: The length of the output sequence 30 | :param normalization: If true, normalize input sequence 31 | :param debug: If true, only use the first 100 samples 32 | :param use_mmap: If true, use mmap mode to load data, which can save the running memory 33 | """ 34 | 35 | self.debug = debug 36 | self.data_path = data_path 37 | self.label_path = label_path 38 | self.random_choose = random_choose 39 | self.random_shift = random_shift 40 | self.random_move = random_move 41 | self.window_size = window_size 42 | self.normalization = normalization 43 | self.use_mmap = use_mmap 44 | self.load_data() 45 | if normalization: 46 | self.get_mean_map() 47 | 48 | def load_data(self): 49 | # data: N C V T M 50 | try: 51 | with open(self.label_path) as f: 52 | self.sample_name, self.label = pickle.load(f) 53 | except: 54 | # for pickle file from python2 55 | with open(self.label_path, 'rb') as f: 56 | self.sample_name, self.label = pickle.load(f, encoding='latin1') 57 | 58 | # load data 59 | if self.use_mmap: 60 | self.data = np.load(self.data_path, mmap_mode='r') 61 | else: 62 | self.data = np.load(self.data_path) 63 | if self.debug: 64 | self.label = self.label[0:1000] 65 | self.data = self.data[0:1000] 66 | self.sample_name = self.sample_name[0:1000] 67 | 68 | 69 | def get_mean_map(self): 70 | data = self.data 71 | N, C, T, V, M = data.shape 72 | self.mean_map = data.mean(axis=2, keepdims=True).mean(axis=4, keepdims=True).mean(axis=0) 73 | self.std_map = data.transpose((0, 2, 4, 1, 3)).reshape((N * T * M, C * V)).std(axis=0).reshape((C, 1, V, 1)) 74 | 75 | def __len__(self): 76 | return len(self.sample_name) 77 | 78 | def __iter__(self): 79 | return self 80 | 81 | def __getitem__(self, index): 82 | data_numpy = self.data[index] 83 | data_numpy = np.array(data_numpy) 84 | 85 | seg_id = self.sample_name[index] 86 | label = self.label[0][index] 87 | sid = self.label[1][index] 88 | chunk_n = self.label[2][index] 89 | anntr_id = self.label[3][index] 90 | 91 | if self.normalization: 92 | data_numpy = (data_numpy - self.mean_map) / self.std_map 93 | if self.random_shift: 94 | data_numpy = tools.random_shift(data_numpy) 95 | if self.random_choose: 96 | data_numpy = tools.random_choose(data_numpy, self.window_size) 97 | elif self.window_size > 0: 98 | data_numpy = tools.auto_pading(data_numpy, self.window_size) 99 | if self.random_move: 100 | data_numpy = tools.random_move(data_numpy) 101 | 102 | return data_numpy, label, sid, seg_id, chunk_n, anntr_id, index 103 | 104 | def top_k(self, score, top_k): 105 | rank = score.argsort() 106 | hit_top_k = [l in rank[i, -top_k:] for i, l in enumerate(self.label[0])] 107 | return sum(hit_top_k) * 1.0 / len(hit_top_k) 108 | 109 | 110 | def import_class(name): 111 | components = name.split('.') 112 | mod = __import__(components[0]) 113 | for comp in components[1:]: 114 | mod = getattr(mod, comp) 115 | return mod 116 | 117 | 118 | def test(data_path, label_path, vid=None, graph=None, is_3d=False): 119 | ''' 120 | vis the samples using matplotlib 121 | :param data_path: 122 | :param label_path: 123 | :param vid: the id of sample 124 | :param graph: 125 | :param is_3d: when vis NTU, set it True 126 | :return: 127 | ''' 128 | import matplotlib.pyplot as plt 129 | loader = torch.utils.data.DataLoader( 130 | dataset=Feeder(data_path, label_path), 131 | batch_size=64, 132 | shuffle=False, 133 | num_workers=2) 134 | 135 | if vid is not None: 136 | sample_name = loader.dataset.sample_name 137 | sample_id = [name.split('.')[0] for name in sample_name] 138 | index = sample_id.index(vid) 139 | data, label, index = loader.dataset[index] 140 | data = data.reshape((1,) + data.shape) 141 | 142 | # for batch_idx, (data, label) in enumerate(loader): 143 | N, C, T, V, M = data.shape 144 | 145 | plt.ion() 146 | fig = plt.figure() 147 | if is_3d: 148 | from mpl_toolkits.mplot3d import Axes3D 149 | ax = fig.add_subplot(111, projection='3d') 150 | else: 151 | ax = fig.add_subplot(111) 152 | 153 | if graph is None: 154 | p_type = ['b.', 'g.', 'r.', 'c.', 'm.', 'y.', 'k.', 'k.', 'k.', 'k.'] 155 | pose = [ 156 | ax.plot(np.zeros(V), np.zeros(V), p_type[m])[0] for m in range(M) 157 | ] 158 | ax.axis([-1, 1, -1, 1]) 159 | for t in range(T): 160 | for m in range(M): 161 | pose[m].set_xdata(data[0, 0, t, :, m]) 162 | pose[m].set_ydata(data[0, 1, t, :, m]) 163 | fig.canvas.draw() 164 | plt.pause(0.001) 165 | else: 166 | p_type = ['b-', 'g-', 'r-', 'c-', 'm-', 'y-', 'k-', 'k-', 'k-', 'k-'] 167 | import sys 168 | from os import path 169 | sys.path.append( 170 | path.dirname(path.dirname(path.dirname(path.abspath(__file__))))) 171 | G = import_class(graph)() 172 | edge = G.inward 173 | pose = [] 174 | for m in range(M): 175 | a = [] 176 | for i in range(len(edge)): 177 | if is_3d: 178 | a.append(ax.plot(np.zeros(3), np.zeros(3), p_type[m])[0]) 179 | else: 180 | a.append(ax.plot(np.zeros(2), np.zeros(2), p_type[m])[0]) 181 | pose.append(a) 182 | ax.axis([-1, 1, -1, 1]) 183 | if is_3d: 184 | ax.set_zlim3d(-1, 1) 185 | for t in range(T): 186 | for m in range(M): 187 | for i, (v1, v2) in enumerate(edge): 188 | x1 = data[0, :2, t, v1, m] 189 | x2 = data[0, :2, t, v2, m] 190 | if (x1.sum() != 0 and x2.sum() != 0) or v1 == 1 or v2 == 1: 191 | pose[m][i].set_xdata(data[0, 0, t, [v1, v2], m]) 192 | pose[m][i].set_ydata(data[0, 1, t, [v1, v2], m]) 193 | if is_3d: 194 | pose[m][i].set_3d_properties(data[0, 2, t, [v1, v2], m]) 195 | fig.canvas.draw() 196 | # plt.savefig('/home/lshi/Desktop/skeleton_sequence/' + str(t) + '.jpg') 197 | plt.pause(0.01) 198 | 199 | 200 | if __name__ == '__main__': 201 | import os 202 | 203 | os.environ['DISPLAY'] = 'localhost:10.0' 204 | data_path = "../data/ntu/xview/val_data_joint.npy" 205 | label_path = "../data/ntu/xview/val_label.pkl" 206 | graph = 'graph.ntu_rgb_d.Graph' 207 | test(data_path, label_path, vid='S004C001P003R001A032', graph=graph, is_3d=True) 208 | # data_path = "../data/kinetics/val_data.npy" 209 | # label_path = "../data/kinetics/val_label.pkl" 210 | # graph = 'graph.Kinetics' 211 | # test(data_path, label_path, vid='UOD7oll3Kqo', graph=graph) 212 | -------------------------------------------------------------------------------- /notebooks/BABEL_explore.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Explore BABEL\n", 8 | "\n", 9 | "We present some code to explore BABEL by computing stats., and searching for specific actions." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Preparing the environment\n", 19 | "%load_ext autoreload\n", 20 | "%autoreload 2\n", 21 | "%matplotlib notebook\n", 22 | "%matplotlib inline\n", 23 | "\n", 24 | "import sys, os, pdb\n", 25 | "from os.path import join as ospj\n", 26 | "import json\n", 27 | "from collections import *\n", 28 | "\n", 29 | "import numpy as np\n", 30 | "import pandas as pd\n", 31 | "from pandas.core.common import flatten\n", 32 | "\n", 33 | "import pprint\n", 34 | "pp = pprint.PrettyPrinter()" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Load BABEL \n", 42 | "Note that we are not loading the test set " 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "d_folder = '../data/babel_v1.0_release' # Data folder\n", 52 | "l_babel_dense_files = ['train', 'val'] \n", 53 | "l_babel_extra_files = ['extra_train', 'extra_val']\n", 54 | "\n", 55 | "# BABEL Dataset \n", 56 | "babel = {}\n", 57 | "for file in l_babel_dense_files:\n", 58 | " babel[file] = json.load(open(ospj(d_folder, file+'.json')))\n", 59 | " \n", 60 | "for file in l_babel_extra_files:\n", 61 | " babel[file] = json.load(open(ospj(d_folder, file+'.json'))) " 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Duration of mocap for which BABEL action labels are available" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 3, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "Total duration = 30.0 hours 2.0 min. 32 sec.\n", 81 | "Total # seqs. = 8808\n", 82 | "------------------------------\n", 83 | "Total duration = 34.0 hours 43.0 min. 39 sec.\n", 84 | "Total # seqs. = 10576\n", 85 | "------------------------------\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "for babel_set in [l_babel_dense_files, l_babel_dense_files+l_babel_extra_files]:\n", 91 | " dur = 0.0\n", 92 | " list_sids = [] \n", 93 | " for spl in babel_set:\n", 94 | " for sid in babel[spl]:\n", 95 | " if sid not in list_sids:\n", 96 | " list_sids.append(sid)\n", 97 | " dur += babel[spl][sid]['dur'] \n", 98 | " \n", 99 | " # Duration of each set\n", 100 | " minutes = dur//60\n", 101 | " print('Total duration = {0} hours {1} min. {2:.0f} sec.'.format(\n", 102 | " minutes//60, minutes%60, dur%60))\n", 103 | " print('Total # seqs. = ', len(list_sids))\n", 104 | " print('-'*30)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### Search BABEL for action" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 4, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "def get_cats(ann, file):\n", 121 | " # Get sequence labels and frame labels if they exist\n", 122 | " seq_l, frame_l = [], []\n", 123 | " if 'extra' not in file:\n", 124 | " if ann['seq_ann'] is not None:\n", 125 | " seq_l = flatten([seg['act_cat'] for seg in ann['seq_ann']['labels']])\n", 126 | " if ann['frame_ann'] is not None:\n", 127 | " frame_l = flatten([seg['act_cat'] for seg in ann['frame_ann']['labels']])\n", 128 | " else:\n", 129 | " # Load all labels from (possibly) multiple annotators\n", 130 | " if ann['seq_anns'] is not None:\n", 131 | " seq_l = flatten([seg['act_cat'] for seq_ann in ann['seq_anns'] for seg in seq_ann['labels']])\n", 132 | " if ann['frame_anns'] is not None: \n", 133 | " frame_l = flatten([seg['act_cat'] for frame_ann in ann['frame_anns'] for seg in frame_ann['labels']])\n", 134 | " \n", 135 | " return list(seq_l), list(frame_l)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 5, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "# Seqs. containing action jump = 746\n", 148 | "# Segments containing action jump = 1597\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "action = 'jump'\n", 154 | "act_anns = defaultdict(list) # { seq_id_1: [ann_1_1, ann_1_2], seq_id_2: [ann_2_1], ...} \n", 155 | "n_act_spans = 0\n", 156 | "\n", 157 | "for spl in babel:\n", 158 | " for sid in babel[spl]:\n", 159 | " \n", 160 | " seq_l, frame_l = get_cats(babel[spl][sid], spl)\n", 161 | " # print(seq_l + frame_l)\n", 162 | " \n", 163 | " if action in seq_l + frame_l:\n", 164 | " \n", 165 | " # Store all relevant mocap sequence annotations\n", 166 | " act_anns[sid].append(babel[spl][sid])\n", 167 | " \n", 168 | " # # Individual spans of the action in the sequence\n", 169 | " n_act_spans += Counter(seq_l+frame_l)[action]\n", 170 | " \n", 171 | "print('# Seqs. containing action {0} = {1}'.format(action, len(act_anns)))\n", 172 | "print('# Segments containing action {0} = {1}'.format(action, n_act_spans))" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 6, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "[{'babel_sid': 7692,\n", 185 | " 'dur': 3.83,\n", 186 | " 'feat_p': 'CMU/CMU/141/141_05_poses.npz',\n", 187 | " 'frame_ann': {'anntr_id': 'eab5b72f-7399-43a7-a752-e4ee2807faaf',\n", 188 | " 'babel_lid': '59ad905d-f378-4d2b-90a7-4e3222bbc1f7',\n", 189 | " 'labels': [{'act_cat': ['hop'],\n", 190 | " 'end_t': 2,\n", 191 | " 'proc_label': 'hop left',\n", 192 | " 'raw_label': 'hopping left',\n", 193 | " 'seg_id': 'daf942ad-7cbe-4387-b6a0-0fc391c702ea',\n", 194 | " 'start_t': 1},\n", 195 | " {'act_cat': ['hop'],\n", 196 | " 'end_t': 3,\n", 197 | " 'proc_label': 'hop right',\n", 198 | " 'raw_label': 'hopping right',\n", 199 | " 'seg_id': '7b17f75e-3da9-4e56-aca1-9bbb6b8d5dd9',\n", 200 | " 'start_t': 2},\n", 201 | " {'act_cat': ['stand'],\n", 202 | " 'end_t': 1,\n", 203 | " 'proc_label': 'stand',\n", 204 | " 'raw_label': 'standing',\n", 205 | " 'seg_id': '70687891-613e-42f7-87f4-5760f18a3548',\n", 206 | " 'start_t': 0},\n", 207 | " {'act_cat': ['stand'],\n", 208 | " 'end_t': 3.834,\n", 209 | " 'proc_label': 'stand',\n", 210 | " 'raw_label': 'standing',\n", 211 | " 'seg_id': 'f0cdfd79-5dad-43f3-b2d1-8a0ce8668010',\n", 212 | " 'start_t': 3}],\n", 213 | " 'mul_act': True},\n", 214 | " 'seq_ann': {'anntr_id': '30bf91ac-e0c1-4298-814f-7811fe634bac',\n", 215 | " 'babel_lid': 'da9d959f-f5b6-434f-a927-35effc7b5afe',\n", 216 | " 'labels': [{'act_cat': ['jump'],\n", 217 | " 'proc_label': 'jump',\n", 218 | " 'raw_label': 'jump',\n", 219 | " 'seg_id': '082c172b-3883-4231-9c81-fcee4cf1a999'}],\n", 220 | " 'mul_act': True},\n", 221 | " 'url': 'https://babel-renders.s3.eu-central-1.amazonaws.com/007692.mp4'}]\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "# View a random annotation \n", 227 | "key = np.random.choice(list(act_anns.keys()))\n", 228 | "pp.pprint(act_anns[key])" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "kernelspec": { 241 | "display_name": "Python 3", 242 | "language": "python", 243 | "name": "python3" 244 | }, 245 | "language_info": { 246 | "codemirror_mode": { 247 | "name": "ipython", 248 | "version": 3 249 | }, 250 | "file_extension": ".py", 251 | "mimetype": "text/x-python", 252 | "name": "python", 253 | "nbconvert_exporter": "python", 254 | "pygments_lexer": "ipython3", 255 | "version": "3.8.3" 256 | } 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 5 260 | } 261 | -------------------------------------------------------------------------------- /action_recognition/model/aagcn.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | from torch.autograd import Variable 7 | 8 | 9 | def import_class(name): 10 | components = name.split('.') 11 | mod = __import__(components[0]) 12 | for comp in components[1:]: 13 | mod = getattr(mod, comp) 14 | return mod 15 | 16 | 17 | def conv_branch_init(conv, branches): 18 | weight = conv.weight 19 | n = weight.size(0) 20 | k1 = weight.size(1) 21 | k2 = weight.size(2) 22 | nn.init.normal_(weight, 0, math.sqrt(2. / (n * k1 * k2 * branches))) 23 | nn.init.constant_(conv.bias, 0) 24 | 25 | 26 | def conv_init(conv): 27 | nn.init.kaiming_normal_(conv.weight, mode='fan_out') 28 | nn.init.constant_(conv.bias, 0) 29 | 30 | 31 | def bn_init(bn, scale): 32 | nn.init.constant_(bn.weight, scale) 33 | nn.init.constant_(bn.bias, 0) 34 | 35 | 36 | class unit_tcn(nn.Module): 37 | def __init__(self, in_channels, out_channels, kernel_size=9, stride=1): 38 | super(unit_tcn, self).__init__() 39 | pad = int((kernel_size - 1) / 2) 40 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(pad, 0), 41 | stride=(stride, 1)) 42 | 43 | self.bn = nn.BatchNorm2d(out_channels) 44 | self.relu = nn.ReLU(inplace=True) 45 | conv_init(self.conv) 46 | bn_init(self.bn, 1) 47 | 48 | def forward(self, x): 49 | x = self.bn(self.conv(x)) 50 | return x 51 | 52 | 53 | class unit_gcn(nn.Module): 54 | def __init__(self, in_channels, out_channels, A, coff_embedding=4, num_subset=3, adaptive=True, attention=True): 55 | super(unit_gcn, self).__init__() 56 | inter_channels = out_channels // coff_embedding 57 | self.inter_c = inter_channels 58 | self.out_c = out_channels 59 | self.in_c = in_channels 60 | self.num_subset = num_subset 61 | num_jpts = A.shape[-1] 62 | 63 | self.conv_d = nn.ModuleList() 64 | for i in range(self.num_subset): 65 | self.conv_d.append(nn.Conv2d(in_channels, out_channels, 1)) 66 | 67 | if adaptive: 68 | self.PA = nn.Parameter(torch.from_numpy(A.astype(np.float32))) 69 | self.alpha = nn.Parameter(torch.zeros(1)) 70 | # self.beta = nn.Parameter(torch.ones(1)) 71 | # nn.init.constant_(self.PA, 1e-6) 72 | # self.A = Variable(torch.from_numpy(A.astype(np.float32)), requires_grad=False) 73 | # self.A = self.PA 74 | self.conv_a = nn.ModuleList() 75 | self.conv_b = nn.ModuleList() 76 | for i in range(self.num_subset): 77 | self.conv_a.append(nn.Conv2d(in_channels, inter_channels, 1)) 78 | self.conv_b.append(nn.Conv2d(in_channels, inter_channels, 1)) 79 | else: 80 | self.A = Variable(torch.from_numpy(A.astype(np.float32)), requires_grad=False) 81 | self.adaptive = adaptive 82 | 83 | if attention: 84 | # self.beta = nn.Parameter(torch.zeros(1)) 85 | # self.gamma = nn.Parameter(torch.zeros(1)) 86 | # unified attention 87 | # self.Attention = nn.Parameter(torch.ones(num_jpts)) 88 | 89 | # temporal attention 90 | self.conv_ta = nn.Conv1d(out_channels, 1, 9, padding=4) 91 | nn.init.constant_(self.conv_ta.weight, 0) 92 | nn.init.constant_(self.conv_ta.bias, 0) 93 | 94 | # s attention 95 | ker_jpt = num_jpts - 1 if not num_jpts % 2 else num_jpts 96 | pad = (ker_jpt - 1) // 2 97 | self.conv_sa = nn.Conv1d(out_channels, 1, ker_jpt, padding=pad) 98 | nn.init.xavier_normal_(self.conv_sa.weight) 99 | nn.init.constant_(self.conv_sa.bias, 0) 100 | 101 | # channel attention 102 | rr = 2 103 | self.fc1c = nn.Linear(out_channels, out_channels // rr) 104 | self.fc2c = nn.Linear(out_channels // rr, out_channels) 105 | nn.init.kaiming_normal_(self.fc1c.weight) 106 | nn.init.constant_(self.fc1c.bias, 0) 107 | nn.init.constant_(self.fc2c.weight, 0) 108 | nn.init.constant_(self.fc2c.bias, 0) 109 | 110 | # self.bn = nn.BatchNorm2d(out_channels) 111 | # bn_init(self.bn, 1) 112 | self.attention = attention 113 | 114 | if in_channels != out_channels: 115 | self.down = nn.Sequential( 116 | nn.Conv2d(in_channels, out_channels, 1), 117 | nn.BatchNorm2d(out_channels) 118 | ) 119 | else: 120 | self.down = lambda x: x 121 | 122 | self.bn = nn.BatchNorm2d(out_channels) 123 | self.soft = nn.Softmax(-2) 124 | self.tan = nn.Tanh() 125 | self.sigmoid = nn.Sigmoid() 126 | self.relu = nn.ReLU(inplace=True) 127 | 128 | for m in self.modules(): 129 | if isinstance(m, nn.Conv2d): 130 | conv_init(m) 131 | elif isinstance(m, nn.BatchNorm2d): 132 | bn_init(m, 1) 133 | bn_init(self.bn, 1e-6) 134 | for i in range(self.num_subset): 135 | conv_branch_init(self.conv_d[i], self.num_subset) 136 | 137 | def forward(self, x): 138 | N, C, T, V = x.size() 139 | 140 | y = None 141 | if self.adaptive: 142 | A = self.PA 143 | # A = A + self.PA 144 | for i in range(self.num_subset): 145 | A1 = self.conv_a[i](x).permute(0, 3, 1, 2).contiguous().view(N, V, self.inter_c * T) 146 | A2 = self.conv_b[i](x).view(N, self.inter_c * T, V) 147 | A1 = self.tan(torch.matmul(A1, A2) / A1.size(-1)) # N V V 148 | A1 = A[i] + A1 * self.alpha 149 | A2 = x.view(N, C * T, V) 150 | z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V)) 151 | y = z + y if y is not None else z 152 | else: 153 | A = self.A.cuda(x.get_device()) * self.mask 154 | for i in range(self.num_subset): 155 | A1 = A[i] 156 | A2 = x.view(N, C * T, V) 157 | z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V)) 158 | y = z + y if y is not None else z 159 | 160 | y = self.bn(y) 161 | y += self.down(x) 162 | y = self.relu(y) 163 | 164 | if self.attention: 165 | # spatial attention 166 | se = y.mean(-2) # N C V 167 | se1 = self.sigmoid(self.conv_sa(se)) 168 | y = y * se1.unsqueeze(-2) + y 169 | # a1 = se1.unsqueeze(-2) 170 | 171 | # temporal attention 172 | se = y.mean(-1) 173 | se1 = self.sigmoid(self.conv_ta(se)) 174 | y = y * se1.unsqueeze(-1) + y 175 | # a2 = se1.unsqueeze(-1) 176 | 177 | # channel attention 178 | se = y.mean(-1).mean(-1) 179 | se1 = self.relu(self.fc1c(se)) 180 | se2 = self.sigmoid(self.fc2c(se1)) 181 | y = y * se2.unsqueeze(-1).unsqueeze(-1) + y 182 | # a3 = se2.unsqueeze(-1).unsqueeze(-1) 183 | 184 | # unified attention 185 | # y = y * self.Attention + y 186 | # y = y + y * ((a2 + a3) / 2) 187 | # y = self.bn(y) 188 | return y 189 | 190 | 191 | class TCN_GCN_unit(nn.Module): 192 | def __init__(self, in_channels, out_channels, A, stride=1, residual=True, adaptive=True, attention=True): 193 | super(TCN_GCN_unit, self).__init__() 194 | self.gcn1 = unit_gcn(in_channels, out_channels, A, adaptive=adaptive, attention=attention) 195 | self.tcn1 = unit_tcn(out_channels, out_channels, stride=stride) 196 | self.relu = nn.ReLU(inplace=True) 197 | # if attention: 198 | # self.alpha = nn.Parameter(torch.zeros(1)) 199 | # self.beta = nn.Parameter(torch.ones(1)) 200 | # temporal attention 201 | # self.conv_ta1 = nn.Conv1d(out_channels, out_channels//rt, 9, padding=4) 202 | # self.bn = nn.BatchNorm2d(out_channels) 203 | # bn_init(self.bn, 1) 204 | # self.conv_ta2 = nn.Conv1d(out_channels, 1, 9, padding=4) 205 | # nn.init.kaiming_normal_(self.conv_ta1.weight) 206 | # nn.init.constant_(self.conv_ta1.bias, 0) 207 | # nn.init.constant_(self.conv_ta2.weight, 0) 208 | # nn.init.constant_(self.conv_ta2.bias, 0) 209 | 210 | # rt = 4 211 | # self.inter_c = out_channels // rt 212 | # self.conv_ta1 = nn.Conv2d(out_channels, out_channels // rt, 1) 213 | # self.conv_ta2 = nn.Conv2d(out_channels, out_channels // rt, 1) 214 | # nn.init.constant_(self.conv_ta1.weight, 0) 215 | # nn.init.constant_(self.conv_ta1.bias, 0) 216 | # nn.init.constant_(self.conv_ta2.weight, 0) 217 | # nn.init.constant_(self.conv_ta2.bias, 0) 218 | # s attention 219 | # num_jpts = A.shape[-1] 220 | # ker_jpt = num_jpts - 1 if not num_jpts % 2 else num_jpts 221 | # pad = (ker_jpt - 1) // 2 222 | # self.conv_sa = nn.Conv1d(out_channels, 1, ker_jpt, padding=pad) 223 | # nn.init.constant_(self.conv_sa.weight, 0) 224 | # nn.init.constant_(self.conv_sa.bias, 0) 225 | 226 | # channel attention 227 | # rr = 16 228 | # self.fc1c = nn.Linear(out_channels, out_channels // rr) 229 | # self.fc2c = nn.Linear(out_channels // rr, out_channels) 230 | # nn.init.kaiming_normal_(self.fc1c.weight) 231 | # nn.init.constant_(self.fc1c.bias, 0) 232 | # nn.init.constant_(self.fc2c.weight, 0) 233 | # nn.init.constant_(self.fc2c.bias, 0) 234 | # 235 | # self.softmax = nn.Softmax(-2) 236 | # self.sigmoid = nn.Sigmoid() 237 | self.attention = attention 238 | 239 | if not residual: 240 | self.residual = lambda x: 0 241 | 242 | elif (in_channels == out_channels) and (stride == 1): 243 | self.residual = lambda x: x 244 | 245 | else: 246 | self.residual = unit_tcn(in_channels, out_channels, kernel_size=1, stride=stride) 247 | 248 | def forward(self, x): 249 | if self.attention: 250 | y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x)) 251 | 252 | # spatial attention 253 | # se = y.mean(-2) # N C V 254 | # se1 = self.sigmoid(self.conv_sa(se)) 255 | # y = y * se1.unsqueeze(-2) + y 256 | # a1 = se1.unsqueeze(-2) 257 | 258 | # temporal attention 259 | # se = y.mean(-1) # N C T 260 | # # se1 = self.relu(self.bn(self.conv_ta1(se))) 261 | # se2 = self.sigmoid(self.conv_ta2(se)) 262 | # # y = y * se1.unsqueeze(-1) + y 263 | # a2 = se2.unsqueeze(-1) 264 | 265 | # se = y # NCTV 266 | # N, C, T, V = y.shape 267 | # se1 = self.conv_ta1(se).permute(0, 2, 1, 3).contiguous().view(N, T, self.inter_c * V) # NTCV 268 | # se2 = self.conv_ta2(se).permute(0, 1, 3, 2).contiguous().view(N, self.inter_c * V, T) # NCVT 269 | # a2 = self.softmax(torch.matmul(se1, se2) / np.sqrt(se1.size(-1))) # N T T 270 | # y = torch.matmul(y.permute(0, 1, 3, 2).contiguous().view(N, C * V, T), a2) \ 271 | # .view(N, C, V, T).permute(0, 1, 3, 2) * self.alpha + y 272 | 273 | # channel attention 274 | # se = y.mean(-1).mean(-1) 275 | # se1 = self.relu(self.fc1c(se)) 276 | # se2 = self.sigmoid(self.fc2c(se1)) 277 | # # y = y * se2.unsqueeze(-1).unsqueeze(-1) + y 278 | # a3 = se2.unsqueeze(-1).unsqueeze(-1) 279 | # 280 | # y = y * ((a2 + a3) / 2) + y 281 | # y = self.bn(y) 282 | else: 283 | y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x)) 284 | return y 285 | 286 | 287 | class Model(nn.Module): 288 | def __init__(self, num_class=60, num_point=25, num_person=2, graph=None, graph_args=dict(), in_channels=3, 289 | drop_out=0, adaptive=True, attention=True): 290 | super(Model, self).__init__() 291 | 292 | if graph is None: 293 | raise ValueError() 294 | else: 295 | Graph = import_class(graph) 296 | self.graph = Graph(**graph_args) 297 | 298 | A = self.graph.A 299 | self.num_class = num_class 300 | 301 | self.data_bn = nn.BatchNorm1d(num_person * in_channels * num_point) 302 | 303 | self.l1 = TCN_GCN_unit(3, 64, A, residual=False, adaptive=adaptive, attention=attention) 304 | self.l2 = TCN_GCN_unit(64, 64, A, adaptive=adaptive, attention=attention) 305 | self.l3 = TCN_GCN_unit(64, 64, A, adaptive=adaptive, attention=attention) 306 | self.l4 = TCN_GCN_unit(64, 64, A, adaptive=adaptive, attention=attention) 307 | self.l5 = TCN_GCN_unit(64, 128, A, stride=2, adaptive=adaptive, attention=attention) 308 | self.l6 = TCN_GCN_unit(128, 128, A, adaptive=adaptive, attention=attention) 309 | self.l7 = TCN_GCN_unit(128, 128, A, adaptive=adaptive, attention=attention) 310 | self.l8 = TCN_GCN_unit(128, 256, A, stride=2, adaptive=adaptive, attention=attention) 311 | self.l9 = TCN_GCN_unit(256, 256, A, adaptive=adaptive, attention=attention) 312 | self.l10 = TCN_GCN_unit(256, 256, A, adaptive=adaptive, attention=attention) 313 | 314 | self.fc = nn.Linear(256, num_class) 315 | nn.init.normal_(self.fc.weight, 0, math.sqrt(2. / num_class)) 316 | bn_init(self.data_bn, 1) 317 | if drop_out: 318 | self.drop_out = nn.Dropout(drop_out) 319 | else: 320 | self.drop_out = lambda x: x 321 | 322 | def forward(self, x): 323 | N, C, T, V, M = x.size() 324 | 325 | x = x.permute(0, 4, 3, 1, 2).contiguous().view(N, M * V * C, T) 326 | x = self.data_bn(x) 327 | x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, 2).contiguous().view(N * M, C, T, V) 328 | 329 | x = self.l1(x) 330 | x = self.l2(x) 331 | x = self.l3(x) 332 | x = self.l4(x) 333 | x = self.l5(x) 334 | x = self.l6(x) 335 | x = self.l7(x) 336 | x = self.l8(x) 337 | x = self.l9(x) 338 | x = self.l10(x) 339 | 340 | # N*M,C,T,V 341 | c_new = x.size(1) 342 | x = x.view(N, M, c_new, -1) 343 | x = x.mean(3).mean(1) 344 | x = self.drop_out(x) 345 | 346 | return self.fc(x) 347 | -------------------------------------------------------------------------------- /action_recognition/data_gen/viz.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2020 achandrasekaran 6 | # 7 | # Distributed under terms of the MIT license. 8 | 9 | import os, sys 10 | import os.path as osp 11 | 12 | import random 13 | import numpy as np 14 | import math 15 | import torch 16 | from torch.nn.functional import interpolate as intrp 17 | 18 | import subprocess 19 | import shutil 20 | import uuid 21 | import cv2 22 | from matplotlib import pyplot as plt 23 | from mpl_toolkits.mplot3d import Axes3D 24 | 25 | import pdb 26 | 27 | import dutils 28 | 29 | 30 | """ 31 | Visualize input and output motion sequences and labels 32 | """ 33 | 34 | def get_smpl_skeleton(): 35 | '''Skeleton ordering so that you traverse joints in this order: 36 | Left lower, Left upper, Spine, Neck, Head, Right lower, Right upper. 37 | ''' 38 | return np.array( 39 | [ 40 | # Left lower 41 | [ 0, 1 ], 42 | [ 1, 4 ], 43 | [ 4, 7 ], 44 | [ 7, 10], 45 | 46 | # Left upper 47 | [ 9, 13], 48 | [13, 16], 49 | [16, 18], 50 | [18, 20], 51 | # [20, 22], 52 | 53 | # Spinal column 54 | [ 0, 3 ], 55 | [ 3, 6 ], 56 | [ 6, 9 ], 57 | [ 9, 12], 58 | [12, 15], 59 | 60 | # Right lower 61 | [ 0, 2 ], 62 | [ 2, 5 ], 63 | [ 5, 8 ], 64 | [ 8, 11], 65 | 66 | # Right upper 67 | [ 9, 14], 68 | [14, 17], 69 | [17, 19], 70 | [19, 21], 71 | # [21, 23], 72 | ]) 73 | 74 | def get_nturgbd_joint_names(): 75 | '''From paper: 76 | 1-base of the spine 2-middle of the spine 3-neck 4-head 5-left shoulder 6-left elbow 7-left wrist 8- left hand 9-right shoulder 10-right elbow 11-right wrist 12- right hand 13-left hip 14-left knee 15-left ankle 16-left foot 17- right hip 18-right knee 19-right ankle 20-right foot 21-spine 22- tip of the left hand 23-left thumb 24-tip of the right hand 25- right thumb 77 | ''' 78 | # Joint names by AC, based on SMPL names 79 | joint_names_map = { 80 | 0: 'Pelvis', 81 | 82 | 12: 'L_Hip', 83 | 13: 'L_Knee', 84 | 14: 'L_Ankle', 85 | 15: 'L_Foot', 86 | 87 | 16: 'R_Hip', 88 | 17: 'R_Knee', 89 | 18: 'R_Ankle', 90 | 19: 'R_Foot', 91 | 92 | 1: 'Spine1', 93 | # 'Spine2', 94 | 20: 'Spine3', 95 | 2: 'Neck', 96 | 3: 'Head', 97 | 98 | # 'L_Collar', 99 | 4: 'L_Shoulder', 100 | 5: 'L_Elbow', 101 | 6: 'L_Wrist', 102 | 7: 'L_Hand', 103 | 21: 'L_HandTip', # Not in SMPL 104 | 22: 'L_Thumb', # Not in SMPL 105 | 106 | # 'R_Collar', 107 | 8: 'R_Shoulder', 108 | 9: 'R_Elbow', 109 | 10: 'R_Wrist', 110 | 11: 'R_Hand', 111 | 23: 'R_HandTip', # Not in SMPL 112 | 24: 'R_Thumb', # Not in SMPL 113 | } 114 | 115 | return [joint_names_map[idx] for idx in range(len(joint_names_map))] 116 | 117 | def get_smpl_joint_names(): 118 | # Joint names from SMPL Wiki 119 | joint_names_map = { 120 | 0: 'Pelvis', 121 | 122 | 1: 'L_Hip', 123 | 4: 'L_Knee', 124 | 7: 'L_Ankle', 125 | 10: 'L_Foot', 126 | 127 | 2: 'R_Hip', 128 | 5: 'R_Knee', 129 | 8: 'R_Ankle', 130 | 11: 'R_Foot', 131 | 132 | 3: 'Spine1', 133 | 6: 'Spine2', 134 | 9: 'Spine3', 135 | 12: 'Neck', 136 | 15: 'Head', 137 | 138 | 13: 'L_Collar', 139 | 16: 'L_Shoulder', 140 | 18: 'L_Elbow', 141 | 20: 'L_Wrist', 142 | 22: 'L_Hand', 143 | 14: 'R_Collar', 144 | 17: 'R_Shoulder', 145 | 19: 'R_Elbow', 146 | 21: 'R_Wrist', 147 | 23: 'R_Hand'} 148 | 149 | # Return all joints except indices 22 (L_Hand), 23 (R_Hand) 150 | return [joint_names_map[idx] for idx in range(len(joint_names_map)-2)] 151 | 152 | def get_nturgbd_skeleton(): 153 | ''' Skeleton ordering such that you traverse joints in this order: 154 | Left lower, Left upper, Spine, Neck, Head, Right lower, Right upper. 155 | ''' 156 | return np.array( 157 | [ 158 | # Left lower 159 | [0, 12], 160 | [12, 13], 161 | [13, 14], 162 | [14, 15], 163 | 164 | # Left upper 165 | [4, 20], 166 | [4, 5], 167 | [5, 6], 168 | [6, 7], 169 | [7, 21], 170 | [7, 22], # --> L Thumb 171 | 172 | # Spinal column 173 | [0, 1], 174 | [1, 20], 175 | [20, 2], 176 | [2, 3], 177 | 178 | # Right lower 179 | [0, 16], 180 | [16, 17], 181 | [17, 18], 182 | [18, 19], 183 | 184 | # Right upper 185 | [20, 8], 186 | [8, 9], 187 | [9, 10], 188 | [10, 11], 189 | [11, 24], 190 | # [24, 11] --> R Thumb 191 | 192 | [21, 22], 193 | 194 | [23, 24], 195 | 196 | ] 197 | ) 198 | 199 | def get_joint_colors(joint_names): 200 | '''Return joints based on a color spectrum. Also, joints on 201 | L and R should have distinctly different colors. 202 | ''' 203 | # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv. 204 | cmap = plt.get_cmap('rainbow') 205 | colors = [cmap(i) for i in np.linspace(0, 1, len(joint_names))] 206 | colors = [np.array((c[2], c[1], c[0])) for c in colors] 207 | return colors 208 | 209 | def calc_angle_from_x(sk): 210 | '''Given skeleton, calc. angle from x-axis''' 211 | # Hip bone 212 | id_l_hip = get_smpl_joint_names().index('L_Hip') 213 | id_r_hip = get_smpl_joint_names().index('R_Hip') 214 | pl, pr = sk[id_l_hip], sk[id_r_hip] 215 | bone = np.array(pr-pl) 216 | unit_v = bone / np.linalg.norm(bone) 217 | # Angle with x-axis 218 | pdb.set_trace() 219 | x_ax = np.array([1, 0, 0]) 220 | x_angle = math.degrees(np.arccos(np.dot(x_ax, unit_v))) 221 | 222 | ''' 223 | l_hip_z = seq[0, joint_names.index('L_Hip'), 2] 224 | r_hip_z = seq[0, joint_names.index('R_Hip'), 2] 225 | az = 0 if (l_hip_z > zroot and zroot > r_hip_z) else 180 226 | ''' 227 | if bone[1] > 0: 228 | x_angle = - x_angle 229 | 230 | return x_angle 231 | 232 | def calc_angle_from_y(sk): 233 | '''Given skeleton, calc. angle from x-axis''' 234 | # Hip bone 235 | id_l_hip = get_smpl_joint_names().index('L_Hip') 236 | id_r_hip = get_smpl_joint_names().index('R_Hip') 237 | pl, pr = sk[id_l_hip], sk[id_r_hip] 238 | bone = np.array(pl-pr) 239 | unit_v = bone / np.linalg.norm(bone) 240 | print(unit_v) 241 | # Angle with x-axis 242 | pdb.set_trace() 243 | y_ax = np.array([0, 1, 0]) 244 | y_angle = math.degrees(np.arccos(np.dot(y_ax, unit_v))) 245 | 246 | ''' 247 | l_hip_z = seq[0, joint_names.index('L_Hip'), 2] 248 | r_hip_z = seq[0, joint_names.index('R_Hip'), 2] 249 | az = 0 if (l_hip_z > zroot and zroot > r_hip_z) else 180 250 | ''' 251 | # if bone[1] > 0: 252 | # y_angle = - y_angle 253 | seq_y_proj = bone * np.cos(np.deg2rad(y_angle)) 254 | print('Bone projected onto y-axis: ', seq_y_proj) 255 | 256 | return y_angle 257 | 258 | def viz_skeleton(seq, folder_p, sk_type='smpl', radius=1, lcolor='#ff0000', rcolor='#0000ff', action='', debug=False): 259 | ''' Visualize skeletons for given sequence and store as images. 260 | 261 | Args: 262 | seq (np.array): Array (frames) of joint positions. 263 | Size depends on sk_type (see below). 264 | if sk_type is 'smpl' then assume: 265 | 1. first 3 dims = translation. 266 | 2. Size = (# frames, 69) 267 | elif sk_type is 'nturgbd', then assume: 268 | 1. no translation. 269 | 2. Size = (# frames, 25, 3) 270 | folder_p (str): Path to root folder containing visualized frames. 271 | Frames are dumped to the path: folder_p/frames/*.jpg 272 | radius (float): Space around the subject? 273 | 274 | Returns: 275 | Stores skeleton sequence as jpg frames. 276 | ''' 277 | joint_names = get_nturgbd_joint_names() if 'nturgbd' == sk_type \ 278 | else get_smpl_joint_names() 279 | n_j = n_j = len(joint_names) 280 | 281 | az = 90 282 | if 'smpl' == sk_type: 283 | # SMPL kinematic chain, joint list. 284 | # NOTE that hands are skipped. 285 | kin_chain = get_smpl_skeleton() 286 | # Reshape flat pose features into (frames, joints, (x,y,z)) (skip trans) 287 | seq = seq[:, 3:].reshape(-1, n_j, 3).cpu().detach().numpy() 288 | 289 | elif 'nturgbd' == sk_type: 290 | kin_chain = get_nturgbd_skeleton() 291 | az = 0 292 | 293 | # Get color-spectrum for skeleton 294 | colors = get_joint_colors(joint_names) 295 | labels = [(joint_names[jidx[0]], joint_names[jidx[1]]) for jidx in kin_chain] 296 | 297 | # xroot, yroot, zroot = 0.0, 0.0, 0.0 298 | xroot, yroot, zroot = seq[0, 0, 0], seq[0, 0, 1], seq[0, 0, 2] 299 | # seq = seq - seq[0, :, :] 300 | 301 | # Change viewing angle so that first frame is in frontal pose 302 | # az = calc_angle_from_x(seq[0]-np.array([xroot, yroot, zroot])) 303 | # az = calc_angle_from_y(seq[0]-np.array([xroot, yroot, zroot])) 304 | 305 | # Viz. skeleton for each frame 306 | for t in range(seq.shape[0]): 307 | 308 | # Fig. settings 309 | fig = plt.figure(figsize=(7, 6)) if debug else \ 310 | plt.figure(figsize=(5, 5)) 311 | ax = fig.add_subplot(111, projection='3d') 312 | 313 | for i, (j1, j2) in enumerate(kin_chain): 314 | # Store bones 315 | x = np.array([seq[t, j1, 0], seq[t, j2, 0]]) 316 | y = np.array([seq[t, j1, 1], seq[t, j2, 1]]) 317 | z = np.array([seq[t, j1, 2], seq[t, j2, 2]]) 318 | # Plot bones in skeleton 319 | ax.plot(x, y, z, c=colors[i], marker='o', linewidth=2, label=labels[i]) 320 | 321 | # More figure settings 322 | ax.set_title(action) 323 | ax.set_xlabel('X') 324 | ax.set_ylabel('Y') 325 | ax.set_zlabel('Z') 326 | # xroot, yroot, zroot = seq[t, 0, 0], seq[t, 0, 1], seq[t, 0, 2] 327 | 328 | # pdb.set_trace() 329 | ax.set_xlim3d(-radius + xroot, radius + xroot) 330 | ax.set_ylim3d([-radius + yroot, radius + yroot]) 331 | ax.set_zlim3d([-radius + zroot, radius + zroot]) 332 | 333 | if True==debug: 334 | ax.axis('on') 335 | ax.grid(b=True) 336 | else: 337 | ax.axis('off') 338 | ax.grid(b=None) 339 | # Turn off tick labels 340 | ax.set_yticklabels([]) 341 | ax.set_xticklabels([]) 342 | ax.set_zticklabels([]) 343 | 344 | cv2.waitKey(0) 345 | 346 | # ax.view_init(-75, 90) 347 | # ax.view_init(elev=20, azim=90+az) 348 | ax.view_init(elev=20, azim=az) 349 | 350 | if True==debug: 351 | ax.legend(bbox_to_anchor=(1.1, 1), loc='upper right') 352 | pass 353 | 354 | fig.savefig(osp.join(folder_p, 'frames', '{0}.jpg'.format(t))) 355 | plt.close(fig) 356 | 357 | # break 358 | 359 | def write_vid_from_imgs(folder_p, fps): 360 | '''Collate frames into a video sequence. 361 | 362 | Args: 363 | folder_p (str): Frame images are in the path: folder_p/frames/.jpg 364 | fps (float): Output frame rate. 365 | 366 | Returns: 367 | Output video is stored in the path: folder_p/video.mp4 368 | ''' 369 | vid_p = osp.join(folder_p, 'video.mp4') 370 | cmd = ['ffmpeg', '-r', str(int(fps)), '-i', 371 | osp.join(folder_p, 'frames', '%d.jpg'), '-y', vid_p] 372 | FNULL = open(os.devnull, 'w') 373 | retcode = subprocess.call(cmd, stdout=FNULL, stderr=subprocess.STDOUT) 374 | if not 0 == retcode: 375 | print('*******ValueError(Error {0} executing command: {1}*********'.format(retcode, ' '.join(cmd))) 376 | shutil.rmtree(osp.join(folder_p, 'frames')) 377 | 378 | def viz_seq(seq, folder_p, sk_type, orig_fps=30.0, debug=False): 379 | '''1. Dumps sequence of skeleton images for the given sequence of joints. 380 | 2. Collates the sequence of images into an mp4 video. 381 | 382 | Args: 383 | seq (np.array): Array of joint positions. 384 | folder_p (str): Path to root folder that will contain frames folder. 385 | sk_type (str): {'smpl', 'nturgbd'} 386 | 387 | Return: 388 | None. Path of mp4 video: folder_p/video.mp4 389 | ''' 390 | # Delete folder if exists 391 | if osp.exists(folder_p): 392 | print('Deleting existing folder ', folder_p) 393 | shutil.rmtree(folder_p) 394 | 395 | # Create folder for frames 396 | os.makedirs(osp.join(folder_p, 'frames')) 397 | 398 | # Dump frames into folder. Args: (data, radius, frames path) 399 | viz_skeleton(seq, folder_p=folder_p, sk_type=sk_type, radius=1.2, debug=debug) 400 | write_vid_from_imgs(folder_p, orig_fps) 401 | 402 | return None 403 | 404 | def viz_rand_seq(X, Y, dtype, epoch, wb, urls=None, 405 | k=3, pred_labels=None): 406 | ''' 407 | Args: 408 | X (np.array): Array (frames) of SMPL joint positions. 409 | Y (np.array): Multiple labels for each frame in x \in X. 410 | dtype (str): {'input', 'pred'} 411 | k (int): # samples to viz. 412 | urls (tuple): Tuple of URLs of the rendered videos from original mocap. 413 | wb (dict): Wandb log dict. 414 | Returns: 415 | viz_ds (dict): Data structure containing all viz. info so far. 416 | ''' 417 | import wandb 418 | # `idx2al`: idx --> action label string 419 | al2idx = dutils.read_json('data/action_label_to_idx.json') 420 | idx2al = {al2idx[k]: k for k in al2idx} 421 | 422 | # Sample k random seqs. to viz. 423 | for s_idx in random.sample(list(range(X.shape[0])), k): 424 | # Visualize a single seq. in path `folder_p` 425 | folder_p = osp.join('viz', str(uuid.uuid4())) 426 | viz_seq(seq=X[s_idx], folder_p=folder_p) 427 | title='{0} seq. {1}: '.format(dtype, s_idx) 428 | acts_str = ', '.join([idx2al[l] for l in torch.unique(Y[s_idx])]) 429 | wb[title+urls[s_idx]] = wandb.Video(osp.join(folder_p, 'video.mp4'), 430 | caption='Actions: '+acts_str) 431 | 432 | if 'pred' == dtype or 'preds'==dtype: 433 | raise NotImplementedError 434 | 435 | print('Done viz. {0} seqs.'.format(k)) 436 | return wb 437 | -------------------------------------------------------------------------------- /notebooks/BABEL_visualization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Visualizing BABEL labels\n", 8 | "[BABEL](https://babel.is.tue.mpg.de/) labels mocap sequences from [AMASS](https://amass.is.tue.mpg.de) with action labels. \n", 9 | "A single sequence in BABEL can have multiple action labels associated with it, from multiple annotators. \n", 10 | "Here, we present code to load data from BABEL, visualize the mocap sequence rendered as a 2D video, and view the action labels corresponding to the sequence. " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Preparing the environment\n", 20 | "%load_ext autoreload\n", 21 | "%autoreload 2\n", 22 | "%matplotlib notebook\n", 23 | "%matplotlib inline" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import json\n", 33 | "from os.path import join as ospj\n", 34 | "\n", 35 | "import numpy as np\n", 36 | "\n", 37 | "import pprint\n", 38 | "pp = pprint.PrettyPrinter()\n", 39 | "\n", 40 | "from IPython.display import HTML" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Load BABEL\n", 48 | "We assume that you have downloaded BABEL annotations from the [website](https://babel.is.tue.mpg.de/data.html) and placed the downloaded `babel_v1.0_release` folder in `data/`. The BABEL data is provided as two sets -- BABEL dense and BABEL extra. " 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "d_folder = '../data/babel_v1.0_release' # Data folder\n", 58 | "l_babel_dense_files = ['train', 'val', 'test']\n", 59 | "l_babel_extra_files = ['extra_train', 'extra_val']\n", 60 | "\n", 61 | "# BABEL Dataset \n", 62 | "babel = {}\n", 63 | "for file in l_babel_dense_files:\n", 64 | " babel[file] = json.load(open(ospj(d_folder, file+'.json')))\n", 65 | " \n", 66 | "for file in l_babel_extra_files:\n", 67 | " babel[file] = json.load(open(ospj(d_folder, file+'.json'))) " 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### View random annotation" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Now, let us view an annotation data structure from the BABEL. \n", 82 | "The overall data structure is a dictionary, with a unique sequence ID as key and the annotation as value. " 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "def get_random_babel_ann():\n", 92 | " '''Get annotation from random sequence from a random file'''\n", 93 | " file = np.random.choice(l_babel_dense_files + l_babel_extra_files)\n", 94 | " seq_id = np.random.choice(list(babel[file].keys()))\n", 95 | " print('We are visualizing annotations for seq ID: {0} in \"{1}.json\"'.format(seq_id, file))\n", 96 | " ann = babel[file][seq_id]\n", 97 | " return ann, file" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "We are visualizing annotations for seq ID: 3312 in \"test.json\"\n", 110 | "{'babel_sid': 3312,\n", 111 | " 'dur': 76.73,\n", 112 | " 'feat_p': 'CMU/CMU/86/86_08_poses.npz',\n", 113 | " 'frame_ann': {'anntr_id': 'c6065e9c-1652-46df-a45f-fe8b8158428f',\n", 114 | " 'babel_lid': 'a642048f-7fa9-402f-a4c1-d7e9e7f696d1',\n", 115 | " 'labels': [{'act_cat': None,\n", 116 | " 'end_t': 68.093,\n", 117 | " 'proc_label': None,\n", 118 | " 'raw_label': None,\n", 119 | " 'seg_id': 'ad703788-bd17-42d4-854b-2b64cb58ee16',\n", 120 | " 'start_t': 59.51},\n", 121 | " {'act_cat': None,\n", 122 | " 'end_t': 32.82,\n", 123 | " 'proc_label': None,\n", 124 | " 'raw_label': None,\n", 125 | " 'seg_id': '1785aeca-53ce-4a33-a249-8a5d3466ea95',\n", 126 | " 'start_t': 27.445},\n", 127 | " {'act_cat': None,\n", 128 | " 'end_t': 52.426,\n", 129 | " 'proc_label': None,\n", 130 | " 'raw_label': None,\n", 131 | " 'seg_id': '12768b82-b342-46ee-ae60-e158f8b1dd47',\n", 132 | " 'start_t': 47.843},\n", 133 | " {'act_cat': None,\n", 134 | " 'end_t': 59.51,\n", 135 | " 'proc_label': None,\n", 136 | " 'raw_label': None,\n", 137 | " 'seg_id': '435bd5a6-01e9-4fc4-abee-642954466832',\n", 138 | " 'start_t': 53.26},\n", 139 | " {'act_cat': None,\n", 140 | " 'end_t': 40.007,\n", 141 | " 'proc_label': None,\n", 142 | " 'raw_label': None,\n", 143 | " 'seg_id': 'd3911406-ad83-4438-941c-919bf296d5e1',\n", 144 | " 'start_t': 33.382},\n", 145 | " {'act_cat': None,\n", 146 | " 'end_t': 76.733,\n", 147 | " 'proc_label': None,\n", 148 | " 'raw_label': None,\n", 149 | " 'seg_id': 'f222a4d9-a8d5-4002-893b-4df102e1e0fa',\n", 150 | " 'start_t': 70.593},\n", 151 | " {'act_cat': None,\n", 152 | " 'end_t': 2.252,\n", 153 | " 'proc_label': None,\n", 154 | " 'raw_label': None,\n", 155 | " 'seg_id': '35e605ec-c9f8-4c9d-8320-680de71837ce',\n", 156 | " 'start_t': 0.294},\n", 157 | " {'act_cat': None,\n", 158 | " 'end_t': 6.961,\n", 159 | " 'proc_label': None,\n", 160 | " 'raw_label': None,\n", 161 | " 'seg_id': 'fdaead4c-0a37-4579-a42a-4a94145570b9',\n", 162 | " 'start_t': 4.232},\n", 163 | " {'act_cat': None,\n", 164 | " 'end_t': 70.593,\n", 165 | " 'proc_label': None,\n", 166 | " 'raw_label': None,\n", 167 | " 'seg_id': '52d3c3e9-102b-4cf0-b082-cd416a7b5f64',\n", 168 | " 'start_t': 68.093},\n", 169 | " {'act_cat': None,\n", 170 | " 'end_t': 4.232,\n", 171 | " 'proc_label': None,\n", 172 | " 'raw_label': None,\n", 173 | " 'seg_id': 'f524e2df-36e2-45ce-a54e-892fdb7353d0',\n", 174 | " 'start_t': 2.252},\n", 175 | " {'act_cat': None,\n", 176 | " 'end_t': 9.336,\n", 177 | " 'proc_label': None,\n", 178 | " 'raw_label': None,\n", 179 | " 'seg_id': '7f265bed-f445-4b6b-a41f-c62106d7be3b',\n", 180 | " 'start_t': 6.961},\n", 181 | " {'act_cat': None,\n", 182 | " 'end_t': 47.843,\n", 183 | " 'proc_label': None,\n", 184 | " 'raw_label': None,\n", 185 | " 'seg_id': '1aa33355-a669-45a6-86a9-19ae862a47e9',\n", 186 | " 'start_t': 40.007},\n", 187 | " {'act_cat': None,\n", 188 | " 'end_t': 15.523,\n", 189 | " 'proc_label': None,\n", 190 | " 'raw_label': None,\n", 191 | " 'seg_id': 'd9c310f5-fc1e-47d8-b2f7-075c31a2eb6d',\n", 192 | " 'start_t': 9.523},\n", 193 | " {'act_cat': None,\n", 194 | " 'end_t': 22.507,\n", 195 | " 'proc_label': None,\n", 196 | " 'raw_label': None,\n", 197 | " 'seg_id': 'f7a71a16-2807-49f7-8a66-7df3e678e161',\n", 198 | " 'start_t': 15.523},\n", 199 | " {'act_cat': None,\n", 200 | " 'end_t': 0.294,\n", 201 | " 'proc_label': None,\n", 202 | " 'raw_label': None,\n", 203 | " 'seg_id': '3f57a657-2c8f-4995-87a4-965bcf8ea2a6',\n", 204 | " 'start_t': 0},\n", 205 | " {'act_cat': None,\n", 206 | " 'end_t': 9.523,\n", 207 | " 'proc_label': None,\n", 208 | " 'raw_label': None,\n", 209 | " 'seg_id': 'c9f97199-97eb-463c-a04e-a511413ad5ba',\n", 210 | " 'start_t': 9.336},\n", 211 | " {'act_cat': None,\n", 212 | " 'end_t': 33.382,\n", 213 | " 'proc_label': None,\n", 214 | " 'raw_label': None,\n", 215 | " 'seg_id': 'dac4fabe-e96c-411c-ad2e-29211e8c212a',\n", 216 | " 'start_t': 32.82},\n", 217 | " {'act_cat': None,\n", 218 | " 'end_t': 53.26,\n", 219 | " 'proc_label': None,\n", 220 | " 'raw_label': None,\n", 221 | " 'seg_id': 'ed99bf22-3ea5-45a6-9df3-17e67e49f119',\n", 222 | " 'start_t': 52.426},\n", 223 | " {'act_cat': None,\n", 224 | " 'end_t': 27.445,\n", 225 | " 'proc_label': None,\n", 226 | " 'raw_label': None,\n", 227 | " 'seg_id': '5c459b13-35e6-4c36-8ec4-9eb1536bfe95',\n", 228 | " 'start_t': 22.507}],\n", 229 | " 'mul_act': True},\n", 230 | " 'seq_ann': {'anntr_id': 'a217bb6b-93ae-4611-8e53-d4318ed5be00',\n", 231 | " 'babel_lid': '037dc092-28d5-4537-9632-9a91fc9f7fb9',\n", 232 | " 'labels': [{'act_cat': None,\n", 233 | " 'proc_label': None,\n", 234 | " 'raw_label': None,\n", 235 | " 'seg_id': 'f7d4b8fa-de77-487f-a08c-84bbc05c3148'}],\n", 236 | " 'mul_act': True},\n", 237 | " 'url': 'https://babel-renders.s3.eu-central-1.amazonaws.com/003312.mp4'}\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "ann, _ = get_random_babel_ann()\n", 243 | "pp.pprint(ann)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "Note that the action labels from `test.json` are not available publicly. \n", 251 | "Also note that the internal data structures of BABEL dense and BABEL extra differ slightly. \n", 252 | "For a detailed description of the annotation, see [BABEL's data page](https://babel.is.tue.mpg.de/data.html)." 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "### Visualize a mocap seq. and its action labels " 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 6, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "def get_vid_html(url):\n", 269 | " '''Helper code to embed a URL in a notebook'''\n", 270 | " html_code = '
'\n", 273 | " return html_code" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 7, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "def get_labels(ann, file):\n", 283 | " # Get sequence labels and frame labels if they exist\n", 284 | " seq_l, frame_l = None, None\n", 285 | " if 'extra' not in file:\n", 286 | " if ann['seq_ann'] is not None:\n", 287 | " seq_l = [seg['raw_label'] for seg in ann['seq_ann']['labels']]\n", 288 | " if ann['frame_ann'] is not None:\n", 289 | " frame_l = [(seg['raw_label'], seg['start_t'], seg['end_t']) for seg in ann['frame_ann']['labels']]\n", 290 | " else:\n", 291 | " # Load labels from 1st annotator (random) if there are multiple annotators\n", 292 | " if ann['seq_anns'] is not None:\n", 293 | " seq_l = [seg['raw_label'] for seg in ann['seq_anns'][0]['labels']]\n", 294 | " if ann['frame_anns'] is not None:\n", 295 | " frame_l = [(seg['raw_label'], seg['start_t'], seg['end_t']) for seg in ann['frame_anns'][0]['labels']]\n", 296 | " return seq_l, frame_l" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "#### Visualize a random mocap and its annotation from BABEL, by running the cell below. " 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 8, 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "We are visualizing annotations for seq ID: 7536 in \"train.json\"\n", 316 | "Sequence labels: ['pace and shake hand']\n", 317 | "Frame labels: (action label, start time, end time)\n", 318 | "[('walk', 0, 2.106),\n", 319 | " ('transition', 2.106, 2.845),\n", 320 | " ('make a knocking gesture', 2.845, 3.507),\n", 321 | " ('transition', 3.466, 4.6),\n", 322 | " ('turn around', 4.519, 5.519),\n", 323 | " ('walk back', 5.424, 7.734)]\n" 324 | ] 325 | }, 326 | { 327 | "data": { 328 | "text/html": [ 329 | "
" 330 | ], 331 | "text/plain": [ 332 | "" 333 | ] 334 | }, 335 | "execution_count": 8, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "ann, file = get_random_babel_ann()\n", 342 | "seq_l, frame_l = get_labels(ann, file)\n", 343 | "print('Sequence labels: ', seq_l)\n", 344 | "print('Frame labels: (action label, start time, end time)')\n", 345 | "pp.pprint(frame_l) \n", 346 | "HTML(get_vid_html(ann['url']))" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "- If you are interested in loading the mocap sequence in 3D, please refer to the tutorials in [AMASS](https://github.com/nghorbani/amass/tree/master/notebooks)" 354 | ] 355 | } 356 | ], 357 | "metadata": { 358 | "kernelspec": { 359 | "display_name": "Python 3", 360 | "language": "python", 361 | "name": "python3" 362 | }, 363 | "language_info": { 364 | "codemirror_mode": { 365 | "name": "ipython", 366 | "version": 3 367 | }, 368 | "file_extension": ".py", 369 | "mimetype": "text/x-python", 370 | "name": "python", 371 | "nbconvert_exporter": "python", 372 | "pygments_lexer": "ipython3", 373 | "version": "3.8.3" 374 | } 375 | }, 376 | "nbformat": 4, 377 | "nbformat_minor": 4 378 | } 379 | -------------------------------------------------------------------------------- /action_recognition/data_gen/create_dataset.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2021 achandrasekaran 6 | # 7 | # Distributed under terms of the MIT license. 8 | 9 | 10 | import sys, os, pdb 11 | from os.path import join as ospj 12 | from os.path import basename as ospb 13 | from os.path import dirname as ospd 14 | import numpy as np 15 | import torch 16 | from collections import * 17 | from itertools import * 18 | import pandas as pd 19 | import pickle, json, csv 20 | from tqdm import tqdm 21 | from pandas.core.common import flatten 22 | import ipdb 23 | import pickle 24 | 25 | # Custom 26 | import preprocess 27 | import dutils 28 | import viz 29 | 30 | """ 31 | Script to load BABEL segments with NTU skeleton format and pre-process. 32 | """ 33 | 34 | 35 | def ntu_style_preprocessing(b_dset_path): 36 | ''' 37 | ''' 38 | pdb.set_trace() 39 | print('Load BABEL v1.0 dataset subset', b_dset_path) 40 | b_dset = dutils.read_pkl(b_dset_path) 41 | # Get unnormalized 5-sec. samples 42 | X = np.array(b_dset['X']) 43 | print('X (old) = ', np.shape(X)) # N, T, V, C 44 | 45 | # Prep. data for normalization 46 | X = X.transpose(0, 3, 1, 2) # N, C, T, V 47 | X = X[:, :, :, :, np.newaxis] # N, C, T, V, M 48 | print('Shape of prepped X: ', X.shape) 49 | 50 | # Normalize (pre-process) in NTU RGBD-style 51 | ntu_sk_spine_bone = np.array([0, 1]) 52 | ntu_sk_shoulder_bone = np.array([8, 4]) 53 | X, l_m_sk = preprocess.pre_normalization(X, zaxis=ntu_sk_spine_bone, 54 | xaxis=ntu_sk_shoulder_bone) 55 | print('Shape of normalized X: ', X.shape) 56 | print('Skipping {0} samples because "skeleton is missing"'.format(len(l_m_sk))) 57 | print('Skipped idxs = ', l_m_sk) 58 | 59 | # Dataset w/ processed seg. chunks. (Skip samples w/ missing skeletons) 60 | b_AR_dset = {k: np.delete(b_dset[k], l_m_sk) for k in b_dset if k!='X'} 61 | b_AR_dset['X'] = np.delete(X, l_m_sk, axis=0) 62 | print('Shape of dataset = ', b_AR_dset['X'].shape) 63 | 64 | fp = b_dset_path.replace('samples', 'ntu_sk_ntu-style_preprocessed' ) 65 | # fp = '../data/babel_v1.0/babel_v1.0_ntu_sk_ntu-style_preprocessed.pkl' 66 | # dutils.write_pkl(b_AR_dset, fp) 67 | with open(fp, 'wb') as of: 68 | pickle.dump(b_AR_dset, of, protocol=4) 69 | 70 | def get_act_idx(y, act2idx, n_classes): 71 | ''' 72 | ''' 73 | if y in act2idx: 74 | return act2idx[y] 75 | else: 76 | return n_classes 77 | 78 | def store_splits_subsets(n_classes, spl, plus_extra = True, w_folder = '../data/babel_v1.0/'): 79 | ''' 80 | ''' 81 | # Get splits 82 | splits = dutils.read_json('../data/amass_splits.json') 83 | sid2split = {int(ospb(u).replace('.mp4', '')): spl for spl in splits \ 84 | for u in splits[spl] } 85 | 86 | # In labels, act. cat. --> idx 87 | act2idx_150 = dutils.read_json('../data/action_label_2_idx.json') 88 | act2idx = {k: act2idx_150[k] for k in act2idx_150 if act2idx_150[k] < n_classes} 89 | print('{0} actions in label set: {1}'.format(len(act2idx), act2idx)) 90 | 91 | if plus_extra : 92 | fp = w_folder + 'babel_v1.0_'+spl+'_extra_ntu_sk_ntu-style_preprocessed.pkl' 93 | else: 94 | fp = w_folder + 'babel_v1.0_'+spl+'_ntu_sk_ntu-style_preprocessed.pkl' 95 | 96 | # Get full dataset 97 | b_AR_dset = dutils.read_pkl(fp) 98 | 99 | # Store idxs of samples to include in learning 100 | split_idxs = defaultdict(list) 101 | for i, y1 in enumerate(b_AR_dset['Y1']): 102 | 103 | # Check if action category in list of classes 104 | if y1 not in act2idx: 105 | continue 106 | 107 | sid = b_AR_dset['sid'][i] 108 | split_idxs[sid2split[sid]].append(i) # Include idx in dataset 109 | 110 | # Save features that'll be loaded by dataloader 111 | ar_idxs = np.array(split_idxs[spl]) 112 | X = b_AR_dset['X'][ar_idxs] 113 | if plus_extra: 114 | fn = w_folder + f'{spl}_extra_ntu_sk_{n_classes}.npy' 115 | else: 116 | fn = w_folder + f'{spl}_ntu_sk_{n_classes}.npy' 117 | np.save(fn, X) 118 | 119 | # labels 120 | labels = {k: np.array(b_AR_dset[k])[ar_idxs] for k in b_AR_dset if k!='X'} 121 | 122 | # Create, save label data structure that'll be loaded by dataloader 123 | label_idxs = defaultdict(list) 124 | for i, y1 in enumerate(labels['Y1']): 125 | # y1 126 | label_idxs['Y1'].append(act2idx[y1]) 127 | # yk 128 | yk = [get_act_idx(y, act2idx, n_classes) for y in labels['Yk'][i]] 129 | label_idxs['Yk'].append(yk) 130 | # yov 131 | yov_o = labels['Yov'][i] 132 | yov = {get_act_idx(y, act2idx, n_classes): yov_o[y] for y in yov_o} 133 | label_idxs['Yov'].append(yov) 134 | # 135 | label_idxs['seg_id'].append(labels['seg_id'][i]) 136 | label_idxs['sid'].append(labels['sid'][i]) 137 | label_idxs['chunk_n'].append(labels['chunk_n'][i]) 138 | label_idxs['anntr_id'].append(labels['anntr_id'][i]) 139 | 140 | if plus_extra: 141 | wr_f = w_folder + f'{spl}_extra_label_{n_classes}.pkl' 142 | else: 143 | wr_f = w_folder + f'{spl}_label_{n_classes}.pkl' 144 | dutils.write_pkl(\ 145 | (label_idxs['seg_id'], (label_idxs['Y1'], label_idxs['sid'], 146 | label_idxs['chunk_n'], label_idxs['anntr_id'])), \ 147 | wr_f) 148 | 149 | class Babel_AR: 150 | '''Object containing data, methods for Action Recognition. 151 | 152 | Task 153 | ----- 154 | Given: x (Segment from Babel) 155 | Predict: \hat{p}(x) (Distribution over action categories) 156 | 157 | GT 158 | --- 159 | How to compute GT for a given segment? 160 | - yk: All action categories that are labeled for the entirety of segment 161 | - y1: One of yk 162 | - yov: Any y that belongs to part of a segment is considered to be GT. 163 | Fraction of segment covered by an action: {'walk': 1.0, 'wave': 0.5} 164 | 165 | ''' 166 | def __init__(self, dataset, dense=True, seq_dense_ann_type={}): 167 | '''Dataset with (samples, different GTs) 168 | ''' 169 | # Load dataset 170 | self.babel = dataset 171 | self.dense = dense 172 | self.seq_dense_ann_type = seq_dense_ann_type 173 | self.jpos_p = '../../../../../amass/' 174 | 175 | # Get frame-rate for each seq. in AMASS 176 | f_p = '../data/featp_2_fps.json' 177 | self.ft_p_2_fps = dutils.read_json(f_p) 178 | 179 | # Dataset w/ keys = {'X', 'Y1', 'Yk', 'Yov', 'seg_id', 'sid', 180 | # 'seg_dur'} 181 | self.d = defaultdict(list) 182 | for ann in tqdm(self.babel): 183 | self._update_dataset(ann) 184 | 185 | def _subsample_to_30fps(self, orig_ft, orig_fps): 186 | '''Get features at 30fps frame-rate 187 | Args: 188 | orig_ft (T, 25*3): Feats. @ `orig_fps` frame-rate 189 | orig_fps : Frame-rate in original (ft) seq. 190 | Return: 191 | ft (T', 25*3): Feats. @ 30fps 192 | ''' 193 | T, n_j, _ = orig_ft.shape 194 | out_fps = 30.0 195 | # Matching the sub-sampling used for rendering 196 | if int(orig_fps)%int(out_fps): 197 | sel_fr = np.floor(orig_fps / out_fps * np.arange(int(out_fps))).astype(int) 198 | n_duration = int(T/int(orig_fps)) 199 | t_idxs = [] 200 | for i in range(n_duration): 201 | t_idxs += list(i * int(orig_fps) + sel_fr) 202 | if int(T % int(orig_fps)): 203 | last_sec_frame_idx = n_duration*int(orig_fps) 204 | t_idxs += [x+ last_sec_frame_idx for x in sel_fr if x + last_sec_frame_idx < T ] 205 | else: 206 | t_idxs = np.arange(0, T, orig_fps/out_fps, dtype=int) 207 | 208 | ft = orig_ft[t_idxs, :, :] 209 | return ft 210 | 211 | def _viz_x(self, ft, fn='test_sample'): 212 | '''Wraper to Viz. the given sample (w/ NTU RGBD skeleton)''' 213 | viz.viz_seq(seq=ft, folder_p=f'test_viz/{fn}', sk_type='nturgbd', 214 | debug=True) 215 | return None 216 | 217 | def _load_seq_feats(self, ft_p, sk_type): 218 | '''Given path to joint position features, return them in 30fps''' 219 | # Identify appropriate feature directory path on disk 220 | if 'smpl_wo_hands' == sk_type: # SMPL w/o hands (T, 22*3) 221 | jpos_p = ospj(self.jpos_p, 'joint_pos') 222 | if 'nturgbd' == sk_type: # NTU (T, 219) 223 | jpos_p = ospj(self.jpos_p, 'babel_joint_pos') 224 | 225 | # Get the correct dataset folder name 226 | ddir_n = ospb(ospd(ospd(ft_p))) 227 | ddir_map = {'BioMotionLab_NTroje': 'BMLrub', 'DFaust_67': 'DFaust'} 228 | ddir_n = ddir_map[ddir_n] if ddir_n in ddir_map else ddir_n 229 | # Get the subject folder name 230 | sub_fol_n = ospb(ospd(ft_p)) 231 | 232 | # Sanity check 233 | fft_p = ospj(jpos_p, ddir_n, sub_fol_n, ospb(ft_p)) 234 | assert os.path.exists(fft_p) 235 | 236 | # Load seq. fts. 237 | ft = np.load(fft_p)['joint_pos'] 238 | T, ft_sz = ft.shape 239 | 240 | # Get NTU skeleton joints 241 | ntu_js = dutils.smpl_to_nturgbd(model_type='smplh', out_format='nturgbd') 242 | ft = ft.reshape(T, -1, 3) 243 | ft = ft[:, ntu_js, :] 244 | 245 | # Sub-sample to 30fps 246 | orig_fps = self.ft_p_2_fps[ft_p] 247 | ft = self._subsample_to_30fps(ft, orig_fps) 248 | # print(f'Feat. shape = {ft.shape}, fps = {orig_fps}') 249 | # if orig_fps != 30.0: 250 | # self._viz_x(ft) 251 | return ft 252 | 253 | def _get_per_f_labels(self, ann, ann_type, seq_dur): 254 | ''' ''' 255 | # Per-frame labels: {0: ['walk'], 1: ['walk', 'wave'], ... T: ['stand']} 256 | yf = defaultdict(list) 257 | T = int(30.0*seq_dur) 258 | for n_f in range(T): 259 | cur_t = float(n_f/30.0) 260 | for seg in ann['labels']: 261 | 262 | if seg['act_cat'] is None: 263 | continue 264 | 265 | if 'seq_ann' == ann_type: 266 | seg['start_t'] = 0.0 267 | seg['end_t'] = seq_dur 268 | 269 | if cur_t >= float(seg['start_t']) and cur_t < float(seg['end_t']): 270 | yf[n_f] += seg['act_cat'] 271 | return yf 272 | 273 | def _compute_dur_samples(self, ann, ann_type, seq_ft, seq_dur, dur=5.0): 274 | '''Return each GT action, corresponding to the fraction of the 275 | segment that it overlaps with. 276 | There are 2 conditions that we need to handle: 277 | 1. Multiple action categories in 'act_cat' 278 | 2. Simultaneous (overlapping action segments). 279 | 280 | Example Input: 281 | Seq. => frames [0, 1, 2, 3, 4, 5] 282 | GT acts. => [[2,3], [2,3], [2], [0], [0,1], [0,1]] 283 | 284 | Segs, GT: 285 | 1. seg_x = seq[0: 3], y1 = 2, yall = {2: 1.0, 3: 0.66} 286 | 2. seg_x = seq[0: 2], y1 = 3, yall = {2: 1.0, 3: 1.0} 287 | 3. seg_x = seq[3: ], y1 = 0, yall = {0: 1.0, 1: 0.66} 288 | 4. seg_x = seq[4: ], y1 = 1, yall = {0: 1.0, 1: 1.0} 289 | 290 | - Note that we should do the above for each chunk in a segment, 291 | each of duration = seconds. 292 | 293 | Return: 294 | [ { 'x': [st_t, end_t], 295 | 'y1': , 296 | 'yall': { : , ...}}, 297 | { ... }, ... 298 | ] 299 | ''' 300 | # 301 | yf = self._get_per_f_labels(ann, ann_type, seq_dur) 302 | 303 | # Compute, store all samples for each segment 304 | seq_samples = [] 305 | for seg in ann['labels']: 306 | 307 | # If no labeled act. cats. for current seg., skip it 308 | if seg['act_cat'] is None or 0 == len(seg['act_cat']): 309 | continue 310 | 311 | # Handle stage 1 missing durs. 312 | if 'seq_ann' == ann_type: 313 | seg['start_t'] = 0.0 314 | seg['end_t'] = seq_dur 315 | 316 | # Get segment feats. 317 | seg_st_f, seg_end_f = int(30.0*seg['start_t']), int(30.0*seg['end_t']) 318 | seg_x = seq_ft[seg_st_f: seg_end_f, :, :] 319 | 320 | # Split segment into -second chunks 321 | n_f_pc = 30.0 * dur 322 | n_chunks = int(np.ceil(seg_x.shape[0]/n_f_pc)) 323 | for n_ch in range(n_chunks): 324 | 325 | # Single -sec. chunk in segment 326 | ch_st_f = int(n_f_pc * n_ch) 327 | ch_end_f = int(min(ch_st_f + n_f_pc, seg_x.shape[0])) 328 | x = seg_x[ch_st_f: ch_end_f, :, :] 329 | 330 | # Handle case where chunk_T < n_f_pc 331 | x_T, nj, xyz = x.shape 332 | x_ch = np.concatenate((x, np.zeros((int(n_f_pc)- x_T, nj, xyz))), axis=0) 333 | 334 | # Labels for this chunk 335 | yov = Counter(flatten([yf[seg_st_f + n_f] for n_f in range(ch_st_f, ch_end_f)])) 336 | 337 | # Sanity check -- is segment smaller than 1 frame? 338 | if seg['act_cat'][0] not in yov: 339 | # print('Skipping seg:', seg) 340 | # print(f'Chunk # {n_ch}, Yov: ', yov) 341 | continue 342 | 343 | yov = {k: round(yov[k]/x_T, 3) for k in yov} 344 | 345 | # For each act_cat in segment, create a separate sample 346 | for cat in seg['act_cat']: 347 | # Add to samples GTs 348 | seq_samples.append({'seg_id': seg['seg_id'], 349 | 'chunk_n': n_ch, 350 | 'chunk_dur': round(x_T/n_f_pc, 3), 351 | 'x': x_ch, 352 | 'y1': cat, 353 | 'yk': seg['act_cat'], 354 | 'yov': yov, 355 | 'anntr_id': ann['anntr_id'] 356 | }) 357 | return seq_samples 358 | 359 | def _sample_at_seg_chunk_level(self, ann, seq_samples): 360 | # Samples at segment-chunk-level 361 | for i, sample in enumerate(seq_samples): 362 | 363 | self.d['sid'].append(ann['babel_sid']) # Seq. info 364 | self.d['seg_id'].append(sample['seg_id']) # Seg. info 365 | self.d['chunk_n'].append(sample['chunk_n']) # Seg. chunk info 366 | self.d['anntr_id'].append(sample['anntr_id']) # Annotator id (useful in rebuttal exp.) 367 | self.d['chunk_dur'].append(sample['chunk_dur']) # Seg. chunk info 368 | self.d['X'].append(sample['x']) # Seg. chunk feats. 369 | self.d['Y1'].append(sample['y1']) # 1 out of k GT act. cats. 370 | self.d['Yk'].append(sample['yk']) # List of k GT act. cats. 371 | # : fractions of overlapping act. cats. 372 | self.d['Yov'].append(sample['yov']) 373 | return 374 | 375 | def _update_dataset(self, ann): 376 | '''Return one sample (one segment) = (X, Y1, Yall)''' 377 | 378 | # Get feats. for seq. 379 | seq_ft = self._load_seq_feats(ann['feat_p'], 'nturgbd') 380 | 381 | # To keep track of type of annotation for loading 'extra' 382 | # Compute all GT labels for this seq. 383 | seq_samples = None 384 | if self.dense: 385 | if ann['frame_ann'] is not None: 386 | ann_ar = ann['frame_ann'] 387 | self.seq_dense_ann_type[ann['babel_sid']] = 'frame_ann' 388 | seq_samples = self._compute_dur_samples(ann_ar, 'frame_ann', seq_ft, ann['dur']) 389 | else: 390 | ann_ar = ann['seq_ann'] 391 | self.seq_dense_ann_type[ann['babel_sid']] = 'seq_ann' 392 | seq_samples = self._compute_dur_samples(ann_ar, 'seq_ann', seq_ft, ann['dur']) 393 | self._sample_at_seg_chunk_level(ann, seq_samples) 394 | else: 395 | # check if extra exists 396 | if 'frame_anns' in ann.keys() or 'seq_anns' in ann.keys(): 397 | ann_type = None 398 | if ann['babel_sid'] in self.seq_dense_ann_type: 399 | ann_type = self.seq_dense_ann_type[ann['babel_sid']] 400 | else: 401 | if ann['frame_anns'] is not None: 402 | ann_type = 'frame_ann' 403 | elif ann['seq_anns'] is not None: 404 | ann_type = 'seq_ann' 405 | else: 406 | ipdb.set_trace() 407 | self.seq_dense_ann_type['babel_sid'] = ann_type 408 | ann_ar = None 409 | if ann_type == 'frame_ann': 410 | if ann['frame_anns'] is not None: 411 | ann_ar = ann['frame_anns'] 412 | elif ann_type == 'seq_ann': 413 | if ann['seq_anns'] is not None: 414 | ann_ar = ann['seq_anns'] 415 | else: 416 | ipdb.set_trace() 417 | if ann_ar: 418 | for an in ann_ar: 419 | seq_samples = self._compute_dur_samples(an, ann_type, \ 420 | seq_ft, ann['dur']) 421 | self._sample_at_seg_chunk_level(ann, seq_samples) 422 | else: 423 | print('Unexpected format for extra!') 424 | return 425 | 426 | 427 | # Create dataset 428 | # -------------------------- 429 | d_folder = '../../data/babel_v1.0_release/' 430 | w_folder = '../data/babel_v1.0/' 431 | for spl in ['train', 'val']: 432 | 433 | # Load Dense BABEL 434 | data = dutils.read_json(ospj(d_folder, f'{spl}.json')) 435 | dataset = [data[sid] for sid in data] 436 | dense_babel = Babel_AR(dataset, dense=True) 437 | # Store Dense BABEL 438 | d_filename = w_folder + 'babel_v1.0_'+ spl + '_samples.pkl' 439 | dutils.write_pkl(dense_babel.d, d_filename) 440 | 441 | # Load Extra BABEL 442 | data = dutils.read_json(ospj(d_folder, f'extra_{spl}.json')) 443 | dataset = [data[sid] for sid in data] 444 | extra_babel = Babel_AR(dataset, dense=False, 445 | seq_dense_ann_type=dense_babel.seq_dense_ann_type) 446 | # Store Dense + Extra 447 | de = {} 448 | for k in dense_babel.d.keys(): 449 | de[k] = dense_babel.d[k] + extra_babel.d[k] 450 | ex_filename = w_folder + 'babel_v1.0_' + spl + '_extra_samples.pkl' 451 | dutils.write_pkl(de, ex_filename) 452 | 453 | # Pre-process, Store data in dataset 454 | print('NTU-style preprocessing') 455 | babel_dataset_AR = ntu_style_preprocessing(d_filename) 456 | babel_dataset_AR = ntu_style_preprocessing(ex_filename) 457 | 458 | for ex, C in product(('', '_extra'), (120, 60)): 459 | 460 | # Split, store data in npy file, labels in pkl 461 | store_splits_subsets(n_classes=C, spl=spl, plus_extra=True) 462 | store_splits_subsets(n_classes=C, spl=spl, plus_extra=False) 463 | 464 | # Store counts of samples for training with class-balanced focal loss 465 | label_fp = ospj(w_folder, f'{spl}{ex}_label_{C}.pkl') 466 | dutils.store_counts(label_fp) 467 | 468 | -------------------------------------------------------------------------------- /action_recognition/train_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | # 5 | # Adapted from https://github.com/lshiwjx/2s-AGCN for BABEL (https://babel.is.tue.mpg.de/) 6 | 7 | from __future__ import print_function 8 | 9 | import argparse 10 | import inspect 11 | import os 12 | import pickle 13 | import random 14 | import shutil 15 | import time 16 | from collections import * 17 | import numpy as np 18 | 19 | # torch 20 | import torch 21 | import torch.backends.cudnn as cudnn 22 | import torch.nn as nn 23 | import torch.optim as optim 24 | import torch.nn.functional as F 25 | 26 | import yaml 27 | from tensorboardX import SummaryWriter 28 | from torch.autograd import Variable 29 | from torch.optim.lr_scheduler import _LRScheduler 30 | from tqdm import tqdm 31 | 32 | import pdb 33 | import ipdb 34 | 35 | # Custom 36 | from class_balanced_loss import CB_loss 37 | 38 | 39 | # class GradualWarmupScheduler(_LRScheduler): 40 | # def __init__(self, optimizer, total_epoch, after_scheduler=None): 41 | # self.total_epoch = total_epoch 42 | # self.after_scheduler = after_scheduler 43 | # self.finished = False 44 | # self.last_epoch = -1 45 | # super().__init__(optimizer) 46 | 47 | # def get_lr(self): 48 | # return [base_lr * (self.last_epoch + 1) / self.total_epoch for base_lr in self.base_lrs] 49 | 50 | # def step(self, epoch=None, metric=None): 51 | # if self.last_epoch >= self.total_epoch - 1: 52 | # if metric is None: 53 | # return self.after_scheduler.step(epoch) 54 | # else: 55 | # return self.after_scheduler.step(metric, epoch) 56 | # else: 57 | # return super(GradualWarmupScheduler, self).step(epoch) 58 | 59 | 60 | def init_seed(_): 61 | torch.cuda.manual_seed_all(1) 62 | torch.manual_seed(1) 63 | np.random.seed(1) 64 | random.seed(1) 65 | # torch.backends.cudnn.enabled = False 66 | torch.backends.cudnn.deterministic = True 67 | torch.backends.cudnn.benchmark = False 68 | 69 | 70 | def get_parser(): 71 | # parameter priority: command line > config > default 72 | parser = argparse.ArgumentParser( 73 | description='Spatial Temporal Graph Convolution Network') 74 | parser.add_argument( 75 | '--work-dir', 76 | default='./work_dir/temp', 77 | help='the work folder for storing results') 78 | 79 | parser.add_argument('-model_saved_name', default='') 80 | parser.add_argument( 81 | '--config', 82 | default='./config/nturgbd-cross-view/test_bone.yaml', 83 | help='path to the configuration file') 84 | 85 | # processor 86 | parser.add_argument( 87 | '--phase', default='train', help='must be train or test') 88 | parser.add_argument( 89 | '--save-score', 90 | type=str2bool, 91 | default=True, 92 | help='if ture, the classification score will be stored') 93 | 94 | # visulize and debug 95 | parser.add_argument( 96 | '--seed', type=int, default=1, help='random seed for pytorch') 97 | parser.add_argument( 98 | '--log-interval', 99 | type=int, 100 | default=100, 101 | help='the interval for printing messages (#iteration)') 102 | parser.add_argument( 103 | '--save-interval', 104 | type=int, 105 | default=2, 106 | help='the interval for storing models (#iteration)') 107 | parser.add_argument( 108 | '--eval-interval', 109 | type=int, 110 | default=5, 111 | help='the interval for evaluating models (#iteration)') 112 | parser.add_argument( 113 | '--print-log', 114 | type=str2bool, 115 | default=True, 116 | help='print logging or not') 117 | parser.add_argument( 118 | '--show-topk', 119 | type=int, 120 | default=[1, 5], 121 | nargs='+', 122 | help='which Top K accuracy will be shown') 123 | 124 | # feeder 125 | parser.add_argument( 126 | '--feeder', default='feeder.feeder', help='data loader will be used') 127 | parser.add_argument( 128 | '--num-worker', 129 | type=int, 130 | default=32, 131 | help='the number of worker for data loader') 132 | parser.add_argument( 133 | '--train-feeder-args', 134 | default=dict(), 135 | help='the arguments of data loader for training') 136 | parser.add_argument( 137 | '--test-feeder-args', 138 | default=dict(), 139 | help='the arguments of data loader for test') 140 | 141 | # model 142 | parser.add_argument('--model', default=None, help='the model will be used') 143 | parser.add_argument( 144 | '--model-args', 145 | type=dict, 146 | default=dict(), 147 | help='the arguments of model') 148 | parser.add_argument( 149 | '--weights', 150 | default=None, 151 | help='the weights for network initialization') 152 | parser.add_argument( 153 | '--ignore-weights', 154 | type=str, 155 | default=[], 156 | nargs='+', 157 | help='the name of weights which will be ignored in the initialization') 158 | 159 | # optim 160 | parser.add_argument( 161 | '--base-lr', type=float, default=0.01, help='initial learning rate') 162 | parser.add_argument( 163 | '--step', 164 | type=int, 165 | default=[20, 40, 60], 166 | nargs='+', 167 | help='the epoch where optimizer reduce the learning rate') 168 | 169 | #training 170 | parser.add_argument( 171 | '--device', 172 | type=int, 173 | default=0, 174 | nargs='+', 175 | help='the indexes of GPUs for training or testing') 176 | parser.add_argument('--optimizer', default='SGD', help='type of optimizer') 177 | parser.add_argument( 178 | '--nesterov', type=str2bool, default=False, help='use nesterov or not') 179 | parser.add_argument( 180 | '--batch-size', type=int, default=256, help='training batch size') 181 | parser.add_argument( 182 | '--test-batch-size', type=int, default=256, help='test batch size') 183 | parser.add_argument( 184 | '--start-epoch', 185 | type=int, 186 | default=0, 187 | help='start training from which epoch') 188 | parser.add_argument( 189 | '--num-epoch', 190 | type=int, 191 | default=80, 192 | help='stop training in which epoch') 193 | parser.add_argument( 194 | '--weight-decay', 195 | type=float, 196 | default=0.0005, 197 | help='weight decay for optimizer') 198 | # loss 199 | parser.add_argument( 200 | '--loss', 201 | type=str, 202 | default='CE', 203 | help='loss type(CE or focal)') 204 | parser.add_argument( 205 | '--label_count_path', 206 | default=None, 207 | type=str, 208 | help='Path to label counts (used in loss weighting)') 209 | parser.add_argument( 210 | '---beta', 211 | type=float, 212 | default=0.9999, 213 | help='Hyperparameter for Class balanced loss') 214 | parser.add_argument( 215 | '--gamma', 216 | type=float, 217 | default=2.0, 218 | help='Hyperparameter for Focal loss') 219 | 220 | parser.add_argument('--only_train_part', default=False) 221 | parser.add_argument('--only_train_epoch', default=0) 222 | parser.add_argument('--warm_up_epoch', default=0) 223 | return parser 224 | 225 | 226 | class Processor(): 227 | """ 228 | Processor for Skeleton-based Action Recgnition 229 | """ 230 | def __init__(self, arg): 231 | self.arg = arg 232 | self.save_arg() 233 | if arg.phase == 'train': 234 | if not arg.train_feeder_args['debug']: 235 | if os.path.isdir(arg.model_saved_name): 236 | print('log_dir: ', arg.model_saved_name, 'already exist') 237 | # answer = input('delete it? y/n:') 238 | answer = 'y' 239 | if answer == 'y': 240 | print('Deleting dir...') 241 | shutil.rmtree(arg.model_saved_name) 242 | print('Dir removed: ', arg.model_saved_name) 243 | # input('Refresh the website of tensorboard by pressing any keys') 244 | else: 245 | print('Dir not removed: ', arg.model_saved_name) 246 | self.train_writer = SummaryWriter(os.path.join(arg.model_saved_name, 'train'), 'train') 247 | self.val_writer = SummaryWriter(os.path.join(arg.model_saved_name, 'val'), 'val') 248 | else: 249 | self.train_writer = self.val_writer = SummaryWriter(os.path.join(arg.model_saved_name, 'test'), 'test') 250 | self.global_step = 0 251 | self.load_model() 252 | self.load_optimizer() 253 | self.load_data() 254 | self.lr = self.arg.base_lr 255 | self.best_acc = 0 256 | self.best_per_class_acc = 0 257 | 258 | def load_data(self): 259 | Feeder = import_class(self.arg.feeder) 260 | self.data_loader = dict() 261 | if self.arg.phase == 'train': 262 | self.data_loader['train'] = torch.utils.data.DataLoader( 263 | dataset=Feeder(**self.arg.train_feeder_args), 264 | batch_size=self.arg.batch_size, 265 | shuffle=True, 266 | num_workers=self.arg.num_worker, 267 | drop_last=True, 268 | worker_init_fn=init_seed) 269 | self.data_loader['test'] = torch.utils.data.DataLoader( 270 | dataset=Feeder(**self.arg.test_feeder_args), 271 | batch_size=self.arg.test_batch_size, 272 | shuffle=False, 273 | num_workers=self.arg.num_worker, 274 | drop_last=False, 275 | worker_init_fn=init_seed) 276 | 277 | def load_class_weights(self): 278 | if arg.label_count_path == None: 279 | raise Exception('No label count path..!!!') 280 | with open(arg.label_count_path, 'rb') as f: 281 | label_count = pickle.load(f) 282 | img_num_per_cls = [] 283 | # ipdb.set_trace() 284 | for cls_idx in range(len(label_count)): 285 | img_num_per_cls.append(int(label_count[cls_idx])) 286 | self.samples_per_class = img_num_per_cls 287 | 288 | def load_model(self): 289 | output_device = self.arg.device[0] if type(self.arg.device) is list else self.arg.device 290 | self.output_device = output_device 291 | Model = import_class(self.arg.model) 292 | shutil.copy2(inspect.getfile(Model), self.arg.work_dir) 293 | print(Model) 294 | self.model = Model(**self.arg.model_args).cuda(output_device) 295 | print(self.model) 296 | self.loss_type = arg.loss 297 | if self.loss_type != 'CE': 298 | self.load_class_weights() 299 | 300 | if self.arg.weights: 301 | self.global_step = int(arg.weights[:-3].split('-')[-1]) 302 | self.print_log('Load weights from {}.'.format(self.arg.weights)) 303 | if '.pkl' in self.arg.weights: 304 | with open(self.arg.weights, 'r') as f: 305 | weights = pickle.load(f) 306 | else: 307 | weights = torch.load(self.arg.weights) 308 | 309 | weights = OrderedDict( 310 | [[k.split('module.')[-1], 311 | v.cuda(output_device)] for k, v in weights.items()]) 312 | 313 | keys = list(weights.keys()) 314 | for w in self.arg.ignore_weights: 315 | for key in keys: 316 | if w in key: 317 | if weights.pop(key, None) is not None: 318 | self.print_log('Sucessfully Remove Weights: {}.'.format(key)) 319 | else: 320 | self.print_log('Can Not Remove Weights: {}.'.format(key)) 321 | 322 | try: 323 | self.model.load_state_dict(weights) 324 | except: 325 | state = self.model.state_dict() 326 | diff = list(set(state.keys()).difference(set(weights.keys()))) 327 | print('Can not find these weights:') 328 | for d in diff: 329 | print(' ' + d) 330 | state.update(weights) 331 | self.model.load_state_dict(state) 332 | 333 | if type(self.arg.device) is list: 334 | if len(self.arg.device) > 1: 335 | self.model = nn.DataParallel( 336 | self.model, 337 | device_ids=self.arg.device, 338 | output_device=output_device) 339 | 340 | def load_optimizer(self): 341 | if self.arg.optimizer == 'SGD': 342 | self.optimizer = optim.SGD( 343 | self.model.parameters(), 344 | lr=self.arg.base_lr, 345 | momentum=0.9, 346 | nesterov=self.arg.nesterov, 347 | weight_decay=self.arg.weight_decay) 348 | elif self.arg.optimizer == 'Adam': 349 | self.optimizer = optim.Adam( 350 | self.model.parameters(), 351 | lr=self.arg.base_lr, 352 | weight_decay=self.arg.weight_decay) 353 | else: 354 | raise ValueError() 355 | 356 | def save_arg(self): 357 | # save arg 358 | arg_dict = vars(self.arg) 359 | if not os.path.exists(self.arg.work_dir): 360 | os.makedirs(self.arg.work_dir) 361 | with open('{}/config.yaml'.format(self.arg.work_dir), 'w') as f: 362 | yaml.dump(arg_dict, f) 363 | 364 | def adjust_learning_rate(self, epoch): 365 | if self.arg.optimizer == 'SGD' or self.arg.optimizer == 'Adam': 366 | if epoch < self.arg.warm_up_epoch: 367 | lr = self.arg.base_lr * (epoch + 1) / self.arg.warm_up_epoch 368 | else: 369 | lr = self.arg.base_lr * ( 370 | 0.1 ** np.sum(epoch >= np.array(self.arg.step))) 371 | for param_group in self.optimizer.param_groups: 372 | param_group['lr'] = lr 373 | 374 | return lr 375 | else: 376 | raise ValueError() 377 | 378 | def print_time(self): 379 | localtime = time.asctime(time.localtime(time.time())) 380 | self.print_log("Local current time : " + localtime) 381 | 382 | def print_log(self, str, print_time=True): 383 | if print_time: 384 | localtime = time.asctime(time.localtime(time.time())) 385 | str = "[ " + localtime + ' ] ' + str 386 | print(str) 387 | if self.arg.print_log: 388 | with open('{}/log.txt'.format(self.arg.work_dir), 'a') as f: 389 | print(str, file=f) 390 | 391 | def record_time(self): 392 | self.cur_time = time.time() 393 | return self.cur_time 394 | 395 | def split_time(self): 396 | split_time = time.time() - self.cur_time 397 | self.record_time() 398 | return split_time 399 | 400 | def train(self, epoch, wb_dict, save_model=False): 401 | self.model.train() 402 | self.print_log('Training epoch: {}'.format(epoch + 1)) 403 | loader = self.data_loader['train'] 404 | self.adjust_learning_rate(epoch) 405 | 406 | loss_value, batch_acc, batch_per_class_acc = [], [], [] 407 | self.train_writer.add_scalar('epoch', epoch, self.global_step) 408 | self.record_time() 409 | timer = dict(dataloader=0.001, model=0.001, statistics=0.001) 410 | process = tqdm(loader) 411 | if self.arg.only_train_part: 412 | if epoch > self.arg.only_train_epoch: 413 | print('only train part, require grad') 414 | for key, value in self.model.named_parameters(): 415 | if 'PA' in key: 416 | value.requires_grad = True 417 | else: 418 | print('only train part, do not require grad') 419 | for key, value in self.model.named_parameters(): 420 | if 'PA' in key: 421 | value.requires_grad = False 422 | 423 | nb_classes = self.arg.model_args['num_class'] 424 | confusion_matrix = torch.zeros(nb_classes, nb_classes) 425 | for batch_idx, (data, label, sid, seg_id, chunk_n, anntr_id, index) in enumerate(process): 426 | 427 | self.global_step += 1 428 | # get data 429 | data = Variable(data.float().cuda(self.output_device), requires_grad=False) 430 | label = Variable(label.long().cuda(self.output_device), requires_grad=False) 431 | timer['dataloader'] += self.split_time() 432 | 433 | # forward 434 | output = self.model(data) 435 | 436 | if self.loss_type == "CE": 437 | l_type = nn.CrossEntropyLoss() 438 | loss = l_type(output, label) 439 | else: 440 | loss = CB_loss(label, output, 441 | self.samples_per_class, 442 | nb_classes, self.loss_type, 443 | self.arg.beta, 444 | self.arg.gamma, 445 | self.arg.device[0] 446 | ) 447 | 448 | # backward 449 | self.optimizer.zero_grad() 450 | loss.backward() 451 | self.optimizer.step() 452 | loss_value.append(loss.data.item()) 453 | timer['model'] += self.split_time() 454 | 455 | # Compute per-class acc. 456 | value, predict_label = torch.max(output.data, 1) 457 | for t, p in zip(label.view(-1), predict_label.view(-1)): 458 | confusion_matrix[t.long(), p.long()] += 1 459 | 460 | # Acc. 461 | acc = torch.mean((predict_label == label.data).float()) 462 | batch_acc.append(acc.item()) 463 | self.train_writer.add_scalar('acc', acc, self.global_step) 464 | self.train_writer.add_scalar('loss', loss.data.item(), self.global_step) 465 | 466 | # statistics 467 | self.lr = self.optimizer.param_groups[0]['lr'] 468 | self.train_writer.add_scalar('lr', self.lr, self.global_step) 469 | # if self.global_step % self.arg.log_interval == 0: 470 | # self.print_log( 471 | # '\tBatch({}/{}) done. Loss: {:.4f} lr:{:.6f}'.format( 472 | # batch_idx, len(loader), loss.data[0], lr)) 473 | timer['statistics'] += self.split_time() 474 | 475 | per_class_acc_vals = confusion_matrix.diag()/confusion_matrix.sum(1) 476 | per_class_acc = torch.mean(per_class_acc_vals).float() 477 | 478 | # statistics of time consumption and loss 479 | proportion = { 480 | k: '{:02d}%'.format(int(round(v * 100 / sum(timer.values())))) 481 | for k, v in timer.items() 482 | } 483 | self.print_log( 484 | '\tMean training loss: {:.4f}.'.format(np.mean(loss_value))) 485 | self.print_log('\tTop-1-norm: {:.3f}%'.format(100*per_class_acc)) 486 | 487 | # Log 488 | wb_dict['train loss'] = np.mean(loss_value) 489 | wb_dict['train acc'] = np.mean(batch_acc) 490 | 491 | if save_model: 492 | state_dict = self.model.state_dict() 493 | weights = OrderedDict([[k.split('module.')[-1], 494 | v.cpu()] for k, v in state_dict.items()]) 495 | 496 | torch.save(weights, self.arg.model_saved_name + '-' + str(epoch) + '-' + str(int(self.global_step)) + '.pt') 497 | 498 | return wb_dict 499 | 500 | @torch.no_grad() 501 | def eval(self, epoch, 502 | wb_dict, 503 | save_score=True, 504 | loader_name=['test'], 505 | wrong_file=None, 506 | result_file=None 507 | ): 508 | if wrong_file is not None: 509 | f_w = open(wrong_file, 'w') 510 | if result_file is not None: 511 | f_r = open(result_file, 'w') 512 | self.model.eval() 513 | self.print_log('Eval epoch: {}'.format(epoch + 1)) 514 | for ln in loader_name: 515 | loss_value = [] 516 | score_frag = [] 517 | pred_label_list = [] 518 | step = 0 519 | nb_classes = self.arg.model_args['num_class'] 520 | confusion_matrix = torch.zeros(nb_classes, nb_classes) 521 | process = tqdm(self.data_loader[ln]) 522 | for batch_idx, (data, label, sid, seg_id, chunk_n, anntr_id, index) in enumerate(process): 523 | data = Variable( 524 | data.float().cuda(self.output_device), 525 | requires_grad=False) 526 | # volatile=True) 527 | label = Variable( 528 | label.long().cuda(self.output_device), 529 | requires_grad=False) 530 | # volatile=True) 531 | output = self.model(data) 532 | 533 | if self.loss_type == "CE": 534 | l_type = nn.CrossEntropyLoss() 535 | loss = l_type(output, label) 536 | else: 537 | loss = CB_loss(label, output, 538 | self.samples_per_class, 539 | nb_classes, self.loss_type, 540 | self.arg.beta, 541 | self.arg.gamma, 542 | self.arg.device[0] 543 | ) 544 | # Store outputs 545 | logits = output.data.cpu().numpy() 546 | score_frag.append(logits) 547 | loss_value.append(loss.data.item()) 548 | 549 | _, predict_label = torch.max(output.data, 1) 550 | pred_label_list.append(predict_label) 551 | 552 | step += 1 553 | 554 | # Compute per-class acc. 555 | for t, p in zip(label.view(-1), predict_label.view(-1)): 556 | confusion_matrix[t.long(), p.long()] += 1 557 | if wrong_file is not None or result_file is not None: 558 | predict = list(predict_label.cpu().numpy()) 559 | true = list(label.data.cpu().numpy()) 560 | for i, x in enumerate(predict): 561 | if result_file is not None: 562 | f_r.write(str(x) + ',' + str(true[i]) + '\n') 563 | if x != true[i] and wrong_file is not None: 564 | f_w.write(str(index[i]) + ',' + str(x) + ',' + str(true[i]) + '\n') 565 | per_class_acc_vals = confusion_matrix.diag()/confusion_matrix.sum(1) 566 | per_class_acc = torch.mean(per_class_acc_vals).float() 567 | score = np.concatenate(score_frag) 568 | loss = np.mean(loss_value) 569 | 570 | accuracy = self.data_loader[ln].dataset.top_k(score, 1) 571 | topk_scores = { k: self.data_loader[ln].dataset.top_k(score, k) \ 572 | for k in self.arg.show_topk } 573 | 574 | wb_dict['val loss'] = loss 575 | wb_dict['val acc'] = accuracy 576 | wb_dict['val per class acc'] = per_class_acc 577 | for k in topk_scores: 578 | wb_dict['val top{0} score'.format(k)] = topk_scores[k] 579 | 580 | if accuracy > self.best_acc: 581 | self.best_acc = accuracy 582 | if per_class_acc > self.best_per_class_acc: 583 | self.best_per_class_acc = per_class_acc 584 | 585 | print('Accuracy: ', accuracy, ' model: ', self.arg.model_saved_name) 586 | if self.arg.phase == 'train': 587 | self.val_writer.add_scalar('loss', loss, self.global_step) 588 | self.val_writer.add_scalar('acc', accuracy, self.global_step) 589 | self.val_writer.add_scalar('per_class_acc', per_class_acc , self.global_step) 590 | 591 | pred_scores = list(zip( 592 | self.data_loader[ln].dataset.label[1], # sid 593 | self.data_loader[ln].dataset.sample_name, # seg_id 594 | self.data_loader[ln].dataset.label[2], # chunk_id 595 | score)) 596 | 597 | self.print_log('\tMean {} loss of {} batches: {}.'.format( 598 | ln, len(self.data_loader[ln]), np.mean(loss_value))) 599 | self.print_log('\tTop-1-norm: {:.3f}%'.format(100*per_class_acc)) 600 | for k in topk_scores: 601 | self.print_log('\tTop{}: {:.3f}%'.format(k, 100*topk_scores[k])) 602 | 603 | if save_score: 604 | with open('{}/epoch{}_{}_score.pkl'.format( 605 | self.arg.work_dir, epoch + 1, ln), 'wb') as f: 606 | pickle.dump(pred_scores, f) 607 | return wb_dict 608 | 609 | def start(self): 610 | wb_dict = {} 611 | if self.arg.phase == 'train': 612 | self.print_log('Parameters:\n{}\n'.format(str(vars(self.arg)))) 613 | self.global_step = self.arg.start_epoch * len(self.data_loader['train']) / self.arg.batch_size 614 | 615 | for epoch in range(self.arg.start_epoch, self.arg.num_epoch): 616 | 617 | save_model = ((epoch + 1) % self.arg.save_interval == 0) or ( 618 | epoch + 1 == self.arg.num_epoch) 619 | 620 | # Wandb logging 621 | wb_dict = {'lr': self.lr} 622 | 623 | # Train 624 | wb_dict = self.train(epoch, wb_dict, save_model=save_model) 625 | 626 | # Eval. on val set 627 | wb_dict = self.eval( 628 | epoch, 629 | wb_dict, 630 | save_score=self.arg.save_score, 631 | loader_name=['test']) 632 | # Log stats. for this epoch 633 | print('Epoch: {0}\nMetrics: {1}'.format(epoch, wb_dict)) 634 | 635 | print('best accuracy: ', self.best_acc, ' model_name: ', self.arg.model_saved_name) 636 | 637 | elif self.arg.phase == 'test': 638 | if not self.arg.test_feeder_args['debug']: 639 | wf = self.arg.model_saved_name + '_wrong.txt' 640 | rf = self.arg.model_saved_name + '_right.txt' 641 | else: 642 | wf = rf = None 643 | if self.arg.weights is None: 644 | raise ValueError('Please appoint --weights.') 645 | self.arg.print_log = False 646 | self.print_log('Model: {}.'.format(self.arg.model)) 647 | self.print_log('Weights: {}.'.format(self.arg.weights)) 648 | 649 | wb_dict = self.eval(epoch=0, wb_dict=wb_dict, 650 | save_score=self.arg.save_score, 651 | loader_name=['test'], 652 | wrong_file=wf, 653 | result_file=rf 654 | ) 655 | print('Inference metrics: ', wb_dict) 656 | self.print_log('Done.\n') 657 | 658 | 659 | def str2bool(v): 660 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 661 | return True 662 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 663 | return False 664 | else: 665 | raise argparse.ArgumentTypeError('Boolean value expected.') 666 | 667 | 668 | def import_class(name): 669 | components = name.split('.') 670 | mod = __import__(components[0]) 671 | for comp in components[1:]: 672 | mod = getattr(mod, comp) 673 | return mod 674 | 675 | 676 | if __name__ == '__main__': 677 | parser = get_parser() 678 | 679 | # load arg form config file 680 | p = parser.parse_args() 681 | if p.config is not None: 682 | with open(p.config, 'r') as f: 683 | default_arg = yaml.load(f) 684 | key = vars(p).keys() 685 | for k in default_arg.keys(): 686 | if k not in key: 687 | print('WRONG ARG: {}'.format(k)) 688 | assert (k in key) 689 | parser.set_defaults(**default_arg) 690 | 691 | arg = parser.parse_args() 692 | print('BABEL Action Recognition') 693 | print('Config: ', arg) 694 | init_seed(0) 695 | processor = Processor(arg) 696 | processor.start() 697 | --------------------------------------------------------------------------------