├── action_recognition
├── model
│ ├── __init__.py
│ ├── agcn.py
│ ├── agcn_mult.py
│ └── aagcn.py
├── feeders
│ ├── __init__.py
│ ├── tools.py
│ └── feeder.py
├── graph
│ ├── __init__.py
│ ├── tools.py
│ ├── ntu_rgb_d.py
│ └── kinetics.py
├── config
│ └── babel_v1.0
│ │ ├── test_60.yaml
│ │ ├── test_120.yaml
│ │ ├── test_60_wfl.yaml
│ │ ├── test_120_wfl.yaml
│ │ ├── train_60.yaml
│ │ ├── train_120.yaml
│ │ ├── train_60_wfl.yaml
│ │ └── train_120_wfl.yaml
├── data_gen
│ ├── rotation.py
│ ├── preprocess.py
│ ├── dutils.py
│ ├── viz.py
│ └── create_dataset.py
├── data
│ └── action_label_2_idx.json
├── class_balanced_loss.py
├── challenge
│ └── create_submission.py
├── Readme.md
└── train_test.py
├── .gitignore
├── notebooks
├── Readme.md
├── BABEL_explore.ipynb
└── BABEL_visualization.ipynb
├── requirements.txt
└── Readme.md
/action_recognition/model/__init__.py:
--------------------------------------------------------------------------------
1 | from . import agcn, aagcn
2 |
--------------------------------------------------------------------------------
/action_recognition/feeders/__init__.py:
--------------------------------------------------------------------------------
1 | from . import tools
2 | from . import feeder
3 |
--------------------------------------------------------------------------------
/action_recognition/graph/__init__.py:
--------------------------------------------------------------------------------
1 | from . import tools
2 | from . import ntu_rgb_d
3 | from . import kinetics
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # System
2 | babel-env
3 |
4 | # Temp files
5 | *.DS_Store
6 | *.swp
7 | __pycache__
8 | notebooks/.ipynb_checkpoints
9 |
10 | # Data
11 | data/babel_v1.0_release
12 | action_recognition/data/release
13 |
14 | # Predictions
15 | action_recognition/challenge/*.pkl
16 | action_recognition/challenge/*.npz
17 | ckpts
18 |
19 | # Logging
20 | wandb
21 | runs
22 | work_dir
23 |
--------------------------------------------------------------------------------
/notebooks/Readme.md:
--------------------------------------------------------------------------------
1 | ### Load and visualize BABEL
2 |
3 | [`BABEL_visualization.ipynb`](BABEL_visualization.ipynb) contains code that demonstrates how to:
4 | - Load the BABEL dataset
5 | - Visualize rendered videos of mocap sequences
6 | - Visualize their action labels
7 |
8 |
9 | ### Explore BABEL
10 |
11 | [`BABEL_explore.ipynb`](BABEL_explore.ipynb) contains code that shows how to:
12 | - Compute stats. from BABEL (e.g., duration of labeled mocap)
13 | - Search BABEL for mocap sequences containing a specific action, and retrieve their annotations.
--------------------------------------------------------------------------------
/action_recognition/config/babel_v1.0/test_60.yaml:
--------------------------------------------------------------------------------
1 | # feeder
2 | feeder: feeders.feeder.Feeder
3 | test_feeder_args:
4 | data_path: ./data/release/val_ntu_sk_60.npy
5 | label_path: ./data/release/val_label_60.pkl
6 | debug: False
7 |
8 | # model
9 | model: model.agcn.Model
10 | model_args:
11 | num_class: 60
12 | num_point: 25
13 | num_person: 1
14 | graph: graph.ntu_rgb_d.Graph
15 | graph_args:
16 | labeling_mode: 'spatial'
17 |
18 | # test
19 | phase: test
20 | device: [0]
21 | test_batch_size: 128
22 | weights: ./ckpts/ntu_sk_60_agcn_joint_const_lr_1e-3-17-6390.pt
23 |
24 | work_dir: ./work_dir/babel_v1.0/test_runs/test_ntu_sk_60_agcn_joint_const_lr_1e-3
25 | save_score: True
26 |
--------------------------------------------------------------------------------
/action_recognition/config/babel_v1.0/test_120.yaml:
--------------------------------------------------------------------------------
1 | # feeder
2 | feeder: feeders.feeder.Feeder
3 | test_feeder_args:
4 | data_path: ./data/release/val_ntu_sk_120.npy
5 | label_path: ./data/release/val_label_120.pkl
6 | debug: False
7 |
8 | # model
9 | model: model.agcn.Model
10 | model_args:
11 | num_class: 120
12 | num_point: 25
13 | num_person: 1
14 | graph: graph.ntu_rgb_d.Graph
15 | graph_args:
16 | labeling_mode: 'spatial'
17 |
18 | # test
19 | phase: test
20 | device: [0]
21 | test_batch_size: 128
22 | weights: ./ckpts/ntu_sk_120_agcn_joint_const_lr_1e-3-15-12240.pt
23 |
24 | work_dir: ./work_dir/babel_v1.0/test_runs/test_ntu_sk_120_agcn_joint_const_lr_1e-3
25 | save_score: True
26 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | backcall==0.2.0
2 | certifi==2020.12.5
3 | decorator==4.4.2
4 | ipdb==0.13.4
5 | ipython==7.19.0
6 | ipython-genutils==0.2.0
7 | jedi==0.18.0
8 | joblib==1.0.0
9 | networkx==2.5
10 | numpy==1.19.5
11 | parso==0.8.1
12 | pexpect==4.8.0
13 | pickleshare==0.7.5
14 | Pillow==8.1.0
15 | prompt-toolkit==3.0.10
16 | protobuf==3.14.0
17 | ptyprocess==0.7.0
18 | Pygments==2.7.4
19 | PyYAML==5.4
20 | scikit-learn==0.24.1
21 | scipy==1.6.0
22 | six==1.15.0
23 | tensorboardX==2.1
24 | threadpoolctl==2.1.0
25 | torch==1.7.1
26 | torchvision==0.8.2
27 | tqdm==4.56.0
28 | traitlets==5.0.5
29 | typing-extensions==3.7.4.3
30 | wcwidth==0.2.5
31 | pandas==1.3.4
32 | smplx==0.1.13
33 | matplotlib==3.1.3
34 | opencv-python==4.4.0.42
35 |
--------------------------------------------------------------------------------
/action_recognition/graph/tools.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def edge2mat(link, num_node):
5 | A = np.zeros((num_node, num_node))
6 | for i, j in link:
7 | A[j, i] = 1
8 | return A
9 |
10 |
11 | def normalize_digraph(A): # 除以每列的和
12 | Dl = np.sum(A, 0)
13 | h, w = A.shape
14 | Dn = np.zeros((w, w))
15 | for i in range(w):
16 | if Dl[i] > 0:
17 | Dn[i, i] = Dl[i] ** (-1)
18 | AD = np.dot(A, Dn)
19 | return AD
20 |
21 |
22 | def get_spatial_graph(num_node, self_link, inward, outward):
23 | I = edge2mat(self_link, num_node)
24 | In = normalize_digraph(edge2mat(inward, num_node))
25 | Out = normalize_digraph(edge2mat(outward, num_node))
26 | A = np.stack((I, In, Out))
27 | return A
28 |
--------------------------------------------------------------------------------
/action_recognition/config/babel_v1.0/test_60_wfl.yaml:
--------------------------------------------------------------------------------
1 | # feeder
2 | feeder: feeders.feeder.Feeder
3 | test_feeder_args:
4 | data_path: ./data/release/val_ntu_sk_60.npy
5 | label_path: ./data/release/val_label_60.pkl
6 | debug: False
7 |
8 | label_count_path: ./data/release/train_label_60_count.pkl
9 |
10 | # model
11 | model: model.agcn.Model
12 | model_args:
13 | num_class: 60
14 | num_point: 25
15 | num_person: 1
16 | graph: graph.ntu_rgb_d.Graph
17 | graph_args:
18 | labeling_mode: 'spatial'
19 |
20 | # test
21 | phase: test
22 | device: [0]
23 | test_batch_size: 32
24 | weights: ./ckpts/wfl_ntu_sk_60_agcn_joint_const_lr_1e-3-93-33370.pt
25 |
26 | work_dir: ./work_dir/babel_v1.0/test_runs/test_wfl_ntu_sk_60_agcn_joint_const_lr_1e-3
27 | save_score: True
28 |
--------------------------------------------------------------------------------
/action_recognition/config/babel_v1.0/test_120_wfl.yaml:
--------------------------------------------------------------------------------
1 | # feeder
2 | feeder: feeders.feeder.Feeder
3 | test_feeder_args:
4 | data_path: ./data/release/val_ntu_sk_120.npy
5 | label_path: ./data/release/val_label_120.pkl
6 | debug: False
7 |
8 | label_count_path: ./data/release/train_label_120_count.pkl
9 |
10 | # model
11 | model: model.agcn.Model
12 | model_args:
13 | num_class: 120
14 | num_point: 25
15 | num_person: 1
16 | graph: graph.ntu_rgb_d.Graph
17 | graph_args:
18 | labeling_mode: 'spatial'
19 |
20 | # test
21 | phase: test
22 | device: [0]
23 | test_batch_size: 128
24 | weights: ./ckpts/wfl_ntu_sk_120_agcn_joint_const_lr_1e-3-157-60356.pt
25 |
26 | work_dir: ./work_dir/babel_v1.0/test_runs/test_wfl_ntu_sk_120_agcn_joint_const_lr_1e-3
27 | save_score: True
28 |
--------------------------------------------------------------------------------
/action_recognition/config/babel_v1.0/train_60.yaml:
--------------------------------------------------------------------------------
1 | work_dir: ./work_dir/babel_v1.0/ntu_sk_60_agcn_joint_const_lr_1e-3
2 | model_saved_name: ./runs/babel_v1.0/ntu_sk_60_agcn_joint_const_lr_1e-3
3 |
4 | # feeder
5 | feeder: feeders.feeder.Feeder
6 | train_feeder_args:
7 | data_path: ./data/release/train_ntu_sk_60.npy
8 | label_path: ./data/release/train_label_60.pkl
9 | debug: False
10 | random_choose: False
11 | random_shift: False
12 | random_move: False
13 | window_size: -1
14 | normalization: False
15 |
16 | test_feeder_args:
17 | data_path: ./data/release/val_ntu_sk_60.npy
18 | label_path: ./data/release/val_label_60.pkl
19 |
20 | # model
21 | model: model.agcn.Model
22 | model_args:
23 | num_class: 60
24 | num_person: 1
25 | num_point: 25
26 | graph: graph.ntu_rgb_d.Graph
27 | graph_args:
28 | labeling_mode: 'spatial'
29 |
30 | #optim
31 | weight_decay: 0.0001
32 | base_lr: 0.001
33 | step: []
34 |
35 | # training
36 | device: [0]
37 | optimizer: 'Adam'
38 | loss: 'CE'
39 | batch_size: 64
40 | test_batch_size: 64
41 | num_epoch: 250
42 | nesterov: True
43 |
44 | # weights: /ps/project/conditional_action_gen/2s_agcn/runs/babel_v1.0/ntu_sk_60_agcn_joint_const_lr_1e-3-49-23450.pt
45 |
--------------------------------------------------------------------------------
/action_recognition/config/babel_v1.0/train_120.yaml:
--------------------------------------------------------------------------------
1 | work_dir: ./work_dir/babel_v1.0/ntu_sk_120_agcn_joint_const_lr_1e-3
2 | model_saved_name: ./runs/babel_v1.0/ntu_sk_120_agcn_joint_const_lr_1e-3
3 |
4 | # feeder
5 | feeder: feeders.feeder.Feeder
6 | train_feeder_args:
7 | data_path: ./data/release/train_ntu_sk_120.npy
8 | label_path: ./data/release/train_label_120.pkl
9 | debug: False
10 | random_choose: False
11 | random_shift: False
12 | random_move: False
13 | window_size: -1
14 | normalization: False
15 |
16 | test_feeder_args:
17 | data_path: ./data/release/val_ntu_sk_120.npy
18 | label_path: ./data/release/val_label_120.pkl
19 |
20 | # model
21 | model: model.agcn.Model
22 | model_args:
23 | num_class: 120
24 | num_person: 1
25 | num_point: 25
26 | graph: graph.ntu_rgb_d.Graph
27 | graph_args:
28 | labeling_mode: 'spatial'
29 |
30 | #optim
31 | weight_decay: 0.0001
32 | base_lr: 0.001
33 | step: []
34 |
35 | # training
36 | device: [0]
37 | optimizer: 'Adam'
38 | loss: 'CE'
39 | batch_size: 64
40 | test_batch_size: 64
41 | num_epoch: 250
42 | nesterov: True
43 |
44 | # weights: /ps/project/conditional_action_gen/2s_agcn/runs/babel_v1.0/ntu_sk_120_agcn_joint_const_lr_1e-3-49-23450.pt
45 |
--------------------------------------------------------------------------------
/action_recognition/config/babel_v1.0/train_60_wfl.yaml:
--------------------------------------------------------------------------------
1 | work_dir: ./work_dir/babel_v1.0/wfl_ntu_sk_60_agcn_joint_const_lr_1e-3
2 | model_saved_name: ./runs/babel_v1.0/wfl_ntu_sk_60_agcn_joint_const_lr_1e-3
3 |
4 | # feeder
5 | feeder: feeders.feeder.Feeder
6 | train_feeder_args:
7 | data_path: ./data/release/train_ntu_sk_60.npy
8 | label_path: ./data/release/train_label_60.pkl
9 | debug: False
10 | random_choose: False
11 | random_shift: False
12 | random_move: False
13 | window_size: -1
14 | normalization: False
15 |
16 | test_feeder_args:
17 | data_path: ./data/release/val_ntu_sk_60.npy
18 | label_path: ./data/release/val_label_60.pkl
19 |
20 | # model
21 | model: model.agcn.Model
22 | model_args:
23 | num_class: 60
24 | num_person: 1
25 | num_point: 25
26 | graph: graph.ntu_rgb_d.Graph
27 | graph_args:
28 | labeling_mode: 'spatial'
29 |
30 | #optim
31 | weight_decay: 0.0001
32 | base_lr: 0.001
33 | step: []
34 |
35 | # training
36 | device: [0]
37 | optimizer: 'Adam'
38 | loss: 'focal'
39 | beta: 0.9999
40 | gamma: 1.0
41 | label_count_path: ./data/release/train_label_60_count.pkl
42 | batch_size: 64
43 | test_batch_size: 64
44 | num_epoch: 200
45 | nesterov: True
46 |
47 | # weights: /ps/project/conditional_action_gen/2s_agcn/runs/babel_v1.0/wfl_ntu_sk_60_agcn_joint_const_lr_1e-3-19-8760.pt
48 |
--------------------------------------------------------------------------------
/action_recognition/config/babel_v1.0/train_120_wfl.yaml:
--------------------------------------------------------------------------------
1 | work_dir: ./work_dir/babel_v1.0/wfl_ntu_sk_120_agcn_joint_const_lr_1e-3
2 | model_saved_name: ./runs/babel_v1.0/wfl_ntu_sk_120_agcn_joint_const_lr_1e-3
3 |
4 | # feeder
5 | feeder: feeders.feeder.Feeder
6 | train_feeder_args:
7 | data_path: ./data/release/train_ntu_sk_120.npy
8 | label_path: ./data/release/train_label_120.pkl
9 | debug: False
10 | random_choose: False
11 | random_shift: False
12 | random_move: False
13 | window_size: -1
14 | normalization: False
15 |
16 | test_feeder_args:
17 | data_path: ./data/release/val_ntu_sk_120.npy
18 | label_path: ./data/release/val_label_120.pkl
19 |
20 | # model
21 | model: model.agcn.Model
22 | model_args:
23 | num_class: 120
24 | num_person: 1
25 | num_point: 25
26 | graph: graph.ntu_rgb_d.Graph
27 | graph_args:
28 | labeling_mode: 'spatial'
29 |
30 | #optim
31 | weight_decay: 0.0001
32 | base_lr: 0.001
33 | step: []
34 |
35 | # training
36 | device: [0]
37 | optimizer: 'Adam'
38 | loss: 'focal'
39 | beta: 0.9999
40 | gamma: 1.0
41 | label_count_path: ./data/release/train_label_120_count.pkl
42 | batch_size: 64
43 | test_batch_size: 64
44 | num_epoch: 200
45 | nesterov: True
46 |
47 | # weights: /ps/project/conditional_action_gen/2s_agcn/runs/babel_v1.0/wfl_ntu_sk_120_agcn_joint_const_lr_1e-3-19-8760.pt
48 |
--------------------------------------------------------------------------------
/action_recognition/graph/ntu_rgb_d.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.extend(['../'])
4 | from graph import tools
5 |
6 | num_node = 25
7 | self_link = [(i, i) for i in range(num_node)]
8 | inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5), (7, 6),
9 | (8, 7), (9, 21), (10, 9), (11, 10), (12, 11), (13, 1),
10 | (14, 13), (15, 14), (16, 15), (17, 1), (18, 17), (19, 18),
11 | (20, 19), (22, 23), (23, 8), (24, 25), (25, 12)]
12 | inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]
13 | outward = [(j, i) for (i, j) in inward]
14 | neighbor = inward + outward
15 |
16 |
17 | class Graph:
18 | def __init__(self, labeling_mode='spatial'):
19 | self.A = self.get_adjacency_matrix(labeling_mode)
20 | self.num_node = num_node
21 | self.self_link = self_link
22 | self.inward = inward
23 | self.outward = outward
24 | self.neighbor = neighbor
25 |
26 | def get_adjacency_matrix(self, labeling_mode=None):
27 | if labeling_mode is None:
28 | return self.A
29 | if labeling_mode == 'spatial':
30 | A = tools.get_spatial_graph(num_node, self_link, inward, outward)
31 | else:
32 | raise ValueError()
33 | return A
34 |
35 |
36 | if __name__ == '__main__':
37 | import matplotlib.pyplot as plt
38 | import os
39 |
40 | # os.environ['DISPLAY'] = 'localhost:11.0'
41 | A = Graph('spatial').get_adjacency_matrix()
42 | for i in A:
43 | plt.imshow(i, cmap='gray')
44 | plt.show()
45 | print(A)
46 |
--------------------------------------------------------------------------------
/action_recognition/graph/kinetics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sys
3 |
4 | sys.path.extend(['../'])
5 | from graph import tools
6 | import networkx as nx
7 |
8 | # Joint index:
9 | # {0, "Nose"}
10 | # {1, "Neck"},
11 | # {2, "RShoulder"},
12 | # {3, "RElbow"},
13 | # {4, "RWrist"},
14 | # {5, "LShoulder"},
15 | # {6, "LElbow"},
16 | # {7, "LWrist"},
17 | # {8, "RHip"},
18 | # {9, "RKnee"},
19 | # {10, "RAnkle"},
20 | # {11, "LHip"},
21 | # {12, "LKnee"},
22 | # {13, "LAnkle"},
23 | # {14, "REye"},
24 | # {15, "LEye"},
25 | # {16, "REar"},
26 | # {17, "LEar"},
27 |
28 | # Edge format: (origin, neighbor)
29 | num_node = 18
30 | self_link = [(i, i) for i in range(num_node)]
31 | inward = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11), (10, 9), (9, 8),
32 | (11, 5), (8, 2), (5, 1), (2, 1), (0, 1), (15, 0), (14, 0), (17, 15),
33 | (16, 14)]
34 | outward = [(j, i) for (i, j) in inward]
35 | neighbor = inward + outward
36 |
37 |
38 | class Graph:
39 | def __init__(self, labeling_mode='spatial'):
40 | self.A = self.get_adjacency_matrix(labeling_mode)
41 | self.num_node = num_node
42 | self.self_link = self_link
43 | self.inward = inward
44 | self.outward = outward
45 | self.neighbor = neighbor
46 |
47 | def get_adjacency_matrix(self, labeling_mode=None):
48 | if labeling_mode is None:
49 | return self.A
50 | if labeling_mode == 'spatial':
51 | A = tools.get_spatial_graph(num_node, self_link, inward, outward)
52 | else:
53 | raise ValueError()
54 | return A
55 |
56 |
57 | if __name__ == '__main__':
58 | A = Graph('spatial').get_adjacency_matrix()
59 | print('')
60 |
--------------------------------------------------------------------------------
/action_recognition/data_gen/rotation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import math
3 |
4 |
5 | def rotation_matrix(axis, theta):
6 | """
7 | Return the rotation matrix associated with counterclockwise rotation about
8 | the given axis by theta radians.
9 | """
10 | if np.abs(axis).sum() < 1e-6 or np.abs(theta) < 1e-6:
11 | return np.eye(3)
12 | axis = np.asarray(axis)
13 | axis = axis / math.sqrt(np.dot(axis, axis))
14 | a = math.cos(theta / 2.0)
15 | b, c, d = -axis * math.sin(theta / 2.0)
16 | aa, bb, cc, dd = a * a, b * b, c * c, d * d
17 | bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d
18 | return np.array([[aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)],
19 | [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)],
20 | [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc]])
21 |
22 |
23 | def unit_vector(vector):
24 | """ Returns the unit vector of the vector. """
25 | return vector / np.linalg.norm(vector)
26 |
27 |
28 | def angle_between(v1, v2):
29 | """ Returns the angle in radians between vectors 'v1' and 'v2'::
30 |
31 | >>> angle_between((1, 0, 0), (0, 1, 0))
32 | 1.5707963267948966
33 | >>> angle_between((1, 0, 0), (1, 0, 0))
34 | 0.0
35 | >>> angle_between((1, 0, 0), (-1, 0, 0))
36 | 3.141592653589793
37 | """
38 | if np.abs(v1).sum() < 1e-6 or np.abs(v2).sum() < 1e-6:
39 | return 0
40 | v1_u = unit_vector(v1)
41 | v2_u = unit_vector(v2)
42 | return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))
43 |
44 |
45 | def x_rotation(vector, theta):
46 | """Rotates 3-D vector around x-axis"""
47 | R = np.array([[1, 0, 0], [0, np.cos(theta), -np.sin(theta)], [0, np.sin(theta), np.cos(theta)]])
48 | return np.dot(R, vector)
49 |
50 |
51 | def y_rotation(vector, theta):
52 | """Rotates 3-D vector around y-axis"""
53 | R = np.array([[np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)]])
54 | return np.dot(R, vector)
55 |
56 |
57 | def z_rotation(vector, theta):
58 | """Rotates 3-D vector around z-axis"""
59 | R = np.array([[np.cos(theta), -np.sin(theta), 0], [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
60 | return np.dot(R, vector)
61 |
--------------------------------------------------------------------------------
/action_recognition/data/action_label_2_idx.json:
--------------------------------------------------------------------------------
1 | {
2 | "walk": 0,
3 | "stand": 1,
4 | "hand movements": 2,
5 | "turn": 3,
6 | "interact with/use object": 4,
7 | "arm movements": 5,
8 | "t pose": 6,
9 | "step": 7,
10 | "backwards movement": 8,
11 | "raising body part": 9,
12 | "look": 10,
13 | "touch object": 11,
14 | "leg movements": 12,
15 | "forward movement": 13,
16 | "circular movement": 14,
17 | "stretch": 15,
18 | "jump": 16,
19 | "touching body part": 17,
20 | "sit": 18,
21 | "place something": 19,
22 | "take/pick something up": 20,
23 | "run": 21,
24 | "bend": 22,
25 | "throw": 23,
26 | "foot movements": 24,
27 | "a pose": 25,
28 | "stand up": 26,
29 | "lowering body part": 27,
30 | "sideways movement": 28,
31 | "move up/down incline": 29,
32 | "action with ball": 30,
33 | "kick": 31,
34 | "gesture": 32,
35 | "head movements": 33,
36 | "jog": 34,
37 | "grasp object": 35,
38 | "waist movements": 36,
39 | "lift something": 37,
40 | "knee movement": 38,
41 | "wave": 39,
42 | "move something": 40,
43 | "swing body part": 41,
44 | "catch": 42,
45 | "dance": 43,
46 | "lean": 44,
47 | "greet": 45,
48 | "poses": 46,
49 | "touching face": 47,
50 | "sports move": 48,
51 | "exercise/training": 49,
52 | "clean something": 50,
53 | "punch": 51,
54 | "squat": 52,
55 | "scratch": 53,
56 | "hop": 54,
57 | "play sport": 55,
58 | "stumble": 56,
59 | "crossing limbs": 57,
60 | "perform": 58,
61 | "martial art": 59,
62 | "balance": 60,
63 | "kneel": 61,
64 | "shake": 62,
65 | "grab body part": 63,
66 | "clap": 64,
67 | "crouch": 65,
68 | "spin": 66,
69 | "upper body movements": 67,
70 | "knock": 68,
71 | "adjust": 69,
72 | "crawl": 70,
73 | "twist": 71,
74 | "move back to original position": 72,
75 | "bow": 73,
76 | "hit": 74,
77 | "touch ground": 75,
78 | "shoulder movements": 76,
79 | "telephone call": 77,
80 | "grab person": 78,
81 | "play instrument": 79,
82 | "tap": 80,
83 | "spread": 81,
84 | "skip": 82,
85 | "rolling movement": 83,
86 | "jump rope": 84,
87 | "play catch": 85,
88 | "drink": 86,
89 | "evade": 87,
90 | "support": 88,
91 | "point": 89,
92 | "side to side movement": 90,
93 | "stop": 91,
94 | "protect": 92,
95 | "wrist movements": 93,
96 | "stances": 94,
97 | "wait": 95,
98 | "shuffle": 96,
99 | "lunge": 97,
100 | "communicate (vocalise)": 98,
101 | "jumping jacks": 99,
102 | "rub": 100,
103 | "dribble": 101,
104 | "swim": 102,
105 | "sneak": 103,
106 | "to lower a body part": 104,
107 | "misc. abstract action": 105,
108 | "mix": 106,
109 | "limp": 107,
110 | "sway": 108,
111 | "slide": 109,
112 | "cartwheel": 110,
113 | "press something": 111,
114 | "shrug": 112,
115 | "open something": 113,
116 | "leap": 114,
117 | "trip": 115,
118 | "golf": 116,
119 | "move misc. body part": 117,
120 | "get injured": 118,
121 | "sudden movement": 119,
122 | "duck": 120,
123 | "flap": 121,
124 | "salute": 122,
125 | "stagger": 123,
126 | "draw": 124,
127 | "tie": 125,
128 | "eat": 126,
129 | "style hair": 127,
130 | "relax": 128,
131 | "pray": 129,
132 | "flip": 130,
133 | "shivering": 131,
134 | "interact with rope": 132,
135 | "march": 133,
136 | "zombie": 134,
137 | "check": 135,
138 | "wiggle": 136,
139 | "bump": 137,
140 | "give something": 138,
141 | "yoga": 139,
142 | "mime": 140,
143 | "wobble": 141,
144 | "release": 142,
145 | "wash": 143,
146 | "stroke": 144,
147 | "rocking movement": 145,
148 | "swipe": 146,
149 | "strafe": 147,
150 | "hang": 148,
151 | "flail arms": 149
152 | }
--------------------------------------------------------------------------------
/action_recognition/data_gen/preprocess.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.extend(['../'])
4 | from data_gen.rotation import *
5 | #from rotation import *
6 | from tqdm import tqdm
7 |
8 |
9 | def pre_normalization(data, zaxis=[0, 1], xaxis=[8, 4]):
10 | N, C, T, V, M = data.shape
11 | s = np.transpose(data, [0, 4, 2, 3, 1]) # N, C, T, V, M to N, M, T, V, C
12 | l_m_sk = [] # List idxs of missing skeletons
13 |
14 | print('pad the null frames with the previous frames')
15 | for i_s, skeleton in enumerate(tqdm(s)): # pad
16 | if skeleton.sum() == 0:
17 | print(i_s, ' has no skeleton')
18 | l_m_sk.append(i_s)
19 | for i_p, person in enumerate(skeleton):
20 | if person.sum() == 0:
21 | continue
22 | if person[0].sum() == 0:
23 | index = (person.sum(-1).sum(-1) != 0)
24 | tmp = person[index].copy()
25 | person *= 0
26 | person[:len(tmp)] = tmp
27 | for i_f, frame in enumerate(person):
28 | if frame.sum() == 0:
29 | if person[i_f:].sum() == 0:
30 | rest = len(person) - i_f
31 | num = int(np.ceil(rest / i_f))
32 | pad = np.concatenate([person[0:i_f] for _ in range(num)], 0)[:rest]
33 | s[i_s, i_p, i_f:] = pad
34 | break
35 |
36 | print('sub the center joint #1 (spine joint in ntu and neck joint in kinetics)')
37 | for i_s, skeleton in enumerate(tqdm(s)):
38 | if skeleton.sum() == 0:
39 | continue
40 | main_body_center = skeleton[0][:, 1:2, :].copy()
41 | for i_p, person in enumerate(skeleton):
42 | if person.sum() == 0:
43 | continue
44 | mask = (person.sum(-1) != 0).reshape(T, V, 1)
45 | s[i_s, i_p] = (s[i_s, i_p] - main_body_center) * mask
46 |
47 | print('parallel the bone between hip(jpt 0) and spine(jpt 1) of the first person to the z axis')
48 | for i_s, skeleton in enumerate(tqdm(s)):
49 | if skeleton.sum() == 0:
50 | continue
51 | joint_bottom = skeleton[0, 0, zaxis[0]]
52 | joint_top = skeleton[0, 0, zaxis[1]]
53 | axis = np.cross(joint_top - joint_bottom, [0, 0, 1])
54 | angle = angle_between(joint_top - joint_bottom, [0, 0, 1])
55 | matrix_z = rotation_matrix(axis, angle)
56 | for i_p, person in enumerate(skeleton):
57 | if person.sum() == 0:
58 | continue
59 | for i_f, frame in enumerate(person):
60 | if frame.sum() == 0:
61 | continue
62 | for i_j, joint in enumerate(frame):
63 | s[i_s, i_p, i_f, i_j] = np.dot(matrix_z, joint)
64 |
65 | print(
66 | 'parallel the bone between right shoulder(jpt 8) and left shoulder(jpt 4) of the first person to the x axis')
67 | for i_s, skeleton in enumerate(tqdm(s)):
68 | if skeleton.sum() == 0:
69 | continue
70 | joint_rshoulder = skeleton[0, 0, xaxis[0]]
71 | joint_lshoulder = skeleton[0, 0, xaxis[1]]
72 | axis = np.cross(joint_rshoulder - joint_lshoulder, [1, 0, 0])
73 | angle = angle_between(joint_rshoulder - joint_lshoulder, [1, 0, 0])
74 | matrix_x = rotation_matrix(axis, angle)
75 | for i_p, person in enumerate(skeleton):
76 | if person.sum() == 0:
77 | continue
78 | for i_f, frame in enumerate(person):
79 | if frame.sum() == 0:
80 | continue
81 | for i_j, joint in enumerate(frame):
82 | s[i_s, i_p, i_f, i_j] = np.dot(matrix_x, joint)
83 |
84 | data = np.transpose(s, [0, 4, 2, 3, 1])
85 | return data, l_m_sk
86 |
87 |
88 | if __name__ == '__main__':
89 | data = np.load('../data/ntu/xview/val_data.npy')
90 | pre_normalization(data)
91 | np.save('../data/ntu/xview/data_val_pre.npy', data)
92 |
--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
1 |
2 | # BABEL: Bodies, Action and Behavior with English Labels [[CVPR 2021](http://cvpr2021.thecvf.com/)]
3 |
4 |
5 |
6 |
7 |
8 | > [Abhinanda R. Punnakkal\*](https://ps.is.tuebingen.mpg.de/person/apunnakkal), [Arjun Chandrasekaran\*](https://ps.is.tuebingen.mpg.de/person/achandrasekaran), [Nikos Athanasiou](https://ps.is.tuebingen.mpg.de/person/nathanasiou), [Alejandra Quiros-Ramirez](https://ps.is.tuebingen.mpg.de/person/aquiros), [Michael J. Black](https://ps.is.tuebingen.mpg.de/person/black).
9 | > \* denotes equal contribution
10 |
11 | [Project Website](https://babel.is.tue.mpg.de) | [Paper](https://arxiv.org/pdf/2106.09696.pdf) | [Video](https://www.youtube.com/watch?v=BYWxvjKpCqA) | [Poster](https://babel.is.tue.mpg.de/media/upload/CVPR_2021_BABEL_poster.pdf)
12 |
13 | ---
14 |
15 | BABEL is a large dataset with language labels describing the actions being performed in mocap sequences. BABEL labels about 43 hours of mocap sequences from [AMASS](https://amass.is.tue.mpg.de/) [1] with action labels.
16 | Sequences have action labels at two possible levels of abstraction:
17 | - **Sequence labels** which describe the overall action in the sequence
18 | - **Frame labels** which describe all actions in every frame of the sequence. Each frame label is precisely aligned with the duration of the corresponding action in the mocap sequence, and multiple actions can overlap.
19 |
20 | To download the BABEL action labels, visit our ['Data' page](https://babel.is.tue.mpg.de/data.html). You can download the mocap sequences from [AMASS](https://amass.is.tue.mpg.de/).
21 |
22 |
23 | ### Tutorials
24 |
25 | We release some helper code in Jupyter notebooks to load the BABEL dataset, visualize mocap sequences and their action labels, search BABEL for sequences containing specific actions, etc.
26 |
27 | See [`notebooks/`](notebooks/) for more details.
28 |
29 |
30 | ### Action Recognition
31 |
32 | We provide features, training and inference code, and pre-trained checkpoints for 3D skeleton-based action recognition.
33 |
34 | Please see [`action_recognition/`](action_recognition/) for more details.
35 |
36 |
37 | ### Acknowledgements
38 |
39 | We thank the [Software Workshop](https://is.mpg.de/en/software-workshop) at MPI for building the action recognition test set evaluation web server.
40 | The notebooks in this repo are inspired by the those provided by [AMASS](https://github.com/nghorbani/amass).
41 | The Action Recognition code is based on the [2s-AGCN](https://github.com/lshiwjx/2s-AGCN) [2] implementation.
42 |
43 |
44 | ### References
45 |
46 | [1] Mahmood, Naureen, et al. "AMASS: Archive of motion capture as surface shapes." Proceedings of the IEEE/CVF International Conference on Computer Vision. 2019.
47 | [2] Shi, Lei, et al. "Two-stream adaptive graph convolutional networks for skeleton-based action recognition." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.
48 |
49 | ### License
50 |
51 | Software Copyright License for non-commercial scientific research purposes. Please read carefully the terms and conditions and any accompanying documentation before you download and/or use the BABEL dataset, and software, (the "Model & Software"). By downloading and/or using the Model & Software (including downloading, cloning, installing, and any other use of this GitHub repository), you acknowledge that you have read these terms and conditions, understand them, and agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use the Model & Software. Any infringement of the terms of this agreement will automatically terminate your rights under this License.
52 |
53 | ### Contact
54 |
55 | The code in this repository is developed by [Abhinanda Punnakkal](https://www.is.mpg.de/person/apunnakkal) and [Arjun Chandrasekaran](https://www.is.mpg.de/person/achandrasekaran), and tested by [Nikos Athanasiou](https://www.is.mpg.de/person/nathanasiou).
56 |
57 | If you have any questions you can contact us at babel@tue.mpg.de.
58 |
59 |
--------------------------------------------------------------------------------
/action_recognition/class_balanced_loss.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:fenc=utf-8
4 | #
5 |
6 | """
7 | Code from:
8 | https://raw.githubusercontent.com/vandit15/Class-balanced-loss-pytorch/master/class_balanced_loss.py
9 |
10 | Pytorch implementation of Class-Balanced-Loss
11 | Reference: "Class-Balanced Loss Based on Effective Number of Samples"
12 | Authors: Yin Cui and
13 | Menglin Jia and
14 | Tsung Yi Lin and
15 | Yang Song and
16 | Serge J. Belongie
17 | https://arxiv.org/abs/1901.05555, CVPR'19.
18 | """
19 |
20 |
21 | import numpy as np
22 | import torch
23 | import torch.nn.functional as F
24 |
25 |
26 |
27 | def focal_loss(labels, logits, alpha, gamma):
28 | """Compute the focal loss between `logits` and the ground truth `labels`.
29 |
30 | Focal loss = -alpha_t * (1-pt)^gamma * log(pt)
31 | where pt is the probability of being classified to the true class.
32 | pt = p (if true class), otherwise pt = 1 - p. p = sigmoid(logit).
33 |
34 | Args:
35 | labels: A float tensor of size [batch, num_classes].
36 | logits: A float tensor of size [batch, num_classes].
37 | alpha: A float tensor of size [batch_size]
38 | specifying per-example weight for balanced cross entropy.
39 | gamma: A float scalar modulating loss from hard and easy examples.
40 |
41 | Returns:
42 | focal_loss: A float32 scalar representing normalized total loss.
43 | """
44 | BCLoss = F.binary_cross_entropy_with_logits(input = logits, target = labels,reduction = "none")
45 |
46 | if gamma == 0.0:
47 | modulator = 1.0
48 | else:
49 | modulator = torch.exp(-gamma * labels * logits - gamma * torch.log(1 +
50 | torch.exp(-1.0 * logits)))
51 |
52 | loss = modulator * BCLoss
53 |
54 | weighted_loss = alpha * loss
55 | focal_loss = torch.sum(weighted_loss)
56 |
57 | focal_loss /= torch.sum(labels)
58 | return focal_loss
59 |
60 |
61 | def CB_loss(labels, logits, samples_per_cls, no_of_classes, loss_type, beta, gamma, device):
62 | """Compute the Class Balanced Loss between `logits` and the ground truth `labels`.
63 |
64 | Class Balanced Loss: ((1-beta)/(1-beta^n))*Loss(labels, logits)
65 | where Loss is one of the standard losses used for Neural Networks.
66 |
67 | Args:
68 | labels: A int tensor of size [batch].
69 | logits: A float tensor of size [batch, no_of_classes].
70 | samples_per_cls: A python list of size [no_of_classes].
71 | no_of_classes: total number of classes. int
72 | loss_type: string. One of "sigmoid", "focal", "softmax".
73 | beta: float. Hyperparameter for Class balanced loss.
74 | gamma: float. Hyperparameter for Focal loss.
75 |
76 | Returns:
77 | cb_loss: A float tensor representing class balanced loss
78 | """
79 | effective_num = 1.0 - np.power(beta, samples_per_cls)
80 | weights = (1.0 - beta) / np.array(effective_num)
81 | weights = weights / np.sum(weights) * no_of_classes
82 |
83 | labels_one_hot = F.one_hot(labels, no_of_classes).float().cuda(device)
84 |
85 | weights = torch.tensor(weights).float().cuda(device)
86 | weights = weights.unsqueeze(0)
87 | weights = weights.repeat(labels_one_hot.shape[0],1) * labels_one_hot
88 | weights = weights.sum(1)
89 | weights = weights.unsqueeze(1)
90 | weights = weights.repeat(1,no_of_classes)
91 |
92 | if loss_type == "focal":
93 | cb_loss = focal_loss(labels_one_hot, logits, weights, gamma)
94 | elif loss_type == "sigmoid":
95 | cb_loss = F.binary_cross_entropy_with_logits(input = logits,target = labels_one_hot, weight = weights)
96 | elif loss_type == "softmax":
97 | pred = logits.softmax(dim = 1)
98 | cb_loss = F.binary_cross_entropy(input = pred, target = labels_one_hot, weight = weights)
99 | return cb_loss
100 |
101 |
102 | def test():
103 | no_of_classes = 5
104 | logits = torch.rand(10,no_of_classes).float()
105 | labels = torch.randint(0,no_of_classes, size = (10,))
106 | beta = 0.9999
107 | gamma = 2.0
108 | samples_per_cls = [2,3,1,2,2]
109 | loss_type = "focal"
110 | cb_loss = CB_loss(labels, logits, samples_per_cls, no_of_classes,loss_type, beta, gamma)
111 | print(cb_loss)
112 |
--------------------------------------------------------------------------------
/action_recognition/challenge/create_submission.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:fenc=utf-8
4 | #
5 | # Copyright © 2021 achandrasekaran
6 | #
7 | # Distributed under terms of the MIT license.
8 |
9 | import sys, os, pdb, glob
10 | import uuid
11 | from os.path import join as ospj
12 | from os.path import dirname as ospd
13 | import json, pickle
14 | import argparse
15 | from tqdm import tqdm
16 | from collections import *
17 |
18 | import numpy as np
19 | import pandas as pd
20 | from pandas.core.common import flatten
21 | from fnmatch import fnmatch
22 | import re
23 |
24 |
25 | def load_test_scores(test_scores_fp):
26 | '''
27 | Load the score prediction file, validate.
28 |
29 | Format of data structure stored in prediction file:
30 | score_dict = list(zip(
31 | self.data_loader[ln].dataset.label[1], # sid
32 | self.data_loader[ln].dataset.sample_name, # seg_id
33 | self.data_loader[ln].dataset.label[2], # chunk_id
34 | score))
35 | '''
36 | # load test set predictions from model
37 | test_scores = pickle.load(open(test_scores_fp, 'rb'))
38 |
39 | # GT labels (-1 for test set), seg_id, chunk_id, score
40 | _, seg_ids, chunk_ids, scores = zip(*test_scores)
41 |
42 | # Validate the shape of predictions
43 | scores = np.array(scores)
44 | n_samples, n_classes = scores.shape
45 | assert n_classes in (60, 120)
46 |
47 | return list(zip(seg_ids, chunk_ids, scores)), n_classes
48 |
49 |
50 | def load_test_samples(n_classes):
51 | '''Load the GT samples corresponding to the BABEL subset (# classes) used.
52 |
53 | GT labels data structure format:
54 | List of seg_id, (label, sid, chunk_n, anntr_id)
55 |
56 | Arguments:
57 | scores: np.array (n_samples, n_classes) contains predicted scores for samples.
58 | '''
59 | # load test set samples
60 | samples_filename = f'test_label_{n_classes}.pkl'
61 | test_samples = pickle.load(open(f'../data/release/{samples_filename}', 'rb'))
62 |
63 | # GT labels (-1 for test set), sid, chunk_id, anntr_id
64 | seg_ids, (_, _, chunk_ids, _) = test_samples
65 |
66 | return list(zip(seg_ids, chunk_ids))
67 |
68 |
69 | def create_submission(test_samples, test_pred_scores, n_classes):
70 | '''Create a submission with the same ordering of samples
71 | as provided in the `test_label_{60, 120}.pkl` file.
72 | '''
73 | submission = []
74 | perfect_map = True
75 |
76 | # Ideal scenario -- 1:1 map between samples in two files
77 | for i, ((seg_id, chunk_id), (pred_seg_id, pred_chunk_id, _)) in \
78 | enumerate(zip(test_samples, test_pred_scores)):
79 | if seg_id != pred_seg_id or chunk_id != pred_chunk_id:
80 | perfect_map = False
81 |
82 | if True == perfect_map:
83 | submission = np.array(list(zip(*test_pred_scores))[2])
84 | else:
85 | # For each sample, find its predicted score
86 | for i, (seg_id, chunk_id) in enumerate(test_samples):
87 | for pred_seg_id, pred_chunk_id, score in test_pred_scores:
88 | if pred_seg_id == seg_id and pred_chunk_id == chunk_id:
89 | submission.append(score)
90 | break
91 | submission = np.array(submission)
92 | if 60 == n_classes:
93 | assert 15647 == submission.shape[0]
94 | elif 120 == n_classes:
95 | assert 16839 == submission.shape[0]
96 |
97 | return submission
98 |
99 |
100 | def save_submission(submission, filepath):
101 | '''Save predicted scores for test samples in .npz format for
102 | submission to BABEL Action Recognition Challenge.
103 | '''
104 | np.savez(filepath, submission)
105 | print(f'Successfully saved submission in: {filepath}')
106 |
107 | return None
108 |
109 |
110 | if __name__ == '__main__':
111 | # Add args
112 | parser = argparse.ArgumentParser(
113 | description='Predicted test scores --> Submission to server')
114 | parser.add_argument(
115 | '--pred_path',
116 | default='./epoch1_test_score.pkl',
117 | help='Path to file containing model predictions (saved to disk by train_test.py.')
118 | parser.add_argument(
119 | '--sub_path',
120 | default='./test_sub.npz',
121 | help='Path to write submission file.')
122 |
123 | # Parse args
124 | args = parser.parse_args()
125 |
126 | # Process scores into submission file
127 | test_pred_scores, n_classes = load_test_scores(args.pred_path)
128 | test_samples = load_test_samples(n_classes)
129 | submission = create_submission(test_samples, test_pred_scores, n_classes)
130 | save_submission(submission, args.sub_path)
131 |
--------------------------------------------------------------------------------
/action_recognition/feeders/tools.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 |
4 | def downsample(data_numpy, step, random_sample=True):
5 | # input: C,T,V,M
6 | begin = np.random.randint(step) if random_sample else 0
7 | return data_numpy[:, begin::step, :, :]
8 |
9 |
10 | def temporal_slice(data_numpy, step):
11 | # input: C,T,V,M
12 | C, T, V, M = data_numpy.shape
13 | return data_numpy.reshape(C, T / step, step, V, M).transpose(
14 | (0, 1, 3, 2, 4)).reshape(C, T / step, V, step * M)
15 |
16 |
17 | def mean_subtractor(data_numpy, mean):
18 | # input: C,T,V,M
19 | # naive version
20 | if mean == 0:
21 | return
22 | C, T, V, M = data_numpy.shape
23 | valid_frame = (data_numpy != 0).sum(axis=3).sum(axis=2).sum(axis=0) > 0
24 | begin = valid_frame.argmax()
25 | end = len(valid_frame) - valid_frame[::-1].argmax()
26 | data_numpy[:, :end, :, :] = data_numpy[:, :end, :, :] - mean
27 | return data_numpy
28 |
29 |
30 | def auto_pading(data_numpy, size, random_pad=False):
31 | C, T, V, M = data_numpy.shape
32 | if T < size:
33 | begin = random.randint(0, size - T) if random_pad else 0
34 | data_numpy_paded = np.zeros((C, size, V, M))
35 | data_numpy_paded[:, begin:begin + T, :, :] = data_numpy
36 | return data_numpy_paded
37 | else:
38 | return data_numpy
39 |
40 |
41 | def random_choose(data_numpy, size, auto_pad=True):
42 | # input: C,T,V,M 随机选择其中一段,不是很合理。因为有0
43 | C, T, V, M = data_numpy.shape
44 | if T == size:
45 | return data_numpy
46 | elif T < size:
47 | if auto_pad:
48 | return auto_pading(data_numpy, size, random_pad=True)
49 | else:
50 | return data_numpy
51 | else:
52 | begin = random.randint(0, T - size)
53 | return data_numpy[:, begin:begin + size, :, :]
54 |
55 |
56 | def random_move(data_numpy,
57 | angle_candidate=[-10., -5., 0., 5., 10.],
58 | scale_candidate=[0.9, 1.0, 1.1],
59 | transform_candidate=[-0.2, -0.1, 0.0, 0.1, 0.2],
60 | move_time_candidate=[1]):
61 | # input: C,T,V,M
62 | C, T, V, M = data_numpy.shape
63 | move_time = random.choice(move_time_candidate)
64 | node = np.arange(0, T, T * 1.0 / move_time).round().astype(int)
65 | node = np.append(node, T)
66 | num_node = len(node)
67 |
68 | A = np.random.choice(angle_candidate, num_node)
69 | S = np.random.choice(scale_candidate, num_node)
70 | T_x = np.random.choice(transform_candidate, num_node)
71 | T_y = np.random.choice(transform_candidate, num_node)
72 |
73 | a = np.zeros(T)
74 | s = np.zeros(T)
75 | t_x = np.zeros(T)
76 | t_y = np.zeros(T)
77 |
78 | # linspace
79 | for i in range(num_node - 1):
80 | a[node[i]:node[i + 1]] = np.linspace(
81 | A[i], A[i + 1], node[i + 1] - node[i]) * np.pi / 180
82 | s[node[i]:node[i + 1]] = np.linspace(S[i], S[i + 1],
83 | node[i + 1] - node[i])
84 | t_x[node[i]:node[i + 1]] = np.linspace(T_x[i], T_x[i + 1],
85 | node[i + 1] - node[i])
86 | t_y[node[i]:node[i + 1]] = np.linspace(T_y[i], T_y[i + 1],
87 | node[i + 1] - node[i])
88 |
89 | theta = np.array([[np.cos(a) * s, -np.sin(a) * s],
90 | [np.sin(a) * s, np.cos(a) * s]]) # xuanzhuan juzhen
91 |
92 | # perform transformation
93 | for i_frame in range(T):
94 | xy = data_numpy[0:2, i_frame, :, :]
95 | new_xy = np.dot(theta[:, :, i_frame], xy.reshape(2, -1))
96 | new_xy[0] += t_x[i_frame]
97 | new_xy[1] += t_y[i_frame] # pingyi bianhuan
98 | data_numpy[0:2, i_frame, :, :] = new_xy.reshape(2, V, M)
99 |
100 | return data_numpy
101 |
102 |
103 | def random_shift(data_numpy):
104 | # input: C,T,V,M 偏移其中一段
105 | C, T, V, M = data_numpy.shape
106 | data_shift = np.zeros(data_numpy.shape)
107 | valid_frame = (data_numpy != 0).sum(axis=3).sum(axis=2).sum(axis=0) > 0
108 | begin = valid_frame.argmax()
109 | end = len(valid_frame) - valid_frame[::-1].argmax()
110 |
111 | size = end - begin
112 | bias = random.randint(0, T - size)
113 | data_shift[:, bias:bias + size, :, :] = data_numpy[:, begin:end, :, :]
114 |
115 | return data_shift
116 |
117 |
118 | def openpose_match(data_numpy):
119 | C, T, V, M = data_numpy.shape
120 | assert (C == 3)
121 | score = data_numpy[2, :, :, :].sum(axis=1)
122 | # the rank of body confidence in each frame (shape: T-1, M)
123 | rank = (-score[0:T - 1]).argsort(axis=1).reshape(T - 1, M)
124 |
125 | # data of frame 1
126 | xy1 = data_numpy[0:2, 0:T - 1, :, :].reshape(2, T - 1, V, M, 1)
127 | # data of frame 2
128 | xy2 = data_numpy[0:2, 1:T, :, :].reshape(2, T - 1, V, 1, M)
129 | # square of distance between frame 1&2 (shape: T-1, M, M)
130 | distance = ((xy2 - xy1) ** 2).sum(axis=2).sum(axis=0)
131 |
132 | # match pose
133 | forward_map = np.zeros((T, M), dtype=int) - 1
134 | forward_map[0] = range(M)
135 | for m in range(M):
136 | choose = (rank == m)
137 | forward = distance[choose].argmin(axis=1)
138 | for t in range(T - 1):
139 | distance[t, :, forward[t]] = np.inf
140 | forward_map[1:][choose] = forward
141 | assert (np.all(forward_map >= 0))
142 |
143 | # string data
144 | for t in range(T - 1):
145 | forward_map[t + 1] = forward_map[t + 1][forward_map[t]]
146 |
147 | # generate data
148 | new_data_numpy = np.zeros(data_numpy.shape)
149 | for t in range(T):
150 | new_data_numpy[:, t, :, :] = data_numpy[:, t, :, forward_map[
151 | t]].transpose(1, 2, 0)
152 | data_numpy = new_data_numpy
153 |
154 | # score sort
155 | trace_score = data_numpy[2, :, :, :].sum(axis=1).sum(axis=0)
156 | rank = (-trace_score).argsort()
157 | data_numpy = data_numpy[:, :, :, rank]
158 |
159 | return data_numpy
160 |
--------------------------------------------------------------------------------
/action_recognition/model/agcn.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 | from torch.autograd import Variable
7 |
8 | def import_class(name):
9 | components = name.split('.')
10 | mod = __import__(components[0])
11 | for comp in components[1:]:
12 | mod = getattr(mod, comp)
13 | return mod
14 |
15 |
16 | def conv_branch_init(conv, branches):
17 | weight = conv.weight
18 | n = weight.size(0)
19 | k1 = weight.size(1)
20 | k2 = weight.size(2)
21 | nn.init.normal_(weight, 0, math.sqrt(2. / (n * k1 * k2 * branches)))
22 | nn.init.constant_(conv.bias, 0)
23 |
24 |
25 | def conv_init(conv):
26 | nn.init.kaiming_normal_(conv.weight, mode='fan_out')
27 | nn.init.constant_(conv.bias, 0)
28 |
29 |
30 | def bn_init(bn, scale):
31 | nn.init.constant_(bn.weight, scale)
32 | nn.init.constant_(bn.bias, 0)
33 |
34 |
35 | class unit_tcn(nn.Module):
36 | def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):
37 | super(unit_tcn, self).__init__()
38 | pad = int((kernel_size - 1) / 2)
39 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(pad, 0),
40 | stride=(stride, 1))
41 |
42 | self.bn = nn.BatchNorm2d(out_channels)
43 | self.relu = nn.ReLU()
44 | conv_init(self.conv)
45 | bn_init(self.bn, 1)
46 |
47 | def forward(self, x):
48 | x = self.bn(self.conv(x))
49 | return x
50 |
51 |
52 | class unit_gcn(nn.Module):
53 | def __init__(self, in_channels, out_channels, A, coff_embedding=4, num_subset=3):
54 | super(unit_gcn, self).__init__()
55 | inter_channels = out_channels // coff_embedding
56 | self.inter_c = inter_channels
57 | self.PA = nn.Parameter(torch.from_numpy(A.astype(np.float32)))
58 | nn.init.constant_(self.PA, 1e-6)
59 | self.A = Variable(torch.from_numpy(A.astype(np.float32)), requires_grad=False)
60 | self.num_subset = num_subset
61 |
62 | self.conv_a = nn.ModuleList()
63 | self.conv_b = nn.ModuleList()
64 | self.conv_d = nn.ModuleList()
65 | for i in range(self.num_subset):
66 | self.conv_a.append(nn.Conv2d(in_channels, inter_channels, 1))
67 | self.conv_b.append(nn.Conv2d(in_channels, inter_channels, 1))
68 | self.conv_d.append(nn.Conv2d(in_channels, out_channels, 1))
69 |
70 | if in_channels != out_channels:
71 | self.down = nn.Sequential(
72 | nn.Conv2d(in_channels, out_channels, 1),
73 | nn.BatchNorm2d(out_channels)
74 | )
75 | else:
76 | self.down = lambda x: x
77 |
78 | self.bn = nn.BatchNorm2d(out_channels)
79 | self.soft = nn.Softmax(-2)
80 | self.relu = nn.ReLU()
81 |
82 | for m in self.modules():
83 | if isinstance(m, nn.Conv2d):
84 | conv_init(m)
85 | elif isinstance(m, nn.BatchNorm2d):
86 | bn_init(m, 1)
87 | bn_init(self.bn, 1e-6)
88 | for i in range(self.num_subset):
89 | conv_branch_init(self.conv_d[i], self.num_subset)
90 |
91 | def forward(self, x):
92 | N, C, T, V = x.size()
93 | A = self.A
94 | if -1 != x.get_device():
95 | A = A.cuda(x.get_device())
96 | A = A + self.PA
97 |
98 | y = None
99 | for i in range(self.num_subset):
100 | A1 = self.conv_a[i](x).permute(0, 3, 1, 2).contiguous().view(N, V, self.inter_c * T)
101 | A2 = self.conv_b[i](x).view(N, self.inter_c * T, V)
102 | A1 = self.soft(torch.matmul(A1, A2) / A1.size(-1)) # N V V
103 | A1 = A1 + A[i]
104 | A2 = x.view(N, C * T, V)
105 | z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V))
106 | y = z + y if y is not None else z
107 |
108 | y = self.bn(y)
109 | y += self.down(x)
110 | return self.relu(y)
111 |
112 |
113 | class TCN_GCN_unit(nn.Module):
114 | def __init__(self, in_channels, out_channels, A, stride=1, residual=True):
115 | super(TCN_GCN_unit, self).__init__()
116 | self.gcn1 = unit_gcn(in_channels, out_channels, A)
117 | self.tcn1 = unit_tcn(out_channels, out_channels, stride=stride)
118 | self.relu = nn.ReLU()
119 | if not residual:
120 | self.residual = lambda x: 0
121 |
122 | elif (in_channels == out_channels) and (stride == 1):
123 | self.residual = lambda x: x
124 |
125 | else:
126 | self.residual = unit_tcn(in_channels, out_channels, kernel_size=1, stride=stride)
127 |
128 | def forward(self, x):
129 | x = self.tcn1(self.gcn1(x)) + self.residual(x)
130 | return self.relu(x)
131 |
132 |
133 | class Model(nn.Module):
134 | def __init__(self, num_class=60, num_point=25, num_person=2, graph=None, graph_args=dict(), in_channels=3):
135 | super(Model, self).__init__()
136 |
137 | if graph is None:
138 | raise ValueError()
139 | else:
140 | Graph = import_class(graph)
141 | self.graph = Graph(**graph_args)
142 |
143 | A = self.graph.A
144 | self.data_bn = nn.BatchNorm1d(num_person * in_channels * num_point)
145 |
146 | self.l1 = TCN_GCN_unit(3, 64, A, residual=False)
147 | self.l2 = TCN_GCN_unit(64, 64, A)
148 | self.l3 = TCN_GCN_unit(64, 64, A)
149 | self.l4 = TCN_GCN_unit(64, 64, A)
150 | self.l5 = TCN_GCN_unit(64, 128, A, stride=2)
151 | self.l6 = TCN_GCN_unit(128, 128, A)
152 | self.l7 = TCN_GCN_unit(128, 128, A)
153 | self.l8 = TCN_GCN_unit(128, 256, A, stride=2)
154 | self.l9 = TCN_GCN_unit(256, 256, A)
155 | self.l10 = TCN_GCN_unit(256, 256, A)
156 |
157 | self.fc = nn.Linear(256, num_class)
158 | nn.init.normal_(self.fc.weight, 0, math.sqrt(2. / num_class))
159 | bn_init(self.data_bn, 1)
160 |
161 | def forward(self, x):
162 | N, C, T, V, M = x.size()
163 |
164 | x = x.permute(0, 4, 3, 1, 2).contiguous().view(N, M * V * C, T)
165 | x = self.data_bn(x)
166 | x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, 2).contiguous().view(N * M, C, T, V)
167 |
168 | x = self.l1(x)
169 | x = self.l2(x)
170 | x = self.l3(x)
171 | x = self.l4(x)
172 | x = self.l5(x)
173 | x = self.l6(x)
174 | x = self.l7(x)
175 | x = self.l8(x)
176 | x = self.l9(x)
177 | x = self.l10(x)
178 |
179 | # N*M,C,T,V
180 | c_new = x.size(1)
181 | x = x.view(N, M, c_new, -1)
182 | x = x.mean(3).mean(1)
183 |
184 | return self.fc(x)
185 |
--------------------------------------------------------------------------------
/action_recognition/model/agcn_mult.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 | from torch.autograd import Variable
7 |
8 |
9 | def import_class(name):
10 | components = name.split('.')
11 | mod = __import__(components[0])
12 | for comp in components[1:]:
13 | mod = getattr(mod, comp)
14 | return mod
15 |
16 |
17 | def conv_branch_init(conv, branches):
18 | weight = conv.weight
19 | n = weight.size(0)
20 | k1 = weight.size(1)
21 | k2 = weight.size(2)
22 | nn.init.normal_(weight, 0, math.sqrt(2. / (n * k1 * k2 * branches)))
23 | nn.init.constant_(conv.bias, 0)
24 |
25 |
26 | def conv_init(conv):
27 | nn.init.kaiming_normal_(conv.weight, mode='fan_out')
28 | nn.init.constant_(conv.bias, 0)
29 |
30 |
31 | def bn_init(bn, scale):
32 | nn.init.constant_(bn.weight, scale)
33 | nn.init.constant_(bn.bias, 0)
34 |
35 |
36 | class unit_tcn(nn.Module):
37 | def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):
38 | super(unit_tcn, self).__init__()
39 | pad = int((kernel_size - 1) / 2)
40 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(pad, 0),
41 | stride=(stride, 1))
42 |
43 | self.bn = nn.BatchNorm2d(out_channels)
44 | self.relu = nn.ReLU()
45 | conv_init(self.conv)
46 | bn_init(self.bn, 1)
47 |
48 | def forward(self, x):
49 | x = self.bn(self.conv(x))
50 | return x
51 |
52 |
53 | class unit_gcn(nn.Module):
54 | def __init__(self, in_channels, out_channels, A, coff_embedding=4, num_subset=3):
55 | super(unit_gcn, self).__init__()
56 | inter_channels = out_channels // coff_embedding
57 | self.inter_c = inter_channels
58 | self.PA = nn.Parameter(torch.from_numpy(A.astype(np.float32)))
59 | nn.init.constant_(self.PA, 1e-6)
60 | self.A = Variable(torch.from_numpy(A.astype(np.float32)), requires_grad=False)
61 | self.num_subset = num_subset
62 |
63 | self.conv_a = nn.ModuleList()
64 | self.conv_b = nn.ModuleList()
65 | self.conv_d = nn.ModuleList()
66 | for i in range(self.num_subset):
67 | self.conv_a.append(nn.Conv2d(in_channels, inter_channels, 1))
68 | self.conv_b.append(nn.Conv2d(in_channels, inter_channels, 1))
69 | self.conv_d.append(nn.Conv2d(in_channels, out_channels, 1))
70 |
71 | if in_channels != out_channels:
72 | self.down = nn.Sequential(
73 | nn.Conv2d(in_channels, out_channels, 1),
74 | nn.BatchNorm2d(out_channels)
75 | )
76 | else:
77 | self.down = lambda x: x
78 |
79 | self.bn = nn.BatchNorm2d(out_channels)
80 | self.soft = nn.Softmax(-2)
81 | self.relu = nn.ReLU()
82 |
83 | for m in self.modules():
84 | if isinstance(m, nn.Conv2d):
85 | conv_init(m)
86 | elif isinstance(m, nn.BatchNorm2d):
87 | bn_init(m, 1)
88 | bn_init(self.bn, 1e-6)
89 | for i in range(self.num_subset):
90 | conv_branch_init(self.conv_d[i], self.num_subset)
91 |
92 | def forward(self, x):
93 | N, C, T, V = x.size()
94 | A = self.A.cuda(x.get_device())
95 | A = A + self.PA
96 |
97 | y = None
98 | for i in range(self.num_subset):
99 | A1 = self.conv_a[i](x).permute(0, 3, 1, 2).contiguous().view(N, V, self.inter_c * T)
100 | A2 = self.conv_b[i](x).view(N, self.inter_c * T, V)
101 | A1 = self.soft(torch.matmul(A1, A2) / A1.size(-1)) # N V V
102 | A1 = A1 + A[i]
103 | A2 = x.view(N, C * T, V)
104 | z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V))
105 | y = z + y if y is not None else z
106 |
107 | y = self.bn(y)
108 | y += self.down(x)
109 | return self.relu(y)
110 |
111 |
112 | class TCN_GCN_unit(nn.Module):
113 | def __init__(self, in_channels, out_channels, A, stride=1, residual=True):
114 | super(TCN_GCN_unit, self).__init__()
115 | self.gcn1 = unit_gcn(in_channels, out_channels, A)
116 | self.tcn1 = unit_tcn(out_channels, out_channels, stride=stride)
117 | self.relu = nn.ReLU()
118 | if not residual:
119 | self.residual = lambda x: 0
120 |
121 | elif (in_channels == out_channels) and (stride == 1):
122 | self.residual = lambda x: x
123 |
124 | else:
125 | self.residual = unit_tcn(in_channels, out_channels, kernel_size=1, stride=stride)
126 |
127 | def forward(self, x):
128 | x = self.tcn1(self.gcn1(x)) + self.residual(x)
129 | return self.relu(x)
130 |
131 |
132 | class Model(nn.Module):
133 | def __init__(self, num_class=60, num_point=25, loss_type='softmax', num_person=2, graph=None, graph_args=dict(), in_channels=3):
134 | super(Model, self).__init__()
135 |
136 | if graph is None:
137 | raise ValueError()
138 | else:
139 | Graph = import_class(graph)
140 | self.graph = Graph(**graph_args)
141 |
142 | A = self.graph.A
143 | self.data_bn = nn.BatchNorm1d(num_person * in_channels * num_point)
144 |
145 | self.l1 = TCN_GCN_unit(3, 64, A, residual=False)
146 | self.l2 = TCN_GCN_unit(64, 64, A)
147 | self.l3 = TCN_GCN_unit(64, 64, A)
148 | self.l4 = TCN_GCN_unit(64, 64, A)
149 | self.l5 = TCN_GCN_unit(64, 128, A, stride=2)
150 | self.l6 = TCN_GCN_unit(128, 128, A)
151 | self.l7 = TCN_GCN_unit(128, 128, A)
152 | self.l8 = TCN_GCN_unit(128, 256, A, stride=2)
153 | self.l9 = TCN_GCN_unit(256, 256, A)
154 | self.l10 = TCN_GCN_unit(256, 256, A)
155 |
156 |
157 | self.fc = nn.Linear(256, num_class)
158 | self.sig = nn.Sigmoid()
159 | nn.init.normal_(self.fc.weight, 0, math.sqrt(2. / num_class))
160 | if loss_type == 'sigmoid' or loss_type == 'focal' or loss_type=='focal2':
161 | nn.init.constant(self.fc.bias, -np.log(num_class - 1))
162 | # self.sof = nn.Softmax(-1)
163 | bn_init(self.data_bn, 1)
164 |
165 | def forward(self, x):
166 | N, C, T, V, M = x.size()
167 |
168 | x = x.permute(0, 4, 3, 1, 2).contiguous().view(N, M * V * C, T)
169 | x = self.data_bn(x)
170 | x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, 2).contiguous().view(N * M, C, T, V)
171 |
172 | x = self.l1(x)
173 | x = self.l2(x)
174 | x = self.l3(x)
175 | x = self.l4(x)
176 | x = self.l5(x)
177 | x = self.l6(x)
178 | x = self.l7(x)
179 | x = self.l8(x)
180 | x = self.l9(x)
181 | x = self.l10(x)
182 |
183 | # N*M,C,T,V
184 | c_new = x.size(1)
185 | x = x.view(N, M, c_new, -1)
186 | x = x.mean(3).mean(1)
187 |
188 | x = self.fc(x)
189 | # import pdb
190 | # pdb.set_trace()
191 | x = self.sig(x)
192 | return x
193 |
--------------------------------------------------------------------------------
/action_recognition/Readme.md:
--------------------------------------------------------------------------------
1 | ## Action Recognition
2 |
3 | We follow the 3D skeleton-based action recognition setup and [implementation](https://github.com/lshiwjx/2s-AGCN) from Shi et al. [2]
4 |
5 | ### Task
6 |
7 | **Sample** `(n_frames, feat_dim)`: Each action segment (start-end span) from BABEL is divided into contiguous 5-second chunks. See the [paper](https://arxiv.org/pdf/2106.09696.pdf) for more details.
8 | **Label** ``: Index of the ground-truth action label of the segment that the current chunk belongs to.
9 |
10 |
11 | ### Features
12 |
13 | We extract the joint positions (in `x, y, z` co-ordinates) from the AMASS mocap sequences in NTU RGB+D [1] skeleton format. There are 25 joints, resulting in `feat_dim=25*3=75`.
14 |
15 | Each sample is a 5-second chunk @ 30fps, resulting in `n_frames=150`.
16 |
17 | Pre-preprocessing of the skeleton joints follows Shi et al. [2]. Download the pre-processed sample features and corresponding labels:
18 |
19 | ```
20 | # BABEL Dense
21 | cd data/
22 | wget https://human-movement.is.tue.mpg.de/babel_feats_labels.tar.gz
23 | tar -xzvf babel_feats_labels.tar.gz -C ./
24 |
25 | # BABEL Dense+Extra
26 | wget https://human-movement.is.tue.mpg.de/babel_dense_and_extra_feats_labels.tar.gz
27 | tar -xzvf babel_dense_and_extra_feats_labels.tar.gz -C ./
28 | ```
29 |
30 | Note: We only train and test with Dense annotations. For details regarding Dense and Extra annotations, please see BABEL's [Data page](https://babel.is.tue.mpg.de/data.html).
31 |
32 |
33 | ### Training and Inference
34 |
35 | Set up and activate a virtual environment:
36 |
37 | ```
38 | python3 -m venv babel-env
39 | source $PWD/babel-env/bin/activate
40 | $PWD/babel-env/bin/pip install --upgrade pip setuptools
41 | $PWD/babel-env/bin/pip install -r requirements.txt
42 | ```
43 |
44 | #### Model
45 |
46 | We use [this](https://github.com/lshiwjx/2s-AGCN) implementation for the 2S-AGCN [2] model for 3D skeleton-based action recognition. Note that we use only the Joint-stream alone.
47 |
48 |
49 | #### Training
50 |
51 | To train a model with CE loss:
52 |
53 | From the top directory `babel/`, enter the following to train a model with the Cross-Entropy loss:
54 |
55 | ```python action_recognition/train_test.py --config action_recognition/config/babel_v1.0/train_60.yaml```
56 |
57 | To train a model with Focal loss [3] with class-balancing [4]:
58 |
59 | ```python action_recognition/train_test.py --config action_recognition/config/babel_v1.0/train_60_wfl.yaml```
60 |
61 | You can use the repsective configuration files inside `config/babel_v1.0` to train the model with `120` classes in both ways.
62 |
63 |
64 | #### Inference
65 |
66 | Provide the path to the trained model in the `weights` key in the respective config file.
67 |
68 | To perform inference, use the same command as when training, and pass the test config file as argument. E.g.:
69 |
70 | ```python action_recognition/main.py --config action_recognition/config/babel_v1.0/test_60.yaml```
71 |
72 | or
73 |
74 | ```python action_recognition/main_wl.py --config action_recognition/config/babel_v1.0/test_60_wfl.yaml```
75 |
76 | To save the predicted scores to disk, in the config file, set `save_score: True`.
77 |
78 | ### Pre-trained models
79 |
80 | Download the checkpoints from the links below and place them in `action_recognition/ckpts/`.
81 |
82 | Performing inference on the validation set should result in the following performance.
83 |
84 | | \# Classes | Loss type | Ckpt | Top-5 | Top-1 | Top-1-norm |
85 | |---|---|---|---|---|--|
86 | | BABEL-60 | CE | [ntu_sk_60_agcn_joint_const_lr_1e-3-17-6390.pt](https://human-movement.is.tue.mpg.de/release/ckpts/ntu_sk_60_agcn_joint_const_lr_1e-3-17-6390.pt) | 0.74 | 0.42 | 0.24 |
87 | | BABEL-60 | Focal | [wfl_ntu_sk_60_agcn_joint_const_lr_1e-3-93-33370.pt](https://human-movement.is.tue.mpg.de/release/ckpts/wfl_ntu_sk_60_agcn_joint_const_lr_1e-3-93-33370.pt) | 0.69 | 0.34 | 0.30 |
88 | | BABEL-120 | CE | [ntu_sk_120_agcn_joint_const_lr_1e-3-15-12240.pt](https://human-movement.is.tue.mpg.de/release/ckpts/ntu_sk_120_agcn_joint_const_lr_1e-3-15-12240.pt) | 0.72 | 0.4 | 0.16 |
89 | | BABEL-120 | Focal | [wfl_ntu_sk_120_agcn_joint_const_lr_1e-3-157-60356.pt](https://human-movement.is.tue.mpg.de/release/ckpts/wfl_ntu_sk_120_agcn_joint_const_lr_1e-3-157-60356.pt) | 0.59 | 0.29 | 0.23 |
90 |
91 | **Note:** The models are *only* trained with dense labels from `train.json` (See [project webpage](https://babel.is.tue.mpg.de/data.html) for more details about the data).
92 |
93 |
94 | ### Metrics
95 |
96 | **Description**
97 |
98 | 1. **Top-1** measures the accuracy of the highest-scoring prediction.
99 | 2. **Top-5** evaluates whether the ground-truth category is present among the top 5 highest-scoring predictions.
100 | 1. It accounts for labeling noise and inherent label ambiguity.
101 | 2. It also accounts for the possible association of multiple action categories with a single input movement sequence. For instance, a person `walking in a circle` is mapped to the two action categories `walk` and `circular movement`.
102 | Ideal models will predict high scores for all the categories relevant to the movement sample.
103 | 3. **Top-1-norm** is the mean `Top-1` across categories. The magnitude of `Top-1-norm` - `Top-1` illustrates the class-specific bias in the model performance. In Babel, it reflects the impact of class imbalance on learning.
104 |
105 |
106 | ### Challenge
107 |
108 | To make a submission:
109 |
110 | 1. Store the predictions (variable `pred_scores` in [L591](https://github.com/abhinanda-punnakkal/BABEL/blob/6454163e196fc6400e1b8232dffb651341ed7c14/action_recognition/train_test.py#L591) of `train_test.py`) as a python pickle.
111 | - `pred_scores` is list of tuples, each containing the following 4 elements — (sequence ID, segment ID, chunk ID, score). Here score is an `np.array` of size `(N, C)` where `N` is # samples in the test set and `C` is the # classes.
112 | - By default, `train_test.py` stores this pickle file as `/epoch1_test_score.pkl` (see [L604](https://github.com/abhinanda-punnakkal/BABEL/blob/6454163e196fc6400e1b8232dffb651341ed7c14/action_recognition/train_test.py#L606)).
113 | 2. In the command line, type the following commands:
114 | 1. `cd action_recognition/challenge/`
115 | 2. `python create_submission.py --pred_path /epoch1_test_score.pkl --sub_path `
116 | - Note: This code assumes that the GT test samples (`test_label_{60, 120}.pkl`) are present in the following path: `action_recognition/data/release/`
117 | 3. Submit the `.npz` submission file to the BABEL Action Recognition Challenge [evaluation server](https://babel-evaluation.is.tuebingen.mpg.de/).
118 |
119 |
120 | ### References
121 |
122 | [1] Shahroudy, Amir, et al. "NTU RGB+D: A large scale dataset for 3d human activity analysis." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
123 | [2] Shi, Lei, et al. "Two-stream adaptive graph convolutional networks for skeleton-based action recognition." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.
124 | [3] Lin, Tsung-Yi, et al. "Focal loss for dense object detection." Proceedings of the IEEE international conference on computer vision. 2017.
125 | [4] Cui, Yin, et al. "Class-balanced loss based on effective number of samples." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.
126 |
--------------------------------------------------------------------------------
/action_recognition/data_gen/dutils.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:fenc=utf-8
4 | #
5 | # Copyright © 2021 achandrasekaran
6 | #
7 | # Distributed under terms of the MIT license.
8 |
9 | import sys, os, pdb
10 | import os.path as osp
11 | from os.path import join as ospj
12 | from os.path import basename as ospb
13 | from os.path import dirname as ospd
14 |
15 | import numpy as np
16 | import torch
17 |
18 | import json, pickle, csv
19 | from collections import Counter
20 | from tqdm import tqdm
21 |
22 | from smplx import SMPLH
23 |
24 | import viz
25 |
26 |
27 | def read_json(json_filename):
28 | '''Return contents of JSON file'''
29 | jc = None
30 | with open(json_filename) as infile:
31 | jc = json.load(infile)
32 | return jc
33 |
34 | def read_pkl(pkl_filename):
35 | '''Return contents of pikcle file'''
36 | pklc = None
37 | with open(pkl_filename, 'rb') as infile:
38 | pklc = pickle.load(infile)
39 | return pklc
40 |
41 | def write_json(contents, filename):
42 | with open(filename, 'w') as outfile:
43 | json.dump(contents, outfile, indent=2)
44 |
45 | def write_pkl(contents, filename):
46 | with open(filename, 'wb') as outfile:
47 | pickle.dump(contents, outfile)
48 |
49 | def smpl_to_nturgbd(model_type='smplh', out_format='nturgbd'):
50 | ''' Borrowed from https://gitlab.tuebingen.mpg.de/apunnakkal/2s_agcn/-/blob/master/data_gen/smpl_data_utils.py
51 | NTU mapping
52 | -----------
53 | 0 --> ?
54 | 1-base of the spine
55 | 2-middle of the spine
56 | 3-neck
57 | 4-head
58 | 5-left shoulder
59 | 6-left elbow
60 | 7-left wrist
61 | 8-left hand
62 | 9-right shoulder
63 | 10-right elbow
64 | 11-right wrist
65 | 12-right hand
66 | 13-left hip
67 | 14-left knee
68 | 15-left ankle
69 | 16-left foot
70 | 17-right hip
71 | 18-right knee
72 | 19-right ankle
73 | 20-right foot
74 | 21-spine
75 | 22-tip of the left hand
76 | 23-left thumb
77 | 24-tip of the right hand
78 | 25-right thumb
79 |
80 | :param model_type:
81 | :param out_format:
82 | :return:
83 | '''
84 | if model_type == 'smplh' and out_format == 'nturgbd':
85 | '22 and 37 are approximation for hand (base of index finger)'
86 | return np.array([0, 3, 12, 15,
87 | 16, 18, 20, 22, #left hand
88 | 17, 19, 21, 37, # right hand
89 | 1, 4, 7, 10, #left leg
90 | 2, 5, 8, 11, #right hand
91 | 9,
92 | 63, 64 , 68, 69
93 | ],
94 | dtype=np.int32)
95 |
96 | class dotdict(dict):
97 | """dot.notation access to dictionary attributes"""
98 | __getattr__ = dict.get
99 | __setattr__ = dict.__setitem__
100 | __delattr__ = dict.__delitem__
101 |
102 | def store_counts(label_fp):
103 | """Compute # samples per class, from stored labels
104 |
105 | Args:
106 | label_fp : Path to label file
107 |
108 | Writes (to same path as label file):
109 | out_fp : # samples per class = {: , ...}
110 | """
111 | Y_tup = read_pkl(label_fp)
112 | Y_idxs = Y_tup[1][0]
113 | print('# Samples in set = ', len(Y_idxs))
114 |
115 | label_count = Counter(Y_idxs)
116 | print('File ', label_fp, 'len',len(label_count))
117 |
118 | out_fp = label_fp.replace('.pkl', '_count.pkl')
119 | write_pkl(label_count, out_fp)
120 |
121 | def load_babel_dataset(d_folder='../../data/babel_v1.0_release'):
122 | '''Load the BABEL dataset'''
123 | # Data folder
124 | l_babel_dense_files = ['train', 'val', 'test']
125 | l_babel_extra_files = ['extra_train', 'extra_val']
126 |
127 | # BABEL Dataset
128 | babel = {}
129 | for fn in l_babel_dense_files + l_babel_extra_files:
130 | babel[fn] = json.load(open(ospj(d_folder, fn+'.json')))
131 |
132 | return babel
133 |
134 | def store_seq_fps(amass_p):
135 | '''Get fps for each seq. in BABEL
136 | Arguments:
137 | ---------
138 | amass_p : Path where you download AMASS to.
139 | Save:
140 | -----
141 | featp_2_fps.json : Key: feat path , value: orig. fps
142 | in AMASS . E.g.,: {'KIT/KIT/4/RightTurn01_poses.npz': 100.0, ...}
143 | '''
144 | # Get BABEL dataset
145 | babel = load_babel_dataset()
146 |
147 | # Loop over each BABEL seq, store frame-rate
148 | ft_p_2_fps = {}
149 | for fn in babel:
150 | for sid in tqdm(babel[fn]):
151 | ann = babel[fn][sid]
152 | if ann['feat_p'] not in ft_p_2_fps:
153 | fps = np.load(ospj(amass_p, ann['feat_p']))['mocap_framerate']
154 | ft_p_2_fps[ann['feat_p']] = float(fps)
155 | dest_fp = '../data/featp_2_fps.json'
156 | write_json(ft_p_2_fps, dest_fp)
157 | return None
158 |
159 | def store_ntu_jpos(smplh_model_p, dest_jpos_p, amass_p):
160 | '''Store joint positions of kfor NTU-RGBD skeleton
161 | '''
162 | # Model to forward-pass through, to store joint positions
163 | smplh = SMPLH(smplh_model_p, create_transl=False, ext='pkl',
164 | gender='male', use_pca=False, batch_size=1)
165 |
166 | # Load paths to all BABEL features
167 | featp_2_fps = read_json('../data/featp_2_fps.json')
168 |
169 | # Loop over all BABEL data, verify that joint positions are stored on disk
170 | l_m_ft_p = []
171 | for ft_p in featp_2_fps:
172 |
173 | # Get the correct dataset folder name
174 | ddir_n = ospb(ospd(ospd(ft_p)))
175 | ddir_map = {'BioMotionLab_NTroje': 'BMLrub', 'DFaust_67': 'DFaust'}
176 | ddir_n = ddir_map[ddir_n] if ddir_n in ddir_map else ddir_n
177 | # Get the subject folder name
178 | sub_fol_n = ospb(ospd(ft_p))
179 |
180 | # Sanity check
181 | fft_p = ospj(dest_jpos_p, ddir_n, sub_fol_n, ospb(ft_p))
182 | if not os.path.exists(fft_p):
183 | l_m_ft_p.append((ft_p, fft_p))
184 | print('Total # missing NTU RGBD skeleton features = ', len(l_m_ft_p))
185 |
186 | # Loop over missing joint positions and store them on disk
187 | for i, (ft_p, ntu_jpos_p) in enumerate(tqdm(l_m_ft_p)):
188 | jrot_smplh = np.load(ospj(amass_p, ft_p))['poses']
189 | # Break joints down into body parts
190 | smpl_body_jrot = jrot_smplh[:, 3:66]
191 | left_hand_jrot = jrot_smplh[:, 66:111]
192 | right_hand_jrot = jrot_smplh[:, 111:]
193 | root_orient = jrot_smplh[:, 0:3].reshape(-1, 3)
194 |
195 | # Forward through model to get a superset of required joints
196 | T = jrot_smplh.shape[0]
197 | ntu_jpos = np.zeros((T, 219))
198 | for t in range(T):
199 | res = smplh(body_pose=torch.Tensor(smpl_body_jrot[t:t+1, :]),
200 | global_orient=torch.Tensor(root_orient[t: t+1, :]),
201 | left_hand_pose = torch.Tensor(left_hand_jrot[t: t+1, :]),
202 | right_hand_pose=torch.Tensor(right_hand_jrot[t: t+1, :]),
203 | # transl=torch.Tensor(transl)
204 | )
205 | jpos = res.joints.detach().cpu().numpy()[:, :, :].reshape(-1)
206 | ntu_jpos[t, :] = jpos
207 |
208 | # Save to disk
209 | if not os.path.exists(ospd(ntu_jpos_p)):
210 | os.makedirs(ospd(ntu_jpos_p))
211 | np.savez(ntu_jpos_p, joint_pos=ntu_jpos, allow_pickle=True)
212 |
213 | return
214 |
215 | def viz_ntu_jpos(jpos_p, l_ft_p):
216 | '''Visualize sequences of NTU-skeleton joint positions'''
217 | # Load paths to all BABEL features
218 | featp_2_fps = read_json('../data/featp_2_fps.json')
219 | # Indices that are in the NTU RGBD skeleton
220 | smpl2nturgbd = smpl_to_nturgbd()
221 | # Iterate over each
222 | for ft_p in l_ft_p:
223 | x = np.load(ospj(jpos_p, ft_p))['joint_pos']
224 | T, ft_sz = x.shape
225 | x = x.reshape(T, ft_sz//3, 3)
226 | # print('Data shape = {0}'.format(x.shape))
227 | x = x[:, smpl2nturgbd, :]
228 | # print('Data shape = {0}'.format(x.shape))
229 | # x = x[:,:,:, 0].transpose(1, 2, 0) # (3, 150, 22, 1) --> (150, 22, 3)
230 | print('Data shape = {0}'.format(x.shape))
231 | viz.viz_seq(seq=x, folder_p='test_viz/test_ntu_w_axis', sk_type='nturgbd', debug=True)
232 | print('-'*50)
233 |
234 |
235 | def main():
236 | '''Store preliminary stuff'''
237 | amass_p= '/ps/project/conditional_action_gen/data/AMASS_March2021/'
238 |
239 | # Save feature paths --> fps (released in babel/action_recognition/data/)
240 | # store_seq_fps(amass_p)
241 |
242 | # Save joint positions in NTU-RGBD skeleton format
243 | smplh_model_p = '/ps/project/conditional_action_gen/body_models/mano_v1_2/models_cleaned_merged/SMPLH_male.pkl'
244 | jpos_p = '/ps/project/conditional_action_gen/amass/babel_joint_pos'
245 | # store_ntu_jpos(smplh_model_p, jpos_p, amass_p)
246 |
247 | # Viz. saved seqs.
248 | # l_ft_p = ['KIT/917/Experiment3a_09_poses.npz']
249 | # viz_ntu_jpos(jpos_p, l_ft_p)
250 |
251 | if __name__ == '__main__':
252 | main()
253 |
254 |
--------------------------------------------------------------------------------
/action_recognition/feeders/feeder.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # -*- coding: utf-8 -*-
4 | #
5 | # Adapted from https://github.com/lshiwjx/2s-AGCN for BABEL (https://babel.is.tue.mpg.de/)
6 |
7 | import numpy as np
8 | import pickle
9 | import torch
10 | from torch.utils.data import Dataset
11 | import sys
12 | import pdb
13 |
14 | sys.path.extend(['../'])
15 | from feeders import tools
16 |
17 |
18 | class Feeder(Dataset):
19 | def __init__(self, data_path, label_path,
20 | random_choose=False, random_shift=False, random_move=False,
21 | window_size=-1, normalization=False, debug=False, use_mmap=True):
22 | """
23 |
24 | :param data_path:
25 | :param label_path:
26 | :param random_choose: If true, randomly choose a portion of the input sequence
27 | :param random_shift: If true, randomly pad zeros at the begining or end of sequence
28 | :param random_move:
29 | :param window_size: The length of the output sequence
30 | :param normalization: If true, normalize input sequence
31 | :param debug: If true, only use the first 100 samples
32 | :param use_mmap: If true, use mmap mode to load data, which can save the running memory
33 | """
34 |
35 | self.debug = debug
36 | self.data_path = data_path
37 | self.label_path = label_path
38 | self.random_choose = random_choose
39 | self.random_shift = random_shift
40 | self.random_move = random_move
41 | self.window_size = window_size
42 | self.normalization = normalization
43 | self.use_mmap = use_mmap
44 | self.load_data()
45 | if normalization:
46 | self.get_mean_map()
47 |
48 | def load_data(self):
49 | # data: N C V T M
50 | try:
51 | with open(self.label_path) as f:
52 | self.sample_name, self.label = pickle.load(f)
53 | except:
54 | # for pickle file from python2
55 | with open(self.label_path, 'rb') as f:
56 | self.sample_name, self.label = pickle.load(f, encoding='latin1')
57 |
58 | # load data
59 | if self.use_mmap:
60 | self.data = np.load(self.data_path, mmap_mode='r')
61 | else:
62 | self.data = np.load(self.data_path)
63 | if self.debug:
64 | self.label = self.label[0:1000]
65 | self.data = self.data[0:1000]
66 | self.sample_name = self.sample_name[0:1000]
67 |
68 |
69 | def get_mean_map(self):
70 | data = self.data
71 | N, C, T, V, M = data.shape
72 | self.mean_map = data.mean(axis=2, keepdims=True).mean(axis=4, keepdims=True).mean(axis=0)
73 | self.std_map = data.transpose((0, 2, 4, 1, 3)).reshape((N * T * M, C * V)).std(axis=0).reshape((C, 1, V, 1))
74 |
75 | def __len__(self):
76 | return len(self.sample_name)
77 |
78 | def __iter__(self):
79 | return self
80 |
81 | def __getitem__(self, index):
82 | data_numpy = self.data[index]
83 | data_numpy = np.array(data_numpy)
84 |
85 | seg_id = self.sample_name[index]
86 | label = self.label[0][index]
87 | sid = self.label[1][index]
88 | chunk_n = self.label[2][index]
89 | anntr_id = self.label[3][index]
90 |
91 | if self.normalization:
92 | data_numpy = (data_numpy - self.mean_map) / self.std_map
93 | if self.random_shift:
94 | data_numpy = tools.random_shift(data_numpy)
95 | if self.random_choose:
96 | data_numpy = tools.random_choose(data_numpy, self.window_size)
97 | elif self.window_size > 0:
98 | data_numpy = tools.auto_pading(data_numpy, self.window_size)
99 | if self.random_move:
100 | data_numpy = tools.random_move(data_numpy)
101 |
102 | return data_numpy, label, sid, seg_id, chunk_n, anntr_id, index
103 |
104 | def top_k(self, score, top_k):
105 | rank = score.argsort()
106 | hit_top_k = [l in rank[i, -top_k:] for i, l in enumerate(self.label[0])]
107 | return sum(hit_top_k) * 1.0 / len(hit_top_k)
108 |
109 |
110 | def import_class(name):
111 | components = name.split('.')
112 | mod = __import__(components[0])
113 | for comp in components[1:]:
114 | mod = getattr(mod, comp)
115 | return mod
116 |
117 |
118 | def test(data_path, label_path, vid=None, graph=None, is_3d=False):
119 | '''
120 | vis the samples using matplotlib
121 | :param data_path:
122 | :param label_path:
123 | :param vid: the id of sample
124 | :param graph:
125 | :param is_3d: when vis NTU, set it True
126 | :return:
127 | '''
128 | import matplotlib.pyplot as plt
129 | loader = torch.utils.data.DataLoader(
130 | dataset=Feeder(data_path, label_path),
131 | batch_size=64,
132 | shuffle=False,
133 | num_workers=2)
134 |
135 | if vid is not None:
136 | sample_name = loader.dataset.sample_name
137 | sample_id = [name.split('.')[0] for name in sample_name]
138 | index = sample_id.index(vid)
139 | data, label, index = loader.dataset[index]
140 | data = data.reshape((1,) + data.shape)
141 |
142 | # for batch_idx, (data, label) in enumerate(loader):
143 | N, C, T, V, M = data.shape
144 |
145 | plt.ion()
146 | fig = plt.figure()
147 | if is_3d:
148 | from mpl_toolkits.mplot3d import Axes3D
149 | ax = fig.add_subplot(111, projection='3d')
150 | else:
151 | ax = fig.add_subplot(111)
152 |
153 | if graph is None:
154 | p_type = ['b.', 'g.', 'r.', 'c.', 'm.', 'y.', 'k.', 'k.', 'k.', 'k.']
155 | pose = [
156 | ax.plot(np.zeros(V), np.zeros(V), p_type[m])[0] for m in range(M)
157 | ]
158 | ax.axis([-1, 1, -1, 1])
159 | for t in range(T):
160 | for m in range(M):
161 | pose[m].set_xdata(data[0, 0, t, :, m])
162 | pose[m].set_ydata(data[0, 1, t, :, m])
163 | fig.canvas.draw()
164 | plt.pause(0.001)
165 | else:
166 | p_type = ['b-', 'g-', 'r-', 'c-', 'm-', 'y-', 'k-', 'k-', 'k-', 'k-']
167 | import sys
168 | from os import path
169 | sys.path.append(
170 | path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))
171 | G = import_class(graph)()
172 | edge = G.inward
173 | pose = []
174 | for m in range(M):
175 | a = []
176 | for i in range(len(edge)):
177 | if is_3d:
178 | a.append(ax.plot(np.zeros(3), np.zeros(3), p_type[m])[0])
179 | else:
180 | a.append(ax.plot(np.zeros(2), np.zeros(2), p_type[m])[0])
181 | pose.append(a)
182 | ax.axis([-1, 1, -1, 1])
183 | if is_3d:
184 | ax.set_zlim3d(-1, 1)
185 | for t in range(T):
186 | for m in range(M):
187 | for i, (v1, v2) in enumerate(edge):
188 | x1 = data[0, :2, t, v1, m]
189 | x2 = data[0, :2, t, v2, m]
190 | if (x1.sum() != 0 and x2.sum() != 0) or v1 == 1 or v2 == 1:
191 | pose[m][i].set_xdata(data[0, 0, t, [v1, v2], m])
192 | pose[m][i].set_ydata(data[0, 1, t, [v1, v2], m])
193 | if is_3d:
194 | pose[m][i].set_3d_properties(data[0, 2, t, [v1, v2], m])
195 | fig.canvas.draw()
196 | # plt.savefig('/home/lshi/Desktop/skeleton_sequence/' + str(t) + '.jpg')
197 | plt.pause(0.01)
198 |
199 |
200 | if __name__ == '__main__':
201 | import os
202 |
203 | os.environ['DISPLAY'] = 'localhost:10.0'
204 | data_path = "../data/ntu/xview/val_data_joint.npy"
205 | label_path = "../data/ntu/xview/val_label.pkl"
206 | graph = 'graph.ntu_rgb_d.Graph'
207 | test(data_path, label_path, vid='S004C001P003R001A032', graph=graph, is_3d=True)
208 | # data_path = "../data/kinetics/val_data.npy"
209 | # label_path = "../data/kinetics/val_label.pkl"
210 | # graph = 'graph.Kinetics'
211 | # test(data_path, label_path, vid='UOD7oll3Kqo', graph=graph)
212 |
--------------------------------------------------------------------------------
/notebooks/BABEL_explore.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Explore BABEL\n",
8 | "\n",
9 | "We present some code to explore BABEL by computing stats., and searching for specific actions."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Preparing the environment\n",
19 | "%load_ext autoreload\n",
20 | "%autoreload 2\n",
21 | "%matplotlib notebook\n",
22 | "%matplotlib inline\n",
23 | "\n",
24 | "import sys, os, pdb\n",
25 | "from os.path import join as ospj\n",
26 | "import json\n",
27 | "from collections import *\n",
28 | "\n",
29 | "import numpy as np\n",
30 | "import pandas as pd\n",
31 | "from pandas.core.common import flatten\n",
32 | "\n",
33 | "import pprint\n",
34 | "pp = pprint.PrettyPrinter()"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "### Load BABEL \n",
42 | "Note that we are not loading the test set "
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "d_folder = '../data/babel_v1.0_release' # Data folder\n",
52 | "l_babel_dense_files = ['train', 'val'] \n",
53 | "l_babel_extra_files = ['extra_train', 'extra_val']\n",
54 | "\n",
55 | "# BABEL Dataset \n",
56 | "babel = {}\n",
57 | "for file in l_babel_dense_files:\n",
58 | " babel[file] = json.load(open(ospj(d_folder, file+'.json')))\n",
59 | " \n",
60 | "for file in l_babel_extra_files:\n",
61 | " babel[file] = json.load(open(ospj(d_folder, file+'.json'))) "
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "### Duration of mocap for which BABEL action labels are available"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 3,
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "name": "stdout",
78 | "output_type": "stream",
79 | "text": [
80 | "Total duration = 30.0 hours 2.0 min. 32 sec.\n",
81 | "Total # seqs. = 8808\n",
82 | "------------------------------\n",
83 | "Total duration = 34.0 hours 43.0 min. 39 sec.\n",
84 | "Total # seqs. = 10576\n",
85 | "------------------------------\n"
86 | ]
87 | }
88 | ],
89 | "source": [
90 | "for babel_set in [l_babel_dense_files, l_babel_dense_files+l_babel_extra_files]:\n",
91 | " dur = 0.0\n",
92 | " list_sids = [] \n",
93 | " for spl in babel_set:\n",
94 | " for sid in babel[spl]:\n",
95 | " if sid not in list_sids:\n",
96 | " list_sids.append(sid)\n",
97 | " dur += babel[spl][sid]['dur'] \n",
98 | " \n",
99 | " # Duration of each set\n",
100 | " minutes = dur//60\n",
101 | " print('Total duration = {0} hours {1} min. {2:.0f} sec.'.format(\n",
102 | " minutes//60, minutes%60, dur%60))\n",
103 | " print('Total # seqs. = ', len(list_sids))\n",
104 | " print('-'*30)"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "### Search BABEL for action"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 4,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "def get_cats(ann, file):\n",
121 | " # Get sequence labels and frame labels if they exist\n",
122 | " seq_l, frame_l = [], []\n",
123 | " if 'extra' not in file:\n",
124 | " if ann['seq_ann'] is not None:\n",
125 | " seq_l = flatten([seg['act_cat'] for seg in ann['seq_ann']['labels']])\n",
126 | " if ann['frame_ann'] is not None:\n",
127 | " frame_l = flatten([seg['act_cat'] for seg in ann['frame_ann']['labels']])\n",
128 | " else:\n",
129 | " # Load all labels from (possibly) multiple annotators\n",
130 | " if ann['seq_anns'] is not None:\n",
131 | " seq_l = flatten([seg['act_cat'] for seq_ann in ann['seq_anns'] for seg in seq_ann['labels']])\n",
132 | " if ann['frame_anns'] is not None: \n",
133 | " frame_l = flatten([seg['act_cat'] for frame_ann in ann['frame_anns'] for seg in frame_ann['labels']])\n",
134 | " \n",
135 | " return list(seq_l), list(frame_l)"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 5,
141 | "metadata": {},
142 | "outputs": [
143 | {
144 | "name": "stdout",
145 | "output_type": "stream",
146 | "text": [
147 | "# Seqs. containing action jump = 746\n",
148 | "# Segments containing action jump = 1597\n"
149 | ]
150 | }
151 | ],
152 | "source": [
153 | "action = 'jump'\n",
154 | "act_anns = defaultdict(list) # { seq_id_1: [ann_1_1, ann_1_2], seq_id_2: [ann_2_1], ...} \n",
155 | "n_act_spans = 0\n",
156 | "\n",
157 | "for spl in babel:\n",
158 | " for sid in babel[spl]:\n",
159 | " \n",
160 | " seq_l, frame_l = get_cats(babel[spl][sid], spl)\n",
161 | " # print(seq_l + frame_l)\n",
162 | " \n",
163 | " if action in seq_l + frame_l:\n",
164 | " \n",
165 | " # Store all relevant mocap sequence annotations\n",
166 | " act_anns[sid].append(babel[spl][sid])\n",
167 | " \n",
168 | " # # Individual spans of the action in the sequence\n",
169 | " n_act_spans += Counter(seq_l+frame_l)[action]\n",
170 | " \n",
171 | "print('# Seqs. containing action {0} = {1}'.format(action, len(act_anns)))\n",
172 | "print('# Segments containing action {0} = {1}'.format(action, n_act_spans))"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 6,
178 | "metadata": {},
179 | "outputs": [
180 | {
181 | "name": "stdout",
182 | "output_type": "stream",
183 | "text": [
184 | "[{'babel_sid': 7692,\n",
185 | " 'dur': 3.83,\n",
186 | " 'feat_p': 'CMU/CMU/141/141_05_poses.npz',\n",
187 | " 'frame_ann': {'anntr_id': 'eab5b72f-7399-43a7-a752-e4ee2807faaf',\n",
188 | " 'babel_lid': '59ad905d-f378-4d2b-90a7-4e3222bbc1f7',\n",
189 | " 'labels': [{'act_cat': ['hop'],\n",
190 | " 'end_t': 2,\n",
191 | " 'proc_label': 'hop left',\n",
192 | " 'raw_label': 'hopping left',\n",
193 | " 'seg_id': 'daf942ad-7cbe-4387-b6a0-0fc391c702ea',\n",
194 | " 'start_t': 1},\n",
195 | " {'act_cat': ['hop'],\n",
196 | " 'end_t': 3,\n",
197 | " 'proc_label': 'hop right',\n",
198 | " 'raw_label': 'hopping right',\n",
199 | " 'seg_id': '7b17f75e-3da9-4e56-aca1-9bbb6b8d5dd9',\n",
200 | " 'start_t': 2},\n",
201 | " {'act_cat': ['stand'],\n",
202 | " 'end_t': 1,\n",
203 | " 'proc_label': 'stand',\n",
204 | " 'raw_label': 'standing',\n",
205 | " 'seg_id': '70687891-613e-42f7-87f4-5760f18a3548',\n",
206 | " 'start_t': 0},\n",
207 | " {'act_cat': ['stand'],\n",
208 | " 'end_t': 3.834,\n",
209 | " 'proc_label': 'stand',\n",
210 | " 'raw_label': 'standing',\n",
211 | " 'seg_id': 'f0cdfd79-5dad-43f3-b2d1-8a0ce8668010',\n",
212 | " 'start_t': 3}],\n",
213 | " 'mul_act': True},\n",
214 | " 'seq_ann': {'anntr_id': '30bf91ac-e0c1-4298-814f-7811fe634bac',\n",
215 | " 'babel_lid': 'da9d959f-f5b6-434f-a927-35effc7b5afe',\n",
216 | " 'labels': [{'act_cat': ['jump'],\n",
217 | " 'proc_label': 'jump',\n",
218 | " 'raw_label': 'jump',\n",
219 | " 'seg_id': '082c172b-3883-4231-9c81-fcee4cf1a999'}],\n",
220 | " 'mul_act': True},\n",
221 | " 'url': 'https://babel-renders.s3.eu-central-1.amazonaws.com/007692.mp4'}]\n"
222 | ]
223 | }
224 | ],
225 | "source": [
226 | "# View a random annotation \n",
227 | "key = np.random.choice(list(act_anns.keys()))\n",
228 | "pp.pprint(act_anns[key])"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {},
235 | "outputs": [],
236 | "source": []
237 | }
238 | ],
239 | "metadata": {
240 | "kernelspec": {
241 | "display_name": "Python 3",
242 | "language": "python",
243 | "name": "python3"
244 | },
245 | "language_info": {
246 | "codemirror_mode": {
247 | "name": "ipython",
248 | "version": 3
249 | },
250 | "file_extension": ".py",
251 | "mimetype": "text/x-python",
252 | "name": "python",
253 | "nbconvert_exporter": "python",
254 | "pygments_lexer": "ipython3",
255 | "version": "3.8.3"
256 | }
257 | },
258 | "nbformat": 4,
259 | "nbformat_minor": 5
260 | }
261 |
--------------------------------------------------------------------------------
/action_recognition/model/aagcn.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 | from torch.autograd import Variable
7 |
8 |
9 | def import_class(name):
10 | components = name.split('.')
11 | mod = __import__(components[0])
12 | for comp in components[1:]:
13 | mod = getattr(mod, comp)
14 | return mod
15 |
16 |
17 | def conv_branch_init(conv, branches):
18 | weight = conv.weight
19 | n = weight.size(0)
20 | k1 = weight.size(1)
21 | k2 = weight.size(2)
22 | nn.init.normal_(weight, 0, math.sqrt(2. / (n * k1 * k2 * branches)))
23 | nn.init.constant_(conv.bias, 0)
24 |
25 |
26 | def conv_init(conv):
27 | nn.init.kaiming_normal_(conv.weight, mode='fan_out')
28 | nn.init.constant_(conv.bias, 0)
29 |
30 |
31 | def bn_init(bn, scale):
32 | nn.init.constant_(bn.weight, scale)
33 | nn.init.constant_(bn.bias, 0)
34 |
35 |
36 | class unit_tcn(nn.Module):
37 | def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):
38 | super(unit_tcn, self).__init__()
39 | pad = int((kernel_size - 1) / 2)
40 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(pad, 0),
41 | stride=(stride, 1))
42 |
43 | self.bn = nn.BatchNorm2d(out_channels)
44 | self.relu = nn.ReLU(inplace=True)
45 | conv_init(self.conv)
46 | bn_init(self.bn, 1)
47 |
48 | def forward(self, x):
49 | x = self.bn(self.conv(x))
50 | return x
51 |
52 |
53 | class unit_gcn(nn.Module):
54 | def __init__(self, in_channels, out_channels, A, coff_embedding=4, num_subset=3, adaptive=True, attention=True):
55 | super(unit_gcn, self).__init__()
56 | inter_channels = out_channels // coff_embedding
57 | self.inter_c = inter_channels
58 | self.out_c = out_channels
59 | self.in_c = in_channels
60 | self.num_subset = num_subset
61 | num_jpts = A.shape[-1]
62 |
63 | self.conv_d = nn.ModuleList()
64 | for i in range(self.num_subset):
65 | self.conv_d.append(nn.Conv2d(in_channels, out_channels, 1))
66 |
67 | if adaptive:
68 | self.PA = nn.Parameter(torch.from_numpy(A.astype(np.float32)))
69 | self.alpha = nn.Parameter(torch.zeros(1))
70 | # self.beta = nn.Parameter(torch.ones(1))
71 | # nn.init.constant_(self.PA, 1e-6)
72 | # self.A = Variable(torch.from_numpy(A.astype(np.float32)), requires_grad=False)
73 | # self.A = self.PA
74 | self.conv_a = nn.ModuleList()
75 | self.conv_b = nn.ModuleList()
76 | for i in range(self.num_subset):
77 | self.conv_a.append(nn.Conv2d(in_channels, inter_channels, 1))
78 | self.conv_b.append(nn.Conv2d(in_channels, inter_channels, 1))
79 | else:
80 | self.A = Variable(torch.from_numpy(A.astype(np.float32)), requires_grad=False)
81 | self.adaptive = adaptive
82 |
83 | if attention:
84 | # self.beta = nn.Parameter(torch.zeros(1))
85 | # self.gamma = nn.Parameter(torch.zeros(1))
86 | # unified attention
87 | # self.Attention = nn.Parameter(torch.ones(num_jpts))
88 |
89 | # temporal attention
90 | self.conv_ta = nn.Conv1d(out_channels, 1, 9, padding=4)
91 | nn.init.constant_(self.conv_ta.weight, 0)
92 | nn.init.constant_(self.conv_ta.bias, 0)
93 |
94 | # s attention
95 | ker_jpt = num_jpts - 1 if not num_jpts % 2 else num_jpts
96 | pad = (ker_jpt - 1) // 2
97 | self.conv_sa = nn.Conv1d(out_channels, 1, ker_jpt, padding=pad)
98 | nn.init.xavier_normal_(self.conv_sa.weight)
99 | nn.init.constant_(self.conv_sa.bias, 0)
100 |
101 | # channel attention
102 | rr = 2
103 | self.fc1c = nn.Linear(out_channels, out_channels // rr)
104 | self.fc2c = nn.Linear(out_channels // rr, out_channels)
105 | nn.init.kaiming_normal_(self.fc1c.weight)
106 | nn.init.constant_(self.fc1c.bias, 0)
107 | nn.init.constant_(self.fc2c.weight, 0)
108 | nn.init.constant_(self.fc2c.bias, 0)
109 |
110 | # self.bn = nn.BatchNorm2d(out_channels)
111 | # bn_init(self.bn, 1)
112 | self.attention = attention
113 |
114 | if in_channels != out_channels:
115 | self.down = nn.Sequential(
116 | nn.Conv2d(in_channels, out_channels, 1),
117 | nn.BatchNorm2d(out_channels)
118 | )
119 | else:
120 | self.down = lambda x: x
121 |
122 | self.bn = nn.BatchNorm2d(out_channels)
123 | self.soft = nn.Softmax(-2)
124 | self.tan = nn.Tanh()
125 | self.sigmoid = nn.Sigmoid()
126 | self.relu = nn.ReLU(inplace=True)
127 |
128 | for m in self.modules():
129 | if isinstance(m, nn.Conv2d):
130 | conv_init(m)
131 | elif isinstance(m, nn.BatchNorm2d):
132 | bn_init(m, 1)
133 | bn_init(self.bn, 1e-6)
134 | for i in range(self.num_subset):
135 | conv_branch_init(self.conv_d[i], self.num_subset)
136 |
137 | def forward(self, x):
138 | N, C, T, V = x.size()
139 |
140 | y = None
141 | if self.adaptive:
142 | A = self.PA
143 | # A = A + self.PA
144 | for i in range(self.num_subset):
145 | A1 = self.conv_a[i](x).permute(0, 3, 1, 2).contiguous().view(N, V, self.inter_c * T)
146 | A2 = self.conv_b[i](x).view(N, self.inter_c * T, V)
147 | A1 = self.tan(torch.matmul(A1, A2) / A1.size(-1)) # N V V
148 | A1 = A[i] + A1 * self.alpha
149 | A2 = x.view(N, C * T, V)
150 | z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V))
151 | y = z + y if y is not None else z
152 | else:
153 | A = self.A.cuda(x.get_device()) * self.mask
154 | for i in range(self.num_subset):
155 | A1 = A[i]
156 | A2 = x.view(N, C * T, V)
157 | z = self.conv_d[i](torch.matmul(A2, A1).view(N, C, T, V))
158 | y = z + y if y is not None else z
159 |
160 | y = self.bn(y)
161 | y += self.down(x)
162 | y = self.relu(y)
163 |
164 | if self.attention:
165 | # spatial attention
166 | se = y.mean(-2) # N C V
167 | se1 = self.sigmoid(self.conv_sa(se))
168 | y = y * se1.unsqueeze(-2) + y
169 | # a1 = se1.unsqueeze(-2)
170 |
171 | # temporal attention
172 | se = y.mean(-1)
173 | se1 = self.sigmoid(self.conv_ta(se))
174 | y = y * se1.unsqueeze(-1) + y
175 | # a2 = se1.unsqueeze(-1)
176 |
177 | # channel attention
178 | se = y.mean(-1).mean(-1)
179 | se1 = self.relu(self.fc1c(se))
180 | se2 = self.sigmoid(self.fc2c(se1))
181 | y = y * se2.unsqueeze(-1).unsqueeze(-1) + y
182 | # a3 = se2.unsqueeze(-1).unsqueeze(-1)
183 |
184 | # unified attention
185 | # y = y * self.Attention + y
186 | # y = y + y * ((a2 + a3) / 2)
187 | # y = self.bn(y)
188 | return y
189 |
190 |
191 | class TCN_GCN_unit(nn.Module):
192 | def __init__(self, in_channels, out_channels, A, stride=1, residual=True, adaptive=True, attention=True):
193 | super(TCN_GCN_unit, self).__init__()
194 | self.gcn1 = unit_gcn(in_channels, out_channels, A, adaptive=adaptive, attention=attention)
195 | self.tcn1 = unit_tcn(out_channels, out_channels, stride=stride)
196 | self.relu = nn.ReLU(inplace=True)
197 | # if attention:
198 | # self.alpha = nn.Parameter(torch.zeros(1))
199 | # self.beta = nn.Parameter(torch.ones(1))
200 | # temporal attention
201 | # self.conv_ta1 = nn.Conv1d(out_channels, out_channels//rt, 9, padding=4)
202 | # self.bn = nn.BatchNorm2d(out_channels)
203 | # bn_init(self.bn, 1)
204 | # self.conv_ta2 = nn.Conv1d(out_channels, 1, 9, padding=4)
205 | # nn.init.kaiming_normal_(self.conv_ta1.weight)
206 | # nn.init.constant_(self.conv_ta1.bias, 0)
207 | # nn.init.constant_(self.conv_ta2.weight, 0)
208 | # nn.init.constant_(self.conv_ta2.bias, 0)
209 |
210 | # rt = 4
211 | # self.inter_c = out_channels // rt
212 | # self.conv_ta1 = nn.Conv2d(out_channels, out_channels // rt, 1)
213 | # self.conv_ta2 = nn.Conv2d(out_channels, out_channels // rt, 1)
214 | # nn.init.constant_(self.conv_ta1.weight, 0)
215 | # nn.init.constant_(self.conv_ta1.bias, 0)
216 | # nn.init.constant_(self.conv_ta2.weight, 0)
217 | # nn.init.constant_(self.conv_ta2.bias, 0)
218 | # s attention
219 | # num_jpts = A.shape[-1]
220 | # ker_jpt = num_jpts - 1 if not num_jpts % 2 else num_jpts
221 | # pad = (ker_jpt - 1) // 2
222 | # self.conv_sa = nn.Conv1d(out_channels, 1, ker_jpt, padding=pad)
223 | # nn.init.constant_(self.conv_sa.weight, 0)
224 | # nn.init.constant_(self.conv_sa.bias, 0)
225 |
226 | # channel attention
227 | # rr = 16
228 | # self.fc1c = nn.Linear(out_channels, out_channels // rr)
229 | # self.fc2c = nn.Linear(out_channels // rr, out_channels)
230 | # nn.init.kaiming_normal_(self.fc1c.weight)
231 | # nn.init.constant_(self.fc1c.bias, 0)
232 | # nn.init.constant_(self.fc2c.weight, 0)
233 | # nn.init.constant_(self.fc2c.bias, 0)
234 | #
235 | # self.softmax = nn.Softmax(-2)
236 | # self.sigmoid = nn.Sigmoid()
237 | self.attention = attention
238 |
239 | if not residual:
240 | self.residual = lambda x: 0
241 |
242 | elif (in_channels == out_channels) and (stride == 1):
243 | self.residual = lambda x: x
244 |
245 | else:
246 | self.residual = unit_tcn(in_channels, out_channels, kernel_size=1, stride=stride)
247 |
248 | def forward(self, x):
249 | if self.attention:
250 | y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x))
251 |
252 | # spatial attention
253 | # se = y.mean(-2) # N C V
254 | # se1 = self.sigmoid(self.conv_sa(se))
255 | # y = y * se1.unsqueeze(-2) + y
256 | # a1 = se1.unsqueeze(-2)
257 |
258 | # temporal attention
259 | # se = y.mean(-1) # N C T
260 | # # se1 = self.relu(self.bn(self.conv_ta1(se)))
261 | # se2 = self.sigmoid(self.conv_ta2(se))
262 | # # y = y * se1.unsqueeze(-1) + y
263 | # a2 = se2.unsqueeze(-1)
264 |
265 | # se = y # NCTV
266 | # N, C, T, V = y.shape
267 | # se1 = self.conv_ta1(se).permute(0, 2, 1, 3).contiguous().view(N, T, self.inter_c * V) # NTCV
268 | # se2 = self.conv_ta2(se).permute(0, 1, 3, 2).contiguous().view(N, self.inter_c * V, T) # NCVT
269 | # a2 = self.softmax(torch.matmul(se1, se2) / np.sqrt(se1.size(-1))) # N T T
270 | # y = torch.matmul(y.permute(0, 1, 3, 2).contiguous().view(N, C * V, T), a2) \
271 | # .view(N, C, V, T).permute(0, 1, 3, 2) * self.alpha + y
272 |
273 | # channel attention
274 | # se = y.mean(-1).mean(-1)
275 | # se1 = self.relu(self.fc1c(se))
276 | # se2 = self.sigmoid(self.fc2c(se1))
277 | # # y = y * se2.unsqueeze(-1).unsqueeze(-1) + y
278 | # a3 = se2.unsqueeze(-1).unsqueeze(-1)
279 | #
280 | # y = y * ((a2 + a3) / 2) + y
281 | # y = self.bn(y)
282 | else:
283 | y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x))
284 | return y
285 |
286 |
287 | class Model(nn.Module):
288 | def __init__(self, num_class=60, num_point=25, num_person=2, graph=None, graph_args=dict(), in_channels=3,
289 | drop_out=0, adaptive=True, attention=True):
290 | super(Model, self).__init__()
291 |
292 | if graph is None:
293 | raise ValueError()
294 | else:
295 | Graph = import_class(graph)
296 | self.graph = Graph(**graph_args)
297 |
298 | A = self.graph.A
299 | self.num_class = num_class
300 |
301 | self.data_bn = nn.BatchNorm1d(num_person * in_channels * num_point)
302 |
303 | self.l1 = TCN_GCN_unit(3, 64, A, residual=False, adaptive=adaptive, attention=attention)
304 | self.l2 = TCN_GCN_unit(64, 64, A, adaptive=adaptive, attention=attention)
305 | self.l3 = TCN_GCN_unit(64, 64, A, adaptive=adaptive, attention=attention)
306 | self.l4 = TCN_GCN_unit(64, 64, A, adaptive=adaptive, attention=attention)
307 | self.l5 = TCN_GCN_unit(64, 128, A, stride=2, adaptive=adaptive, attention=attention)
308 | self.l6 = TCN_GCN_unit(128, 128, A, adaptive=adaptive, attention=attention)
309 | self.l7 = TCN_GCN_unit(128, 128, A, adaptive=adaptive, attention=attention)
310 | self.l8 = TCN_GCN_unit(128, 256, A, stride=2, adaptive=adaptive, attention=attention)
311 | self.l9 = TCN_GCN_unit(256, 256, A, adaptive=adaptive, attention=attention)
312 | self.l10 = TCN_GCN_unit(256, 256, A, adaptive=adaptive, attention=attention)
313 |
314 | self.fc = nn.Linear(256, num_class)
315 | nn.init.normal_(self.fc.weight, 0, math.sqrt(2. / num_class))
316 | bn_init(self.data_bn, 1)
317 | if drop_out:
318 | self.drop_out = nn.Dropout(drop_out)
319 | else:
320 | self.drop_out = lambda x: x
321 |
322 | def forward(self, x):
323 | N, C, T, V, M = x.size()
324 |
325 | x = x.permute(0, 4, 3, 1, 2).contiguous().view(N, M * V * C, T)
326 | x = self.data_bn(x)
327 | x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, 2).contiguous().view(N * M, C, T, V)
328 |
329 | x = self.l1(x)
330 | x = self.l2(x)
331 | x = self.l3(x)
332 | x = self.l4(x)
333 | x = self.l5(x)
334 | x = self.l6(x)
335 | x = self.l7(x)
336 | x = self.l8(x)
337 | x = self.l9(x)
338 | x = self.l10(x)
339 |
340 | # N*M,C,T,V
341 | c_new = x.size(1)
342 | x = x.view(N, M, c_new, -1)
343 | x = x.mean(3).mean(1)
344 | x = self.drop_out(x)
345 |
346 | return self.fc(x)
347 |
--------------------------------------------------------------------------------
/action_recognition/data_gen/viz.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:fenc=utf-8
4 | #
5 | # Copyright © 2020 achandrasekaran
6 | #
7 | # Distributed under terms of the MIT license.
8 |
9 | import os, sys
10 | import os.path as osp
11 |
12 | import random
13 | import numpy as np
14 | import math
15 | import torch
16 | from torch.nn.functional import interpolate as intrp
17 |
18 | import subprocess
19 | import shutil
20 | import uuid
21 | import cv2
22 | from matplotlib import pyplot as plt
23 | from mpl_toolkits.mplot3d import Axes3D
24 |
25 | import pdb
26 |
27 | import dutils
28 |
29 |
30 | """
31 | Visualize input and output motion sequences and labels
32 | """
33 |
34 | def get_smpl_skeleton():
35 | '''Skeleton ordering so that you traverse joints in this order:
36 | Left lower, Left upper, Spine, Neck, Head, Right lower, Right upper.
37 | '''
38 | return np.array(
39 | [
40 | # Left lower
41 | [ 0, 1 ],
42 | [ 1, 4 ],
43 | [ 4, 7 ],
44 | [ 7, 10],
45 |
46 | # Left upper
47 | [ 9, 13],
48 | [13, 16],
49 | [16, 18],
50 | [18, 20],
51 | # [20, 22],
52 |
53 | # Spinal column
54 | [ 0, 3 ],
55 | [ 3, 6 ],
56 | [ 6, 9 ],
57 | [ 9, 12],
58 | [12, 15],
59 |
60 | # Right lower
61 | [ 0, 2 ],
62 | [ 2, 5 ],
63 | [ 5, 8 ],
64 | [ 8, 11],
65 |
66 | # Right upper
67 | [ 9, 14],
68 | [14, 17],
69 | [17, 19],
70 | [19, 21],
71 | # [21, 23],
72 | ])
73 |
74 | def get_nturgbd_joint_names():
75 | '''From paper:
76 | 1-base of the spine 2-middle of the spine 3-neck 4-head 5-left shoulder 6-left elbow 7-left wrist 8- left hand 9-right shoulder 10-right elbow 11-right wrist 12- right hand 13-left hip 14-left knee 15-left ankle 16-left foot 17- right hip 18-right knee 19-right ankle 20-right foot 21-spine 22- tip of the left hand 23-left thumb 24-tip of the right hand 25- right thumb
77 | '''
78 | # Joint names by AC, based on SMPL names
79 | joint_names_map = {
80 | 0: 'Pelvis',
81 |
82 | 12: 'L_Hip',
83 | 13: 'L_Knee',
84 | 14: 'L_Ankle',
85 | 15: 'L_Foot',
86 |
87 | 16: 'R_Hip',
88 | 17: 'R_Knee',
89 | 18: 'R_Ankle',
90 | 19: 'R_Foot',
91 |
92 | 1: 'Spine1',
93 | # 'Spine2',
94 | 20: 'Spine3',
95 | 2: 'Neck',
96 | 3: 'Head',
97 |
98 | # 'L_Collar',
99 | 4: 'L_Shoulder',
100 | 5: 'L_Elbow',
101 | 6: 'L_Wrist',
102 | 7: 'L_Hand',
103 | 21: 'L_HandTip', # Not in SMPL
104 | 22: 'L_Thumb', # Not in SMPL
105 |
106 | # 'R_Collar',
107 | 8: 'R_Shoulder',
108 | 9: 'R_Elbow',
109 | 10: 'R_Wrist',
110 | 11: 'R_Hand',
111 | 23: 'R_HandTip', # Not in SMPL
112 | 24: 'R_Thumb', # Not in SMPL
113 | }
114 |
115 | return [joint_names_map[idx] for idx in range(len(joint_names_map))]
116 |
117 | def get_smpl_joint_names():
118 | # Joint names from SMPL Wiki
119 | joint_names_map = {
120 | 0: 'Pelvis',
121 |
122 | 1: 'L_Hip',
123 | 4: 'L_Knee',
124 | 7: 'L_Ankle',
125 | 10: 'L_Foot',
126 |
127 | 2: 'R_Hip',
128 | 5: 'R_Knee',
129 | 8: 'R_Ankle',
130 | 11: 'R_Foot',
131 |
132 | 3: 'Spine1',
133 | 6: 'Spine2',
134 | 9: 'Spine3',
135 | 12: 'Neck',
136 | 15: 'Head',
137 |
138 | 13: 'L_Collar',
139 | 16: 'L_Shoulder',
140 | 18: 'L_Elbow',
141 | 20: 'L_Wrist',
142 | 22: 'L_Hand',
143 | 14: 'R_Collar',
144 | 17: 'R_Shoulder',
145 | 19: 'R_Elbow',
146 | 21: 'R_Wrist',
147 | 23: 'R_Hand'}
148 |
149 | # Return all joints except indices 22 (L_Hand), 23 (R_Hand)
150 | return [joint_names_map[idx] for idx in range(len(joint_names_map)-2)]
151 |
152 | def get_nturgbd_skeleton():
153 | ''' Skeleton ordering such that you traverse joints in this order:
154 | Left lower, Left upper, Spine, Neck, Head, Right lower, Right upper.
155 | '''
156 | return np.array(
157 | [
158 | # Left lower
159 | [0, 12],
160 | [12, 13],
161 | [13, 14],
162 | [14, 15],
163 |
164 | # Left upper
165 | [4, 20],
166 | [4, 5],
167 | [5, 6],
168 | [6, 7],
169 | [7, 21],
170 | [7, 22], # --> L Thumb
171 |
172 | # Spinal column
173 | [0, 1],
174 | [1, 20],
175 | [20, 2],
176 | [2, 3],
177 |
178 | # Right lower
179 | [0, 16],
180 | [16, 17],
181 | [17, 18],
182 | [18, 19],
183 |
184 | # Right upper
185 | [20, 8],
186 | [8, 9],
187 | [9, 10],
188 | [10, 11],
189 | [11, 24],
190 | # [24, 11] --> R Thumb
191 |
192 | [21, 22],
193 |
194 | [23, 24],
195 |
196 | ]
197 | )
198 |
199 | def get_joint_colors(joint_names):
200 | '''Return joints based on a color spectrum. Also, joints on
201 | L and R should have distinctly different colors.
202 | '''
203 | # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
204 | cmap = plt.get_cmap('rainbow')
205 | colors = [cmap(i) for i in np.linspace(0, 1, len(joint_names))]
206 | colors = [np.array((c[2], c[1], c[0])) for c in colors]
207 | return colors
208 |
209 | def calc_angle_from_x(sk):
210 | '''Given skeleton, calc. angle from x-axis'''
211 | # Hip bone
212 | id_l_hip = get_smpl_joint_names().index('L_Hip')
213 | id_r_hip = get_smpl_joint_names().index('R_Hip')
214 | pl, pr = sk[id_l_hip], sk[id_r_hip]
215 | bone = np.array(pr-pl)
216 | unit_v = bone / np.linalg.norm(bone)
217 | # Angle with x-axis
218 | pdb.set_trace()
219 | x_ax = np.array([1, 0, 0])
220 | x_angle = math.degrees(np.arccos(np.dot(x_ax, unit_v)))
221 |
222 | '''
223 | l_hip_z = seq[0, joint_names.index('L_Hip'), 2]
224 | r_hip_z = seq[0, joint_names.index('R_Hip'), 2]
225 | az = 0 if (l_hip_z > zroot and zroot > r_hip_z) else 180
226 | '''
227 | if bone[1] > 0:
228 | x_angle = - x_angle
229 |
230 | return x_angle
231 |
232 | def calc_angle_from_y(sk):
233 | '''Given skeleton, calc. angle from x-axis'''
234 | # Hip bone
235 | id_l_hip = get_smpl_joint_names().index('L_Hip')
236 | id_r_hip = get_smpl_joint_names().index('R_Hip')
237 | pl, pr = sk[id_l_hip], sk[id_r_hip]
238 | bone = np.array(pl-pr)
239 | unit_v = bone / np.linalg.norm(bone)
240 | print(unit_v)
241 | # Angle with x-axis
242 | pdb.set_trace()
243 | y_ax = np.array([0, 1, 0])
244 | y_angle = math.degrees(np.arccos(np.dot(y_ax, unit_v)))
245 |
246 | '''
247 | l_hip_z = seq[0, joint_names.index('L_Hip'), 2]
248 | r_hip_z = seq[0, joint_names.index('R_Hip'), 2]
249 | az = 0 if (l_hip_z > zroot and zroot > r_hip_z) else 180
250 | '''
251 | # if bone[1] > 0:
252 | # y_angle = - y_angle
253 | seq_y_proj = bone * np.cos(np.deg2rad(y_angle))
254 | print('Bone projected onto y-axis: ', seq_y_proj)
255 |
256 | return y_angle
257 |
258 | def viz_skeleton(seq, folder_p, sk_type='smpl', radius=1, lcolor='#ff0000', rcolor='#0000ff', action='', debug=False):
259 | ''' Visualize skeletons for given sequence and store as images.
260 |
261 | Args:
262 | seq (np.array): Array (frames) of joint positions.
263 | Size depends on sk_type (see below).
264 | if sk_type is 'smpl' then assume:
265 | 1. first 3 dims = translation.
266 | 2. Size = (# frames, 69)
267 | elif sk_type is 'nturgbd', then assume:
268 | 1. no translation.
269 | 2. Size = (# frames, 25, 3)
270 | folder_p (str): Path to root folder containing visualized frames.
271 | Frames are dumped to the path: folder_p/frames/*.jpg
272 | radius (float): Space around the subject?
273 |
274 | Returns:
275 | Stores skeleton sequence as jpg frames.
276 | '''
277 | joint_names = get_nturgbd_joint_names() if 'nturgbd' == sk_type \
278 | else get_smpl_joint_names()
279 | n_j = n_j = len(joint_names)
280 |
281 | az = 90
282 | if 'smpl' == sk_type:
283 | # SMPL kinematic chain, joint list.
284 | # NOTE that hands are skipped.
285 | kin_chain = get_smpl_skeleton()
286 | # Reshape flat pose features into (frames, joints, (x,y,z)) (skip trans)
287 | seq = seq[:, 3:].reshape(-1, n_j, 3).cpu().detach().numpy()
288 |
289 | elif 'nturgbd' == sk_type:
290 | kin_chain = get_nturgbd_skeleton()
291 | az = 0
292 |
293 | # Get color-spectrum for skeleton
294 | colors = get_joint_colors(joint_names)
295 | labels = [(joint_names[jidx[0]], joint_names[jidx[1]]) for jidx in kin_chain]
296 |
297 | # xroot, yroot, zroot = 0.0, 0.0, 0.0
298 | xroot, yroot, zroot = seq[0, 0, 0], seq[0, 0, 1], seq[0, 0, 2]
299 | # seq = seq - seq[0, :, :]
300 |
301 | # Change viewing angle so that first frame is in frontal pose
302 | # az = calc_angle_from_x(seq[0]-np.array([xroot, yroot, zroot]))
303 | # az = calc_angle_from_y(seq[0]-np.array([xroot, yroot, zroot]))
304 |
305 | # Viz. skeleton for each frame
306 | for t in range(seq.shape[0]):
307 |
308 | # Fig. settings
309 | fig = plt.figure(figsize=(7, 6)) if debug else \
310 | plt.figure(figsize=(5, 5))
311 | ax = fig.add_subplot(111, projection='3d')
312 |
313 | for i, (j1, j2) in enumerate(kin_chain):
314 | # Store bones
315 | x = np.array([seq[t, j1, 0], seq[t, j2, 0]])
316 | y = np.array([seq[t, j1, 1], seq[t, j2, 1]])
317 | z = np.array([seq[t, j1, 2], seq[t, j2, 2]])
318 | # Plot bones in skeleton
319 | ax.plot(x, y, z, c=colors[i], marker='o', linewidth=2, label=labels[i])
320 |
321 | # More figure settings
322 | ax.set_title(action)
323 | ax.set_xlabel('X')
324 | ax.set_ylabel('Y')
325 | ax.set_zlabel('Z')
326 | # xroot, yroot, zroot = seq[t, 0, 0], seq[t, 0, 1], seq[t, 0, 2]
327 |
328 | # pdb.set_trace()
329 | ax.set_xlim3d(-radius + xroot, radius + xroot)
330 | ax.set_ylim3d([-radius + yroot, radius + yroot])
331 | ax.set_zlim3d([-radius + zroot, radius + zroot])
332 |
333 | if True==debug:
334 | ax.axis('on')
335 | ax.grid(b=True)
336 | else:
337 | ax.axis('off')
338 | ax.grid(b=None)
339 | # Turn off tick labels
340 | ax.set_yticklabels([])
341 | ax.set_xticklabels([])
342 | ax.set_zticklabels([])
343 |
344 | cv2.waitKey(0)
345 |
346 | # ax.view_init(-75, 90)
347 | # ax.view_init(elev=20, azim=90+az)
348 | ax.view_init(elev=20, azim=az)
349 |
350 | if True==debug:
351 | ax.legend(bbox_to_anchor=(1.1, 1), loc='upper right')
352 | pass
353 |
354 | fig.savefig(osp.join(folder_p, 'frames', '{0}.jpg'.format(t)))
355 | plt.close(fig)
356 |
357 | # break
358 |
359 | def write_vid_from_imgs(folder_p, fps):
360 | '''Collate frames into a video sequence.
361 |
362 | Args:
363 | folder_p (str): Frame images are in the path: folder_p/frames/.jpg
364 | fps (float): Output frame rate.
365 |
366 | Returns:
367 | Output video is stored in the path: folder_p/video.mp4
368 | '''
369 | vid_p = osp.join(folder_p, 'video.mp4')
370 | cmd = ['ffmpeg', '-r', str(int(fps)), '-i',
371 | osp.join(folder_p, 'frames', '%d.jpg'), '-y', vid_p]
372 | FNULL = open(os.devnull, 'w')
373 | retcode = subprocess.call(cmd, stdout=FNULL, stderr=subprocess.STDOUT)
374 | if not 0 == retcode:
375 | print('*******ValueError(Error {0} executing command: {1}*********'.format(retcode, ' '.join(cmd)))
376 | shutil.rmtree(osp.join(folder_p, 'frames'))
377 |
378 | def viz_seq(seq, folder_p, sk_type, orig_fps=30.0, debug=False):
379 | '''1. Dumps sequence of skeleton images for the given sequence of joints.
380 | 2. Collates the sequence of images into an mp4 video.
381 |
382 | Args:
383 | seq (np.array): Array of joint positions.
384 | folder_p (str): Path to root folder that will contain frames folder.
385 | sk_type (str): {'smpl', 'nturgbd'}
386 |
387 | Return:
388 | None. Path of mp4 video: folder_p/video.mp4
389 | '''
390 | # Delete folder if exists
391 | if osp.exists(folder_p):
392 | print('Deleting existing folder ', folder_p)
393 | shutil.rmtree(folder_p)
394 |
395 | # Create folder for frames
396 | os.makedirs(osp.join(folder_p, 'frames'))
397 |
398 | # Dump frames into folder. Args: (data, radius, frames path)
399 | viz_skeleton(seq, folder_p=folder_p, sk_type=sk_type, radius=1.2, debug=debug)
400 | write_vid_from_imgs(folder_p, orig_fps)
401 |
402 | return None
403 |
404 | def viz_rand_seq(X, Y, dtype, epoch, wb, urls=None,
405 | k=3, pred_labels=None):
406 | '''
407 | Args:
408 | X (np.array): Array (frames) of SMPL joint positions.
409 | Y (np.array): Multiple labels for each frame in x \in X.
410 | dtype (str): {'input', 'pred'}
411 | k (int): # samples to viz.
412 | urls (tuple): Tuple of URLs of the rendered videos from original mocap.
413 | wb (dict): Wandb log dict.
414 | Returns:
415 | viz_ds (dict): Data structure containing all viz. info so far.
416 | '''
417 | import wandb
418 | # `idx2al`: idx --> action label string
419 | al2idx = dutils.read_json('data/action_label_to_idx.json')
420 | idx2al = {al2idx[k]: k for k in al2idx}
421 |
422 | # Sample k random seqs. to viz.
423 | for s_idx in random.sample(list(range(X.shape[0])), k):
424 | # Visualize a single seq. in path `folder_p`
425 | folder_p = osp.join('viz', str(uuid.uuid4()))
426 | viz_seq(seq=X[s_idx], folder_p=folder_p)
427 | title='{0} seq. {1}: '.format(dtype, s_idx)
428 | acts_str = ', '.join([idx2al[l] for l in torch.unique(Y[s_idx])])
429 | wb[title+urls[s_idx]] = wandb.Video(osp.join(folder_p, 'video.mp4'),
430 | caption='Actions: '+acts_str)
431 |
432 | if 'pred' == dtype or 'preds'==dtype:
433 | raise NotImplementedError
434 |
435 | print('Done viz. {0} seqs.'.format(k))
436 | return wb
437 |
--------------------------------------------------------------------------------
/notebooks/BABEL_visualization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Visualizing BABEL labels\n",
8 | "[BABEL](https://babel.is.tue.mpg.de/) labels mocap sequences from [AMASS](https://amass.is.tue.mpg.de) with action labels. \n",
9 | "A single sequence in BABEL can have multiple action labels associated with it, from multiple annotators. \n",
10 | "Here, we present code to load data from BABEL, visualize the mocap sequence rendered as a 2D video, and view the action labels corresponding to the sequence. "
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# Preparing the environment\n",
20 | "%load_ext autoreload\n",
21 | "%autoreload 2\n",
22 | "%matplotlib notebook\n",
23 | "%matplotlib inline"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "import json\n",
33 | "from os.path import join as ospj\n",
34 | "\n",
35 | "import numpy as np\n",
36 | "\n",
37 | "import pprint\n",
38 | "pp = pprint.PrettyPrinter()\n",
39 | "\n",
40 | "from IPython.display import HTML"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "### Load BABEL\n",
48 | "We assume that you have downloaded BABEL annotations from the [website](https://babel.is.tue.mpg.de/data.html) and placed the downloaded `babel_v1.0_release` folder in `data/`. The BABEL data is provided as two sets -- BABEL dense and BABEL extra. "
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 3,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "d_folder = '../data/babel_v1.0_release' # Data folder\n",
58 | "l_babel_dense_files = ['train', 'val', 'test']\n",
59 | "l_babel_extra_files = ['extra_train', 'extra_val']\n",
60 | "\n",
61 | "# BABEL Dataset \n",
62 | "babel = {}\n",
63 | "for file in l_babel_dense_files:\n",
64 | " babel[file] = json.load(open(ospj(d_folder, file+'.json')))\n",
65 | " \n",
66 | "for file in l_babel_extra_files:\n",
67 | " babel[file] = json.load(open(ospj(d_folder, file+'.json'))) "
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "### View random annotation"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "Now, let us view an annotation data structure from the BABEL. \n",
82 | "The overall data structure is a dictionary, with a unique sequence ID as key and the annotation as value. "
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 4,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "def get_random_babel_ann():\n",
92 | " '''Get annotation from random sequence from a random file'''\n",
93 | " file = np.random.choice(l_babel_dense_files + l_babel_extra_files)\n",
94 | " seq_id = np.random.choice(list(babel[file].keys()))\n",
95 | " print('We are visualizing annotations for seq ID: {0} in \"{1}.json\"'.format(seq_id, file))\n",
96 | " ann = babel[file][seq_id]\n",
97 | " return ann, file"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 5,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "We are visualizing annotations for seq ID: 3312 in \"test.json\"\n",
110 | "{'babel_sid': 3312,\n",
111 | " 'dur': 76.73,\n",
112 | " 'feat_p': 'CMU/CMU/86/86_08_poses.npz',\n",
113 | " 'frame_ann': {'anntr_id': 'c6065e9c-1652-46df-a45f-fe8b8158428f',\n",
114 | " 'babel_lid': 'a642048f-7fa9-402f-a4c1-d7e9e7f696d1',\n",
115 | " 'labels': [{'act_cat': None,\n",
116 | " 'end_t': 68.093,\n",
117 | " 'proc_label': None,\n",
118 | " 'raw_label': None,\n",
119 | " 'seg_id': 'ad703788-bd17-42d4-854b-2b64cb58ee16',\n",
120 | " 'start_t': 59.51},\n",
121 | " {'act_cat': None,\n",
122 | " 'end_t': 32.82,\n",
123 | " 'proc_label': None,\n",
124 | " 'raw_label': None,\n",
125 | " 'seg_id': '1785aeca-53ce-4a33-a249-8a5d3466ea95',\n",
126 | " 'start_t': 27.445},\n",
127 | " {'act_cat': None,\n",
128 | " 'end_t': 52.426,\n",
129 | " 'proc_label': None,\n",
130 | " 'raw_label': None,\n",
131 | " 'seg_id': '12768b82-b342-46ee-ae60-e158f8b1dd47',\n",
132 | " 'start_t': 47.843},\n",
133 | " {'act_cat': None,\n",
134 | " 'end_t': 59.51,\n",
135 | " 'proc_label': None,\n",
136 | " 'raw_label': None,\n",
137 | " 'seg_id': '435bd5a6-01e9-4fc4-abee-642954466832',\n",
138 | " 'start_t': 53.26},\n",
139 | " {'act_cat': None,\n",
140 | " 'end_t': 40.007,\n",
141 | " 'proc_label': None,\n",
142 | " 'raw_label': None,\n",
143 | " 'seg_id': 'd3911406-ad83-4438-941c-919bf296d5e1',\n",
144 | " 'start_t': 33.382},\n",
145 | " {'act_cat': None,\n",
146 | " 'end_t': 76.733,\n",
147 | " 'proc_label': None,\n",
148 | " 'raw_label': None,\n",
149 | " 'seg_id': 'f222a4d9-a8d5-4002-893b-4df102e1e0fa',\n",
150 | " 'start_t': 70.593},\n",
151 | " {'act_cat': None,\n",
152 | " 'end_t': 2.252,\n",
153 | " 'proc_label': None,\n",
154 | " 'raw_label': None,\n",
155 | " 'seg_id': '35e605ec-c9f8-4c9d-8320-680de71837ce',\n",
156 | " 'start_t': 0.294},\n",
157 | " {'act_cat': None,\n",
158 | " 'end_t': 6.961,\n",
159 | " 'proc_label': None,\n",
160 | " 'raw_label': None,\n",
161 | " 'seg_id': 'fdaead4c-0a37-4579-a42a-4a94145570b9',\n",
162 | " 'start_t': 4.232},\n",
163 | " {'act_cat': None,\n",
164 | " 'end_t': 70.593,\n",
165 | " 'proc_label': None,\n",
166 | " 'raw_label': None,\n",
167 | " 'seg_id': '52d3c3e9-102b-4cf0-b082-cd416a7b5f64',\n",
168 | " 'start_t': 68.093},\n",
169 | " {'act_cat': None,\n",
170 | " 'end_t': 4.232,\n",
171 | " 'proc_label': None,\n",
172 | " 'raw_label': None,\n",
173 | " 'seg_id': 'f524e2df-36e2-45ce-a54e-892fdb7353d0',\n",
174 | " 'start_t': 2.252},\n",
175 | " {'act_cat': None,\n",
176 | " 'end_t': 9.336,\n",
177 | " 'proc_label': None,\n",
178 | " 'raw_label': None,\n",
179 | " 'seg_id': '7f265bed-f445-4b6b-a41f-c62106d7be3b',\n",
180 | " 'start_t': 6.961},\n",
181 | " {'act_cat': None,\n",
182 | " 'end_t': 47.843,\n",
183 | " 'proc_label': None,\n",
184 | " 'raw_label': None,\n",
185 | " 'seg_id': '1aa33355-a669-45a6-86a9-19ae862a47e9',\n",
186 | " 'start_t': 40.007},\n",
187 | " {'act_cat': None,\n",
188 | " 'end_t': 15.523,\n",
189 | " 'proc_label': None,\n",
190 | " 'raw_label': None,\n",
191 | " 'seg_id': 'd9c310f5-fc1e-47d8-b2f7-075c31a2eb6d',\n",
192 | " 'start_t': 9.523},\n",
193 | " {'act_cat': None,\n",
194 | " 'end_t': 22.507,\n",
195 | " 'proc_label': None,\n",
196 | " 'raw_label': None,\n",
197 | " 'seg_id': 'f7a71a16-2807-49f7-8a66-7df3e678e161',\n",
198 | " 'start_t': 15.523},\n",
199 | " {'act_cat': None,\n",
200 | " 'end_t': 0.294,\n",
201 | " 'proc_label': None,\n",
202 | " 'raw_label': None,\n",
203 | " 'seg_id': '3f57a657-2c8f-4995-87a4-965bcf8ea2a6',\n",
204 | " 'start_t': 0},\n",
205 | " {'act_cat': None,\n",
206 | " 'end_t': 9.523,\n",
207 | " 'proc_label': None,\n",
208 | " 'raw_label': None,\n",
209 | " 'seg_id': 'c9f97199-97eb-463c-a04e-a511413ad5ba',\n",
210 | " 'start_t': 9.336},\n",
211 | " {'act_cat': None,\n",
212 | " 'end_t': 33.382,\n",
213 | " 'proc_label': None,\n",
214 | " 'raw_label': None,\n",
215 | " 'seg_id': 'dac4fabe-e96c-411c-ad2e-29211e8c212a',\n",
216 | " 'start_t': 32.82},\n",
217 | " {'act_cat': None,\n",
218 | " 'end_t': 53.26,\n",
219 | " 'proc_label': None,\n",
220 | " 'raw_label': None,\n",
221 | " 'seg_id': 'ed99bf22-3ea5-45a6-9df3-17e67e49f119',\n",
222 | " 'start_t': 52.426},\n",
223 | " {'act_cat': None,\n",
224 | " 'end_t': 27.445,\n",
225 | " 'proc_label': None,\n",
226 | " 'raw_label': None,\n",
227 | " 'seg_id': '5c459b13-35e6-4c36-8ec4-9eb1536bfe95',\n",
228 | " 'start_t': 22.507}],\n",
229 | " 'mul_act': True},\n",
230 | " 'seq_ann': {'anntr_id': 'a217bb6b-93ae-4611-8e53-d4318ed5be00',\n",
231 | " 'babel_lid': '037dc092-28d5-4537-9632-9a91fc9f7fb9',\n",
232 | " 'labels': [{'act_cat': None,\n",
233 | " 'proc_label': None,\n",
234 | " 'raw_label': None,\n",
235 | " 'seg_id': 'f7d4b8fa-de77-487f-a08c-84bbc05c3148'}],\n",
236 | " 'mul_act': True},\n",
237 | " 'url': 'https://babel-renders.s3.eu-central-1.amazonaws.com/003312.mp4'}\n"
238 | ]
239 | }
240 | ],
241 | "source": [
242 | "ann, _ = get_random_babel_ann()\n",
243 | "pp.pprint(ann)"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "Note that the action labels from `test.json` are not available publicly. \n",
251 | "Also note that the internal data structures of BABEL dense and BABEL extra differ slightly. \n",
252 | "For a detailed description of the annotation, see [BABEL's data page](https://babel.is.tue.mpg.de/data.html)."
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "### Visualize a mocap seq. and its action labels "
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 6,
265 | "metadata": {},
266 | "outputs": [],
267 | "source": [
268 | "def get_vid_html(url):\n",
269 | " '''Helper code to embed a URL in a notebook'''\n",
270 | " html_code = ''\n",
273 | " return html_code"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 7,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "def get_labels(ann, file):\n",
283 | " # Get sequence labels and frame labels if they exist\n",
284 | " seq_l, frame_l = None, None\n",
285 | " if 'extra' not in file:\n",
286 | " if ann['seq_ann'] is not None:\n",
287 | " seq_l = [seg['raw_label'] for seg in ann['seq_ann']['labels']]\n",
288 | " if ann['frame_ann'] is not None:\n",
289 | " frame_l = [(seg['raw_label'], seg['start_t'], seg['end_t']) for seg in ann['frame_ann']['labels']]\n",
290 | " else:\n",
291 | " # Load labels from 1st annotator (random) if there are multiple annotators\n",
292 | " if ann['seq_anns'] is not None:\n",
293 | " seq_l = [seg['raw_label'] for seg in ann['seq_anns'][0]['labels']]\n",
294 | " if ann['frame_anns'] is not None:\n",
295 | " frame_l = [(seg['raw_label'], seg['start_t'], seg['end_t']) for seg in ann['frame_anns'][0]['labels']]\n",
296 | " return seq_l, frame_l"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "#### Visualize a random mocap and its annotation from BABEL, by running the cell below. "
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 8,
309 | "metadata": {},
310 | "outputs": [
311 | {
312 | "name": "stdout",
313 | "output_type": "stream",
314 | "text": [
315 | "We are visualizing annotations for seq ID: 7536 in \"train.json\"\n",
316 | "Sequence labels: ['pace and shake hand']\n",
317 | "Frame labels: (action label, start time, end time)\n",
318 | "[('walk', 0, 2.106),\n",
319 | " ('transition', 2.106, 2.845),\n",
320 | " ('make a knocking gesture', 2.845, 3.507),\n",
321 | " ('transition', 3.466, 4.6),\n",
322 | " ('turn around', 4.519, 5.519),\n",
323 | " ('walk back', 5.424, 7.734)]\n"
324 | ]
325 | },
326 | {
327 | "data": {
328 | "text/html": [
329 | ""
330 | ],
331 | "text/plain": [
332 | ""
333 | ]
334 | },
335 | "execution_count": 8,
336 | "metadata": {},
337 | "output_type": "execute_result"
338 | }
339 | ],
340 | "source": [
341 | "ann, file = get_random_babel_ann()\n",
342 | "seq_l, frame_l = get_labels(ann, file)\n",
343 | "print('Sequence labels: ', seq_l)\n",
344 | "print('Frame labels: (action label, start time, end time)')\n",
345 | "pp.pprint(frame_l) \n",
346 | "HTML(get_vid_html(ann['url']))"
347 | ]
348 | },
349 | {
350 | "cell_type": "markdown",
351 | "metadata": {},
352 | "source": [
353 | "- If you are interested in loading the mocap sequence in 3D, please refer to the tutorials in [AMASS](https://github.com/nghorbani/amass/tree/master/notebooks)"
354 | ]
355 | }
356 | ],
357 | "metadata": {
358 | "kernelspec": {
359 | "display_name": "Python 3",
360 | "language": "python",
361 | "name": "python3"
362 | },
363 | "language_info": {
364 | "codemirror_mode": {
365 | "name": "ipython",
366 | "version": 3
367 | },
368 | "file_extension": ".py",
369 | "mimetype": "text/x-python",
370 | "name": "python",
371 | "nbconvert_exporter": "python",
372 | "pygments_lexer": "ipython3",
373 | "version": "3.8.3"
374 | }
375 | },
376 | "nbformat": 4,
377 | "nbformat_minor": 4
378 | }
379 |
--------------------------------------------------------------------------------
/action_recognition/data_gen/create_dataset.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:fenc=utf-8
4 | #
5 | # Copyright © 2021 achandrasekaran
6 | #
7 | # Distributed under terms of the MIT license.
8 |
9 |
10 | import sys, os, pdb
11 | from os.path import join as ospj
12 | from os.path import basename as ospb
13 | from os.path import dirname as ospd
14 | import numpy as np
15 | import torch
16 | from collections import *
17 | from itertools import *
18 | import pandas as pd
19 | import pickle, json, csv
20 | from tqdm import tqdm
21 | from pandas.core.common import flatten
22 | import ipdb
23 | import pickle
24 |
25 | # Custom
26 | import preprocess
27 | import dutils
28 | import viz
29 |
30 | """
31 | Script to load BABEL segments with NTU skeleton format and pre-process.
32 | """
33 |
34 |
35 | def ntu_style_preprocessing(b_dset_path):
36 | '''
37 | '''
38 | pdb.set_trace()
39 | print('Load BABEL v1.0 dataset subset', b_dset_path)
40 | b_dset = dutils.read_pkl(b_dset_path)
41 | # Get unnormalized 5-sec. samples
42 | X = np.array(b_dset['X'])
43 | print('X (old) = ', np.shape(X)) # N, T, V, C
44 |
45 | # Prep. data for normalization
46 | X = X.transpose(0, 3, 1, 2) # N, C, T, V
47 | X = X[:, :, :, :, np.newaxis] # N, C, T, V, M
48 | print('Shape of prepped X: ', X.shape)
49 |
50 | # Normalize (pre-process) in NTU RGBD-style
51 | ntu_sk_spine_bone = np.array([0, 1])
52 | ntu_sk_shoulder_bone = np.array([8, 4])
53 | X, l_m_sk = preprocess.pre_normalization(X, zaxis=ntu_sk_spine_bone,
54 | xaxis=ntu_sk_shoulder_bone)
55 | print('Shape of normalized X: ', X.shape)
56 | print('Skipping {0} samples because "skeleton is missing"'.format(len(l_m_sk)))
57 | print('Skipped idxs = ', l_m_sk)
58 |
59 | # Dataset w/ processed seg. chunks. (Skip samples w/ missing skeletons)
60 | b_AR_dset = {k: np.delete(b_dset[k], l_m_sk) for k in b_dset if k!='X'}
61 | b_AR_dset['X'] = np.delete(X, l_m_sk, axis=0)
62 | print('Shape of dataset = ', b_AR_dset['X'].shape)
63 |
64 | fp = b_dset_path.replace('samples', 'ntu_sk_ntu-style_preprocessed' )
65 | # fp = '../data/babel_v1.0/babel_v1.0_ntu_sk_ntu-style_preprocessed.pkl'
66 | # dutils.write_pkl(b_AR_dset, fp)
67 | with open(fp, 'wb') as of:
68 | pickle.dump(b_AR_dset, of, protocol=4)
69 |
70 | def get_act_idx(y, act2idx, n_classes):
71 | '''
72 | '''
73 | if y in act2idx:
74 | return act2idx[y]
75 | else:
76 | return n_classes
77 |
78 | def store_splits_subsets(n_classes, spl, plus_extra = True, w_folder = '../data/babel_v1.0/'):
79 | '''
80 | '''
81 | # Get splits
82 | splits = dutils.read_json('../data/amass_splits.json')
83 | sid2split = {int(ospb(u).replace('.mp4', '')): spl for spl in splits \
84 | for u in splits[spl] }
85 |
86 | # In labels, act. cat. --> idx
87 | act2idx_150 = dutils.read_json('../data/action_label_2_idx.json')
88 | act2idx = {k: act2idx_150[k] for k in act2idx_150 if act2idx_150[k] < n_classes}
89 | print('{0} actions in label set: {1}'.format(len(act2idx), act2idx))
90 |
91 | if plus_extra :
92 | fp = w_folder + 'babel_v1.0_'+spl+'_extra_ntu_sk_ntu-style_preprocessed.pkl'
93 | else:
94 | fp = w_folder + 'babel_v1.0_'+spl+'_ntu_sk_ntu-style_preprocessed.pkl'
95 |
96 | # Get full dataset
97 | b_AR_dset = dutils.read_pkl(fp)
98 |
99 | # Store idxs of samples to include in learning
100 | split_idxs = defaultdict(list)
101 | for i, y1 in enumerate(b_AR_dset['Y1']):
102 |
103 | # Check if action category in list of classes
104 | if y1 not in act2idx:
105 | continue
106 |
107 | sid = b_AR_dset['sid'][i]
108 | split_idxs[sid2split[sid]].append(i) # Include idx in dataset
109 |
110 | # Save features that'll be loaded by dataloader
111 | ar_idxs = np.array(split_idxs[spl])
112 | X = b_AR_dset['X'][ar_idxs]
113 | if plus_extra:
114 | fn = w_folder + f'{spl}_extra_ntu_sk_{n_classes}.npy'
115 | else:
116 | fn = w_folder + f'{spl}_ntu_sk_{n_classes}.npy'
117 | np.save(fn, X)
118 |
119 | # labels
120 | labels = {k: np.array(b_AR_dset[k])[ar_idxs] for k in b_AR_dset if k!='X'}
121 |
122 | # Create, save label data structure that'll be loaded by dataloader
123 | label_idxs = defaultdict(list)
124 | for i, y1 in enumerate(labels['Y1']):
125 | # y1
126 | label_idxs['Y1'].append(act2idx[y1])
127 | # yk
128 | yk = [get_act_idx(y, act2idx, n_classes) for y in labels['Yk'][i]]
129 | label_idxs['Yk'].append(yk)
130 | # yov
131 | yov_o = labels['Yov'][i]
132 | yov = {get_act_idx(y, act2idx, n_classes): yov_o[y] for y in yov_o}
133 | label_idxs['Yov'].append(yov)
134 | #
135 | label_idxs['seg_id'].append(labels['seg_id'][i])
136 | label_idxs['sid'].append(labels['sid'][i])
137 | label_idxs['chunk_n'].append(labels['chunk_n'][i])
138 | label_idxs['anntr_id'].append(labels['anntr_id'][i])
139 |
140 | if plus_extra:
141 | wr_f = w_folder + f'{spl}_extra_label_{n_classes}.pkl'
142 | else:
143 | wr_f = w_folder + f'{spl}_label_{n_classes}.pkl'
144 | dutils.write_pkl(\
145 | (label_idxs['seg_id'], (label_idxs['Y1'], label_idxs['sid'],
146 | label_idxs['chunk_n'], label_idxs['anntr_id'])), \
147 | wr_f)
148 |
149 | class Babel_AR:
150 | '''Object containing data, methods for Action Recognition.
151 |
152 | Task
153 | -----
154 | Given: x (Segment from Babel)
155 | Predict: \hat{p}(x) (Distribution over action categories)
156 |
157 | GT
158 | ---
159 | How to compute GT for a given segment?
160 | - yk: All action categories that are labeled for the entirety of segment
161 | - y1: One of yk
162 | - yov: Any y that belongs to part of a segment is considered to be GT.
163 | Fraction of segment covered by an action: {'walk': 1.0, 'wave': 0.5}
164 |
165 | '''
166 | def __init__(self, dataset, dense=True, seq_dense_ann_type={}):
167 | '''Dataset with (samples, different GTs)
168 | '''
169 | # Load dataset
170 | self.babel = dataset
171 | self.dense = dense
172 | self.seq_dense_ann_type = seq_dense_ann_type
173 | self.jpos_p = '../../../../../amass/'
174 |
175 | # Get frame-rate for each seq. in AMASS
176 | f_p = '../data/featp_2_fps.json'
177 | self.ft_p_2_fps = dutils.read_json(f_p)
178 |
179 | # Dataset w/ keys = {'X', 'Y1', 'Yk', 'Yov', 'seg_id', 'sid',
180 | # 'seg_dur'}
181 | self.d = defaultdict(list)
182 | for ann in tqdm(self.babel):
183 | self._update_dataset(ann)
184 |
185 | def _subsample_to_30fps(self, orig_ft, orig_fps):
186 | '''Get features at 30fps frame-rate
187 | Args:
188 | orig_ft (T, 25*3): Feats. @ `orig_fps` frame-rate
189 | orig_fps : Frame-rate in original (ft) seq.
190 | Return:
191 | ft (T', 25*3): Feats. @ 30fps
192 | '''
193 | T, n_j, _ = orig_ft.shape
194 | out_fps = 30.0
195 | # Matching the sub-sampling used for rendering
196 | if int(orig_fps)%int(out_fps):
197 | sel_fr = np.floor(orig_fps / out_fps * np.arange(int(out_fps))).astype(int)
198 | n_duration = int(T/int(orig_fps))
199 | t_idxs = []
200 | for i in range(n_duration):
201 | t_idxs += list(i * int(orig_fps) + sel_fr)
202 | if int(T % int(orig_fps)):
203 | last_sec_frame_idx = n_duration*int(orig_fps)
204 | t_idxs += [x+ last_sec_frame_idx for x in sel_fr if x + last_sec_frame_idx < T ]
205 | else:
206 | t_idxs = np.arange(0, T, orig_fps/out_fps, dtype=int)
207 |
208 | ft = orig_ft[t_idxs, :, :]
209 | return ft
210 |
211 | def _viz_x(self, ft, fn='test_sample'):
212 | '''Wraper to Viz. the given sample (w/ NTU RGBD skeleton)'''
213 | viz.viz_seq(seq=ft, folder_p=f'test_viz/{fn}', sk_type='nturgbd',
214 | debug=True)
215 | return None
216 |
217 | def _load_seq_feats(self, ft_p, sk_type):
218 | '''Given path to joint position features, return them in 30fps'''
219 | # Identify appropriate feature directory path on disk
220 | if 'smpl_wo_hands' == sk_type: # SMPL w/o hands (T, 22*3)
221 | jpos_p = ospj(self.jpos_p, 'joint_pos')
222 | if 'nturgbd' == sk_type: # NTU (T, 219)
223 | jpos_p = ospj(self.jpos_p, 'babel_joint_pos')
224 |
225 | # Get the correct dataset folder name
226 | ddir_n = ospb(ospd(ospd(ft_p)))
227 | ddir_map = {'BioMotionLab_NTroje': 'BMLrub', 'DFaust_67': 'DFaust'}
228 | ddir_n = ddir_map[ddir_n] if ddir_n in ddir_map else ddir_n
229 | # Get the subject folder name
230 | sub_fol_n = ospb(ospd(ft_p))
231 |
232 | # Sanity check
233 | fft_p = ospj(jpos_p, ddir_n, sub_fol_n, ospb(ft_p))
234 | assert os.path.exists(fft_p)
235 |
236 | # Load seq. fts.
237 | ft = np.load(fft_p)['joint_pos']
238 | T, ft_sz = ft.shape
239 |
240 | # Get NTU skeleton joints
241 | ntu_js = dutils.smpl_to_nturgbd(model_type='smplh', out_format='nturgbd')
242 | ft = ft.reshape(T, -1, 3)
243 | ft = ft[:, ntu_js, :]
244 |
245 | # Sub-sample to 30fps
246 | orig_fps = self.ft_p_2_fps[ft_p]
247 | ft = self._subsample_to_30fps(ft, orig_fps)
248 | # print(f'Feat. shape = {ft.shape}, fps = {orig_fps}')
249 | # if orig_fps != 30.0:
250 | # self._viz_x(ft)
251 | return ft
252 |
253 | def _get_per_f_labels(self, ann, ann_type, seq_dur):
254 | ''' '''
255 | # Per-frame labels: {0: ['walk'], 1: ['walk', 'wave'], ... T: ['stand']}
256 | yf = defaultdict(list)
257 | T = int(30.0*seq_dur)
258 | for n_f in range(T):
259 | cur_t = float(n_f/30.0)
260 | for seg in ann['labels']:
261 |
262 | if seg['act_cat'] is None:
263 | continue
264 |
265 | if 'seq_ann' == ann_type:
266 | seg['start_t'] = 0.0
267 | seg['end_t'] = seq_dur
268 |
269 | if cur_t >= float(seg['start_t']) and cur_t < float(seg['end_t']):
270 | yf[n_f] += seg['act_cat']
271 | return yf
272 |
273 | def _compute_dur_samples(self, ann, ann_type, seq_ft, seq_dur, dur=5.0):
274 | '''Return each GT action, corresponding to the fraction of the
275 | segment that it overlaps with.
276 | There are 2 conditions that we need to handle:
277 | 1. Multiple action categories in 'act_cat'
278 | 2. Simultaneous (overlapping action segments).
279 |
280 | Example Input:
281 | Seq. => frames [0, 1, 2, 3, 4, 5]
282 | GT acts. => [[2,3], [2,3], [2], [0], [0,1], [0,1]]
283 |
284 | Segs, GT:
285 | 1. seg_x = seq[0: 3], y1 = 2, yall = {2: 1.0, 3: 0.66}
286 | 2. seg_x = seq[0: 2], y1 = 3, yall = {2: 1.0, 3: 1.0}
287 | 3. seg_x = seq[3: ], y1 = 0, yall = {0: 1.0, 1: 0.66}
288 | 4. seg_x = seq[4: ], y1 = 1, yall = {0: 1.0, 1: 1.0}
289 |
290 | - Note that we should do the above for each chunk in a segment,
291 | each of duration = seconds.
292 |
293 | Return:
294 | [ { 'x': [st_t, end_t],
295 | 'y1': ,
296 | 'yall': { : , ...}},
297 | { ... }, ...
298 | ]
299 | '''
300 | #
301 | yf = self._get_per_f_labels(ann, ann_type, seq_dur)
302 |
303 | # Compute, store all samples for each segment
304 | seq_samples = []
305 | for seg in ann['labels']:
306 |
307 | # If no labeled act. cats. for current seg., skip it
308 | if seg['act_cat'] is None or 0 == len(seg['act_cat']):
309 | continue
310 |
311 | # Handle stage 1 missing durs.
312 | if 'seq_ann' == ann_type:
313 | seg['start_t'] = 0.0
314 | seg['end_t'] = seq_dur
315 |
316 | # Get segment feats.
317 | seg_st_f, seg_end_f = int(30.0*seg['start_t']), int(30.0*seg['end_t'])
318 | seg_x = seq_ft[seg_st_f: seg_end_f, :, :]
319 |
320 | # Split segment into -second chunks
321 | n_f_pc = 30.0 * dur
322 | n_chunks = int(np.ceil(seg_x.shape[0]/n_f_pc))
323 | for n_ch in range(n_chunks):
324 |
325 | # Single -sec. chunk in segment
326 | ch_st_f = int(n_f_pc * n_ch)
327 | ch_end_f = int(min(ch_st_f + n_f_pc, seg_x.shape[0]))
328 | x = seg_x[ch_st_f: ch_end_f, :, :]
329 |
330 | # Handle case where chunk_T < n_f_pc
331 | x_T, nj, xyz = x.shape
332 | x_ch = np.concatenate((x, np.zeros((int(n_f_pc)- x_T, nj, xyz))), axis=0)
333 |
334 | # Labels for this chunk
335 | yov = Counter(flatten([yf[seg_st_f + n_f] for n_f in range(ch_st_f, ch_end_f)]))
336 |
337 | # Sanity check -- is segment smaller than 1 frame?
338 | if seg['act_cat'][0] not in yov:
339 | # print('Skipping seg:', seg)
340 | # print(f'Chunk # {n_ch}, Yov: ', yov)
341 | continue
342 |
343 | yov = {k: round(yov[k]/x_T, 3) for k in yov}
344 |
345 | # For each act_cat in segment, create a separate sample
346 | for cat in seg['act_cat']:
347 | # Add to samples GTs
348 | seq_samples.append({'seg_id': seg['seg_id'],
349 | 'chunk_n': n_ch,
350 | 'chunk_dur': round(x_T/n_f_pc, 3),
351 | 'x': x_ch,
352 | 'y1': cat,
353 | 'yk': seg['act_cat'],
354 | 'yov': yov,
355 | 'anntr_id': ann['anntr_id']
356 | })
357 | return seq_samples
358 |
359 | def _sample_at_seg_chunk_level(self, ann, seq_samples):
360 | # Samples at segment-chunk-level
361 | for i, sample in enumerate(seq_samples):
362 |
363 | self.d['sid'].append(ann['babel_sid']) # Seq. info
364 | self.d['seg_id'].append(sample['seg_id']) # Seg. info
365 | self.d['chunk_n'].append(sample['chunk_n']) # Seg. chunk info
366 | self.d['anntr_id'].append(sample['anntr_id']) # Annotator id (useful in rebuttal exp.)
367 | self.d['chunk_dur'].append(sample['chunk_dur']) # Seg. chunk info
368 | self.d['X'].append(sample['x']) # Seg. chunk feats.
369 | self.d['Y1'].append(sample['y1']) # 1 out of k GT act. cats.
370 | self.d['Yk'].append(sample['yk']) # List of k GT act. cats.
371 | # : fractions of overlapping act. cats.
372 | self.d['Yov'].append(sample['yov'])
373 | return
374 |
375 | def _update_dataset(self, ann):
376 | '''Return one sample (one segment) = (X, Y1, Yall)'''
377 |
378 | # Get feats. for seq.
379 | seq_ft = self._load_seq_feats(ann['feat_p'], 'nturgbd')
380 |
381 | # To keep track of type of annotation for loading 'extra'
382 | # Compute all GT labels for this seq.
383 | seq_samples = None
384 | if self.dense:
385 | if ann['frame_ann'] is not None:
386 | ann_ar = ann['frame_ann']
387 | self.seq_dense_ann_type[ann['babel_sid']] = 'frame_ann'
388 | seq_samples = self._compute_dur_samples(ann_ar, 'frame_ann', seq_ft, ann['dur'])
389 | else:
390 | ann_ar = ann['seq_ann']
391 | self.seq_dense_ann_type[ann['babel_sid']] = 'seq_ann'
392 | seq_samples = self._compute_dur_samples(ann_ar, 'seq_ann', seq_ft, ann['dur'])
393 | self._sample_at_seg_chunk_level(ann, seq_samples)
394 | else:
395 | # check if extra exists
396 | if 'frame_anns' in ann.keys() or 'seq_anns' in ann.keys():
397 | ann_type = None
398 | if ann['babel_sid'] in self.seq_dense_ann_type:
399 | ann_type = self.seq_dense_ann_type[ann['babel_sid']]
400 | else:
401 | if ann['frame_anns'] is not None:
402 | ann_type = 'frame_ann'
403 | elif ann['seq_anns'] is not None:
404 | ann_type = 'seq_ann'
405 | else:
406 | ipdb.set_trace()
407 | self.seq_dense_ann_type['babel_sid'] = ann_type
408 | ann_ar = None
409 | if ann_type == 'frame_ann':
410 | if ann['frame_anns'] is not None:
411 | ann_ar = ann['frame_anns']
412 | elif ann_type == 'seq_ann':
413 | if ann['seq_anns'] is not None:
414 | ann_ar = ann['seq_anns']
415 | else:
416 | ipdb.set_trace()
417 | if ann_ar:
418 | for an in ann_ar:
419 | seq_samples = self._compute_dur_samples(an, ann_type, \
420 | seq_ft, ann['dur'])
421 | self._sample_at_seg_chunk_level(ann, seq_samples)
422 | else:
423 | print('Unexpected format for extra!')
424 | return
425 |
426 |
427 | # Create dataset
428 | # --------------------------
429 | d_folder = '../../data/babel_v1.0_release/'
430 | w_folder = '../data/babel_v1.0/'
431 | for spl in ['train', 'val']:
432 |
433 | # Load Dense BABEL
434 | data = dutils.read_json(ospj(d_folder, f'{spl}.json'))
435 | dataset = [data[sid] for sid in data]
436 | dense_babel = Babel_AR(dataset, dense=True)
437 | # Store Dense BABEL
438 | d_filename = w_folder + 'babel_v1.0_'+ spl + '_samples.pkl'
439 | dutils.write_pkl(dense_babel.d, d_filename)
440 |
441 | # Load Extra BABEL
442 | data = dutils.read_json(ospj(d_folder, f'extra_{spl}.json'))
443 | dataset = [data[sid] for sid in data]
444 | extra_babel = Babel_AR(dataset, dense=False,
445 | seq_dense_ann_type=dense_babel.seq_dense_ann_type)
446 | # Store Dense + Extra
447 | de = {}
448 | for k in dense_babel.d.keys():
449 | de[k] = dense_babel.d[k] + extra_babel.d[k]
450 | ex_filename = w_folder + 'babel_v1.0_' + spl + '_extra_samples.pkl'
451 | dutils.write_pkl(de, ex_filename)
452 |
453 | # Pre-process, Store data in dataset
454 | print('NTU-style preprocessing')
455 | babel_dataset_AR = ntu_style_preprocessing(d_filename)
456 | babel_dataset_AR = ntu_style_preprocessing(ex_filename)
457 |
458 | for ex, C in product(('', '_extra'), (120, 60)):
459 |
460 | # Split, store data in npy file, labels in pkl
461 | store_splits_subsets(n_classes=C, spl=spl, plus_extra=True)
462 | store_splits_subsets(n_classes=C, spl=spl, plus_extra=False)
463 |
464 | # Store counts of samples for training with class-balanced focal loss
465 | label_fp = ospj(w_folder, f'{spl}{ex}_label_{C}.pkl')
466 | dutils.store_counts(label_fp)
467 |
468 |
--------------------------------------------------------------------------------
/action_recognition/train_test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # -*- coding: utf-8 -*-
4 | #
5 | # Adapted from https://github.com/lshiwjx/2s-AGCN for BABEL (https://babel.is.tue.mpg.de/)
6 |
7 | from __future__ import print_function
8 |
9 | import argparse
10 | import inspect
11 | import os
12 | import pickle
13 | import random
14 | import shutil
15 | import time
16 | from collections import *
17 | import numpy as np
18 |
19 | # torch
20 | import torch
21 | import torch.backends.cudnn as cudnn
22 | import torch.nn as nn
23 | import torch.optim as optim
24 | import torch.nn.functional as F
25 |
26 | import yaml
27 | from tensorboardX import SummaryWriter
28 | from torch.autograd import Variable
29 | from torch.optim.lr_scheduler import _LRScheduler
30 | from tqdm import tqdm
31 |
32 | import pdb
33 | import ipdb
34 |
35 | # Custom
36 | from class_balanced_loss import CB_loss
37 |
38 |
39 | # class GradualWarmupScheduler(_LRScheduler):
40 | # def __init__(self, optimizer, total_epoch, after_scheduler=None):
41 | # self.total_epoch = total_epoch
42 | # self.after_scheduler = after_scheduler
43 | # self.finished = False
44 | # self.last_epoch = -1
45 | # super().__init__(optimizer)
46 |
47 | # def get_lr(self):
48 | # return [base_lr * (self.last_epoch + 1) / self.total_epoch for base_lr in self.base_lrs]
49 |
50 | # def step(self, epoch=None, metric=None):
51 | # if self.last_epoch >= self.total_epoch - 1:
52 | # if metric is None:
53 | # return self.after_scheduler.step(epoch)
54 | # else:
55 | # return self.after_scheduler.step(metric, epoch)
56 | # else:
57 | # return super(GradualWarmupScheduler, self).step(epoch)
58 |
59 |
60 | def init_seed(_):
61 | torch.cuda.manual_seed_all(1)
62 | torch.manual_seed(1)
63 | np.random.seed(1)
64 | random.seed(1)
65 | # torch.backends.cudnn.enabled = False
66 | torch.backends.cudnn.deterministic = True
67 | torch.backends.cudnn.benchmark = False
68 |
69 |
70 | def get_parser():
71 | # parameter priority: command line > config > default
72 | parser = argparse.ArgumentParser(
73 | description='Spatial Temporal Graph Convolution Network')
74 | parser.add_argument(
75 | '--work-dir',
76 | default='./work_dir/temp',
77 | help='the work folder for storing results')
78 |
79 | parser.add_argument('-model_saved_name', default='')
80 | parser.add_argument(
81 | '--config',
82 | default='./config/nturgbd-cross-view/test_bone.yaml',
83 | help='path to the configuration file')
84 |
85 | # processor
86 | parser.add_argument(
87 | '--phase', default='train', help='must be train or test')
88 | parser.add_argument(
89 | '--save-score',
90 | type=str2bool,
91 | default=True,
92 | help='if ture, the classification score will be stored')
93 |
94 | # visulize and debug
95 | parser.add_argument(
96 | '--seed', type=int, default=1, help='random seed for pytorch')
97 | parser.add_argument(
98 | '--log-interval',
99 | type=int,
100 | default=100,
101 | help='the interval for printing messages (#iteration)')
102 | parser.add_argument(
103 | '--save-interval',
104 | type=int,
105 | default=2,
106 | help='the interval for storing models (#iteration)')
107 | parser.add_argument(
108 | '--eval-interval',
109 | type=int,
110 | default=5,
111 | help='the interval for evaluating models (#iteration)')
112 | parser.add_argument(
113 | '--print-log',
114 | type=str2bool,
115 | default=True,
116 | help='print logging or not')
117 | parser.add_argument(
118 | '--show-topk',
119 | type=int,
120 | default=[1, 5],
121 | nargs='+',
122 | help='which Top K accuracy will be shown')
123 |
124 | # feeder
125 | parser.add_argument(
126 | '--feeder', default='feeder.feeder', help='data loader will be used')
127 | parser.add_argument(
128 | '--num-worker',
129 | type=int,
130 | default=32,
131 | help='the number of worker for data loader')
132 | parser.add_argument(
133 | '--train-feeder-args',
134 | default=dict(),
135 | help='the arguments of data loader for training')
136 | parser.add_argument(
137 | '--test-feeder-args',
138 | default=dict(),
139 | help='the arguments of data loader for test')
140 |
141 | # model
142 | parser.add_argument('--model', default=None, help='the model will be used')
143 | parser.add_argument(
144 | '--model-args',
145 | type=dict,
146 | default=dict(),
147 | help='the arguments of model')
148 | parser.add_argument(
149 | '--weights',
150 | default=None,
151 | help='the weights for network initialization')
152 | parser.add_argument(
153 | '--ignore-weights',
154 | type=str,
155 | default=[],
156 | nargs='+',
157 | help='the name of weights which will be ignored in the initialization')
158 |
159 | # optim
160 | parser.add_argument(
161 | '--base-lr', type=float, default=0.01, help='initial learning rate')
162 | parser.add_argument(
163 | '--step',
164 | type=int,
165 | default=[20, 40, 60],
166 | nargs='+',
167 | help='the epoch where optimizer reduce the learning rate')
168 |
169 | #training
170 | parser.add_argument(
171 | '--device',
172 | type=int,
173 | default=0,
174 | nargs='+',
175 | help='the indexes of GPUs for training or testing')
176 | parser.add_argument('--optimizer', default='SGD', help='type of optimizer')
177 | parser.add_argument(
178 | '--nesterov', type=str2bool, default=False, help='use nesterov or not')
179 | parser.add_argument(
180 | '--batch-size', type=int, default=256, help='training batch size')
181 | parser.add_argument(
182 | '--test-batch-size', type=int, default=256, help='test batch size')
183 | parser.add_argument(
184 | '--start-epoch',
185 | type=int,
186 | default=0,
187 | help='start training from which epoch')
188 | parser.add_argument(
189 | '--num-epoch',
190 | type=int,
191 | default=80,
192 | help='stop training in which epoch')
193 | parser.add_argument(
194 | '--weight-decay',
195 | type=float,
196 | default=0.0005,
197 | help='weight decay for optimizer')
198 | # loss
199 | parser.add_argument(
200 | '--loss',
201 | type=str,
202 | default='CE',
203 | help='loss type(CE or focal)')
204 | parser.add_argument(
205 | '--label_count_path',
206 | default=None,
207 | type=str,
208 | help='Path to label counts (used in loss weighting)')
209 | parser.add_argument(
210 | '---beta',
211 | type=float,
212 | default=0.9999,
213 | help='Hyperparameter for Class balanced loss')
214 | parser.add_argument(
215 | '--gamma',
216 | type=float,
217 | default=2.0,
218 | help='Hyperparameter for Focal loss')
219 |
220 | parser.add_argument('--only_train_part', default=False)
221 | parser.add_argument('--only_train_epoch', default=0)
222 | parser.add_argument('--warm_up_epoch', default=0)
223 | return parser
224 |
225 |
226 | class Processor():
227 | """
228 | Processor for Skeleton-based Action Recgnition
229 | """
230 | def __init__(self, arg):
231 | self.arg = arg
232 | self.save_arg()
233 | if arg.phase == 'train':
234 | if not arg.train_feeder_args['debug']:
235 | if os.path.isdir(arg.model_saved_name):
236 | print('log_dir: ', arg.model_saved_name, 'already exist')
237 | # answer = input('delete it? y/n:')
238 | answer = 'y'
239 | if answer == 'y':
240 | print('Deleting dir...')
241 | shutil.rmtree(arg.model_saved_name)
242 | print('Dir removed: ', arg.model_saved_name)
243 | # input('Refresh the website of tensorboard by pressing any keys')
244 | else:
245 | print('Dir not removed: ', arg.model_saved_name)
246 | self.train_writer = SummaryWriter(os.path.join(arg.model_saved_name, 'train'), 'train')
247 | self.val_writer = SummaryWriter(os.path.join(arg.model_saved_name, 'val'), 'val')
248 | else:
249 | self.train_writer = self.val_writer = SummaryWriter(os.path.join(arg.model_saved_name, 'test'), 'test')
250 | self.global_step = 0
251 | self.load_model()
252 | self.load_optimizer()
253 | self.load_data()
254 | self.lr = self.arg.base_lr
255 | self.best_acc = 0
256 | self.best_per_class_acc = 0
257 |
258 | def load_data(self):
259 | Feeder = import_class(self.arg.feeder)
260 | self.data_loader = dict()
261 | if self.arg.phase == 'train':
262 | self.data_loader['train'] = torch.utils.data.DataLoader(
263 | dataset=Feeder(**self.arg.train_feeder_args),
264 | batch_size=self.arg.batch_size,
265 | shuffle=True,
266 | num_workers=self.arg.num_worker,
267 | drop_last=True,
268 | worker_init_fn=init_seed)
269 | self.data_loader['test'] = torch.utils.data.DataLoader(
270 | dataset=Feeder(**self.arg.test_feeder_args),
271 | batch_size=self.arg.test_batch_size,
272 | shuffle=False,
273 | num_workers=self.arg.num_worker,
274 | drop_last=False,
275 | worker_init_fn=init_seed)
276 |
277 | def load_class_weights(self):
278 | if arg.label_count_path == None:
279 | raise Exception('No label count path..!!!')
280 | with open(arg.label_count_path, 'rb') as f:
281 | label_count = pickle.load(f)
282 | img_num_per_cls = []
283 | # ipdb.set_trace()
284 | for cls_idx in range(len(label_count)):
285 | img_num_per_cls.append(int(label_count[cls_idx]))
286 | self.samples_per_class = img_num_per_cls
287 |
288 | def load_model(self):
289 | output_device = self.arg.device[0] if type(self.arg.device) is list else self.arg.device
290 | self.output_device = output_device
291 | Model = import_class(self.arg.model)
292 | shutil.copy2(inspect.getfile(Model), self.arg.work_dir)
293 | print(Model)
294 | self.model = Model(**self.arg.model_args).cuda(output_device)
295 | print(self.model)
296 | self.loss_type = arg.loss
297 | if self.loss_type != 'CE':
298 | self.load_class_weights()
299 |
300 | if self.arg.weights:
301 | self.global_step = int(arg.weights[:-3].split('-')[-1])
302 | self.print_log('Load weights from {}.'.format(self.arg.weights))
303 | if '.pkl' in self.arg.weights:
304 | with open(self.arg.weights, 'r') as f:
305 | weights = pickle.load(f)
306 | else:
307 | weights = torch.load(self.arg.weights)
308 |
309 | weights = OrderedDict(
310 | [[k.split('module.')[-1],
311 | v.cuda(output_device)] for k, v in weights.items()])
312 |
313 | keys = list(weights.keys())
314 | for w in self.arg.ignore_weights:
315 | for key in keys:
316 | if w in key:
317 | if weights.pop(key, None) is not None:
318 | self.print_log('Sucessfully Remove Weights: {}.'.format(key))
319 | else:
320 | self.print_log('Can Not Remove Weights: {}.'.format(key))
321 |
322 | try:
323 | self.model.load_state_dict(weights)
324 | except:
325 | state = self.model.state_dict()
326 | diff = list(set(state.keys()).difference(set(weights.keys())))
327 | print('Can not find these weights:')
328 | for d in diff:
329 | print(' ' + d)
330 | state.update(weights)
331 | self.model.load_state_dict(state)
332 |
333 | if type(self.arg.device) is list:
334 | if len(self.arg.device) > 1:
335 | self.model = nn.DataParallel(
336 | self.model,
337 | device_ids=self.arg.device,
338 | output_device=output_device)
339 |
340 | def load_optimizer(self):
341 | if self.arg.optimizer == 'SGD':
342 | self.optimizer = optim.SGD(
343 | self.model.parameters(),
344 | lr=self.arg.base_lr,
345 | momentum=0.9,
346 | nesterov=self.arg.nesterov,
347 | weight_decay=self.arg.weight_decay)
348 | elif self.arg.optimizer == 'Adam':
349 | self.optimizer = optim.Adam(
350 | self.model.parameters(),
351 | lr=self.arg.base_lr,
352 | weight_decay=self.arg.weight_decay)
353 | else:
354 | raise ValueError()
355 |
356 | def save_arg(self):
357 | # save arg
358 | arg_dict = vars(self.arg)
359 | if not os.path.exists(self.arg.work_dir):
360 | os.makedirs(self.arg.work_dir)
361 | with open('{}/config.yaml'.format(self.arg.work_dir), 'w') as f:
362 | yaml.dump(arg_dict, f)
363 |
364 | def adjust_learning_rate(self, epoch):
365 | if self.arg.optimizer == 'SGD' or self.arg.optimizer == 'Adam':
366 | if epoch < self.arg.warm_up_epoch:
367 | lr = self.arg.base_lr * (epoch + 1) / self.arg.warm_up_epoch
368 | else:
369 | lr = self.arg.base_lr * (
370 | 0.1 ** np.sum(epoch >= np.array(self.arg.step)))
371 | for param_group in self.optimizer.param_groups:
372 | param_group['lr'] = lr
373 |
374 | return lr
375 | else:
376 | raise ValueError()
377 |
378 | def print_time(self):
379 | localtime = time.asctime(time.localtime(time.time()))
380 | self.print_log("Local current time : " + localtime)
381 |
382 | def print_log(self, str, print_time=True):
383 | if print_time:
384 | localtime = time.asctime(time.localtime(time.time()))
385 | str = "[ " + localtime + ' ] ' + str
386 | print(str)
387 | if self.arg.print_log:
388 | with open('{}/log.txt'.format(self.arg.work_dir), 'a') as f:
389 | print(str, file=f)
390 |
391 | def record_time(self):
392 | self.cur_time = time.time()
393 | return self.cur_time
394 |
395 | def split_time(self):
396 | split_time = time.time() - self.cur_time
397 | self.record_time()
398 | return split_time
399 |
400 | def train(self, epoch, wb_dict, save_model=False):
401 | self.model.train()
402 | self.print_log('Training epoch: {}'.format(epoch + 1))
403 | loader = self.data_loader['train']
404 | self.adjust_learning_rate(epoch)
405 |
406 | loss_value, batch_acc, batch_per_class_acc = [], [], []
407 | self.train_writer.add_scalar('epoch', epoch, self.global_step)
408 | self.record_time()
409 | timer = dict(dataloader=0.001, model=0.001, statistics=0.001)
410 | process = tqdm(loader)
411 | if self.arg.only_train_part:
412 | if epoch > self.arg.only_train_epoch:
413 | print('only train part, require grad')
414 | for key, value in self.model.named_parameters():
415 | if 'PA' in key:
416 | value.requires_grad = True
417 | else:
418 | print('only train part, do not require grad')
419 | for key, value in self.model.named_parameters():
420 | if 'PA' in key:
421 | value.requires_grad = False
422 |
423 | nb_classes = self.arg.model_args['num_class']
424 | confusion_matrix = torch.zeros(nb_classes, nb_classes)
425 | for batch_idx, (data, label, sid, seg_id, chunk_n, anntr_id, index) in enumerate(process):
426 |
427 | self.global_step += 1
428 | # get data
429 | data = Variable(data.float().cuda(self.output_device), requires_grad=False)
430 | label = Variable(label.long().cuda(self.output_device), requires_grad=False)
431 | timer['dataloader'] += self.split_time()
432 |
433 | # forward
434 | output = self.model(data)
435 |
436 | if self.loss_type == "CE":
437 | l_type = nn.CrossEntropyLoss()
438 | loss = l_type(output, label)
439 | else:
440 | loss = CB_loss(label, output,
441 | self.samples_per_class,
442 | nb_classes, self.loss_type,
443 | self.arg.beta,
444 | self.arg.gamma,
445 | self.arg.device[0]
446 | )
447 |
448 | # backward
449 | self.optimizer.zero_grad()
450 | loss.backward()
451 | self.optimizer.step()
452 | loss_value.append(loss.data.item())
453 | timer['model'] += self.split_time()
454 |
455 | # Compute per-class acc.
456 | value, predict_label = torch.max(output.data, 1)
457 | for t, p in zip(label.view(-1), predict_label.view(-1)):
458 | confusion_matrix[t.long(), p.long()] += 1
459 |
460 | # Acc.
461 | acc = torch.mean((predict_label == label.data).float())
462 | batch_acc.append(acc.item())
463 | self.train_writer.add_scalar('acc', acc, self.global_step)
464 | self.train_writer.add_scalar('loss', loss.data.item(), self.global_step)
465 |
466 | # statistics
467 | self.lr = self.optimizer.param_groups[0]['lr']
468 | self.train_writer.add_scalar('lr', self.lr, self.global_step)
469 | # if self.global_step % self.arg.log_interval == 0:
470 | # self.print_log(
471 | # '\tBatch({}/{}) done. Loss: {:.4f} lr:{:.6f}'.format(
472 | # batch_idx, len(loader), loss.data[0], lr))
473 | timer['statistics'] += self.split_time()
474 |
475 | per_class_acc_vals = confusion_matrix.diag()/confusion_matrix.sum(1)
476 | per_class_acc = torch.mean(per_class_acc_vals).float()
477 |
478 | # statistics of time consumption and loss
479 | proportion = {
480 | k: '{:02d}%'.format(int(round(v * 100 / sum(timer.values()))))
481 | for k, v in timer.items()
482 | }
483 | self.print_log(
484 | '\tMean training loss: {:.4f}.'.format(np.mean(loss_value)))
485 | self.print_log('\tTop-1-norm: {:.3f}%'.format(100*per_class_acc))
486 |
487 | # Log
488 | wb_dict['train loss'] = np.mean(loss_value)
489 | wb_dict['train acc'] = np.mean(batch_acc)
490 |
491 | if save_model:
492 | state_dict = self.model.state_dict()
493 | weights = OrderedDict([[k.split('module.')[-1],
494 | v.cpu()] for k, v in state_dict.items()])
495 |
496 | torch.save(weights, self.arg.model_saved_name + '-' + str(epoch) + '-' + str(int(self.global_step)) + '.pt')
497 |
498 | return wb_dict
499 |
500 | @torch.no_grad()
501 | def eval(self, epoch,
502 | wb_dict,
503 | save_score=True,
504 | loader_name=['test'],
505 | wrong_file=None,
506 | result_file=None
507 | ):
508 | if wrong_file is not None:
509 | f_w = open(wrong_file, 'w')
510 | if result_file is not None:
511 | f_r = open(result_file, 'w')
512 | self.model.eval()
513 | self.print_log('Eval epoch: {}'.format(epoch + 1))
514 | for ln in loader_name:
515 | loss_value = []
516 | score_frag = []
517 | pred_label_list = []
518 | step = 0
519 | nb_classes = self.arg.model_args['num_class']
520 | confusion_matrix = torch.zeros(nb_classes, nb_classes)
521 | process = tqdm(self.data_loader[ln])
522 | for batch_idx, (data, label, sid, seg_id, chunk_n, anntr_id, index) in enumerate(process):
523 | data = Variable(
524 | data.float().cuda(self.output_device),
525 | requires_grad=False)
526 | # volatile=True)
527 | label = Variable(
528 | label.long().cuda(self.output_device),
529 | requires_grad=False)
530 | # volatile=True)
531 | output = self.model(data)
532 |
533 | if self.loss_type == "CE":
534 | l_type = nn.CrossEntropyLoss()
535 | loss = l_type(output, label)
536 | else:
537 | loss = CB_loss(label, output,
538 | self.samples_per_class,
539 | nb_classes, self.loss_type,
540 | self.arg.beta,
541 | self.arg.gamma,
542 | self.arg.device[0]
543 | )
544 | # Store outputs
545 | logits = output.data.cpu().numpy()
546 | score_frag.append(logits)
547 | loss_value.append(loss.data.item())
548 |
549 | _, predict_label = torch.max(output.data, 1)
550 | pred_label_list.append(predict_label)
551 |
552 | step += 1
553 |
554 | # Compute per-class acc.
555 | for t, p in zip(label.view(-1), predict_label.view(-1)):
556 | confusion_matrix[t.long(), p.long()] += 1
557 | if wrong_file is not None or result_file is not None:
558 | predict = list(predict_label.cpu().numpy())
559 | true = list(label.data.cpu().numpy())
560 | for i, x in enumerate(predict):
561 | if result_file is not None:
562 | f_r.write(str(x) + ',' + str(true[i]) + '\n')
563 | if x != true[i] and wrong_file is not None:
564 | f_w.write(str(index[i]) + ',' + str(x) + ',' + str(true[i]) + '\n')
565 | per_class_acc_vals = confusion_matrix.diag()/confusion_matrix.sum(1)
566 | per_class_acc = torch.mean(per_class_acc_vals).float()
567 | score = np.concatenate(score_frag)
568 | loss = np.mean(loss_value)
569 |
570 | accuracy = self.data_loader[ln].dataset.top_k(score, 1)
571 | topk_scores = { k: self.data_loader[ln].dataset.top_k(score, k) \
572 | for k in self.arg.show_topk }
573 |
574 | wb_dict['val loss'] = loss
575 | wb_dict['val acc'] = accuracy
576 | wb_dict['val per class acc'] = per_class_acc
577 | for k in topk_scores:
578 | wb_dict['val top{0} score'.format(k)] = topk_scores[k]
579 |
580 | if accuracy > self.best_acc:
581 | self.best_acc = accuracy
582 | if per_class_acc > self.best_per_class_acc:
583 | self.best_per_class_acc = per_class_acc
584 |
585 | print('Accuracy: ', accuracy, ' model: ', self.arg.model_saved_name)
586 | if self.arg.phase == 'train':
587 | self.val_writer.add_scalar('loss', loss, self.global_step)
588 | self.val_writer.add_scalar('acc', accuracy, self.global_step)
589 | self.val_writer.add_scalar('per_class_acc', per_class_acc , self.global_step)
590 |
591 | pred_scores = list(zip(
592 | self.data_loader[ln].dataset.label[1], # sid
593 | self.data_loader[ln].dataset.sample_name, # seg_id
594 | self.data_loader[ln].dataset.label[2], # chunk_id
595 | score))
596 |
597 | self.print_log('\tMean {} loss of {} batches: {}.'.format(
598 | ln, len(self.data_loader[ln]), np.mean(loss_value)))
599 | self.print_log('\tTop-1-norm: {:.3f}%'.format(100*per_class_acc))
600 | for k in topk_scores:
601 | self.print_log('\tTop{}: {:.3f}%'.format(k, 100*topk_scores[k]))
602 |
603 | if save_score:
604 | with open('{}/epoch{}_{}_score.pkl'.format(
605 | self.arg.work_dir, epoch + 1, ln), 'wb') as f:
606 | pickle.dump(pred_scores, f)
607 | return wb_dict
608 |
609 | def start(self):
610 | wb_dict = {}
611 | if self.arg.phase == 'train':
612 | self.print_log('Parameters:\n{}\n'.format(str(vars(self.arg))))
613 | self.global_step = self.arg.start_epoch * len(self.data_loader['train']) / self.arg.batch_size
614 |
615 | for epoch in range(self.arg.start_epoch, self.arg.num_epoch):
616 |
617 | save_model = ((epoch + 1) % self.arg.save_interval == 0) or (
618 | epoch + 1 == self.arg.num_epoch)
619 |
620 | # Wandb logging
621 | wb_dict = {'lr': self.lr}
622 |
623 | # Train
624 | wb_dict = self.train(epoch, wb_dict, save_model=save_model)
625 |
626 | # Eval. on val set
627 | wb_dict = self.eval(
628 | epoch,
629 | wb_dict,
630 | save_score=self.arg.save_score,
631 | loader_name=['test'])
632 | # Log stats. for this epoch
633 | print('Epoch: {0}\nMetrics: {1}'.format(epoch, wb_dict))
634 |
635 | print('best accuracy: ', self.best_acc, ' model_name: ', self.arg.model_saved_name)
636 |
637 | elif self.arg.phase == 'test':
638 | if not self.arg.test_feeder_args['debug']:
639 | wf = self.arg.model_saved_name + '_wrong.txt'
640 | rf = self.arg.model_saved_name + '_right.txt'
641 | else:
642 | wf = rf = None
643 | if self.arg.weights is None:
644 | raise ValueError('Please appoint --weights.')
645 | self.arg.print_log = False
646 | self.print_log('Model: {}.'.format(self.arg.model))
647 | self.print_log('Weights: {}.'.format(self.arg.weights))
648 |
649 | wb_dict = self.eval(epoch=0, wb_dict=wb_dict,
650 | save_score=self.arg.save_score,
651 | loader_name=['test'],
652 | wrong_file=wf,
653 | result_file=rf
654 | )
655 | print('Inference metrics: ', wb_dict)
656 | self.print_log('Done.\n')
657 |
658 |
659 | def str2bool(v):
660 | if v.lower() in ('yes', 'true', 't', 'y', '1'):
661 | return True
662 | elif v.lower() in ('no', 'false', 'f', 'n', '0'):
663 | return False
664 | else:
665 | raise argparse.ArgumentTypeError('Boolean value expected.')
666 |
667 |
668 | def import_class(name):
669 | components = name.split('.')
670 | mod = __import__(components[0])
671 | for comp in components[1:]:
672 | mod = getattr(mod, comp)
673 | return mod
674 |
675 |
676 | if __name__ == '__main__':
677 | parser = get_parser()
678 |
679 | # load arg form config file
680 | p = parser.parse_args()
681 | if p.config is not None:
682 | with open(p.config, 'r') as f:
683 | default_arg = yaml.load(f)
684 | key = vars(p).keys()
685 | for k in default_arg.keys():
686 | if k not in key:
687 | print('WRONG ARG: {}'.format(k))
688 | assert (k in key)
689 | parser.set_defaults(**default_arg)
690 |
691 | arg = parser.parse_args()
692 | print('BABEL Action Recognition')
693 | print('Config: ', arg)
694 | init_seed(0)
695 | processor = Processor(arg)
696 | processor.start()
697 |
--------------------------------------------------------------------------------