├── checkpoint └── loss_3d.png ├── Documentation on the code.pdf ├── README.md ├── data └── prepare_data_mpi_inf_3dhp.py ├── common └── mpi_inf_3dhp_dataset.py └── run.py /checkpoint/loss_3d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zemingxie/MPI_INF_3DHP-on-VideoPose3D/HEAD/checkpoint/loss_3d.png -------------------------------------------------------------------------------- /Documentation on the code.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zemingxie/MPI_INF_3DHP-on-VideoPose3D/HEAD/Documentation on the code.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MPI_INF_3DHP-on-VideoPose3D 2 | 3 | ## Instruction 4 | Copy paste files to VideoPose3D directory. 5 | The rest follows the same procedure as VideoPose3D 6 | ### Setup dataset 7 | Go to [mpi_inf_3dhp website](http://vcai.mpi-inf.mpg.de/3dhp-dataset/) and follow the instruction to download the dataset, adn run the following 8 | ```bash 9 | python prepare_data_mpi_inf_3dhp.py --from-source path/to/mpi_inf_3dhp/dataset 10 | ``` 11 | 12 | ### Running 13 | To Achieve the same result on the picture below, you can run the following code: 14 | ```bash 15 | python run.py -d mpi_inf_3dhp -k gt -str S1,S2,S3,S4,S5,S6,S7,S8 -ste TS1,TS2,TS3,TS4,TS5,TS6 --export-training-curves -b 256 -e 200 -lrd 0.98 16 | ``` 17 | 18 | ## Result 19 |

20 | 21 | ## Ackownledge 22 | Part of my code is borrowed from [VideoPose3D](https://github.com/facebookresearch/VideoPose3D). I thank the authors for releasing the codes. 23 | -------------------------------------------------------------------------------- /data/prepare_data_mpi_inf_3dhp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import argparse 9 | import os 10 | import numpy as np 11 | import h5py 12 | 13 | import sys 14 | sys.path.append('../') 15 | from common.mpi_inf_3dhp_dataset import MpiInf3dhpDataset 16 | from common.camera import project_to_2d, image_coordinates 17 | from common.utils import wrap 18 | 19 | output_filename = 'data_3d_mpi_inf_3dhp' 20 | output_filename_2d = 'data_2d_mpi_inf_3dhp_gt' 21 | output_filename_2d2 = 'data_2d_mpi_inf_3dhp_computed_gt' 22 | subjects_train = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8'] 23 | subjects_test = ['TS1', 'TS2', 'TS3', 'TS4', 'TS5', 'TS6'] 24 | joint_idx_train_matlab = [8, 6, 15, 16, 17, 10, 11, 12, 24, 25, 26, 19, 20, 21, 5, 4, 7] # notice: it is in matlab index 25 | joint_idx_train = [i-1 for i in joint_idx_train_matlab] 26 | 27 | if __name__ == '__main__': 28 | if os.path.basename(os.getcwd()) != 'data': 29 | print('This script must be launched from the "data" directory') 30 | exit(0) 31 | 32 | parser = argparse.ArgumentParser(description='MPI_INF_3DHP dataset downloader/converter') 33 | 34 | # Convert dataset from original source, using files converted to .mat (the Human3.6M dataset path must be specified manually) 35 | # This option requires MATLAB to convert files using the provided script 36 | parser.add_argument('--from-source', default='', type=str, metavar='PATH', help='convert original dataset') 37 | 38 | args = parser.parse_args() 39 | 40 | if os.path.exists(output_filename + '.npz'): 41 | print('The dataset already exists at', output_filename + '.npz') 42 | exit(0) 43 | 44 | if args.from_source: 45 | print('Converting original MPI_INF_3DHP dataset from', args.from_source) 46 | output = {} 47 | output_2d_poses = {} 48 | from scipy.io import loadmat 49 | 50 | for subject in subjects_train: 51 | output[subject] = {} 52 | output_2d_poses[subject] = {} 53 | file_1 = args.from_source + '/' + subject + '/Seq1/annot.mat' 54 | file_2 = args.from_source + '/' + subject + '/Seq2/annot.mat' 55 | hf = loadmat(file_1) 56 | positions_3d_temp = [] 57 | positions_2d_temp = [] 58 | 59 | for index in range(14): 60 | positions = hf['annot3'][index, 0].reshape(-1, 28, 3) 61 | positions /= 1000 # Meters instead of millimeters 62 | positions_17 = positions[:,joint_idx_train,:] 63 | positions_17[:, 1:] -= positions_17[:, :1] # Remove global offset, but keep trajectory in first position 64 | positions_3d_temp.append(positions_17.astype('float32')) 65 | positions_2d = hf['annot2'][index, 0].reshape(-1, 28, 2) 66 | positions_2d_temp.append(positions_2d[:,joint_idx_train,:].astype('float32')) 67 | 68 | output[subject]['Seq1'] = positions_3d_temp 69 | output_2d_poses[subject]['Seq1'] = positions_2d_temp 70 | 71 | positions_3d_temp = [] 72 | positions_2d_temp = [] 73 | hf = loadmat(file_2) 74 | for index in range(14): 75 | positions = hf['annot3'][index, 0].reshape(-1, 28, 3) 76 | positions /= 1000 # Meters instead of millimeters 77 | positions_17 = positions[:,joint_idx_train,:] 78 | positions_17[:, 1:] -= positions_17[:, :1] # Remove global offset, but keep trajectory in first position 79 | positions_3d_temp.append(positions_17.astype('float32')) 80 | positions_2d = hf['annot2'][index, 0].reshape(-1, 28, 2) 81 | positions_2d_temp.append(positions_2d[:,joint_idx_train,:].astype('float32')) 82 | output[subject]['Seq2'] = positions_3d_temp 83 | output_2d_poses[subject]['Seq2'] = positions_2d_temp 84 | 85 | for subject in subjects_test: 86 | output[subject] = {} 87 | output_2d_poses[subject] = {} 88 | file_1 = args.from_source + '/mpi_inf_3dhp_test_set/mpi_inf_3dhp_test_set/' + subject + '/annot_data.mat' 89 | hf = {} 90 | f = h5py.File(file_1) 91 | for k, v in f.items(): 92 | hf[k] = np.array(v) 93 | positions = hf['annot3'].reshape(-1, 17, 3) 94 | positions /= 1000 # Meters instead of millimeters 95 | positions_17 = positions 96 | positions_17[:, 1:] -= positions_17[:, :1] # Remove global offset, but keep trajectory in first position 97 | output[subject]['Test'] = [positions_17.astype('float32')] 98 | positions_2d = hf['annot2'].reshape(-1, 17, 2) 99 | output_2d_poses[subject]['Test'] = [positions_2d.astype('float32')] 100 | 101 | print('Saving...') 102 | np.savez_compressed(output_filename, positions_3d=output) 103 | print('') 104 | print('Getting 2D poses...') 105 | dataset = MpiInf3dhpDataset(output_filename + '.npz') 106 | metadata = { 107 | 'num_joints': dataset.skeleton().num_joints(), 108 | 'keypoints_symmetry': [dataset.skeleton().joints_left(), dataset.skeleton().joints_right()] 109 | } 110 | print('Saving...') 111 | np.savez_compressed(output_filename_2d, positions_2d=output_2d_poses, metadata=metadata) 112 | 113 | print('Done.') 114 | else: 115 | print('Please specify the dataset source') 116 | exit(0) 117 | ''' 118 | # Create 2D pose file 119 | print('') 120 | print('Computing ground-truth 2D poses...') 121 | dataset = MpiInf3dhpDataset(output_filename + '.npz') 122 | output_2d_poses = {} 123 | for subject in dataset.subjects(): 124 | output_2d_poses[subject] = {} 125 | for action in dataset[subject].keys(): 126 | anim = dataset[subject][action] 127 | 128 | positions_2d = [] 129 | for i,cam in enumerate(anim['cameras']): 130 | pos_3d = anim['positions'][i] 131 | pos_2d = wrap(project_to_2d, pos_3d, cam['intrinsic'], unsqueeze=True) 132 | pos_2d_pixel_space = image_coordinates(pos_2d, w=cam['res_w'], h=cam['res_h']) 133 | positions_2d.append(pos_2d_pixel_space.astype('float32')) 134 | output_2d_poses[subject][action] = positions_2d 135 | 136 | print('Saving...') 137 | metadata = { 138 | 'num_joints': dataset.skeleton().num_joints(), 139 | 'keypoints_symmetry': [dataset.skeleton().joints_left(), dataset.skeleton().joints_right()] 140 | } 141 | np.savez_compressed(output_filename_2d2, positions_2d=output_2d_poses, metadata=metadata) 142 | ''' 143 | 144 | -------------------------------------------------------------------------------- /common/mpi_inf_3dhp_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import numpy as np 9 | import copy 10 | from common.skeleton import Skeleton 11 | from common.mocap_dataset import MocapDataset 12 | from common.camera import normalize_screen_coordinates, image_coordinates 13 | 14 | mpi_inf_3dhp_skeleton = Skeleton(parents=[-1, 0, 1, 2, 3, 4, 0, 6, 7, 8, 9, 0, 11, 12, 13, 14, 12, 15 | 16, 17, 18, 19, 20, 19, 22, 12, 24, 25, 26, 27, 28, 27, 30], 16 | joints_left=[6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23], 17 | joints_right=[1, 2, 3, 4, 5, 24, 25, 26, 27, 28, 29, 30, 31]) 18 | 19 | subjects_train = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8'] 20 | subjects_test1 = ['TS1', 'TS2', 'TS3', 'TS4'] 21 | subjects_test2 = ['TS5', 'TS6'] 22 | 23 | mpi_inf_3dhp_cameras_intrinsic_params = [ 24 | { 25 | 'id': 'cam_0', 26 | 'center': [1024.704, 1051.394], 27 | 'focal_length': [1497.693, 1497.103], 28 | 'radial_distortion': [0, 0, 0], 29 | 'tangential_distortion': [0, 0], 30 | 'res_w': 2048, 31 | 'res_h': 2048, 32 | 'azimuth': 70, # Only used for visualization 33 | }, 34 | { 35 | 'id': 'cam_1', 36 | 'center': [1030.519, 1052.626], 37 | 'focal_length': [1495.217, 1495.52], 38 | 'radial_distortion': [0, 0, 0], 39 | 'tangential_distortion': [0, 0], 40 | 'res_w': 2048, 41 | 'res_h': 2048, 42 | 'azimuth': 70, # Only used for visualization 43 | }, 44 | { 45 | 'id': 'cam_2', 46 | 'center': [983.8873, 987.5902], 47 | 'focal_length': [1495.587, 1497.828], 48 | 'radial_distortion': [0, 0, 0], 49 | 'tangential_distortion': [0, 0], 50 | 'res_w': 2048, 51 | 'res_h': 2048, 52 | 'azimuth': 70, # Only used for visualization 53 | }, 54 | { 55 | 'id': 'cam_3', 56 | 'center': [1029.06, 1041.409], 57 | 'focal_length': [1495.886, 1496.033], 58 | 'radial_distortion': [0, 0, 0], 59 | 'tangential_distortion': [0, 0], 60 | 'res_w': 2048, 61 | 'res_h': 2048, 62 | 'azimuth': -110, # Only used for visualization 63 | }, 64 | { 65 | 'id': 'cam_4', 66 | 'center': [987.6075, 1019.069], 67 | 'focal_length': [1490.952, 1491.108], 68 | 'radial_distortion': [0, 0, 0], 69 | 'tangential_distortion': [0, 0], 70 | 'res_w': 2048, 71 | 'res_h': 2048, 72 | 'azimuth': 70, # Only used for visualization 73 | }, 74 | { 75 | 'id': 'cam_5', 76 | 'center': [1012.331, 998.5009], 77 | 'focal_length': [1500.414, 1499.971], 78 | 'radial_distortion': [0, 0, 0], 79 | 'tangential_distortion': [0, 0], 80 | 'res_w': 2048, 81 | 'res_h': 2048, 82 | 'azimuth': 70, # Only used for visualization 83 | }, 84 | { 85 | 'id': 'cam_6', 86 | 'center': [999.7319, 1010.251], 87 | 'focal_length': [1498.471, 1498.8], 88 | 'radial_distortion': [0, 0, 0], 89 | 'tangential_distortion': [0, 0], 90 | 'res_w': 2048, 91 | 'res_h': 2048, 92 | 'azimuth': 70, # Only used for visualization 93 | }, 94 | { 95 | 'id': 'cam_7', 96 | 'center': [987.2716, 976.8773], 97 | 'focal_length': [1498.831, 1499.674], 98 | 'radial_distortion': [0, 0, 0], 99 | 'tangential_distortion': [0, 0], 100 | 'res_w': 2048, 101 | 'res_h': 2048, 102 | 'azimuth': 70, # Only used for visualization 103 | }, 104 | { 105 | 'id': 'cam_8', 106 | 'center': [1017.387, 1043.032], 107 | 'focal_length': [1500.172, 1500.837], 108 | 'radial_distortion': [0, 0, 0], 109 | 'tangential_distortion': [0, 0], 110 | 'res_w': 2048, 111 | 'res_h': 2048, 112 | 'azimuth': 70, # Only used for visualization 113 | }, 114 | { 115 | 'id': 'cam_9', 116 | 'center': [1010.423, 1037.096], 117 | 'focal_length': [1501.554, 1501.9], 118 | 'radial_distortion': [0, 0, 0], 119 | 'tangential_distortion': [0, 0], 120 | 'res_w': 2048, 121 | 'res_h': 2048, 122 | 'azimuth': 70, # Only used for visualization 123 | }, 124 | { 125 | 'id': 'cam_10', 126 | 'center': [1041.614, 997.0433], 127 | 'focal_length': [1498.423, 1498.585], 128 | 'radial_distortion': [0, 0, 0], 129 | 'tangential_distortion': [0, 0], 130 | 'res_w': 2048, 131 | 'res_h': 2048, 132 | 'azimuth': 70, # Only used for visualization 133 | }, 134 | { 135 | 'id': 'cam_11', 136 | 'center': [1009.802, 999.9984], 137 | 'focal_length': [1495.779, 1493.703], 138 | 'radial_distortion': [0, 0, 0], 139 | 'tangential_distortion': [0, 0], 140 | 'res_w': 2048, 141 | 'res_h': 2048, 142 | 'azimuth': 70, # Only used for visualization 143 | }, 144 | { 145 | 'id': 'cam_12', 146 | 'center': [1000.56, 1014.975], 147 | 'focal_length': [1501.326, 1501.491], 148 | 'radial_distortion': [0, 0, 0], 149 | 'tangential_distortion': [0, 0], 150 | 'res_w': 2048, 151 | 'res_h': 2048, 152 | 'azimuth': 70, # Only used for visualization 153 | }, 154 | { 155 | 'id': 'cam_13', 156 | 'center': [1005.702, 1004.214], 157 | 'focal_length': [1496.961, 1497.378], 158 | 'radial_distortion': [0, 0, 0], 159 | 'tangential_distortion': [0, 0], 160 | 'res_w': 2048, 161 | 'res_h': 2048, 162 | 'azimuth': 70, # Only used for visualization 163 | }, 164 | { 165 | 'id': 'TS56', 166 | 'center': [939.85754016, 560.140743168], 167 | 'focal_length': [1683.98345952, 1672.59370772], 168 | 'radial_distortion': [-0.276859611, 0.131125256, -0.049318332], 169 | 'tangential_distortion': [-0.000360494, -0.001149441], 170 | 'res_w': 1920, 171 | 'res_h': 1080, 172 | 'azimuth': 70, # Only used for visualization 173 | }, 174 | ] 175 | 176 | mpi_inf_3dhp_cameras_extrinsic_params = { 177 | 'Train': [ 178 | { 179 | 'orientation': [0.9910573, 0.0000989, 0.1322565, -0.017709], 180 | 'translation': [-562.8666, 1398.138, 3852.623], 181 | }, 182 | { 183 | 'orientation': [0.8882246, -0.0698901, 0.4388433, -0.1165721], 184 | 'translation': [-1429.856, 738.1779, 4897.966], 185 | }, 186 | { 187 | 'orientation': [0.5651277, -0.0301201, 0.824319, -0.0148915], 188 | 'translation': [57.25702, 1307.287, 2799.822], 189 | }, 190 | { 191 | 'orientation': [0.6670245, -0.1827152, 0.7089925, -0.1379241], 192 | 'translation': [-284.8168, 807.9184, 3177.16], 193 | }, 194 | { 195 | 'orientation': [0.8273998, 0.0263385, 0.5589656, -0.0476783], 196 | 'translation': [-1563.911, 801.9608, 3517.316], 197 | }, 198 | { 199 | 'orientation': [-0.568842, 0.0159665, 0.8220693, -0.0191314], 200 | 'translation': [358.4134, 994.5658, 3439.832], 201 | }, 202 | { 203 | 'orientation': [0.2030824, -0.2818073, 0.9370704, -0.0352313], 204 | 'translation': [569.4388, 528.871, 3687.369], 205 | }, 206 | { 207 | 'orientation': [0.00086, 0.0123344, 0.9998223, -0.0142292], 208 | 'translation': [1378.866, 1270.781, 2631.567], 209 | }, 210 | { 211 | 'orientation': [0.7053718, 0.095632, -0.7004048, -0.0523286], 212 | 'translation': [221.3543, 659.87, 3644.688], 213 | }, 214 | { 215 | 'orientation': [0.6914033, 0.2036966, -0.6615312, -0.2069921], 216 | 'translation': [388.6217, 137.5452, 4216.635], 217 | }, 218 | { 219 | 'orientation': [-0.2266321, -0.2540748, 0.9401911, -0.0111636], 220 | 'translation': [1167.962, 617.6362, 4472.351], 221 | }, 222 | { 223 | 'orientation': [-0.4536946, -0.2035304, -0.0072578, 0.8675736], 224 | 'translation': [134.8272, 251.5094, 4570.244], 225 | }, 226 | { 227 | 'orientation': [-0.0778876, 0.8469901, -0.4230185, 0.3124046], 228 | 'translation': [412.4695, 532.7588, 4887.095], 229 | }, 230 | { 231 | 'orientation': [0.098712, 0.8023286, -0.5397436, -0.2349501], 232 | 'translation': [867.1278, 827.4572, 3985.159], 233 | }, 234 | ], 235 | 'chestHeight': [ 236 | { 237 | 'orientation': [0.7053718, 0.095632, -0.7004048, -0.0523286], 238 | 'translation': [221.3543, 659.87, 3644.688], 239 | }, 240 | ], 241 | } 242 | 243 | 244 | class MpiInf3dhpDataset(MocapDataset): 245 | def __init__(self, path, remove_static_joints=True): 246 | super().__init__(fps=25, skeleton=mpi_inf_3dhp_skeleton) 247 | 248 | self._cameras = {} 249 | 250 | for subject in subjects_train: 251 | self._cameras[subject] = copy.deepcopy(mpi_inf_3dhp_cameras_extrinsic_params['Train']) 252 | 253 | for cameras in self._cameras.values(): 254 | for i, cam in enumerate(cameras): 255 | cam.update(mpi_inf_3dhp_cameras_intrinsic_params[i]) 256 | for k, v in cam.items(): 257 | if k not in ['id', 'res_w', 'res_h']: 258 | cam[k] = np.array(v, dtype='float32') 259 | 260 | # Normalize camera frame 261 | cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32') 262 | cam['focal_length'] = cam['focal_length']/cam['res_w']*2 263 | if 'translation' in cam: 264 | cam['translation'] = cam['translation']/1000 # mm to meters 265 | 266 | # Add intrinsic parameters vector 267 | cam['intrinsic'] = np.concatenate((cam['focal_length'], 268 | cam['center'], 269 | cam['radial_distortion'], 270 | cam['tangential_distortion'])) 271 | 272 | for subject in subjects_test1: 273 | self._cameras[subject] = copy.deepcopy(mpi_inf_3dhp_cameras_extrinsic_params['chestHeight']) 274 | cam = self._cameras[subject] [0] 275 | cam.update(mpi_inf_3dhp_cameras_intrinsic_params[8]) 276 | for k, v in cam.items(): 277 | if k not in ['id', 'res_w', 'res_h']: 278 | cam[k] = np.array(v, dtype='float32') 279 | 280 | # Normalize camera frame 281 | cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32') 282 | cam['focal_length'] = cam['focal_length']/cam['res_w']*2 283 | if 'translation' in cam: 284 | cam['translation'] = cam['translation']/1000 # mm to meters 285 | 286 | # Add intrinsic parameters vector 287 | cam['intrinsic'] = np.concatenate((cam['focal_length'], 288 | cam['center'], 289 | cam['radial_distortion'], 290 | cam['tangential_distortion'])) 291 | 292 | for subject in subjects_test2: 293 | self._cameras[subject] = copy.deepcopy(mpi_inf_3dhp_cameras_extrinsic_params['chestHeight']) 294 | cam = self._cameras[subject] [0] 295 | cam.update(mpi_inf_3dhp_cameras_intrinsic_params[14]) 296 | for k, v in cam.items(): 297 | if k not in ['id', 'res_w', 'res_h']: 298 | cam[k] = np.array(v, dtype='float32') 299 | 300 | # Normalize camera frame 301 | cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32') 302 | cam['focal_length'] = cam['focal_length']/cam['res_w']*2 303 | if 'translation' in cam: 304 | cam['translation'] = cam['translation']/1000 # mm to meters 305 | 306 | # Add intrinsic parameters vector 307 | cam['intrinsic'] = np.concatenate((cam['focal_length'], 308 | cam['center'], 309 | cam['radial_distortion'], 310 | cam['tangential_distortion'])) 311 | 312 | # Load serialized dataset 313 | data = np.load(path, allow_pickle=True)['positions_3d'].item() 314 | 315 | self._data = {} 316 | 317 | for subject, actions in data.items(): 318 | self._data[subject] = {} 319 | for action_name, positions in actions.items(): 320 | self._data[subject][action_name] = { 321 | 'positions': positions, 322 | 'cameras': self._cameras[subject], 323 | } 324 | 325 | if remove_static_joints: 326 | # Bring the skeleton to 17 joints instead of the original 32 327 | self._skeleton.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31]) 328 | 329 | # Rewire shoulders to the correct parents 330 | self._skeleton._parents[11] = 8 331 | self._skeleton._parents[14] = 8 332 | 333 | 334 | def supports_semi_supervised(self): 335 | return True 336 | 337 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import numpy as np 9 | 10 | from common.arguments import parse_args 11 | import torch 12 | 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | import torch.optim as optim 16 | import os 17 | import sys 18 | import errno 19 | 20 | from common.camera import * 21 | from common.model import * 22 | from common.loss import * 23 | from common.generators import ChunkedGenerator, UnchunkedGenerator 24 | from time import time 25 | from common.utils import deterministic_random 26 | 27 | args = parse_args() 28 | print(args) 29 | 30 | try: 31 | # Create checkpoint directory if it does not exist 32 | os.makedirs(args.checkpoint) 33 | except OSError as e: 34 | if e.errno != errno.EEXIST: 35 | raise RuntimeError('Unable to create checkpoint directory:', args.checkpoint) 36 | 37 | print('Loading dataset...') 38 | dataset_path = 'data/data_3d_' + args.dataset + '.npz' 39 | if args.dataset == 'h36m': 40 | from common.h36m_dataset import Human36mDataset 41 | dataset = Human36mDataset(dataset_path) 42 | elif args.dataset.startswith('humaneva'): 43 | from common.humaneva_dataset import HumanEvaDataset 44 | dataset = HumanEvaDataset(dataset_path) 45 | elif args.dataset.startswith('mpi'): 46 | from common.mpi_inf_3dhp_dataset import MpiInf3dhpDataset 47 | dataset = MpiInf3dhpDataset(dataset_path) 48 | elif args.dataset.startswith('custom'): 49 | from common.custom_dataset import CustomDataset 50 | dataset = CustomDataset('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz') 51 | else: 52 | raise KeyError('Invalid dataset') 53 | 54 | print('Preparing data...') 55 | if args.dataset.startswith('mpi'): 56 | for subject in dataset.subjects(): 57 | for action in dataset[subject].keys(): 58 | anim = dataset[subject][action] 59 | 60 | if 'positions' in anim: 61 | anim['positions_3d'] = anim['positions'] 62 | else: 63 | for subject in dataset.subjects(): 64 | for action in dataset[subject].keys(): 65 | anim = dataset[subject][action] 66 | 67 | if 'positions' in anim: 68 | positions_3d = [] 69 | for cam in anim['cameras']: 70 | pos_3d = world_to_camera(anim['positions'], R=cam['orientation'], t=cam['translation']) 71 | pos_3d[:, 1:] -= pos_3d[:, :1] # Remove global offset, but keep trajectory in first position 72 | positions_3d.append(pos_3d) 73 | anim['positions_3d'] = positions_3d 74 | 75 | print('Loading 2D detections...') 76 | keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True) 77 | keypoints_metadata = keypoints['metadata'].item() 78 | keypoints_symmetry = keypoints_metadata['keypoints_symmetry'] 79 | kps_left, kps_right = list(keypoints_symmetry[0]), list(keypoints_symmetry[1]) 80 | joints_left, joints_right = list(dataset.skeleton().joints_left()), list(dataset.skeleton().joints_right()) 81 | keypoints = keypoints['positions_2d'].item() 82 | 83 | for subject in dataset.subjects(): 84 | assert subject in keypoints, 'Subject {} is missing from the 2D detections dataset'.format(subject) 85 | for action in dataset[subject].keys(): 86 | assert action in keypoints[subject], 'Action {} of subject {} is missing from the 2D detections dataset'.format(action, subject) 87 | if 'positions_3d' not in dataset[subject][action]: 88 | continue 89 | 90 | for cam_idx in range(len(keypoints[subject][action])): 91 | 92 | # We check for >= instead of == because some videos in H3.6M contain extra frames 93 | mocap_length = dataset[subject][action]['positions_3d'][cam_idx].shape[0] 94 | assert keypoints[subject][action][cam_idx].shape[0] >= mocap_length 95 | 96 | if keypoints[subject][action][cam_idx].shape[0] > mocap_length: 97 | # Shorten sequence 98 | keypoints[subject][action][cam_idx] = keypoints[subject][action][cam_idx][:mocap_length] 99 | 100 | assert len(keypoints[subject][action]) == len(dataset[subject][action]['positions_3d']) 101 | 102 | for subject in keypoints.keys(): 103 | for action in keypoints[subject]: 104 | for cam_idx, kps in enumerate(keypoints[subject][action]): 105 | # Normalize camera frame 106 | cam = dataset.cameras()[subject][cam_idx] 107 | kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) 108 | keypoints[subject][action][cam_idx] = kps 109 | 110 | subjects_train = args.subjects_train.split(',') 111 | subjects_semi = [] if not args.subjects_unlabeled else args.subjects_unlabeled.split(',') 112 | if not args.render: 113 | subjects_test = args.subjects_test.split(',') 114 | else: 115 | subjects_test = [args.viz_subject] 116 | 117 | semi_supervised = len(subjects_semi) > 0 118 | if semi_supervised and not dataset.supports_semi_supervised(): 119 | raise RuntimeError('Semi-supervised training is not implemented for this dataset') 120 | 121 | def fetch(subjects, action_filter=None, subset=1, parse_3d_poses=True): 122 | out_poses_3d = [] 123 | out_poses_2d = [] 124 | out_camera_params = [] 125 | for subject in subjects: 126 | for action in keypoints[subject].keys(): 127 | if action_filter is not None: 128 | found = False 129 | for a in action_filter: 130 | if action.startswith(a): 131 | found = True 132 | break 133 | if not found: 134 | continue 135 | 136 | poses_2d = keypoints[subject][action] 137 | for i in range(len(poses_2d)): # Iterate across cameras 138 | out_poses_2d.append(poses_2d[i]) 139 | 140 | if subject in dataset.cameras(): 141 | cams = dataset.cameras()[subject] 142 | assert len(cams) == len(poses_2d), 'Camera count mismatch' 143 | for cam in cams: 144 | if 'intrinsic' in cam: 145 | out_camera_params.append(cam['intrinsic']) 146 | 147 | if parse_3d_poses and 'positions_3d' in dataset[subject][action]: 148 | poses_3d = dataset[subject][action]['positions_3d'] 149 | assert len(poses_3d) == len(poses_2d), 'Camera count mismatch' 150 | for i in range(len(poses_3d)): # Iterate across cameras 151 | out_poses_3d.append(poses_3d[i]) 152 | 153 | if len(out_camera_params) == 0: 154 | out_camera_params = None 155 | if len(out_poses_3d) == 0: 156 | out_poses_3d = None 157 | 158 | stride = args.downsample 159 | if subset < 1: 160 | for i in range(len(out_poses_2d)): 161 | n_frames = int(round(len(out_poses_2d[i])//stride * subset)*stride) 162 | start = deterministic_random(0, len(out_poses_2d[i]) - n_frames + 1, str(len(out_poses_2d[i]))) 163 | out_poses_2d[i] = out_poses_2d[i][start:start+n_frames:stride] 164 | if out_poses_3d is not None: 165 | out_poses_3d[i] = out_poses_3d[i][start:start+n_frames:stride] 166 | elif stride > 1: 167 | # Downsample as requested 168 | for i in range(len(out_poses_2d)): 169 | out_poses_2d[i] = out_poses_2d[i][::stride] 170 | if out_poses_3d is not None: 171 | out_poses_3d[i] = out_poses_3d[i][::stride] 172 | 173 | 174 | return out_camera_params, out_poses_3d, out_poses_2d 175 | 176 | action_filter = None if args.actions == '*' else args.actions.split(',') 177 | if action_filter is not None: 178 | print('Selected actions:', action_filter) 179 | 180 | cameras_valid, poses_valid, poses_valid_2d = fetch(subjects_test, action_filter) 181 | 182 | filter_widths = [int(x) for x in args.architecture.split(',')] 183 | if not args.disable_optimizations and not args.dense and args.stride == 1: 184 | # Use optimized model for single-frame predictions 185 | model_pos_train = TemporalModelOptimized1f(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), 186 | filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels) 187 | else: 188 | # When incompatible settings are detected (stride > 1, dense filters, or disabled optimization) fall back to normal model 189 | model_pos_train = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), 190 | filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, 191 | dense=args.dense) 192 | 193 | model_pos = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), 194 | filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, 195 | dense=args.dense) 196 | 197 | receptive_field = model_pos.receptive_field() 198 | print('INFO: Receptive field: {} frames'.format(receptive_field)) 199 | pad = (receptive_field - 1) // 2 # Padding on each side 200 | if args.causal: 201 | print('INFO: Using causal convolutions') 202 | causal_shift = pad 203 | else: 204 | causal_shift = 0 205 | 206 | model_params = 0 207 | for parameter in model_pos.parameters(): 208 | model_params += parameter.numel() 209 | print('INFO: Trainable parameter count:', model_params) 210 | 211 | if torch.cuda.is_available(): 212 | model_pos = model_pos.cuda() 213 | model_pos_train = model_pos_train.cuda() 214 | 215 | if args.resume or args.evaluate: 216 | chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate) 217 | print('Loading checkpoint', chk_filename) 218 | checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) 219 | print('This model was trained for {} epochs'.format(checkpoint['epoch'])) 220 | model_pos_train.load_state_dict(checkpoint['model_pos']) 221 | model_pos.load_state_dict(checkpoint['model_pos']) 222 | 223 | if args.evaluate and 'model_traj' in checkpoint: 224 | # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild) 225 | model_traj = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1, 226 | filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, 227 | dense=args.dense) 228 | if torch.cuda.is_available(): 229 | model_traj = model_traj.cuda() 230 | model_traj.load_state_dict(checkpoint['model_traj']) 231 | else: 232 | model_traj = None 233 | 234 | 235 | test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, 236 | pad=pad, causal_shift=causal_shift, augment=False, 237 | kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) 238 | print('INFO: Testing on {} frames'.format(test_generator.num_frames())) 239 | 240 | if not args.evaluate: 241 | cameras_train, poses_train, poses_train_2d = fetch(subjects_train, action_filter, subset=args.subset) 242 | 243 | lr = args.learning_rate 244 | if semi_supervised: 245 | cameras_semi, _, poses_semi_2d = fetch(subjects_semi, action_filter, parse_3d_poses=False) 246 | 247 | if not args.disable_optimizations and not args.dense and args.stride == 1: 248 | # Use optimized model for single-frame predictions 249 | model_traj_train = TemporalModelOptimized1f(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1, 250 | filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels) 251 | else: 252 | # When incompatible settings are detected (stride > 1, dense filters, or disabled optimization) fall back to normal model 253 | model_traj_train = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1, 254 | filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, 255 | dense=args.dense) 256 | 257 | model_traj = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1, 258 | filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, 259 | dense=args.dense) 260 | if torch.cuda.is_available(): 261 | model_traj = model_traj.cuda() 262 | model_traj_train = model_traj_train.cuda() 263 | optimizer = optim.Adam(list(model_pos_train.parameters()) + list(model_traj_train.parameters()), 264 | lr=lr, amsgrad=True) 265 | 266 | losses_2d_train_unlabeled = [] 267 | losses_2d_train_labeled_eval = [] 268 | losses_2d_train_unlabeled_eval = [] 269 | losses_2d_valid = [] 270 | 271 | losses_traj_train = [] 272 | losses_traj_train_eval = [] 273 | losses_traj_valid = [] 274 | else: 275 | optimizer = optim.Adam(model_pos_train.parameters(), lr=lr, amsgrad=True) 276 | 277 | lr_decay = args.lr_decay 278 | 279 | losses_3d_train = [] 280 | losses_3d_train_eval = [] 281 | losses_3d_valid = [] 282 | 283 | epoch = 0 284 | initial_momentum = 0.8 #0.1 285 | final_momentum = 0.6 #0.001 286 | 287 | 288 | train_generator = ChunkedGenerator(args.batch_size//args.stride, cameras_train, poses_train, poses_train_2d, args.stride, 289 | pad=pad, causal_shift=causal_shift, shuffle=True, augment=args.data_augmentation, 290 | kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) 291 | train_generator_eval = UnchunkedGenerator(cameras_train, poses_train, poses_train_2d, 292 | pad=pad, causal_shift=causal_shift, augment=False) 293 | print('INFO: Training on {} frames'.format(train_generator_eval.num_frames())) 294 | if semi_supervised: 295 | semi_generator = ChunkedGenerator(args.batch_size//args.stride, cameras_semi, None, poses_semi_2d, args.stride, 296 | pad=pad, causal_shift=causal_shift, shuffle=True, 297 | random_seed=4321, augment=args.data_augmentation, 298 | kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right, 299 | endless=True) 300 | semi_generator_eval = UnchunkedGenerator(cameras_semi, None, poses_semi_2d, 301 | pad=pad, causal_shift=causal_shift, augment=False) 302 | print('INFO: Semi-supervision on {} frames'.format(semi_generator_eval.num_frames())) 303 | 304 | if args.resume: 305 | epoch = checkpoint['epoch'] 306 | if 'optimizer' in checkpoint and checkpoint['optimizer'] is not None: 307 | optimizer.load_state_dict(checkpoint['optimizer']) 308 | train_generator.set_random_state(checkpoint['random_state']) 309 | else: 310 | print('WARNING: this checkpoint does not contain an optimizer state. The optimizer will be reinitialized.') 311 | 312 | lr = checkpoint['lr'] 313 | if semi_supervised: 314 | model_traj_train.load_state_dict(checkpoint['model_traj']) 315 | model_traj.load_state_dict(checkpoint['model_traj']) 316 | semi_generator.set_random_state(checkpoint['random_state_semi']) 317 | 318 | print('** Note: reported losses are averaged over all frames and test-time augmentation is not used here.') 319 | print('** The final evaluation will be carried out after the last training epoch.') 320 | 321 | # Pos model only 322 | while epoch < args.epochs: 323 | start_time = time() 324 | epoch_loss_3d_train = 0 325 | epoch_loss_traj_train = 0 326 | epoch_loss_2d_train_unlabeled = 0 327 | N = 0 328 | N_semi = 0 329 | model_pos_train.train() 330 | if semi_supervised: 331 | # Semi-supervised scenario 332 | model_traj_train.train() 333 | for (_, batch_3d, batch_2d), (cam_semi, _, batch_2d_semi) in \ 334 | zip(train_generator.next_epoch(), semi_generator.next_epoch()): 335 | 336 | # Fall back to supervised training for the first epoch (to avoid instability) 337 | skip = epoch < args.warmup 338 | 339 | cam_semi = torch.from_numpy(cam_semi.astype('float32')) 340 | inputs_3d = torch.from_numpy(batch_3d.astype('float32')) 341 | if torch.cuda.is_available(): 342 | cam_semi = cam_semi.cuda() 343 | inputs_3d = inputs_3d.cuda() 344 | 345 | inputs_traj = inputs_3d[:, :, :1].clone() 346 | inputs_3d[:, :, 0] = 0 347 | 348 | # Split point between labeled and unlabeled samples in the batch 349 | split_idx = inputs_3d.shape[0] 350 | 351 | inputs_2d = torch.from_numpy(batch_2d.astype('float32')) 352 | inputs_2d_semi = torch.from_numpy(batch_2d_semi.astype('float32')) 353 | if torch.cuda.is_available(): 354 | inputs_2d = inputs_2d.cuda() 355 | inputs_2d_semi = inputs_2d_semi.cuda() 356 | inputs_2d_cat = torch.cat((inputs_2d, inputs_2d_semi), dim=0) if not skip else inputs_2d 357 | 358 | optimizer.zero_grad() 359 | 360 | # Compute 3D poses 361 | predicted_3d_pos_cat = model_pos_train(inputs_2d_cat) 362 | 363 | loss_3d_pos = mpjpe(predicted_3d_pos_cat[:split_idx], inputs_3d) 364 | epoch_loss_3d_train += inputs_3d.shape[0]*inputs_3d.shape[1] * loss_3d_pos.item() 365 | N += inputs_3d.shape[0]*inputs_3d.shape[1] 366 | loss_total = loss_3d_pos 367 | 368 | # Compute global trajectory 369 | predicted_traj_cat = model_traj_train(inputs_2d_cat) 370 | w = 1 / inputs_traj[:, :, :, 2] # Weight inversely proportional to depth 371 | loss_traj = weighted_mpjpe(predicted_traj_cat[:split_idx], inputs_traj, w) 372 | epoch_loss_traj_train += inputs_3d.shape[0]*inputs_3d.shape[1] * loss_traj.item() 373 | assert inputs_traj.shape[0]*inputs_traj.shape[1] == inputs_3d.shape[0]*inputs_3d.shape[1] 374 | loss_total += loss_traj 375 | 376 | if not skip: 377 | # Semi-supervised loss for unlabeled samples 378 | predicted_semi = predicted_3d_pos_cat[split_idx:] 379 | if pad > 0: 380 | target_semi = inputs_2d_semi[:, pad:-pad, :, :2].contiguous() 381 | else: 382 | target_semi = inputs_2d_semi[:, :, :, :2].contiguous() 383 | 384 | projection_func = project_to_2d_linear if args.linear_projection else project_to_2d 385 | reconstruction_semi = projection_func(predicted_semi + predicted_traj_cat[split_idx:], cam_semi) 386 | 387 | loss_reconstruction = mpjpe(reconstruction_semi, target_semi) # On 2D poses 388 | epoch_loss_2d_train_unlabeled += predicted_semi.shape[0]*predicted_semi.shape[1] * loss_reconstruction.item() 389 | if not args.no_proj: 390 | loss_total += loss_reconstruction 391 | 392 | # Bone length term to enforce kinematic constraints 393 | if args.bone_length_term: 394 | dists = predicted_3d_pos_cat[:, :, 1:] - predicted_3d_pos_cat[:, :, dataset.skeleton().parents()[1:]] 395 | bone_lengths = torch.mean(torch.norm(dists, dim=3), dim=1) 396 | penalty = torch.mean(torch.abs(torch.mean(bone_lengths[:split_idx], dim=0) \ 397 | - torch.mean(bone_lengths[split_idx:], dim=0))) 398 | loss_total += penalty 399 | 400 | 401 | N_semi += predicted_semi.shape[0]*predicted_semi.shape[1] 402 | else: 403 | N_semi += 1 # To avoid division by zero 404 | 405 | loss_total.backward() 406 | 407 | optimizer.step() 408 | losses_traj_train.append(epoch_loss_traj_train / N) 409 | losses_2d_train_unlabeled.append(epoch_loss_2d_train_unlabeled / N_semi) 410 | else: 411 | # Regular supervised scenario 412 | for _, batch_3d, batch_2d in train_generator.next_epoch(): 413 | inputs_3d = torch.from_numpy(batch_3d.astype('float32')) 414 | inputs_2d = torch.from_numpy(batch_2d.astype('float32')) 415 | if torch.cuda.is_available(): 416 | inputs_3d = inputs_3d.cuda() 417 | inputs_2d = inputs_2d.cuda() 418 | inputs_3d[:, :, 0] = 0 419 | 420 | optimizer.zero_grad() 421 | 422 | # Predict 3D poses 423 | predicted_3d_pos = model_pos_train(inputs_2d) 424 | loss_3d_pos = mpjpe(predicted_3d_pos, inputs_3d) 425 | epoch_loss_3d_train += inputs_3d.shape[0]*inputs_3d.shape[1] * loss_3d_pos.item() 426 | N += inputs_3d.shape[0]*inputs_3d.shape[1] 427 | 428 | loss_total = loss_3d_pos 429 | loss_total.backward() 430 | 431 | optimizer.step() 432 | 433 | losses_3d_train.append(epoch_loss_3d_train / N) 434 | 435 | # End-of-epoch evaluation 436 | with torch.no_grad(): 437 | model_pos.load_state_dict(model_pos_train.state_dict()) 438 | model_pos.eval() 439 | model_pos.set_bn_momentum(0.9) 440 | if semi_supervised: 441 | model_traj.load_state_dict(model_traj_train.state_dict()) 442 | model_traj.eval() 443 | 444 | epoch_loss_3d_valid = 0 445 | epoch_loss_traj_valid = 0 446 | epoch_loss_2d_valid = 0 447 | N = 0 448 | 449 | if not args.no_eval: 450 | # Evaluate on test set 451 | for cam, batch, batch_2d in test_generator.next_epoch(): 452 | inputs_3d = torch.from_numpy(batch.astype('float32')) 453 | inputs_2d = torch.from_numpy(batch_2d.astype('float32')) 454 | if torch.cuda.is_available(): 455 | inputs_3d = inputs_3d.cuda() 456 | inputs_2d = inputs_2d.cuda() 457 | inputs_traj = inputs_3d[:, :, :1].clone() 458 | inputs_3d[:, :, 0] = 0 459 | 460 | # Predict 3D poses 461 | predicted_3d_pos = model_pos(inputs_2d) 462 | loss_3d_pos = mpjpe(predicted_3d_pos, inputs_3d) 463 | epoch_loss_3d_valid += inputs_3d.shape[0]*inputs_3d.shape[1] * loss_3d_pos.item() 464 | N += inputs_3d.shape[0]*inputs_3d.shape[1] 465 | 466 | if semi_supervised: 467 | cam = torch.from_numpy(cam.astype('float32')) 468 | if torch.cuda.is_available(): 469 | cam = cam.cuda() 470 | 471 | predicted_traj = model_traj(inputs_2d) 472 | loss_traj = mpjpe(predicted_traj, inputs_traj) 473 | epoch_loss_traj_valid += inputs_traj.shape[0]*inputs_traj.shape[1] * loss_traj.item() 474 | assert inputs_traj.shape[0]*inputs_traj.shape[1] == inputs_3d.shape[0]*inputs_3d.shape[1] 475 | 476 | if pad > 0: 477 | target = inputs_2d[:, pad:-pad, :, :2].contiguous() 478 | else: 479 | target = inputs_2d[:, :, :, :2].contiguous() 480 | reconstruction = project_to_2d(predicted_3d_pos + predicted_traj, cam) 481 | loss_reconstruction = mpjpe(reconstruction, target) # On 2D poses 482 | epoch_loss_2d_valid += reconstruction.shape[0]*reconstruction.shape[1] * loss_reconstruction.item() 483 | assert reconstruction.shape[0]*reconstruction.shape[1] == inputs_3d.shape[0]*inputs_3d.shape[1] 484 | 485 | losses_3d_valid.append(epoch_loss_3d_valid / N) 486 | if semi_supervised: 487 | losses_traj_valid.append(epoch_loss_traj_valid / N) 488 | losses_2d_valid.append(epoch_loss_2d_valid / N) 489 | 490 | 491 | # Evaluate on training set, this time in evaluation mode 492 | epoch_loss_3d_train_eval = 0 493 | epoch_loss_traj_train_eval = 0 494 | epoch_loss_2d_train_labeled_eval = 0 495 | N = 0 496 | for cam, batch, batch_2d in train_generator_eval.next_epoch(): 497 | if batch_2d.shape[1] == 0: 498 | # This can only happen when downsampling the dataset 499 | continue 500 | 501 | inputs_3d = torch.from_numpy(batch.astype('float32')) 502 | inputs_2d = torch.from_numpy(batch_2d.astype('float32')) 503 | if torch.cuda.is_available(): 504 | inputs_3d = inputs_3d.cuda() 505 | inputs_2d = inputs_2d.cuda() 506 | inputs_traj = inputs_3d[:, :, :1].clone() 507 | inputs_3d[:, :, 0] = 0 508 | 509 | # Compute 3D poses 510 | predicted_3d_pos = model_pos(inputs_2d) 511 | loss_3d_pos = mpjpe(predicted_3d_pos, inputs_3d) 512 | epoch_loss_3d_train_eval += inputs_3d.shape[0]*inputs_3d.shape[1] * loss_3d_pos.item() 513 | N += inputs_3d.shape[0]*inputs_3d.shape[1] 514 | 515 | if semi_supervised: 516 | cam = torch.from_numpy(cam.astype('float32')) 517 | if torch.cuda.is_available(): 518 | cam = cam.cuda() 519 | predicted_traj = model_traj(inputs_2d) 520 | loss_traj = mpjpe(predicted_traj, inputs_traj) 521 | epoch_loss_traj_train_eval += inputs_traj.shape[0]*inputs_traj.shape[1] * loss_traj.item() 522 | assert inputs_traj.shape[0]*inputs_traj.shape[1] == inputs_3d.shape[0]*inputs_3d.shape[1] 523 | 524 | if pad > 0: 525 | target = inputs_2d[:, pad:-pad, :, :2].contiguous() 526 | else: 527 | target = inputs_2d[:, :, :, :2].contiguous() 528 | reconstruction = project_to_2d(predicted_3d_pos + predicted_traj, cam) 529 | loss_reconstruction = mpjpe(reconstruction, target) 530 | epoch_loss_2d_train_labeled_eval += reconstruction.shape[0]*reconstruction.shape[1] * loss_reconstruction.item() 531 | assert reconstruction.shape[0]*reconstruction.shape[1] == inputs_3d.shape[0]*inputs_3d.shape[1] 532 | 533 | losses_3d_train_eval.append(epoch_loss_3d_train_eval / N) 534 | if semi_supervised: 535 | losses_traj_train_eval.append(epoch_loss_traj_train_eval / N) 536 | losses_2d_train_labeled_eval.append(epoch_loss_2d_train_labeled_eval / N) 537 | 538 | # Evaluate 2D loss on unlabeled training set (in evaluation mode) 539 | epoch_loss_2d_train_unlabeled_eval = 0 540 | N_semi = 0 541 | if semi_supervised: 542 | for cam, _, batch_2d in semi_generator_eval.next_epoch(): 543 | cam = torch.from_numpy(cam.astype('float32')) 544 | inputs_2d_semi = torch.from_numpy(batch_2d.astype('float32')) 545 | if torch.cuda.is_available(): 546 | cam = cam.cuda() 547 | inputs_2d_semi = inputs_2d_semi.cuda() 548 | 549 | predicted_3d_pos_semi = model_pos(inputs_2d_semi) 550 | predicted_traj_semi = model_traj(inputs_2d_semi) 551 | if pad > 0: 552 | target_semi = inputs_2d_semi[:, pad:-pad, :, :2].contiguous() 553 | else: 554 | target_semi = inputs_2d_semi[:, :, :, :2].contiguous() 555 | reconstruction_semi = project_to_2d(predicted_3d_pos_semi + predicted_traj_semi, cam) 556 | loss_reconstruction_semi = mpjpe(reconstruction_semi, target_semi) 557 | 558 | epoch_loss_2d_train_unlabeled_eval += reconstruction_semi.shape[0]*reconstruction_semi.shape[1] \ 559 | * loss_reconstruction_semi.item() 560 | N_semi += reconstruction_semi.shape[0]*reconstruction_semi.shape[1] 561 | losses_2d_train_unlabeled_eval.append(epoch_loss_2d_train_unlabeled_eval / N_semi) 562 | 563 | elapsed = (time() - start_time)/60 564 | 565 | if args.no_eval: 566 | print('[%d] time %.2f lr %f 3d_train %f' % ( 567 | epoch + 1, 568 | elapsed, 569 | lr, 570 | losses_3d_train[-1] * 1000)) 571 | else: 572 | if semi_supervised: 573 | print('[%d] time %.2f lr %f 3d_train %f 3d_eval %f traj_eval %f 3d_valid %f ' 574 | 'traj_valid %f 2d_train_sup %f 2d_train_unsup %f 2d_valid %f' % ( 575 | epoch + 1, 576 | elapsed, 577 | lr, 578 | losses_3d_train[-1] * 1000, 579 | losses_3d_train_eval[-1] * 1000, 580 | losses_traj_train_eval[-1] * 1000, 581 | losses_3d_valid[-1] * 1000, 582 | losses_traj_valid[-1] * 1000, 583 | losses_2d_train_labeled_eval[-1], 584 | losses_2d_train_unlabeled_eval[-1], 585 | losses_2d_valid[-1])) 586 | else: 587 | print('[%d] time %.2f lr %f 3d_train %f 3d_eval %f 3d_valid %f' % ( 588 | epoch + 1, 589 | elapsed, 590 | lr, 591 | losses_3d_train[-1] * 1000, 592 | losses_3d_train_eval[-1] * 1000, 593 | losses_3d_valid[-1] *1000)) 594 | 595 | # Decay learning rate exponentially 596 | lr *= lr_decay 597 | for param_group in optimizer.param_groups: 598 | param_group['lr'] *= lr_decay 599 | epoch += 1 600 | 601 | # Decay BatchNorm momentum 602 | momentum = initial_momentum * np.exp(-epoch/args.epochs * np.log(initial_momentum/final_momentum)) 603 | model_pos_train.set_bn_momentum(momentum) 604 | if semi_supervised: 605 | model_traj_train.set_bn_momentum(momentum) 606 | 607 | # Save checkpoint if necessary 608 | if epoch % args.checkpoint_frequency == 0: 609 | chk_path = os.path.join(args.checkpoint, 'epoch_{}.bin'.format(epoch)) 610 | print('Saving checkpoint to', chk_path) 611 | 612 | torch.save({ 613 | 'epoch': epoch, 614 | 'lr': lr, 615 | 'random_state': train_generator.random_state(), 616 | 'optimizer': optimizer.state_dict(), 617 | 'model_pos': model_pos_train.state_dict(), 618 | 'model_traj': model_traj_train.state_dict() if semi_supervised else None, 619 | 'random_state_semi': semi_generator.random_state() if semi_supervised else None, 620 | }, chk_path) 621 | 622 | # Save training curves after every epoch, as .png images (if requested) 623 | if args.export_training_curves and epoch > 3: 624 | if 'matplotlib' not in sys.modules: 625 | import matplotlib 626 | matplotlib.use('Agg') 627 | import matplotlib.pyplot as plt 628 | 629 | plt.figure() 630 | epoch_x = np.arange(3, len(losses_3d_train)) + 1 631 | plt.plot(epoch_x, losses_3d_train[3:], '--', color='C0') 632 | plt.plot(epoch_x, losses_3d_train_eval[3:], color='C0') 633 | plt.plot(epoch_x, losses_3d_valid[3:], color='C1') 634 | plt.legend(['3d train', '3d train (eval)', '3d valid (eval)']) 635 | plt.ylabel('MPJPE (m)') 636 | plt.xlabel('Epoch') 637 | plt.xlim((3, epoch)) 638 | plt.savefig(os.path.join(args.checkpoint, 'loss_3d.png')) 639 | 640 | if semi_supervised: 641 | plt.figure() 642 | plt.plot(epoch_x, losses_traj_train[3:], '--', color='C0') 643 | plt.plot(epoch_x, losses_traj_train_eval[3:], color='C0') 644 | plt.plot(epoch_x, losses_traj_valid[3:], color='C1') 645 | plt.legend(['traj. train', 'traj. train (eval)', 'traj. valid (eval)']) 646 | plt.ylabel('Mean distance (m)') 647 | plt.xlabel('Epoch') 648 | plt.xlim((3, epoch)) 649 | plt.savefig(os.path.join(args.checkpoint, 'loss_traj.png')) 650 | 651 | plt.figure() 652 | plt.plot(epoch_x, losses_2d_train_labeled_eval[3:], color='C0') 653 | plt.plot(epoch_x, losses_2d_train_unlabeled[3:], '--', color='C1') 654 | plt.plot(epoch_x, losses_2d_train_unlabeled_eval[3:], color='C1') 655 | plt.plot(epoch_x, losses_2d_valid[3:], color='C2') 656 | plt.legend(['2d train labeled (eval)', '2d train unlabeled', '2d train unlabeled (eval)', '2d valid (eval)']) 657 | plt.ylabel('MPJPE (2D)') 658 | plt.xlabel('Epoch') 659 | plt.xlim((3, epoch)) 660 | plt.savefig(os.path.join(args.checkpoint, 'loss_2d.png')) 661 | plt.close('all') 662 | 663 | # Evaluate 664 | def evaluate(test_generator, action=None, return_predictions=False, use_trajectory_model=False): 665 | epoch_loss_3d_pos = 0 666 | epoch_loss_3d_pos_procrustes = 0 667 | epoch_loss_3d_pos_scale = 0 668 | epoch_loss_3d_vel = 0 669 | with torch.no_grad(): 670 | if not use_trajectory_model: 671 | model_pos.eval() 672 | else: 673 | model_traj.eval() 674 | N = 0 675 | for _, batch, batch_2d in test_generator.next_epoch(): 676 | inputs_2d = torch.from_numpy(batch_2d.astype('float32')) 677 | if torch.cuda.is_available(): 678 | inputs_2d = inputs_2d.cuda() 679 | 680 | # Positional model 681 | if not use_trajectory_model: 682 | predicted_3d_pos = model_pos(inputs_2d) 683 | else: 684 | predicted_3d_pos = model_traj(inputs_2d) 685 | 686 | # Test-time augmentation (if enabled) 687 | if test_generator.augment_enabled(): 688 | # Undo flipping and take average with non-flipped version 689 | predicted_3d_pos[1, :, :, 0] *= -1 690 | if not use_trajectory_model: 691 | predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left] 692 | predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) 693 | 694 | if return_predictions: 695 | return predicted_3d_pos.squeeze(0).cpu().numpy() 696 | 697 | inputs_3d = torch.from_numpy(batch.astype('float32')) 698 | if torch.cuda.is_available(): 699 | inputs_3d = inputs_3d.cuda() 700 | inputs_3d[:, :, 0] = 0 701 | if test_generator.augment_enabled(): 702 | inputs_3d = inputs_3d[:1] 703 | 704 | error = mpjpe(predicted_3d_pos, inputs_3d) 705 | epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item() 706 | 707 | epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item() 708 | N += inputs_3d.shape[0] * inputs_3d.shape[1] 709 | 710 | inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) 711 | predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) 712 | 713 | epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs) 714 | 715 | # Compute velocity error 716 | epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs) 717 | 718 | if action is None: 719 | print('----------') 720 | else: 721 | print('----'+action+'----') 722 | e1 = (epoch_loss_3d_pos / N)*1000 723 | e2 = (epoch_loss_3d_pos_procrustes / N)*1000 724 | e3 = (epoch_loss_3d_pos_scale / N)*1000 725 | ev = (epoch_loss_3d_vel / N)*1000 726 | print('Test time augmentation:', test_generator.augment_enabled()) 727 | print('Protocol #1 Error (MPJPE):', e1, 'mm') 728 | print('Protocol #2 Error (P-MPJPE):', e2, 'mm') 729 | print('Protocol #3 Error (N-MPJPE):', e3, 'mm') 730 | print('Velocity Error (MPJVE):', ev, 'mm') 731 | print('----------') 732 | 733 | return e1, e2, e3, ev 734 | 735 | 736 | if args.render: 737 | print('Rendering...') 738 | 739 | input_keypoints = keypoints[args.viz_subject][args.viz_action][args.viz_camera].copy() 740 | ground_truth = None 741 | if args.viz_subject in dataset.subjects() and args.viz_action in dataset[args.viz_subject]: 742 | if 'positions_3d' in dataset[args.viz_subject][args.viz_action]: 743 | ground_truth = dataset[args.viz_subject][args.viz_action]['positions_3d'][args.viz_camera].copy() 744 | if ground_truth is None: 745 | print('INFO: this action is unlabeled. Ground truth will not be rendered.') 746 | 747 | gen = UnchunkedGenerator(None, None, [input_keypoints], 748 | pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, 749 | kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) 750 | prediction = evaluate(gen, return_predictions=True) 751 | if model_traj is not None and ground_truth is None: 752 | prediction_traj = evaluate(gen, return_predictions=True, use_trajectory_model=True) 753 | prediction += prediction_traj 754 | 755 | if args.viz_export is not None: 756 | print('Exporting joint positions to', args.viz_export) 757 | # Predictions are in camera space 758 | np.save(args.viz_export, prediction) 759 | 760 | if args.viz_output is not None: 761 | if ground_truth is not None: 762 | # Reapply trajectory 763 | trajectory = ground_truth[:, :1] 764 | ground_truth[:, 1:] += trajectory 765 | prediction += trajectory 766 | 767 | # Invert camera transformation 768 | cam = dataset.cameras()[args.viz_subject][args.viz_camera] 769 | if ground_truth is not None: 770 | prediction = camera_to_world(prediction, R=cam['orientation'], t=cam['translation']) 771 | ground_truth = camera_to_world(ground_truth, R=cam['orientation'], t=cam['translation']) 772 | else: 773 | # If the ground truth is not available, take the camera extrinsic params from a random subject. 774 | # They are almost the same, and anyway, we only need this for visualization purposes. 775 | for subject in dataset.cameras(): 776 | if 'orientation' in dataset.cameras()[subject][args.viz_camera]: 777 | rot = dataset.cameras()[subject][args.viz_camera]['orientation'] 778 | break 779 | prediction = camera_to_world(prediction, R=rot, t=0) 780 | # We don't have the trajectory, but at least we can rebase the height 781 | prediction[:, :, 2] -= np.min(prediction[:, :, 2]) 782 | 783 | anim_output = {'Reconstruction': prediction} 784 | if ground_truth is not None and not args.viz_no_ground_truth: 785 | anim_output['Ground truth'] = ground_truth 786 | 787 | input_keypoints = image_coordinates(input_keypoints[..., :2], w=cam['res_w'], h=cam['res_h']) 788 | 789 | from common.visualization import render_animation 790 | render_animation(input_keypoints, keypoints_metadata, anim_output, 791 | dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, 792 | limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, 793 | input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']), 794 | input_video_skip=args.viz_skip) 795 | 796 | else: 797 | print('Evaluating...') 798 | all_actions = {} 799 | all_actions_by_subject = {} 800 | for subject in subjects_test: 801 | if subject not in all_actions_by_subject: 802 | all_actions_by_subject[subject] = {} 803 | 804 | for action in dataset[subject].keys(): 805 | action_name = action.split(' ')[0] 806 | if action_name not in all_actions: 807 | all_actions[action_name] = [] 808 | if action_name not in all_actions_by_subject[subject]: 809 | all_actions_by_subject[subject][action_name] = [] 810 | all_actions[action_name].append((subject, action)) 811 | all_actions_by_subject[subject][action_name].append((subject, action)) 812 | 813 | def fetch_actions(actions): 814 | out_poses_3d = [] 815 | out_poses_2d = [] 816 | 817 | for subject, action in actions: 818 | poses_2d = keypoints[subject][action] 819 | for i in range(len(poses_2d)): # Iterate across cameras 820 | out_poses_2d.append(poses_2d[i]) 821 | 822 | poses_3d = dataset[subject][action]['positions_3d'] 823 | assert len(poses_3d) == len(poses_2d), 'Camera count mismatch' 824 | for i in range(len(poses_3d)): # Iterate across cameras 825 | out_poses_3d.append(poses_3d[i]) 826 | 827 | stride = args.downsample 828 | if stride > 1: 829 | # Downsample as requested 830 | for i in range(len(out_poses_2d)): 831 | out_poses_2d[i] = out_poses_2d[i][::stride] 832 | if out_poses_3d is not None: 833 | out_poses_3d[i] = out_poses_3d[i][::stride] 834 | 835 | return out_poses_3d, out_poses_2d 836 | 837 | def run_evaluation(actions, action_filter=None): 838 | errors_p1 = [] 839 | errors_p2 = [] 840 | errors_p3 = [] 841 | errors_vel = [] 842 | 843 | for action_key in actions.keys(): 844 | if action_filter is not None: 845 | found = False 846 | for a in action_filter: 847 | if action_key.startswith(a): 848 | found = True 849 | break 850 | if not found: 851 | continue 852 | 853 | poses_act, poses_2d_act = fetch_actions(actions[action_key]) 854 | gen = UnchunkedGenerator(None, poses_act, poses_2d_act, 855 | pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, 856 | kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) 857 | e1, e2, e3, ev = evaluate(gen, action_key) 858 | errors_p1.append(e1) 859 | errors_p2.append(e2) 860 | errors_p3.append(e3) 861 | errors_vel.append(ev) 862 | 863 | print('Protocol #1 (MPJPE) action-wise average:', round(np.mean(errors_p1), 1), 'mm') 864 | print('Protocol #2 (P-MPJPE) action-wise average:', round(np.mean(errors_p2), 1), 'mm') 865 | print('Protocol #3 (N-MPJPE) action-wise average:', round(np.mean(errors_p3), 1), 'mm') 866 | print('Velocity (MPJVE) action-wise average:', round(np.mean(errors_vel), 2), 'mm') 867 | 868 | if not args.by_subject: 869 | run_evaluation(all_actions, action_filter) 870 | else: 871 | for subject in all_actions_by_subject.keys(): 872 | print('Evaluating on subject', subject) 873 | run_evaluation(all_actions_by_subject[subject], action_filter) 874 | print('') 875 | --------------------------------------------------------------------------------