├── checkpoint
    └── loss_3d.png
├── Documentation on the code.pdf
├── README.md
├── data
    └── prepare_data_mpi_inf_3dhp.py
├── common
    └── mpi_inf_3dhp_dataset.py
└── run.py


/checkpoint/loss_3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zemingxie/MPI_INF_3DHP-on-VideoPose3D/HEAD/checkpoint/loss_3d.png


--------------------------------------------------------------------------------
/Documentation on the code.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zemingxie/MPI_INF_3DHP-on-VideoPose3D/HEAD/Documentation on the code.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MPI_INF_3DHP-on-VideoPose3D
 2 | 
 3 | ## Instruction
 4 | Copy paste files to VideoPose3D directory.
 5 | The rest follows the same procedure as VideoPose3D
 6 | ### Setup dataset
 7 | Go to [mpi_inf_3dhp website](http://vcai.mpi-inf.mpg.de/3dhp-dataset/) and follow the instruction to download the dataset, adn run the following
 8 | ```bash
 9 | python prepare_data_mpi_inf_3dhp.py --from-source path/to/mpi_inf_3dhp/dataset
10 | ```
11 | 
12 | ### Running
13 | To Achieve the same result on the picture below, you can run the following code:
14 | ```bash
15 | python run.py -d mpi_inf_3dhp -k gt -str S1,S2,S3,S4,S5,S6,S7,S8 -ste TS1,TS2,TS3,TS4,TS5,TS6 --export-training-curves -b 256 -e 200 -lrd 0.98
16 | ```
17 | 
18 | ## Result
19 | <p align="left"> <img src="./checkpoint/loss_3d.png" width="75%"> </p>
20 | 
21 | ## Ackownledge
22 | Part of my code is borrowed from [VideoPose3D](https://github.com/facebookresearch/VideoPose3D). I thank the authors for releasing the codes.
23 | 


--------------------------------------------------------------------------------
/data/prepare_data_mpi_inf_3dhp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | 
  8 | import argparse
  9 | import os
 10 | import numpy as np
 11 | import h5py
 12 | 
 13 | import sys
 14 | sys.path.append('../')
 15 | from common.mpi_inf_3dhp_dataset import MpiInf3dhpDataset
 16 | from common.camera import project_to_2d, image_coordinates
 17 | from common.utils import wrap
 18 | 
 19 | output_filename = 'data_3d_mpi_inf_3dhp'
 20 | output_filename_2d = 'data_2d_mpi_inf_3dhp_gt'
 21 | output_filename_2d2 = 'data_2d_mpi_inf_3dhp_computed_gt'
 22 | subjects_train = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8']
 23 | subjects_test = ['TS1', 'TS2', 'TS3', 'TS4', 'TS5', 'TS6']
 24 | joint_idx_train_matlab = [8, 6, 15, 16, 17, 10, 11, 12, 24, 25, 26, 19, 20, 21, 5, 4, 7]    # notice: it is in matlab index
 25 | joint_idx_train = [i-1 for i in joint_idx_train_matlab]
 26 | 
 27 | if __name__ == '__main__':
 28 |     if os.path.basename(os.getcwd()) != 'data':
 29 |         print('This script must be launched from the "data" directory')
 30 |         exit(0)
 31 |         
 32 |     parser = argparse.ArgumentParser(description='MPI_INF_3DHP dataset downloader/converter')
 33 |     
 34 |     # Convert dataset from original source, using files converted to .mat (the Human3.6M dataset path must be specified manually)
 35 |     # This option requires MATLAB to convert files using the provided script
 36 |     parser.add_argument('--from-source', default='', type=str, metavar='PATH', help='convert original dataset')
 37 |     
 38 |     args = parser.parse_args()
 39 |     
 40 |     if os.path.exists(output_filename + '.npz'):
 41 |         print('The dataset already exists at', output_filename + '.npz')
 42 |         exit(0)
 43 |                 
 44 |     if args.from_source:
 45 |         print('Converting original MPI_INF_3DHP dataset from', args.from_source)
 46 |         output = {}
 47 |         output_2d_poses = {}
 48 |         from scipy.io import loadmat
 49 |         
 50 |         for subject in subjects_train:
 51 |             output[subject] = {}
 52 |             output_2d_poses[subject] = {}
 53 |             file_1 = args.from_source + '/' + subject + '/Seq1/annot.mat'
 54 |             file_2 = args.from_source + '/' + subject + '/Seq2/annot.mat'
 55 |             hf = loadmat(file_1)
 56 |             positions_3d_temp = []
 57 |             positions_2d_temp = []
 58 |             
 59 |             for index in range(14):
 60 |             	positions = hf['annot3'][index, 0].reshape(-1, 28, 3)
 61 |             	positions /= 1000 # Meters instead of millimeters
 62 |             	positions_17 = positions[:,joint_idx_train,:]
 63 |             	positions_17[:, 1:] -= positions_17[:, :1] # Remove global offset, but keep trajectory in first position
 64 |             	positions_3d_temp.append(positions_17.astype('float32'))
 65 |             	positions_2d = hf['annot2'][index, 0].reshape(-1, 28, 2)
 66 |             	positions_2d_temp.append(positions_2d[:,joint_idx_train,:].astype('float32'))
 67 |             	
 68 |             output[subject]['Seq1'] = positions_3d_temp
 69 |             output_2d_poses[subject]['Seq1'] = positions_2d_temp
 70 |             
 71 |             positions_3d_temp = []
 72 |             positions_2d_temp = []
 73 |             hf = loadmat(file_2)
 74 |             for index in range(14):
 75 |             	positions = hf['annot3'][index, 0].reshape(-1, 28, 3)
 76 |             	positions /= 1000 # Meters instead of millimeters
 77 |             	positions_17 = positions[:,joint_idx_train,:]
 78 |             	positions_17[:, 1:] -= positions_17[:, :1] # Remove global offset, but keep trajectory in first position
 79 |             	positions_3d_temp.append(positions_17.astype('float32'))
 80 |             	positions_2d = hf['annot2'][index, 0].reshape(-1, 28, 2)
 81 |             	positions_2d_temp.append(positions_2d[:,joint_idx_train,:].astype('float32'))
 82 |             output[subject]['Seq2'] = positions_3d_temp
 83 |             output_2d_poses[subject]['Seq2'] = positions_2d_temp
 84 | 	    
 85 |         for subject in subjects_test:
 86 |             output[subject] = {}
 87 |             output_2d_poses[subject] = {}
 88 |             file_1 = args.from_source + '/mpi_inf_3dhp_test_set/mpi_inf_3dhp_test_set/' + subject + '/annot_data.mat'
 89 |             hf = {}
 90 |             f = h5py.File(file_1)
 91 |             for k, v in f.items():
 92 |             	hf[k] = np.array(v)
 93 |             positions = hf['annot3'].reshape(-1, 17, 3)
 94 |             positions /= 1000 # Meters instead of millimeters
 95 |             positions_17 = positions
 96 |             positions_17[:, 1:] -= positions_17[:, :1] # Remove global offset, but keep trajectory in first position
 97 |             output[subject]['Test'] = [positions_17.astype('float32')]
 98 |             positions_2d = hf['annot2'].reshape(-1, 17, 2)
 99 |             output_2d_poses[subject]['Test'] = [positions_2d.astype('float32')]
100 |         
101 |         print('Saving...')
102 |         np.savez_compressed(output_filename, positions_3d=output)
103 |         print('')
104 |         print('Getting 2D poses...')
105 |         dataset = MpiInf3dhpDataset(output_filename + '.npz')
106 |         metadata = {
107 |         	'num_joints': dataset.skeleton().num_joints(),
108 |         	'keypoints_symmetry': [dataset.skeleton().joints_left(), dataset.skeleton().joints_right()]
109 |         }
110 |         print('Saving...')
111 |         np.savez_compressed(output_filename_2d, positions_2d=output_2d_poses, metadata=metadata)
112 |         
113 |         print('Done.')
114 |     else:
115 |         print('Please specify the dataset source')
116 |         exit(0)
117 | '''
118 |     # Create 2D pose file
119 |     print('')
120 |     print('Computing ground-truth 2D poses...')
121 |     dataset = MpiInf3dhpDataset(output_filename + '.npz')
122 |     output_2d_poses = {}
123 |     for subject in dataset.subjects():
124 |         output_2d_poses[subject] = {}
125 |         for action in dataset[subject].keys():
126 |             anim = dataset[subject][action]
127 |             
128 |             positions_2d = []
129 |             for i,cam in enumerate(anim['cameras']):
130 |                 pos_3d = anim['positions'][i]
131 |                 pos_2d = wrap(project_to_2d, pos_3d, cam['intrinsic'], unsqueeze=True)
132 |                 pos_2d_pixel_space = image_coordinates(pos_2d, w=cam['res_w'], h=cam['res_h'])
133 |                 positions_2d.append(pos_2d_pixel_space.astype('float32'))
134 |             output_2d_poses[subject][action] = positions_2d
135 |             
136 |     print('Saving...')
137 |     metadata = {
138 |         'num_joints': dataset.skeleton().num_joints(),
139 |         'keypoints_symmetry': [dataset.skeleton().joints_left(), dataset.skeleton().joints_right()]
140 |     }
141 |     np.savez_compressed(output_filename_2d2, positions_2d=output_2d_poses, metadata=metadata)
142 | '''   
143 | 
144 | 


--------------------------------------------------------------------------------
/common/mpi_inf_3dhp_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | 
  8 | import numpy as np
  9 | import copy
 10 | from common.skeleton import Skeleton
 11 | from common.mocap_dataset import MocapDataset
 12 | from common.camera import normalize_screen_coordinates, image_coordinates
 13 |        
 14 | mpi_inf_3dhp_skeleton = Skeleton(parents=[-1,  0,  1,  2,  3,  4,  0,  6,  7,  8,  9,  0, 11, 12, 13, 14, 12,
 15 |        16, 17, 18, 19, 20, 19, 22, 12, 24, 25, 26, 27, 28, 27, 30],
 16 |        joints_left=[6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23],
 17 |        joints_right=[1, 2, 3, 4, 5, 24, 25, 26, 27, 28, 29, 30, 31])
 18 |        
 19 | subjects_train = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8']
 20 | subjects_test1 = ['TS1', 'TS2', 'TS3', 'TS4']
 21 | subjects_test2 = ['TS5', 'TS6']
 22 |        
 23 | mpi_inf_3dhp_cameras_intrinsic_params = [
 24 |     {
 25 |         'id': 'cam_0',
 26 |         'center': [1024.704, 1051.394],
 27 |         'focal_length': [1497.693, 1497.103],
 28 |         'radial_distortion': [0, 0, 0],
 29 |         'tangential_distortion': [0, 0],
 30 |         'res_w': 2048,
 31 |         'res_h': 2048,
 32 |         'azimuth': 70, # Only used for visualization
 33 |     },
 34 |     {
 35 |         'id': 'cam_1',
 36 |         'center': [1030.519, 1052.626],
 37 |         'focal_length': [1495.217, 1495.52],
 38 |         'radial_distortion': [0, 0, 0],
 39 |         'tangential_distortion': [0, 0],
 40 |         'res_w': 2048,
 41 |         'res_h': 2048,
 42 |         'azimuth': 70, # Only used for visualization
 43 |     },
 44 |     {
 45 |         'id': 'cam_2',
 46 |         'center': [983.8873, 987.5902],
 47 |         'focal_length': [1495.587, 1497.828],
 48 |         'radial_distortion': [0, 0, 0],
 49 |         'tangential_distortion': [0, 0],
 50 |         'res_w': 2048,
 51 |         'res_h': 2048,
 52 |         'azimuth': 70, # Only used for visualization
 53 |     },
 54 |     {
 55 |         'id': 'cam_3',
 56 |         'center': [1029.06, 1041.409],
 57 |         'focal_length': [1495.886, 1496.033],
 58 |         'radial_distortion': [0, 0, 0],
 59 |         'tangential_distortion': [0, 0],
 60 |         'res_w': 2048,
 61 |         'res_h': 2048,
 62 |         'azimuth': -110, # Only used for visualization
 63 |     },
 64 |     {
 65 |         'id': 'cam_4',
 66 |         'center': [987.6075, 1019.069],
 67 |         'focal_length': [1490.952, 1491.108],
 68 |         'radial_distortion': [0, 0, 0],
 69 |         'tangential_distortion': [0, 0],
 70 |         'res_w': 2048,
 71 |         'res_h': 2048,
 72 |         'azimuth': 70, # Only used for visualization
 73 |     },
 74 |     {
 75 |         'id': 'cam_5',
 76 |         'center': [1012.331, 998.5009],
 77 |         'focal_length': [1500.414, 1499.971],
 78 |         'radial_distortion': [0, 0, 0],
 79 |         'tangential_distortion': [0, 0],
 80 |         'res_w': 2048,
 81 |         'res_h': 2048,
 82 |         'azimuth': 70, # Only used for visualization
 83 |     },
 84 |     {
 85 |         'id': 'cam_6',
 86 |         'center': [999.7319, 1010.251],
 87 |         'focal_length': [1498.471, 1498.8],
 88 |         'radial_distortion': [0, 0, 0],
 89 |         'tangential_distortion': [0, 0],
 90 |         'res_w': 2048,
 91 |         'res_h': 2048,
 92 |         'azimuth': 70, # Only used for visualization
 93 |     },
 94 |     {
 95 |         'id': 'cam_7',
 96 |         'center': [987.2716, 976.8773],
 97 |         'focal_length': [1498.831, 1499.674],
 98 |         'radial_distortion': [0, 0, 0],
 99 |         'tangential_distortion': [0, 0],
100 |         'res_w': 2048,
101 |         'res_h': 2048,
102 |         'azimuth': 70, # Only used for visualization
103 |     },
104 |     {
105 |         'id': 'cam_8',
106 |         'center': [1017.387, 1043.032],
107 |         'focal_length': [1500.172, 1500.837],
108 |         'radial_distortion': [0, 0, 0],
109 |         'tangential_distortion': [0, 0],
110 |         'res_w': 2048,
111 |         'res_h': 2048,
112 |         'azimuth': 70, # Only used for visualization
113 |     },
114 |     {
115 |         'id': 'cam_9',
116 |         'center': [1010.423, 1037.096],
117 |         'focal_length': [1501.554, 1501.9],
118 |         'radial_distortion': [0, 0, 0],
119 |         'tangential_distortion': [0, 0],
120 |         'res_w': 2048,
121 |         'res_h': 2048,
122 |         'azimuth': 70, # Only used for visualization
123 |     },
124 |     {
125 |         'id': 'cam_10',
126 |         'center': [1041.614, 997.0433],
127 |         'focal_length': [1498.423, 1498.585],
128 |         'radial_distortion': [0, 0, 0],
129 |         'tangential_distortion': [0, 0],
130 |         'res_w': 2048,
131 |         'res_h': 2048,
132 |         'azimuth': 70, # Only used for visualization
133 |     },
134 |     {
135 |         'id': 'cam_11',
136 |         'center': [1009.802, 999.9984],
137 |         'focal_length': [1495.779, 1493.703],
138 |         'radial_distortion': [0, 0, 0],
139 |         'tangential_distortion': [0, 0],
140 |         'res_w': 2048,
141 |         'res_h': 2048,
142 |         'azimuth': 70, # Only used for visualization
143 |     },
144 |     {
145 |         'id': 'cam_12',
146 |         'center': [1000.56, 1014.975],
147 |         'focal_length': [1501.326, 1501.491],
148 |         'radial_distortion': [0, 0, 0],
149 |         'tangential_distortion': [0, 0],
150 |         'res_w': 2048,
151 |         'res_h': 2048,
152 |         'azimuth': 70, # Only used for visualization
153 |     },
154 |     {
155 |         'id': 'cam_13',
156 |         'center': [1005.702, 1004.214],
157 |         'focal_length': [1496.961, 1497.378],
158 |         'radial_distortion': [0, 0, 0],
159 |         'tangential_distortion': [0, 0],
160 |         'res_w': 2048,
161 |         'res_h': 2048,
162 |         'azimuth': 70, # Only used for visualization
163 |     },
164 |     {
165 |         'id': 'TS56',
166 |         'center': [939.85754016, 560.140743168],
167 |         'focal_length': [1683.98345952, 1672.59370772],
168 |         'radial_distortion': [-0.276859611, 0.131125256, -0.049318332],
169 |         'tangential_distortion': [-0.000360494, -0.001149441],
170 |         'res_w': 1920,
171 |         'res_h': 1080,
172 |         'azimuth': 70, # Only used for visualization
173 |     },
174 | ]
175 | 
176 | mpi_inf_3dhp_cameras_extrinsic_params = {
177 |     'Train': [
178 |         {
179 |             'orientation': [0.9910573, 0.0000989, 0.1322565, -0.017709],
180 |             'translation': [-562.8666, 1398.138, 3852.623],
181 |         },
182 |         {
183 |             'orientation': [0.8882246, -0.0698901, 0.4388433, -0.1165721],
184 |             'translation': [-1429.856, 738.1779, 4897.966],
185 |         },
186 |         {
187 |             'orientation': [0.5651277, -0.0301201, 0.824319, -0.0148915],
188 |             'translation': [57.25702, 1307.287, 2799.822],
189 |         },
190 |         {
191 |             'orientation': [0.6670245, -0.1827152, 0.7089925, -0.1379241],
192 |             'translation': [-284.8168, 807.9184, 3177.16],
193 |         },
194 |         {
195 |             'orientation': [0.8273998, 0.0263385, 0.5589656, -0.0476783],
196 |             'translation': [-1563.911, 801.9608, 3517.316],
197 |         },
198 |         {
199 |             'orientation': [-0.568842, 0.0159665, 0.8220693, -0.0191314],
200 |             'translation': [358.4134, 994.5658, 3439.832],
201 |         },
202 |         {
203 |             'orientation': [0.2030824, -0.2818073, 0.9370704, -0.0352313],
204 |             'translation': [569.4388, 528.871, 3687.369],
205 |         },
206 |         {
207 |             'orientation': [0.00086, 0.0123344, 0.9998223, -0.0142292],
208 |             'translation': [1378.866, 1270.781, 2631.567],
209 |         },
210 |         {
211 |             'orientation': [0.7053718, 0.095632, -0.7004048, -0.0523286],
212 |             'translation': [221.3543, 659.87, 3644.688],
213 |         },
214 |         {
215 |             'orientation': [0.6914033, 0.2036966, -0.6615312, -0.2069921],
216 |             'translation': [388.6217, 137.5452, 4216.635],
217 |         },
218 |         {
219 |             'orientation': [-0.2266321, -0.2540748, 0.9401911, -0.0111636],
220 |             'translation': [1167.962, 617.6362, 4472.351],
221 |         },
222 |         {
223 |             'orientation': [-0.4536946, -0.2035304, -0.0072578, 0.8675736],
224 |             'translation': [134.8272, 251.5094, 4570.244],
225 |         },
226 |         {
227 |             'orientation': [-0.0778876, 0.8469901, -0.4230185, 0.3124046],
228 |             'translation': [412.4695, 532.7588, 4887.095],
229 |         },
230 |         {
231 |             'orientation': [0.098712, 0.8023286, -0.5397436, -0.2349501],
232 |             'translation': [867.1278, 827.4572, 3985.159],
233 |         },
234 |     ],
235 |     'chestHeight': [
236 |         {
237 |             'orientation': [0.7053718, 0.095632, -0.7004048, -0.0523286],
238 |             'translation': [221.3543, 659.87, 3644.688],
239 |         },
240 |     ],
241 | }
242 | 
243 | 
244 | class MpiInf3dhpDataset(MocapDataset):
245 |     def __init__(self, path, remove_static_joints=True):
246 |         super().__init__(fps=25, skeleton=mpi_inf_3dhp_skeleton)
247 |         
248 |         self._cameras = {}
249 |         
250 |         for subject in subjects_train:
251 |             self._cameras[subject] = copy.deepcopy(mpi_inf_3dhp_cameras_extrinsic_params['Train'])
252 |             
253 |         for cameras in self._cameras.values():
254 |             for i, cam in enumerate(cameras):
255 |                 cam.update(mpi_inf_3dhp_cameras_intrinsic_params[i])
256 |                 for k, v in cam.items():
257 |                     if k not in ['id', 'res_w', 'res_h']:
258 |                         cam[k] = np.array(v, dtype='float32')
259 |                 
260 |                 # Normalize camera frame
261 |                 cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32')
262 |                 cam['focal_length'] = cam['focal_length']/cam['res_w']*2
263 |                 if 'translation' in cam:
264 |                     cam['translation'] = cam['translation']/1000 # mm to meters
265 |                 
266 |                 # Add intrinsic parameters vector
267 |                 cam['intrinsic'] = np.concatenate((cam['focal_length'],
268 |                                                    cam['center'],
269 |                                                    cam['radial_distortion'],
270 |                                                    cam['tangential_distortion']))
271 |         
272 |         for subject in subjects_test1:
273 |             self._cameras[subject] = copy.deepcopy(mpi_inf_3dhp_cameras_extrinsic_params['chestHeight'])
274 |             cam = self._cameras[subject] [0]
275 |             cam.update(mpi_inf_3dhp_cameras_intrinsic_params[8])
276 |             for k, v in cam.items():
277 |                 if k not in ['id', 'res_w', 'res_h']:
278 |                     cam[k] = np.array(v, dtype='float32')
279 |             
280 |             # Normalize camera frame
281 |             cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32')
282 |             cam['focal_length'] = cam['focal_length']/cam['res_w']*2
283 |             if 'translation' in cam:
284 |                 cam['translation'] = cam['translation']/1000 # mm to meters
285 |             
286 |             # Add intrinsic parameters vector
287 |             cam['intrinsic'] = np.concatenate((cam['focal_length'],
288 |                                                cam['center'],
289 |                                                cam['radial_distortion'],
290 |                                                cam['tangential_distortion']))
291 |             
292 |         for subject in subjects_test2:
293 |             self._cameras[subject] = copy.deepcopy(mpi_inf_3dhp_cameras_extrinsic_params['chestHeight'])
294 |             cam = self._cameras[subject] [0]
295 |             cam.update(mpi_inf_3dhp_cameras_intrinsic_params[14])
296 |             for k, v in cam.items():
297 |                 if k not in ['id', 'res_w', 'res_h']:
298 |                     cam[k] = np.array(v, dtype='float32')
299 |             
300 |             # Normalize camera frame
301 |             cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype('float32')
302 |             cam['focal_length'] = cam['focal_length']/cam['res_w']*2
303 |             if 'translation' in cam:
304 |                 cam['translation'] = cam['translation']/1000 # mm to meters
305 |             
306 |             # Add intrinsic parameters vector
307 |             cam['intrinsic'] = np.concatenate((cam['focal_length'],
308 |                                                cam['center'],
309 |                                                cam['radial_distortion'],
310 |                                                cam['tangential_distortion']))
311 |         
312 |         # Load serialized dataset
313 |         data = np.load(path, allow_pickle=True)['positions_3d'].item()
314 |         
315 |         self._data = {}
316 |         
317 |         for subject, actions in data.items():
318 |             self._data[subject] = {}
319 |             for action_name, positions in actions.items():
320 |                 self._data[subject][action_name] = {
321 |                     'positions': positions,
322 |                     'cameras': self._cameras[subject],
323 |                 }
324 |                 
325 |         if remove_static_joints:
326 |             # Bring the skeleton to 17 joints instead of the original 32
327 |             self._skeleton.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31])
328 |             
329 |             # Rewire shoulders to the correct parents
330 |             self._skeleton._parents[11] = 8
331 |             self._skeleton._parents[14] = 8
332 |                 
333 |             
334 |     def supports_semi_supervised(self):
335 |         return True
336 |    
337 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | 
  8 | import numpy as np
  9 | 
 10 | from common.arguments import parse_args
 11 | import torch
 12 | 
 13 | import torch.nn as nn
 14 | import torch.nn.functional as F
 15 | import torch.optim as optim
 16 | import os
 17 | import sys
 18 | import errno
 19 | 
 20 | from common.camera import *
 21 | from common.model import *
 22 | from common.loss import *
 23 | from common.generators import ChunkedGenerator, UnchunkedGenerator
 24 | from time import time
 25 | from common.utils import deterministic_random
 26 | 
 27 | args = parse_args()
 28 | print(args)
 29 | 
 30 | try:
 31 |     # Create checkpoint directory if it does not exist
 32 |     os.makedirs(args.checkpoint)
 33 | except OSError as e:
 34 |     if e.errno != errno.EEXIST:
 35 |         raise RuntimeError('Unable to create checkpoint directory:', args.checkpoint)
 36 | 
 37 | print('Loading dataset...')
 38 | dataset_path = 'data/data_3d_' + args.dataset + '.npz'
 39 | if args.dataset == 'h36m':
 40 |     from common.h36m_dataset import Human36mDataset
 41 |     dataset = Human36mDataset(dataset_path)
 42 | elif args.dataset.startswith('humaneva'):
 43 |     from common.humaneva_dataset import HumanEvaDataset
 44 |     dataset = HumanEvaDataset(dataset_path)
 45 | elif args.dataset.startswith('mpi'):
 46 |     from common.mpi_inf_3dhp_dataset import MpiInf3dhpDataset
 47 |     dataset = MpiInf3dhpDataset(dataset_path)
 48 | elif args.dataset.startswith('custom'):
 49 |     from common.custom_dataset import CustomDataset
 50 |     dataset = CustomDataset('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz')
 51 | else:
 52 |     raise KeyError('Invalid dataset')
 53 | 
 54 | print('Preparing data...')
 55 | if args.dataset.startswith('mpi'):
 56 |     for subject in dataset.subjects():
 57 |         for action in dataset[subject].keys():
 58 |             anim = dataset[subject][action]
 59 | 		
 60 |             if 'positions' in anim:
 61 |                 anim['positions_3d'] = anim['positions']
 62 | else:
 63 |     for subject in dataset.subjects():
 64 |         for action in dataset[subject].keys():
 65 |             anim = dataset[subject][action]
 66 |             
 67 |             if 'positions' in anim:
 68 |                 positions_3d = []
 69 |                 for cam in anim['cameras']:
 70 |                     pos_3d = world_to_camera(anim['positions'], R=cam['orientation'], t=cam['translation'])
 71 |                     pos_3d[:, 1:] -= pos_3d[:, :1] # Remove global offset, but keep trajectory in first position
 72 |                     positions_3d.append(pos_3d)
 73 |                 anim['positions_3d'] = positions_3d
 74 | 
 75 | print('Loading 2D detections...')
 76 | keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True)
 77 | keypoints_metadata = keypoints['metadata'].item()
 78 | keypoints_symmetry = keypoints_metadata['keypoints_symmetry']
 79 | kps_left, kps_right = list(keypoints_symmetry[0]), list(keypoints_symmetry[1])
 80 | joints_left, joints_right = list(dataset.skeleton().joints_left()), list(dataset.skeleton().joints_right())
 81 | keypoints = keypoints['positions_2d'].item()
 82 | 
 83 | for subject in dataset.subjects():
 84 |     assert subject in keypoints, 'Subject {} is missing from the 2D detections dataset'.format(subject)
 85 |     for action in dataset[subject].keys():
 86 |         assert action in keypoints[subject], 'Action {} of subject {} is missing from the 2D detections dataset'.format(action, subject)
 87 |         if 'positions_3d' not in dataset[subject][action]:
 88 |             continue
 89 |             
 90 |         for cam_idx in range(len(keypoints[subject][action])):
 91 |             
 92 |             # We check for >= instead of == because some videos in H3.6M contain extra frames
 93 |             mocap_length = dataset[subject][action]['positions_3d'][cam_idx].shape[0]
 94 |             assert keypoints[subject][action][cam_idx].shape[0] >= mocap_length
 95 |             
 96 |             if keypoints[subject][action][cam_idx].shape[0] > mocap_length:
 97 |                 # Shorten sequence
 98 |                 keypoints[subject][action][cam_idx] = keypoints[subject][action][cam_idx][:mocap_length]
 99 | 
100 |         assert len(keypoints[subject][action]) == len(dataset[subject][action]['positions_3d'])
101 |         
102 | for subject in keypoints.keys():
103 |     for action in keypoints[subject]:
104 |         for cam_idx, kps in enumerate(keypoints[subject][action]):
105 |             # Normalize camera frame
106 |             cam = dataset.cameras()[subject][cam_idx]
107 |             kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h'])
108 |             keypoints[subject][action][cam_idx] = kps
109 | 
110 | subjects_train = args.subjects_train.split(',')
111 | subjects_semi = [] if not args.subjects_unlabeled else args.subjects_unlabeled.split(',')
112 | if not args.render:
113 |     subjects_test = args.subjects_test.split(',')
114 | else:
115 |     subjects_test = [args.viz_subject]
116 | 
117 | semi_supervised = len(subjects_semi) > 0
118 | if semi_supervised and not dataset.supports_semi_supervised():
119 |     raise RuntimeError('Semi-supervised training is not implemented for this dataset')
120 |             
121 | def fetch(subjects, action_filter=None, subset=1, parse_3d_poses=True):
122 |     out_poses_3d = []
123 |     out_poses_2d = []
124 |     out_camera_params = []
125 |     for subject in subjects:
126 |         for action in keypoints[subject].keys():
127 |             if action_filter is not None:
128 |                 found = False
129 |                 for a in action_filter:
130 |                     if action.startswith(a):
131 |                         found = True
132 |                         break
133 |                 if not found:
134 |                     continue
135 |                 
136 |             poses_2d = keypoints[subject][action]
137 |             for i in range(len(poses_2d)): # Iterate across cameras
138 |                 out_poses_2d.append(poses_2d[i])
139 |                 
140 |             if subject in dataset.cameras():
141 |                 cams = dataset.cameras()[subject]
142 |                 assert len(cams) == len(poses_2d), 'Camera count mismatch'
143 |                 for cam in cams:
144 |                     if 'intrinsic' in cam:
145 |                         out_camera_params.append(cam['intrinsic'])
146 |                 
147 |             if parse_3d_poses and 'positions_3d' in dataset[subject][action]:
148 |                 poses_3d = dataset[subject][action]['positions_3d']
149 |                 assert len(poses_3d) == len(poses_2d), 'Camera count mismatch'
150 |                 for i in range(len(poses_3d)): # Iterate across cameras
151 |                     out_poses_3d.append(poses_3d[i])
152 |     
153 |     if len(out_camera_params) == 0:
154 |         out_camera_params = None
155 |     if len(out_poses_3d) == 0:
156 |         out_poses_3d = None
157 |     
158 |     stride = args.downsample
159 |     if subset < 1:
160 |         for i in range(len(out_poses_2d)):
161 |             n_frames = int(round(len(out_poses_2d[i])//stride * subset)*stride)
162 |             start = deterministic_random(0, len(out_poses_2d[i]) - n_frames + 1, str(len(out_poses_2d[i])))
163 |             out_poses_2d[i] = out_poses_2d[i][start:start+n_frames:stride]
164 |             if out_poses_3d is not None:
165 |                 out_poses_3d[i] = out_poses_3d[i][start:start+n_frames:stride]
166 |     elif stride > 1:
167 |         # Downsample as requested
168 |         for i in range(len(out_poses_2d)):
169 |             out_poses_2d[i] = out_poses_2d[i][::stride]
170 |             if out_poses_3d is not None:
171 |                 out_poses_3d[i] = out_poses_3d[i][::stride]
172 |     
173 | 
174 |     return out_camera_params, out_poses_3d, out_poses_2d
175 | 
176 | action_filter = None if args.actions == '*' else args.actions.split(',')
177 | if action_filter is not None:
178 |     print('Selected actions:', action_filter)
179 |     
180 | cameras_valid, poses_valid, poses_valid_2d = fetch(subjects_test, action_filter)
181 | 
182 | filter_widths = [int(x) for x in args.architecture.split(',')]
183 | if not args.disable_optimizations and not args.dense and args.stride == 1:
184 |     # Use optimized model for single-frame predictions
185 |     model_pos_train = TemporalModelOptimized1f(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(),
186 |                                 filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels)
187 | else:
188 |     # When incompatible settings are detected (stride > 1, dense filters, or disabled optimization) fall back to normal model
189 |     model_pos_train = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(),
190 |                                 filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels,
191 |                                 dense=args.dense)
192 |     
193 | model_pos = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(),
194 |                             filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels,
195 |                             dense=args.dense)
196 | 
197 | receptive_field = model_pos.receptive_field()
198 | print('INFO: Receptive field: {} frames'.format(receptive_field))
199 | pad = (receptive_field - 1) // 2 # Padding on each side
200 | if args.causal:
201 |     print('INFO: Using causal convolutions')
202 |     causal_shift = pad
203 | else:
204 |     causal_shift = 0
205 | 
206 | model_params = 0
207 | for parameter in model_pos.parameters():
208 |     model_params += parameter.numel()
209 | print('INFO: Trainable parameter count:', model_params)
210 | 
211 | if torch.cuda.is_available():
212 |     model_pos = model_pos.cuda()
213 |     model_pos_train = model_pos_train.cuda()
214 |     
215 | if args.resume or args.evaluate:
216 |     chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate)
217 |     print('Loading checkpoint', chk_filename)
218 |     checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage)
219 |     print('This model was trained for {} epochs'.format(checkpoint['epoch']))
220 |     model_pos_train.load_state_dict(checkpoint['model_pos'])
221 |     model_pos.load_state_dict(checkpoint['model_pos'])
222 |     
223 |     if args.evaluate and 'model_traj' in checkpoint:
224 |         # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild)
225 |         model_traj = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1,
226 |                             filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels,
227 |                             dense=args.dense)
228 |         if torch.cuda.is_available():
229 |             model_traj = model_traj.cuda()
230 |         model_traj.load_state_dict(checkpoint['model_traj'])
231 |     else:
232 |         model_traj = None
233 |         
234 |     
235 | test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d,
236 |                                     pad=pad, causal_shift=causal_shift, augment=False,
237 |                                     kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
238 | print('INFO: Testing on {} frames'.format(test_generator.num_frames()))
239 | 
240 | if not args.evaluate:
241 |     cameras_train, poses_train, poses_train_2d = fetch(subjects_train, action_filter, subset=args.subset)
242 | 
243 |     lr = args.learning_rate
244 |     if semi_supervised:
245 |         cameras_semi, _, poses_semi_2d = fetch(subjects_semi, action_filter, parse_3d_poses=False)
246 |         
247 |         if not args.disable_optimizations and not args.dense and args.stride == 1:
248 |             # Use optimized model for single-frame predictions
249 |             model_traj_train = TemporalModelOptimized1f(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1,
250 |                     filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels)
251 |         else:
252 |             # When incompatible settings are detected (stride > 1, dense filters, or disabled optimization) fall back to normal model
253 |             model_traj_train = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1,
254 |                     filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels,
255 |                     dense=args.dense)
256 |         
257 |         model_traj = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1,
258 |                             filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels,
259 |                             dense=args.dense)
260 |         if torch.cuda.is_available():
261 |             model_traj = model_traj.cuda()
262 |             model_traj_train = model_traj_train.cuda()
263 |         optimizer = optim.Adam(list(model_pos_train.parameters()) + list(model_traj_train.parameters()),
264 |                                lr=lr, amsgrad=True)
265 |         
266 |         losses_2d_train_unlabeled = []
267 |         losses_2d_train_labeled_eval = []
268 |         losses_2d_train_unlabeled_eval = []
269 |         losses_2d_valid = []
270 | 
271 |         losses_traj_train = []
272 |         losses_traj_train_eval = []
273 |         losses_traj_valid = []
274 |     else:
275 |         optimizer = optim.Adam(model_pos_train.parameters(), lr=lr, amsgrad=True)
276 |         
277 |     lr_decay = args.lr_decay
278 | 
279 |     losses_3d_train = []
280 |     losses_3d_train_eval = []
281 |     losses_3d_valid = []
282 | 
283 |     epoch = 0
284 |     initial_momentum = 0.8   #0.1
285 |     final_momentum = 0.6   #0.001
286 |     
287 |     
288 |     train_generator = ChunkedGenerator(args.batch_size//args.stride, cameras_train, poses_train, poses_train_2d, args.stride,
289 |                                        pad=pad, causal_shift=causal_shift, shuffle=True, augment=args.data_augmentation,
290 |                                        kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
291 |     train_generator_eval = UnchunkedGenerator(cameras_train, poses_train, poses_train_2d,
292 |                                               pad=pad, causal_shift=causal_shift, augment=False)
293 |     print('INFO: Training on {} frames'.format(train_generator_eval.num_frames()))
294 |     if semi_supervised:
295 |         semi_generator = ChunkedGenerator(args.batch_size//args.stride, cameras_semi, None, poses_semi_2d, args.stride,
296 |                                           pad=pad, causal_shift=causal_shift, shuffle=True,
297 |                                           random_seed=4321, augment=args.data_augmentation,
298 |                                           kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right,
299 |                                           endless=True)
300 |         semi_generator_eval = UnchunkedGenerator(cameras_semi, None, poses_semi_2d,
301 |                                                  pad=pad, causal_shift=causal_shift, augment=False)
302 |         print('INFO: Semi-supervision on {} frames'.format(semi_generator_eval.num_frames()))
303 | 
304 |     if args.resume:
305 |         epoch = checkpoint['epoch']
306 |         if 'optimizer' in checkpoint and checkpoint['optimizer'] is not None:
307 |             optimizer.load_state_dict(checkpoint['optimizer'])
308 |             train_generator.set_random_state(checkpoint['random_state'])
309 |         else:
310 |             print('WARNING: this checkpoint does not contain an optimizer state. The optimizer will be reinitialized.')
311 |         
312 |         lr = checkpoint['lr']
313 |         if semi_supervised:
314 |             model_traj_train.load_state_dict(checkpoint['model_traj'])
315 |             model_traj.load_state_dict(checkpoint['model_traj'])
316 |             semi_generator.set_random_state(checkpoint['random_state_semi'])
317 |             
318 |     print('** Note: reported losses are averaged over all frames and test-time augmentation is not used here.')
319 |     print('** The final evaluation will be carried out after the last training epoch.')
320 |     
321 |     # Pos model only
322 |     while epoch < args.epochs:
323 |         start_time = time()
324 |         epoch_loss_3d_train = 0
325 |         epoch_loss_traj_train = 0
326 |         epoch_loss_2d_train_unlabeled = 0
327 |         N = 0
328 |         N_semi = 0
329 |         model_pos_train.train()
330 |         if semi_supervised:
331 |             # Semi-supervised scenario
332 |             model_traj_train.train()
333 |             for (_, batch_3d, batch_2d), (cam_semi, _, batch_2d_semi) in \
334 |                 zip(train_generator.next_epoch(), semi_generator.next_epoch()):
335 |                 
336 |                 # Fall back to supervised training for the first epoch (to avoid instability)
337 |                 skip = epoch < args.warmup
338 |                 
339 |                 cam_semi = torch.from_numpy(cam_semi.astype('float32'))
340 |                 inputs_3d = torch.from_numpy(batch_3d.astype('float32'))
341 |                 if torch.cuda.is_available():
342 |                     cam_semi = cam_semi.cuda()
343 |                     inputs_3d = inputs_3d.cuda()
344 |                     
345 |                 inputs_traj = inputs_3d[:, :, :1].clone()
346 |                 inputs_3d[:, :, 0] = 0
347 |                 
348 |                 # Split point between labeled and unlabeled samples in the batch
349 |                 split_idx = inputs_3d.shape[0]
350 | 
351 |                 inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
352 |                 inputs_2d_semi = torch.from_numpy(batch_2d_semi.astype('float32'))
353 |                 if torch.cuda.is_available():
354 |                     inputs_2d = inputs_2d.cuda()
355 |                     inputs_2d_semi = inputs_2d_semi.cuda()
356 |                 inputs_2d_cat =  torch.cat((inputs_2d, inputs_2d_semi), dim=0) if not skip else inputs_2d
357 | 
358 |                 optimizer.zero_grad()
359 | 
360 |                 # Compute 3D poses
361 |                 predicted_3d_pos_cat = model_pos_train(inputs_2d_cat)
362 | 
363 |                 loss_3d_pos = mpjpe(predicted_3d_pos_cat[:split_idx], inputs_3d)
364 |                 epoch_loss_3d_train += inputs_3d.shape[0]*inputs_3d.shape[1] * loss_3d_pos.item()
365 |                 N += inputs_3d.shape[0]*inputs_3d.shape[1]
366 |                 loss_total = loss_3d_pos
367 | 
368 |                 # Compute global trajectory
369 |                 predicted_traj_cat = model_traj_train(inputs_2d_cat)
370 |                 w = 1 / inputs_traj[:, :, :, 2] # Weight inversely proportional to depth
371 |                 loss_traj = weighted_mpjpe(predicted_traj_cat[:split_idx], inputs_traj, w)
372 |                 epoch_loss_traj_train += inputs_3d.shape[0]*inputs_3d.shape[1] * loss_traj.item()
373 |                 assert inputs_traj.shape[0]*inputs_traj.shape[1] == inputs_3d.shape[0]*inputs_3d.shape[1]
374 |                 loss_total += loss_traj
375 | 
376 |                 if not skip:
377 |                     # Semi-supervised loss for unlabeled samples
378 |                     predicted_semi = predicted_3d_pos_cat[split_idx:]
379 |                     if pad > 0:
380 |                         target_semi = inputs_2d_semi[:, pad:-pad, :, :2].contiguous()
381 |                     else:
382 |                         target_semi = inputs_2d_semi[:, :, :, :2].contiguous()
383 |                         
384 |                     projection_func = project_to_2d_linear if args.linear_projection else project_to_2d
385 |                     reconstruction_semi = projection_func(predicted_semi + predicted_traj_cat[split_idx:], cam_semi)
386 | 
387 |                     loss_reconstruction = mpjpe(reconstruction_semi, target_semi) # On 2D poses
388 |                     epoch_loss_2d_train_unlabeled += predicted_semi.shape[0]*predicted_semi.shape[1] * loss_reconstruction.item()
389 |                     if not args.no_proj:
390 |                         loss_total += loss_reconstruction
391 |                     
392 |                     # Bone length term to enforce kinematic constraints
393 |                     if args.bone_length_term:
394 |                         dists = predicted_3d_pos_cat[:, :, 1:] - predicted_3d_pos_cat[:, :, dataset.skeleton().parents()[1:]]
395 |                         bone_lengths = torch.mean(torch.norm(dists, dim=3), dim=1)
396 |                         penalty = torch.mean(torch.abs(torch.mean(bone_lengths[:split_idx], dim=0) \
397 |                                                      - torch.mean(bone_lengths[split_idx:], dim=0)))
398 |                         loss_total += penalty
399 |                         
400 |                     
401 |                     N_semi += predicted_semi.shape[0]*predicted_semi.shape[1]
402 |                 else:
403 |                     N_semi += 1 # To avoid division by zero
404 | 
405 |                 loss_total.backward()
406 | 
407 |                 optimizer.step()
408 |             losses_traj_train.append(epoch_loss_traj_train / N)
409 |             losses_2d_train_unlabeled.append(epoch_loss_2d_train_unlabeled / N_semi)
410 |         else:
411 |             # Regular supervised scenario
412 |             for _, batch_3d, batch_2d in train_generator.next_epoch():
413 |                 inputs_3d = torch.from_numpy(batch_3d.astype('float32'))
414 |                 inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
415 |                 if torch.cuda.is_available():
416 |                     inputs_3d = inputs_3d.cuda()
417 |                     inputs_2d = inputs_2d.cuda()
418 |                 inputs_3d[:, :, 0] = 0
419 | 
420 |                 optimizer.zero_grad()
421 | 
422 |                 # Predict 3D poses
423 |                 predicted_3d_pos = model_pos_train(inputs_2d)
424 |                 loss_3d_pos = mpjpe(predicted_3d_pos, inputs_3d)
425 |                 epoch_loss_3d_train += inputs_3d.shape[0]*inputs_3d.shape[1] * loss_3d_pos.item()
426 |                 N += inputs_3d.shape[0]*inputs_3d.shape[1]
427 | 
428 |                 loss_total = loss_3d_pos
429 |                 loss_total.backward()
430 | 
431 |                 optimizer.step()
432 | 
433 |         losses_3d_train.append(epoch_loss_3d_train / N)
434 | 
435 |         # End-of-epoch evaluation
436 |         with torch.no_grad():
437 |             model_pos.load_state_dict(model_pos_train.state_dict())
438 |             model_pos.eval()
439 |             model_pos.set_bn_momentum(0.9)
440 |             if semi_supervised:
441 |                 model_traj.load_state_dict(model_traj_train.state_dict())
442 |                 model_traj.eval()
443 | 
444 |             epoch_loss_3d_valid = 0
445 |             epoch_loss_traj_valid = 0
446 |             epoch_loss_2d_valid = 0
447 |             N = 0
448 |             
449 |             if not args.no_eval:
450 |                 # Evaluate on test set
451 |                 for cam, batch, batch_2d in test_generator.next_epoch():
452 |                     inputs_3d = torch.from_numpy(batch.astype('float32'))
453 |                     inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
454 |                     if torch.cuda.is_available():
455 |                         inputs_3d = inputs_3d.cuda()
456 |                         inputs_2d = inputs_2d.cuda()
457 |                     inputs_traj = inputs_3d[:, :, :1].clone()
458 |                     inputs_3d[:, :, 0] = 0
459 | 
460 |                     # Predict 3D poses
461 |                     predicted_3d_pos = model_pos(inputs_2d)
462 |                     loss_3d_pos = mpjpe(predicted_3d_pos, inputs_3d)
463 |                     epoch_loss_3d_valid += inputs_3d.shape[0]*inputs_3d.shape[1] * loss_3d_pos.item()
464 |                     N += inputs_3d.shape[0]*inputs_3d.shape[1]
465 | 
466 |                     if semi_supervised:
467 |                         cam = torch.from_numpy(cam.astype('float32'))
468 |                         if torch.cuda.is_available():
469 |                             cam = cam.cuda()
470 | 
471 |                         predicted_traj = model_traj(inputs_2d)
472 |                         loss_traj = mpjpe(predicted_traj, inputs_traj)
473 |                         epoch_loss_traj_valid += inputs_traj.shape[0]*inputs_traj.shape[1] * loss_traj.item()
474 |                         assert inputs_traj.shape[0]*inputs_traj.shape[1] == inputs_3d.shape[0]*inputs_3d.shape[1]
475 | 
476 |                         if pad > 0:
477 |                             target = inputs_2d[:, pad:-pad, :, :2].contiguous()
478 |                         else:
479 |                             target = inputs_2d[:, :, :, :2].contiguous()
480 |                         reconstruction = project_to_2d(predicted_3d_pos + predicted_traj, cam)
481 |                         loss_reconstruction = mpjpe(reconstruction, target) # On 2D poses
482 |                         epoch_loss_2d_valid += reconstruction.shape[0]*reconstruction.shape[1] * loss_reconstruction.item()
483 |                         assert reconstruction.shape[0]*reconstruction.shape[1] == inputs_3d.shape[0]*inputs_3d.shape[1]
484 | 
485 |                 losses_3d_valid.append(epoch_loss_3d_valid / N)
486 |                 if semi_supervised:
487 |                     losses_traj_valid.append(epoch_loss_traj_valid / N)
488 |                     losses_2d_valid.append(epoch_loss_2d_valid / N)
489 | 
490 | 
491 |                 # Evaluate on training set, this time in evaluation mode
492 |                 epoch_loss_3d_train_eval = 0
493 |                 epoch_loss_traj_train_eval = 0
494 |                 epoch_loss_2d_train_labeled_eval = 0
495 |                 N = 0
496 |                 for cam, batch, batch_2d in train_generator_eval.next_epoch():
497 |                     if batch_2d.shape[1] == 0:
498 |                         # This can only happen when downsampling the dataset
499 |                         continue
500 |                         
501 |                     inputs_3d = torch.from_numpy(batch.astype('float32'))
502 |                     inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
503 |                     if torch.cuda.is_available():
504 |                         inputs_3d = inputs_3d.cuda()
505 |                         inputs_2d = inputs_2d.cuda()
506 |                     inputs_traj = inputs_3d[:, :, :1].clone()
507 |                     inputs_3d[:, :, 0] = 0
508 | 
509 |                     # Compute 3D poses
510 |                     predicted_3d_pos = model_pos(inputs_2d)
511 |                     loss_3d_pos = mpjpe(predicted_3d_pos, inputs_3d)
512 |                     epoch_loss_3d_train_eval += inputs_3d.shape[0]*inputs_3d.shape[1] * loss_3d_pos.item()
513 |                     N += inputs_3d.shape[0]*inputs_3d.shape[1]
514 | 
515 |                     if semi_supervised:
516 |                         cam = torch.from_numpy(cam.astype('float32'))
517 |                         if torch.cuda.is_available():
518 |                             cam = cam.cuda()
519 |                         predicted_traj = model_traj(inputs_2d)
520 |                         loss_traj = mpjpe(predicted_traj, inputs_traj)
521 |                         epoch_loss_traj_train_eval += inputs_traj.shape[0]*inputs_traj.shape[1] * loss_traj.item()
522 |                         assert inputs_traj.shape[0]*inputs_traj.shape[1] == inputs_3d.shape[0]*inputs_3d.shape[1]
523 | 
524 |                         if pad > 0:
525 |                             target = inputs_2d[:, pad:-pad, :, :2].contiguous()
526 |                         else:
527 |                             target = inputs_2d[:, :, :, :2].contiguous()
528 |                         reconstruction = project_to_2d(predicted_3d_pos + predicted_traj, cam)
529 |                         loss_reconstruction = mpjpe(reconstruction, target)
530 |                         epoch_loss_2d_train_labeled_eval += reconstruction.shape[0]*reconstruction.shape[1] * loss_reconstruction.item()
531 |                         assert reconstruction.shape[0]*reconstruction.shape[1] == inputs_3d.shape[0]*inputs_3d.shape[1]
532 | 
533 |                 losses_3d_train_eval.append(epoch_loss_3d_train_eval / N)
534 |                 if semi_supervised:
535 |                     losses_traj_train_eval.append(epoch_loss_traj_train_eval / N)
536 |                     losses_2d_train_labeled_eval.append(epoch_loss_2d_train_labeled_eval / N)
537 | 
538 |                 # Evaluate 2D loss on unlabeled training set (in evaluation mode)
539 |                 epoch_loss_2d_train_unlabeled_eval = 0
540 |                 N_semi = 0
541 |                 if semi_supervised:
542 |                     for cam, _, batch_2d in semi_generator_eval.next_epoch():
543 |                         cam = torch.from_numpy(cam.astype('float32'))
544 |                         inputs_2d_semi = torch.from_numpy(batch_2d.astype('float32'))
545 |                         if torch.cuda.is_available():
546 |                             cam = cam.cuda()
547 |                             inputs_2d_semi = inputs_2d_semi.cuda()
548 | 
549 |                         predicted_3d_pos_semi = model_pos(inputs_2d_semi)
550 |                         predicted_traj_semi = model_traj(inputs_2d_semi)
551 |                         if pad > 0:
552 |                             target_semi = inputs_2d_semi[:, pad:-pad, :, :2].contiguous()
553 |                         else:
554 |                             target_semi = inputs_2d_semi[:, :, :, :2].contiguous()
555 |                         reconstruction_semi = project_to_2d(predicted_3d_pos_semi + predicted_traj_semi, cam)
556 |                         loss_reconstruction_semi = mpjpe(reconstruction_semi, target_semi)
557 | 
558 |                         epoch_loss_2d_train_unlabeled_eval += reconstruction_semi.shape[0]*reconstruction_semi.shape[1] \
559 |                                                               * loss_reconstruction_semi.item()
560 |                         N_semi += reconstruction_semi.shape[0]*reconstruction_semi.shape[1]
561 |                     losses_2d_train_unlabeled_eval.append(epoch_loss_2d_train_unlabeled_eval / N_semi)
562 | 
563 |         elapsed = (time() - start_time)/60
564 |         
565 |         if args.no_eval:
566 |             print('[%d] time %.2f lr %f 3d_train %f' % (
567 |                     epoch + 1,
568 |                     elapsed,
569 |                     lr,
570 |                     losses_3d_train[-1] * 1000))
571 |         else:
572 |             if semi_supervised:
573 |                 print('[%d] time %.2f lr %f 3d_train %f 3d_eval %f traj_eval %f 3d_valid %f '
574 |                       'traj_valid %f 2d_train_sup %f 2d_train_unsup %f 2d_valid %f' % (
575 |                         epoch + 1,
576 |                         elapsed,
577 |                         lr,
578 |                         losses_3d_train[-1] * 1000,
579 |                         losses_3d_train_eval[-1] * 1000,
580 |                         losses_traj_train_eval[-1] * 1000,
581 |                         losses_3d_valid[-1] * 1000,
582 |                         losses_traj_valid[-1] * 1000,
583 |                         losses_2d_train_labeled_eval[-1],
584 |                         losses_2d_train_unlabeled_eval[-1],
585 |                         losses_2d_valid[-1]))
586 |             else:
587 |                 print('[%d] time %.2f lr %f 3d_train %f 3d_eval %f 3d_valid %f' % (
588 |                         epoch + 1,
589 |                         elapsed,
590 |                         lr,
591 |                         losses_3d_train[-1] * 1000,
592 |                         losses_3d_train_eval[-1] * 1000,
593 |                         losses_3d_valid[-1]  *1000))
594 |         
595 |         # Decay learning rate exponentially
596 |         lr *= lr_decay
597 |         for param_group in optimizer.param_groups:
598 |             param_group['lr'] *= lr_decay
599 |         epoch += 1
600 |         
601 |         # Decay BatchNorm momentum
602 |         momentum = initial_momentum * np.exp(-epoch/args.epochs * np.log(initial_momentum/final_momentum))
603 |         model_pos_train.set_bn_momentum(momentum)
604 |         if semi_supervised:
605 |             model_traj_train.set_bn_momentum(momentum)
606 |             
607 |         # Save checkpoint if necessary
608 |         if epoch % args.checkpoint_frequency == 0:
609 |             chk_path = os.path.join(args.checkpoint, 'epoch_{}.bin'.format(epoch))
610 |             print('Saving checkpoint to', chk_path)
611 |             
612 |             torch.save({
613 |                 'epoch': epoch,
614 |                 'lr': lr,
615 |                 'random_state': train_generator.random_state(),
616 |                 'optimizer': optimizer.state_dict(),
617 |                 'model_pos': model_pos_train.state_dict(),
618 |                 'model_traj': model_traj_train.state_dict() if semi_supervised else None,
619 |                 'random_state_semi': semi_generator.random_state() if semi_supervised else None,
620 |             }, chk_path)
621 |             
622 |         # Save training curves after every epoch, as .png images (if requested)
623 |         if args.export_training_curves and epoch > 3:
624 |             if 'matplotlib' not in sys.modules:
625 |                 import matplotlib
626 |                 matplotlib.use('Agg')
627 |                 import matplotlib.pyplot as plt
628 |             
629 |             plt.figure()
630 |             epoch_x = np.arange(3, len(losses_3d_train)) + 1
631 |             plt.plot(epoch_x, losses_3d_train[3:], '--', color='C0')
632 |             plt.plot(epoch_x, losses_3d_train_eval[3:], color='C0')
633 |             plt.plot(epoch_x, losses_3d_valid[3:], color='C1')
634 |             plt.legend(['3d train', '3d train (eval)', '3d valid (eval)'])
635 |             plt.ylabel('MPJPE (m)')
636 |             plt.xlabel('Epoch')
637 |             plt.xlim((3, epoch))
638 |             plt.savefig(os.path.join(args.checkpoint, 'loss_3d.png'))
639 | 
640 |             if semi_supervised:
641 |                 plt.figure()
642 |                 plt.plot(epoch_x, losses_traj_train[3:], '--', color='C0')
643 |                 plt.plot(epoch_x, losses_traj_train_eval[3:], color='C0')
644 |                 plt.plot(epoch_x, losses_traj_valid[3:], color='C1')
645 |                 plt.legend(['traj. train', 'traj. train (eval)', 'traj. valid (eval)'])
646 |                 plt.ylabel('Mean distance (m)')
647 |                 plt.xlabel('Epoch')
648 |                 plt.xlim((3, epoch))
649 |                 plt.savefig(os.path.join(args.checkpoint, 'loss_traj.png'))
650 | 
651 |                 plt.figure()
652 |                 plt.plot(epoch_x, losses_2d_train_labeled_eval[3:], color='C0')
653 |                 plt.plot(epoch_x, losses_2d_train_unlabeled[3:], '--', color='C1')
654 |                 plt.plot(epoch_x, losses_2d_train_unlabeled_eval[3:], color='C1')
655 |                 plt.plot(epoch_x, losses_2d_valid[3:], color='C2')
656 |                 plt.legend(['2d train labeled (eval)', '2d train unlabeled', '2d train unlabeled (eval)', '2d valid (eval)'])
657 |                 plt.ylabel('MPJPE (2D)')
658 |                 plt.xlabel('Epoch')
659 |                 plt.xlim((3, epoch))
660 |                 plt.savefig(os.path.join(args.checkpoint, 'loss_2d.png'))
661 |             plt.close('all')
662 | 
663 | # Evaluate
664 | def evaluate(test_generator, action=None, return_predictions=False, use_trajectory_model=False):
665 |     epoch_loss_3d_pos = 0
666 |     epoch_loss_3d_pos_procrustes = 0
667 |     epoch_loss_3d_pos_scale = 0
668 |     epoch_loss_3d_vel = 0
669 |     with torch.no_grad():
670 |         if not use_trajectory_model:
671 |             model_pos.eval()
672 |         else:
673 |             model_traj.eval()
674 |         N = 0
675 |         for _, batch, batch_2d in test_generator.next_epoch():
676 |             inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
677 |             if torch.cuda.is_available():
678 |                 inputs_2d = inputs_2d.cuda()
679 | 
680 |             # Positional model
681 |             if not use_trajectory_model:
682 |                 predicted_3d_pos = model_pos(inputs_2d)
683 |             else:
684 |                 predicted_3d_pos = model_traj(inputs_2d)
685 | 
686 |             # Test-time augmentation (if enabled)
687 |             if test_generator.augment_enabled():
688 |                 # Undo flipping and take average with non-flipped version
689 |                 predicted_3d_pos[1, :, :, 0] *= -1
690 |                 if not use_trajectory_model:
691 |                     predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left]
692 |                 predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True)
693 |                 
694 |             if return_predictions:
695 |                 return predicted_3d_pos.squeeze(0).cpu().numpy()
696 |                 
697 |             inputs_3d = torch.from_numpy(batch.astype('float32'))
698 |             if torch.cuda.is_available():
699 |                 inputs_3d = inputs_3d.cuda()
700 |             inputs_3d[:, :, 0] = 0    
701 |             if test_generator.augment_enabled():
702 |                 inputs_3d = inputs_3d[:1]
703 | 
704 |             error = mpjpe(predicted_3d_pos, inputs_3d)
705 |             epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item()
706 | 
707 |             epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item()
708 |             N += inputs_3d.shape[0] * inputs_3d.shape[1]
709 |             
710 |             inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])
711 |             predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])
712 | 
713 |             epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs)
714 | 
715 |             # Compute velocity error
716 |             epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs)
717 |             
718 |     if action is None:
719 |         print('----------')
720 |     else:
721 |         print('----'+action+'----')
722 |     e1 = (epoch_loss_3d_pos / N)*1000
723 |     e2 = (epoch_loss_3d_pos_procrustes / N)*1000
724 |     e3 = (epoch_loss_3d_pos_scale / N)*1000
725 |     ev = (epoch_loss_3d_vel / N)*1000
726 |     print('Test time augmentation:', test_generator.augment_enabled())
727 |     print('Protocol #1 Error (MPJPE):', e1, 'mm')
728 |     print('Protocol #2 Error (P-MPJPE):', e2, 'mm')
729 |     print('Protocol #3 Error (N-MPJPE):', e3, 'mm')
730 |     print('Velocity Error (MPJVE):', ev, 'mm')
731 |     print('----------')
732 | 
733 |     return e1, e2, e3, ev
734 | 
735 | 
736 | if args.render:
737 |     print('Rendering...')
738 |     
739 |     input_keypoints = keypoints[args.viz_subject][args.viz_action][args.viz_camera].copy()
740 |     ground_truth = None
741 |     if args.viz_subject in dataset.subjects() and args.viz_action in dataset[args.viz_subject]:
742 |         if 'positions_3d' in dataset[args.viz_subject][args.viz_action]:
743 |             ground_truth = dataset[args.viz_subject][args.viz_action]['positions_3d'][args.viz_camera].copy()
744 |     if ground_truth is None:
745 |         print('INFO: this action is unlabeled. Ground truth will not be rendered.')
746 |         
747 |     gen = UnchunkedGenerator(None, None, [input_keypoints],
748 |                              pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation,
749 |                              kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
750 |     prediction = evaluate(gen, return_predictions=True)
751 |     if model_traj is not None and ground_truth is None:
752 |         prediction_traj = evaluate(gen, return_predictions=True, use_trajectory_model=True)
753 |         prediction += prediction_traj
754 |     
755 |     if args.viz_export is not None:
756 |         print('Exporting joint positions to', args.viz_export)
757 |         # Predictions are in camera space
758 |         np.save(args.viz_export, prediction)
759 |     
760 |     if args.viz_output is not None:
761 |         if ground_truth is not None:
762 |             # Reapply trajectory
763 |             trajectory = ground_truth[:, :1]
764 |             ground_truth[:, 1:] += trajectory
765 |             prediction += trajectory
766 |         
767 |         # Invert camera transformation
768 |         cam = dataset.cameras()[args.viz_subject][args.viz_camera]
769 |         if ground_truth is not None:
770 |             prediction = camera_to_world(prediction, R=cam['orientation'], t=cam['translation'])
771 |             ground_truth = camera_to_world(ground_truth, R=cam['orientation'], t=cam['translation'])
772 |         else:
773 |             # If the ground truth is not available, take the camera extrinsic params from a random subject.
774 |             # They are almost the same, and anyway, we only need this for visualization purposes.
775 |             for subject in dataset.cameras():
776 |                 if 'orientation' in dataset.cameras()[subject][args.viz_camera]:
777 |                     rot = dataset.cameras()[subject][args.viz_camera]['orientation']
778 |                     break
779 |             prediction = camera_to_world(prediction, R=rot, t=0)
780 |             # We don't have the trajectory, but at least we can rebase the height
781 |             prediction[:, :, 2] -= np.min(prediction[:, :, 2])
782 |         
783 |         anim_output = {'Reconstruction': prediction}
784 |         if ground_truth is not None and not args.viz_no_ground_truth:
785 |             anim_output['Ground truth'] = ground_truth
786 |         
787 |         input_keypoints = image_coordinates(input_keypoints[..., :2], w=cam['res_w'], h=cam['res_h'])
788 |         
789 |         from common.visualization import render_animation
790 |         render_animation(input_keypoints, keypoints_metadata, anim_output,
791 |                          dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output,
792 |                          limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size,
793 |                          input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']),
794 |                          input_video_skip=args.viz_skip)
795 |     
796 | else:
797 |     print('Evaluating...')
798 |     all_actions = {}
799 |     all_actions_by_subject = {}
800 |     for subject in subjects_test:
801 |         if subject not in all_actions_by_subject:
802 |             all_actions_by_subject[subject] = {}
803 | 
804 |         for action in dataset[subject].keys():
805 |             action_name = action.split(' ')[0]
806 |             if action_name not in all_actions:
807 |                 all_actions[action_name] = []
808 |             if action_name not in all_actions_by_subject[subject]:
809 |                 all_actions_by_subject[subject][action_name] = []
810 |             all_actions[action_name].append((subject, action))
811 |             all_actions_by_subject[subject][action_name].append((subject, action))
812 | 
813 |     def fetch_actions(actions):
814 |         out_poses_3d = []
815 |         out_poses_2d = []
816 | 
817 |         for subject, action in actions:
818 |             poses_2d = keypoints[subject][action]
819 |             for i in range(len(poses_2d)): # Iterate across cameras
820 |                 out_poses_2d.append(poses_2d[i])
821 | 
822 |             poses_3d = dataset[subject][action]['positions_3d']
823 |             assert len(poses_3d) == len(poses_2d), 'Camera count mismatch'
824 |             for i in range(len(poses_3d)): # Iterate across cameras
825 |                 out_poses_3d.append(poses_3d[i])
826 | 
827 |         stride = args.downsample
828 |         if stride > 1:
829 |             # Downsample as requested
830 |             for i in range(len(out_poses_2d)):
831 |                 out_poses_2d[i] = out_poses_2d[i][::stride]
832 |                 if out_poses_3d is not None:
833 |                     out_poses_3d[i] = out_poses_3d[i][::stride]
834 |         
835 |         return out_poses_3d, out_poses_2d
836 | 
837 |     def run_evaluation(actions, action_filter=None):
838 |         errors_p1 = []
839 |         errors_p2 = []
840 |         errors_p3 = []
841 |         errors_vel = []
842 | 
843 |         for action_key in actions.keys():
844 |             if action_filter is not None:
845 |                 found = False
846 |                 for a in action_filter:
847 |                     if action_key.startswith(a):
848 |                         found = True
849 |                         break
850 |                 if not found:
851 |                     continue
852 | 
853 |             poses_act, poses_2d_act = fetch_actions(actions[action_key])
854 |             gen = UnchunkedGenerator(None, poses_act, poses_2d_act,
855 |                                      pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation,
856 |                                      kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
857 |             e1, e2, e3, ev = evaluate(gen, action_key)
858 |             errors_p1.append(e1)
859 |             errors_p2.append(e2)
860 |             errors_p3.append(e3)
861 |             errors_vel.append(ev)
862 | 
863 |         print('Protocol #1   (MPJPE) action-wise average:', round(np.mean(errors_p1), 1), 'mm')
864 |         print('Protocol #2 (P-MPJPE) action-wise average:', round(np.mean(errors_p2), 1), 'mm')
865 |         print('Protocol #3 (N-MPJPE) action-wise average:', round(np.mean(errors_p3), 1), 'mm')
866 |         print('Velocity      (MPJVE) action-wise average:', round(np.mean(errors_vel), 2), 'mm')
867 | 
868 |     if not args.by_subject:
869 |         run_evaluation(all_actions, action_filter)
870 |     else:
871 |         for subject in all_actions_by_subject.keys():
872 |             print('Evaluating on subject', subject)
873 |             run_evaluation(all_actions_by_subject[subject], action_filter)
874 |             print('')
875 | 


--------------------------------------------------------------------------------