├── CogVideo ├── .github │ ├── ISSUE_TEMPLATE │ │ ├── bug_report.yaml │ │ └── feature-request.yaml │ └── PULL_REQUEST_TEMPLATE │ │ └── pr_template.md ├── .gitignore ├── LICENSE ├── MODEL_LICENSE ├── README.md ├── README_ja.md ├── README_zh.md ├── download.sh ├── finetune │ ├── README.md │ ├── README_ja.md │ ├── README_zh.md │ ├── accelerate_config_machine_single.yaml │ ├── accelerate_config_machine_single_debug.yaml │ ├── finetune_single_rank_injector.sh │ ├── finetune_single_rank_lora.sh │ ├── hostfile.txt │ ├── models │ │ ├── attention.py │ │ ├── attention_processor.py │ │ ├── cogvideox_transformer_3d.py │ │ ├── embeddings.py │ │ ├── pipeline_cogvideox.py │ │ ├── pipeline_output.py │ │ └── utils.py │ ├── train_cogvideox_injector.py │ └── train_cogvideox_lora.py ├── inference │ ├── 3dtrajmaster_inference.py │ ├── entity_zoo.txt │ ├── location_zoo.txt │ ├── output_example │ │ ├── 1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.mp4 │ │ ├── 1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.txt │ │ ├── 1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.mp4 │ │ ├── 1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.txt │ │ ├── 1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.mp4 │ │ ├── 1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.txt │ │ ├── 1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.mp4 │ │ ├── 1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.txt │ │ ├── 1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.mp4 │ │ ├── 1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.txt │ │ ├── 1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.mp4 │ │ ├── 1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.txt │ │ ├── 1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.mp4 │ │ ├── 1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.txt │ │ ├── 1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.mp4 │ │ ├── 1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.txt │ │ ├── 1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.mp4 │ │ ├── 1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.txt │ │ ├── 1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.mp4 │ │ ├── 1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.txt │ │ ├── 2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.mp4 │ │ ├── 2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.txt │ │ ├── 2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.mp4 │ │ ├── 2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.txt │ │ ├── 2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.mp4 │ │ ├── 2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.txt │ │ ├── 2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.mp4 │ │ ├── 2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.txt │ │ ├── 2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.mp4 │ │ ├── 2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.txt │ │ ├── 2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.mp4 │ │ ├── 2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.txt │ │ ├── 2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.mp4 │ │ ├── 2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.txt │ │ ├── 2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.mp4 │ │ ├── 2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.txt │ │ ├── 2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.mp4 │ │ ├── 2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.txt │ │ ├── 2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.mp4 │ │ ├── 2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.txt │ │ ├── 3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.mp4 │ │ ├── 3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.txt │ │ ├── 3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.mp4 │ │ ├── 3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.txt │ │ ├── 3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.mp4 │ │ ├── 3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.txt │ │ ├── 3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.mp4 │ │ ├── 3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.txt │ │ ├── 3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.mp4 │ │ ├── 3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.txt │ │ ├── 3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.mp4 │ │ ├── 3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.txt │ │ ├── 3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.mp4 │ │ ├── 3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.txt │ │ ├── 3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.mp4 │ │ ├── 3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.txt │ │ ├── 3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.mp4 │ │ ├── 3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.txt │ │ ├── 3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.mp4 │ │ └── 3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.txt │ └── test_sets.json ├── pyproject.toml ├── requirements.txt ├── tools │ ├── caption │ │ ├── README.md │ │ ├── README_ja.md │ │ ├── README_zh.md │ │ ├── assests │ │ │ ├── CogVLM2-Caption-example.png │ │ │ └── cogvlm2-video-example.png │ │ ├── requirements.txt │ │ └── video_caption.py │ ├── convert_weight_sat2hf.py │ ├── export_sat_lora_weight.py │ ├── llm_flux_cogvideox │ │ ├── generate.sh │ │ ├── gradio_page.py │ │ └── llm_flux_cogvideox.py │ ├── load_cogvideox_lora.py │ ├── parallel_inference │ │ ├── parallel_inference_xdit.py │ │ └── run.sh │ ├── replicate │ │ ├── cog.yaml │ │ ├── predict_i2v.py │ │ └── predict_t2v.py │ └── venhancer │ │ ├── README.md │ │ ├── README_ja.md │ │ └── README_zh.md └── weights │ └── put weights here.txt ├── README.md ├── dataset ├── load_dataset.py ├── traj_vis │ ├── D_loc1_61_t3n13_003d_Hemi12_1.json │ ├── Hemi12_transforms.json │ └── location_data_desert.json ├── utils.py └── vis_trajectory.py ├── eval ├── GVHMR │ ├── .gitignore │ ├── .gitmodules │ ├── LICENSE │ ├── README.md │ ├── docs │ │ ├── INSTALL.md │ │ └── example_video │ │ │ ├── project_teaser.gif │ │ │ └── tennis.mp4 │ ├── download_eval_pose.sh │ ├── eval.sh │ ├── hmr4d │ │ ├── __init__.py │ │ ├── build_gvhmr.py │ │ ├── configs │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ │ └── mocap │ │ │ │ │ ├── testY.yaml │ │ │ │ │ └── trainX_testY.yaml │ │ │ ├── demo.yaml │ │ │ ├── exp │ │ │ │ └── gvhmr │ │ │ │ │ └── mixed │ │ │ │ │ └── mixed.yaml │ │ │ ├── global │ │ │ │ ├── debug │ │ │ │ │ ├── debug_train.yaml │ │ │ │ │ └── debug_train_limit_data.yaml │ │ │ │ └── task │ │ │ │ │ └── gvhmr │ │ │ │ │ ├── test_3dpw.yaml │ │ │ │ │ ├── test_3dpw_emdb_rich.yaml │ │ │ │ │ ├── test_emdb.yaml │ │ │ │ │ └── test_rich.yaml │ │ │ ├── hydra │ │ │ │ └── default.yaml │ │ │ ├── siga24_release.yaml │ │ │ ├── store_gvhmr.py │ │ │ └── train.yaml │ │ ├── datamodule │ │ │ └── mocap_trainX_testY.py │ │ ├── dataset │ │ │ ├── bedlam │ │ │ │ ├── bedlam.py │ │ │ │ ├── resource │ │ │ │ │ └── vname2lwh.pt │ │ │ │ └── utils.py │ │ │ ├── emdb │ │ │ │ ├── emdb_motion_test.py │ │ │ │ └── utils.py │ │ │ ├── h36m │ │ │ │ ├── camera-parameters.json │ │ │ │ ├── h36m.py │ │ │ │ └── utils.py │ │ │ ├── imgfeat_motion │ │ │ │ └── base_dataset.py │ │ │ ├── pure_motion │ │ │ │ ├── amass.py │ │ │ │ ├── base_dataset.py │ │ │ │ ├── cam_traj_utils.py │ │ │ │ └── utils.py │ │ │ ├── rich │ │ │ │ ├── resource │ │ │ │ │ ├── cam2params.pt │ │ │ │ │ ├── seqname2imgrange.json │ │ │ │ │ ├── test.txt │ │ │ │ │ ├── train.txt │ │ │ │ │ ├── val.txt │ │ │ │ │ └── w2az_sahmr.json │ │ │ │ ├── rich_motion_test.py │ │ │ │ └── rich_utils.py │ │ │ └── threedpw │ │ │ │ ├── threedpw_motion_test.py │ │ │ │ ├── threedpw_motion_train.py │ │ │ │ └── utils.py │ │ ├── model │ │ │ ├── common_utils │ │ │ │ ├── optimizer.py │ │ │ │ ├── scheduler.py │ │ │ │ └── scheduler_cfg.py │ │ │ └── gvhmr │ │ │ │ ├── callbacks │ │ │ │ ├── metric_3dpw.py │ │ │ │ ├── metric_emdb.py │ │ │ │ └── metric_rich.py │ │ │ │ ├── gvhmr_pl.py │ │ │ │ ├── gvhmr_pl_demo.py │ │ │ │ ├── pipeline │ │ │ │ └── gvhmr_pipeline.py │ │ │ │ └── utils │ │ │ │ ├── endecoder.py │ │ │ │ ├── postprocess.py │ │ │ │ └── stats_compose.py │ │ ├── network │ │ │ ├── base_arch │ │ │ │ ├── embeddings │ │ │ │ │ └── rotary_embedding.py │ │ │ │ └── transformer │ │ │ │ │ ├── encoder_rope.py │ │ │ │ │ └── layer.py │ │ │ ├── gvhmr │ │ │ │ └── relative_transformer.py │ │ │ └── hmr2 │ │ │ │ ├── __init__.py │ │ │ │ ├── components │ │ │ │ ├── __init__.py │ │ │ │ ├── pose_transformer.py │ │ │ │ └── t_cond_mlp.py │ │ │ │ ├── configs │ │ │ │ ├── __init__.py │ │ │ │ ├── model_config.yaml │ │ │ │ └── smpl_mean_params.npz │ │ │ │ ├── hmr2.py │ │ │ │ ├── smpl_head.py │ │ │ │ ├── utils │ │ │ │ ├── geometry.py │ │ │ │ ├── preproc.py │ │ │ │ └── smpl_wrapper.py │ │ │ │ └── vit.py │ │ └── utils │ │ │ ├── body_model │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── body_model.py │ │ │ ├── body_model_smplh.py │ │ │ ├── body_model_smplx.py │ │ │ ├── coco_aug_dict.pth │ │ │ ├── min_lbs.py │ │ │ ├── seg_part_info.npy │ │ │ ├── smpl_3dpw14_J_regressor_sparse.pt │ │ │ ├── smpl_coco17_J_regressor.pt │ │ │ ├── smpl_lite.py │ │ │ ├── smpl_neutral_J_regressor.pt │ │ │ ├── smpl_vert_segmentation.json │ │ │ ├── smplx2smpl_sparse.pt │ │ │ ├── smplx_lite.py │ │ │ ├── smplx_verts437.pt │ │ │ └── utils.py │ │ │ ├── callbacks │ │ │ ├── lr_monitor.py │ │ │ ├── prog_bar.py │ │ │ ├── simple_ckpt_saver.py │ │ │ └── train_speed_timer.py │ │ │ ├── comm │ │ │ └── gather.py │ │ │ ├── eval │ │ │ └── eval_utils.py │ │ │ ├── geo │ │ │ ├── augment_noisy_pose.py │ │ │ ├── flip_utils.py │ │ │ ├── hmr_cam.py │ │ │ ├── hmr_global.py │ │ │ ├── quaternion.py │ │ │ └── transforms.py │ │ │ ├── geo_transform.py │ │ │ ├── ik │ │ │ └── ccd_ik.py │ │ │ ├── kpts │ │ │ └── kp2d_utils.py │ │ │ ├── matrix.py │ │ │ ├── net_utils.py │ │ │ ├── preproc │ │ │ ├── __init__.py │ │ │ ├── slam.py │ │ │ ├── tracker.py │ │ │ ├── vitfeat_extractor.py │ │ │ ├── vitpose.py │ │ │ └── vitpose_pytorch │ │ │ │ ├── __init__.py │ │ │ │ └── src │ │ │ │ └── vitpose_infer │ │ │ │ ├── __init__.py │ │ │ │ ├── builder │ │ │ │ ├── __init__.py │ │ │ │ ├── backbones │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── alexnet.py │ │ │ │ │ ├── cpm.py │ │ │ │ │ ├── hourglass.py │ │ │ │ │ ├── hourglass_ae.py │ │ │ │ │ ├── hrformer.py │ │ │ │ │ ├── litehrnet.py │ │ │ │ │ ├── mobilenet_v2.py │ │ │ │ │ ├── mobilenet_v3.py │ │ │ │ │ ├── mspn.py │ │ │ │ │ ├── regnet.py │ │ │ │ │ ├── resnest.py │ │ │ │ │ ├── resnext.py │ │ │ │ │ ├── rsn.py │ │ │ │ │ ├── scnet.py │ │ │ │ │ ├── seresnet.py │ │ │ │ │ ├── seresnext.py │ │ │ │ │ ├── shufflenet_v1.py │ │ │ │ │ ├── shufflenet_v2.py │ │ │ │ │ ├── tcn.py │ │ │ │ │ ├── test_torch.py │ │ │ │ │ ├── utils │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── channel_shuffle.py │ │ │ │ │ │ ├── inverted_residual.py │ │ │ │ │ │ ├── make_divisible.py │ │ │ │ │ │ ├── se_layer.py │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── vgg.py │ │ │ │ │ ├── vipnas_mbv3.py │ │ │ │ │ ├── vipnas_resnet.py │ │ │ │ │ └── vit.py │ │ │ │ ├── configs │ │ │ │ │ └── coco │ │ │ │ │ │ ├── ViTPose_base_coco_256x192.py │ │ │ │ │ │ ├── ViTPose_base_simple_coco_256x192.py │ │ │ │ │ │ ├── ViTPose_huge_coco_256x192.py │ │ │ │ │ │ ├── ViTPose_huge_simple_coco_256x192.py │ │ │ │ │ │ ├── ViTPose_large_coco_256x192.py │ │ │ │ │ │ ├── ViTPose_large_simple_coco_256x192.py │ │ │ │ │ │ └── __init__.py │ │ │ │ ├── heads │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── deconv_head.py │ │ │ │ │ ├── deeppose_regression_head.py │ │ │ │ │ ├── hmr_head.py │ │ │ │ │ ├── interhand_3d_head.py │ │ │ │ │ ├── temporal_regression_head.py │ │ │ │ │ ├── topdown_heatmap_base_head.py │ │ │ │ │ ├── topdown_heatmap_multi_stage_head.py │ │ │ │ │ ├── topdown_heatmap_simple_head.py │ │ │ │ │ ├── vipnas_heatmap_simple_head.py │ │ │ │ │ └── voxelpose_head.py │ │ │ │ └── model_builder.py │ │ │ │ ├── model_builder.py │ │ │ │ └── pose_utils │ │ │ │ ├── ViTPose_trt.py │ │ │ │ ├── __init__.py │ │ │ │ ├── convert_to_trt.py │ │ │ │ ├── general_utils.py │ │ │ │ ├── inference_test.py │ │ │ │ ├── logger_helper.py │ │ │ │ ├── pose_utils.py │ │ │ │ ├── pose_viz.py │ │ │ │ ├── timerr.py │ │ │ │ └── visualizer.py │ │ │ ├── pylogger.py │ │ │ ├── seq_utils.py │ │ │ ├── smplx_utils.py │ │ │ ├── video_io_utils.py │ │ │ ├── vis │ │ │ ├── README.md │ │ │ ├── cv2_utils.py │ │ │ ├── renderer.py │ │ │ ├── renderer_tools.py │ │ │ ├── renderer_utils.py │ │ │ └── rich_logger.py │ │ │ └── wis3d_utils.py │ ├── pyproject.toml │ ├── pyrightconfig.json │ ├── requirements.txt │ ├── setup.py │ └── tools │ │ ├── demo │ │ ├── colab_demo.ipynb │ │ ├── demo.py │ │ └── demo_folder.py │ │ ├── eval_pose.py │ │ ├── train.py │ │ ├── unitest │ │ ├── make_hydra_cfg.py │ │ └── run_dataset.py │ │ └── video │ │ ├── merge_folder.py │ │ ├── merge_horizontal.py │ │ └── merge_vertical.py └── common_metrics_on_video_quality │ ├── .gitignore │ ├── README.md │ ├── calculate_clip.py │ ├── calculate_fvd.py │ ├── calculate_fvd_styleganv.py │ ├── calculate_lpips.py │ ├── calculate_psnr.py │ ├── calculate_ssim.py │ ├── download_eval_visual.sh │ ├── eval_prompts.json │ └── eval_visual.sh └── imgs ├── logo.png └── vis_objstraj.png /CogVideo/.github/ISSUE_TEMPLATE/bug_report.yaml: -------------------------------------------------------------------------------- 1 | name: "\U0001F41B Bug Report" 2 | description: Submit a bug report to help us improve CogVideoX / 提交一个 Bug 问题报告来帮助我们改进 CogVideoX 开源模型 3 | body: 4 | - type: textarea 5 | id: system-info 6 | attributes: 7 | label: System Info / 系統信息 8 | description: Your operating environment / 您的运行环境信息 9 | placeholder: Includes Cuda version, Diffusers version, Python version, operating system, hardware information (if you suspect a hardware problem)... / 包括Cuda版本,Diffusers,Python版本,操作系统,硬件信息(如果您怀疑是硬件方面的问题)... 10 | validations: 11 | required: true 12 | 13 | - type: checkboxes 14 | id: information-scripts-examples 15 | attributes: 16 | label: Information / 问题信息 17 | description: 'The problem arises when using: / 问题出现在' 18 | options: 19 | - label: "The official example scripts / 官方的示例脚本" 20 | - label: "My own modified scripts / 我自己修改的脚本和任务" 21 | 22 | - type: textarea 23 | id: reproduction 24 | validations: 25 | required: true 26 | attributes: 27 | label: Reproduction / 复现过程 28 | description: | 29 | Please provide a code example that reproduces the problem you encountered, preferably with a minimal reproduction unit. 30 | If you have code snippets, error messages, stack traces, please provide them here as well. 31 | Please format your code correctly using code tags. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting 32 | Do not use screenshots, as they are difficult to read and (more importantly) do not allow others to copy and paste your code. 33 | 34 | 请提供能重现您遇到的问题的代码示例,最好是最小复现单元。 35 | 如果您有代码片段、错误信息、堆栈跟踪,也请在此提供。 36 | 请使用代码标签正确格式化您的代码。请参见 https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting 37 | 请勿使用截图,因为截图难以阅读,而且(更重要的是)不允许他人复制粘贴您的代码。 38 | placeholder: | 39 | Steps to reproduce the behavior/复现Bug的步骤: 40 | 41 | 1. 42 | 2. 43 | 3. 44 | 45 | - type: textarea 46 | id: expected-behavior 47 | validations: 48 | required: true 49 | attributes: 50 | label: Expected behavior / 期待表现 51 | description: "A clear and concise description of what you would expect to happen. /简单描述您期望发生的事情。" -------------------------------------------------------------------------------- /CogVideo/.github/ISSUE_TEMPLATE/feature-request.yaml: -------------------------------------------------------------------------------- 1 | name: "\U0001F680 Feature request" 2 | description: Submit a request for a new CogVideoX feature / 提交一个新的 CogVideoX开源模型的功能建议 3 | labels: [ "feature" ] 4 | body: 5 | - type: textarea 6 | id: feature-request 7 | validations: 8 | required: true 9 | attributes: 10 | label: Feature request / 功能建议 11 | description: | 12 | A brief description of the functional proposal. Links to corresponding papers and code are desirable. 13 | 对功能建议的简述。最好提供对应的论文和代码链接。 14 | 15 | - type: textarea 16 | id: motivation 17 | validations: 18 | required: true 19 | attributes: 20 | label: Motivation / 动机 21 | description: | 22 | Your motivation for making the suggestion. If that motivation is related to another GitHub issue, link to it here. 23 | 您提出建议的动机。如果该动机与另一个 GitHub 问题有关,请在此处提供对应的链接。 24 | 25 | - type: textarea 26 | id: contribution 27 | validations: 28 | required: true 29 | attributes: 30 | label: Your contribution / 您的贡献 31 | description: | 32 | 33 | Your PR link or any other link you can help with. 34 | 您的PR链接或者其他您能提供帮助的链接。 -------------------------------------------------------------------------------- /CogVideo/.github/PULL_REQUEST_TEMPLATE/pr_template.md: -------------------------------------------------------------------------------- 1 | # Raise valuable PR / 提出有价值的PR 2 | 3 | ## Caution / 注意事项: 4 | Users should keep the following points in mind when submitting PRs: 5 | 6 | 1. Ensure that your code meets the requirements in the [specification](../../resources/contribute.md). 7 | 2. the proposed PR should be relevant, if there are multiple ideas and optimizations, they should be assigned to different PRs. 8 | 9 | 用户在提交PR时候应该注意以下几点: 10 | 11 | 1. 确保您的代码符合 [规范](../../resources/contribute_zh.md) 中的要求。 12 | 2. 提出的PR应该具有针对性,如果具有多个不同的想法和优化方案,应该分配到不同的PR中。 13 | 14 | ## 不应该提出的PR / PRs that should not be proposed 15 | 16 | If a developer proposes a PR about any of the following, it may be closed or Rejected. 17 | 18 | 1. those that don't describe improvement options. 19 | 2. multiple issues of different types combined in one PR. 20 | 3. The proposed PR is highly duplicative of already existing PRs. 21 | 22 | 如果开发者提出关于以下方面的PR,则可能会被直接关闭或拒绝通过。 23 | 24 | 1. 没有说明改进方案的。 25 | 2. 多个不同类型的问题合并在一个PR中的。 26 | 3. 提出的PR与已经存在的PR高度重复的。 27 | 28 | 29 | # 检查您的PR 30 | - [ ] Have you read the Contributor Guidelines, Pull Request section? / 您是否阅读了贡献者指南、Pull Request 部分? 31 | - [ ] Has this been discussed/approved via a Github issue or forum? If so, add a link. / 是否通过 Github 问题或论坛讨论/批准过?如果是,请添加链接。 32 | - [ ] Did you make sure you updated the documentation with your changes? Here are the Documentation Guidelines, and here are the Documentation Formatting Tips. /您是否确保根据您的更改更新了文档?这里是文档指南,这里是文档格式化技巧。 33 | - [ ] Did you write new required tests? / 您是否编写了新的必要测试? 34 | - [ ] Are your PRs for only one issue / 您的PR是否仅针对一个问题 -------------------------------------------------------------------------------- /CogVideo/.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__/ 2 | samples*/ 3 | runs/ 4 | checkpoints/ 5 | master_ip 6 | logs/ 7 | *.DS_Store 8 | .idea 9 | output* 10 | test* -------------------------------------------------------------------------------- /CogVideo/download.sh: -------------------------------------------------------------------------------- 1 | mkdir CogVideoX-2b-sat 2 | cd CogVideoX-2b-sat 3 | wget https://cloud.tsinghua.edu.cn/f/fdba7608a49c463ba754/?dl=1 4 | mv 'index.html?dl=1' vae.zip 5 | unzip vae.zip 6 | wget https://cloud.tsinghua.edu.cn/f/556a3e1329e74f1bac45/?dl=1 7 | mv 'index.html?dl=1' transformer.zip 8 | unzip transformer.zip -------------------------------------------------------------------------------- /CogVideo/finetune/accelerate_config_machine_single.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | gradient_accumulation_steps: 1 5 | gradient_clipping: 1.0 6 | offload_optimizer_device: none 7 | offload_param_device: none 8 | zero3_init_flag: false 9 | zero_stage: 2 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | enable_cpu_affinity: false 13 | machine_rank: 0 14 | main_training_function: main 15 | dynamo_backend: 'no' 16 | mixed_precision: 'no' 17 | num_machines: 1 18 | num_processes: 8 19 | rdzv_backend: static 20 | same_network: true 21 | tpu_env: [] 22 | tpu_use_cluster: false 23 | tpu_use_sudo: false 24 | use_cpu: false -------------------------------------------------------------------------------- /CogVideo/finetune/accelerate_config_machine_single_debug.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | gradient_accumulation_steps: 1 5 | gradient_clipping: 1.0 6 | offload_optimizer_device: none 7 | offload_param_device: none 8 | zero3_init_flag: false 9 | zero_stage: 2 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | enable_cpu_affinity: false 13 | machine_rank: 0 14 | main_training_function: main 15 | dynamo_backend: 'no' 16 | mixed_precision: 'no' 17 | num_machines: 1 18 | num_processes: 1 19 | rdzv_backend: static 20 | same_network: true 21 | tpu_env: [] 22 | tpu_use_cluster: false 23 | tpu_use_sudo: false 24 | use_cpu: false -------------------------------------------------------------------------------- /CogVideo/finetune/finetune_single_rank_injector.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MODEL_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/cogvideox-5b" # Change it to CogVideoX-5B path 4 | export TRANSFORMER_PATH="" # Resume from pretrained injector checkpoint 5 | export LORA_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/lora" # Change it to pretrained lora path 6 | export CACHE_PATH="~/.cache" 7 | export DATASET_PATH="/ytech_m2v2_hdd/fuxiao/360Motion-Dataset" # Change it to 360-Motion Dataset path 8 | export OUTPUT_PATH="injector" 9 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True 10 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7," 11 | 12 | # if you are not using wth 8 gus, change `accelerate_config_machine_single_debug.yaml` num_processes as your gpu number 13 | accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \ 14 | train_cogvideox_injector.py \ 15 | --gradient_checkpointing \ 16 | --pretrained_model_name_or_path $MODEL_PATH \ 17 | --lora_path $LORA_PATH \ 18 | --cache_dir $CACHE_PATH \ 19 | --enable_tiling \ 20 | --enable_slicing \ 21 | --finetune_init \ 22 | --instance_data_root $DATASET_PATH \ 23 | --validation_prompt "a woman with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes and a robotic gazelle with a sturdy aluminum frame, an agile build, articulated legs and curved, metallic horns are moving in the city" \ 24 | --validation_prompt_separator ::: \ 25 | --num_validation_videos 1 \ 26 | --validation_epochs 1 \ 27 | --block_interval 2 \ 28 | --seed 42 \ 29 | --lora_scale 1.0 \ 30 | --mixed_precision bf16 \ 31 | --output_dir $OUTPUT_PATH \ 32 | --height 480 \ 33 | --width 720 \ 34 | --fps 8 \ 35 | --max_num_frames 49 \ 36 | --skip_frames_start 0 \ 37 | --skip_frames_end 0 \ 38 | --train_batch_size 1 \ 39 | --num_train_epochs 1000 \ 40 | --checkpointing_steps 4000 \ 41 | --gradient_accumulation_steps 1 \ 42 | --learning_rate 1e-4 \ 43 | --lr_scheduler cosine_with_restarts \ 44 | --lr_warmup_steps 200 \ 45 | --lr_num_cycles 1 \ 46 | --enable_slicing \ 47 | --enable_tiling \ 48 | --gradient_checkpointing \ 49 | --optimizer AdamW \ 50 | --adam_beta1 0.9 \ 51 | --adam_beta2 0.95 \ 52 | --max_grad_norm 1.0 \ 53 | --allow_tf32 \ 54 | --report_to wandb 55 | 56 | # --resume_from_checkpoint $TRANSFORMER_PATH \ -------------------------------------------------------------------------------- /CogVideo/finetune/finetune_single_rank_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MODEL_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/cogvideox-5b" # Change it to CogVideoX-5B path 4 | export CACHE_PATH="~/.cache" 5 | export DATASET_PATH="/ytech_m2v2_hdd/fuxiao/360Motion-Dataset" # Change it to 360-Motion Dataset path 6 | export OUTPUT_PATH="lora" 7 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True 8 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7," 9 | 10 | # if you are not using wth 1 gpu, change `accelerate_config_machine_single_debug.yaml` num_processes as your gpu number 11 | accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \ 12 | train_cogvideox_lora.py \ 13 | --gradient_checkpointing \ 14 | --pretrained_model_name_or_path $MODEL_PATH \ 15 | --cache_dir $CACHE_PATH \ 16 | --enable_tiling \ 17 | --enable_slicing \ 18 | --instance_data_root $DATASET_PATH \ 19 | --validation_prompt "a woman with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes and a robotic gazelle with a sturdy aluminum frame, an agile build, articulated legs and curved, metallic horns are moving in the city" \ 20 | --validation_prompt_separator ::: \ 21 | --num_validation_videos 1 \ 22 | --validation_epochs 1 \ 23 | --seed 42 \ 24 | --rank 32 \ 25 | --lora_alpha 32 \ 26 | --mixed_precision bf16 \ 27 | --output_dir $OUTPUT_PATH \ 28 | --height 480 \ 29 | --width 720 \ 30 | --fps 8 \ 31 | --max_num_frames 49 \ 32 | --skip_frames_start 0 \ 33 | --skip_frames_end 0 \ 34 | --train_batch_size 2 \ 35 | --num_train_epochs 1000 \ 36 | --checkpointing_steps 1000 \ 37 | --gradient_accumulation_steps 1 \ 38 | --learning_rate 3e-4 \ 39 | --lr_scheduler cosine_with_restarts \ 40 | --lr_warmup_steps 200 \ 41 | --lr_num_cycles 1 \ 42 | --enable_slicing \ 43 | --enable_tiling \ 44 | --gradient_checkpointing \ 45 | --optimizer AdamW \ 46 | --adam_beta1 0.9 \ 47 | --adam_beta2 0.95 \ 48 | --max_grad_norm 1.0 \ 49 | --allow_tf32 \ 50 | --report_to wandb 51 | -------------------------------------------------------------------------------- /CogVideo/finetune/hostfile.txt: -------------------------------------------------------------------------------- 1 | node1 slots=8 2 | node2 slots=8 -------------------------------------------------------------------------------- /CogVideo/finetune/models/pipeline_output.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | 5 | from diffusers.utils import BaseOutput 6 | 7 | 8 | @dataclass 9 | class CogVideoXPipelineOutput(BaseOutput): 10 | r""" 11 | Output class for CogVideo pipelines. 12 | 13 | Args: 14 | frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): 15 | List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing 16 | denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape 17 | `(batch_size, num_frames, channels, height, width)`. 18 | """ 19 | 20 | frames: torch.Tensor 21 | -------------------------------------------------------------------------------- /CogVideo/inference/location_zoo.txt: -------------------------------------------------------------------------------- 1 | [ 2 | 'fjord', 3 | 'sunset beach', 4 | 'cave', 5 | 'snowy tundra', 6 | 'prairie', 7 | 'asian town', 8 | 'rainforest', 9 | 'canyon', 10 | 'savanna', 11 | 'urban rooftop garden', 12 | 'swamp', 13 | 'riverbank', 14 | 'coral reef', 15 | 'volcanic landscape', 16 | 'wind farm', 17 | 'town street', 18 | 'night city square', 19 | 'mall lobby', 20 | 'glacier', 21 | 'seaside street', 22 | 'gymnastics room', 23 | 'abandoned factory', 24 | 'autumn forest', 25 | 'mountain village', 26 | 'coastal harbor', 27 | 'ancient ruins', 28 | 'modern metropolis', 29 | 'desert', 30 | 'forest', 31 | 'city', 32 | 'snowy street', 33 | 'park', 34 | ] -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.txt: -------------------------------------------------------------------------------- 1 | D_loc1_541_t1n37_021d_Hemi12_1 2 | a rabbit with a body covered in soft fur, quick hops, and a playful demeanor, showcasing its energy 3 | urban rooftop garden 4 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.txt: -------------------------------------------------------------------------------- 1 | D_loc1_66_t1n36_0042_Hemi12_1 2 | a fire spirit with long, twisting flames resembling flowing red and orange hair, a bright yellow core 3 | park 4 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.txt: -------------------------------------------------------------------------------- 1 | D_loc1_81_t1n42_0051_Hemi12_1 2 | a pickup truck with rugged dark green paint, extended cab, raised suspension, and a modest cargo bed cover 3 | wind farm 4 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.txt: -------------------------------------------------------------------------------- 1 | D_loc2_17_t1n8_0011_Hemi12_1 2 | a disaster rescue robot with reinforced limbs, advanced AI, and a rugged body designed to navigate 3 | sunset beach 4 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.txt: -------------------------------------------------------------------------------- 1 | D_loc2_482_t1n48_01e2_Hemi12_1 2 | a man with short spiky blonde hair, slim build, a black trench coat, blue jeans, and brown hiking shoes 3 | riverbank 4 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.txt: -------------------------------------------------------------------------------- 1 | D_loc3_323_t1n15_0143_Hemi12_1 2 | a cloud creature with billowing white and gray plumes forming a soft, rounded body, wisps of darker fog 3 | coral reef 4 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.txt: -------------------------------------------------------------------------------- 1 | D_loc3_568_t1n3_0238_Hemi12_1 2 | a woman with long straight black hair, toned build, a blue denim jacket, light gray leggings, and black slip-on shoes 3 | cave 4 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.txt: -------------------------------------------------------------------------------- 1 | D_loc4_1174_t1n9_0496_Hemi12_1 2 | a polar bear with thick white fur, strong paws, and a black nose, embodying the essence of the Arctic 3 | mall lobby 4 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.txt: -------------------------------------------------------------------------------- 1 | D_loc5_1210_t1n34_04ba_Hemi12_1 2 | a moose with a body covered in thick brown fur, massive antlers, and a bulky frame 3 | rainforest 4 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.txt: -------------------------------------------------------------------------------- 1 | D_loc5_440_t1n35_01b8_Hemi12_1 2 | a dolphin with sleek grey skin, a curved dorsal fin, and intelligent, playful eyes, reflecting its nature 3 | sunset beach 4 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.txt: -------------------------------------------------------------------------------- 1 | D_loc1_1276_t2n30_04fc_Hemi12_1 2 | a man with short curly red hair, average build, a black leather jacket, dark blue cargo pants, and white sneakers 3 | a fox with sleek russet fur, a bushy tail tipped with black, and bright green and cunning eyes 4 | sunset beach 5 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.txt: -------------------------------------------------------------------------------- 1 | D_loc1_806_t2n2_0326_Hemi12_1 2 | a porcupine with a body covered in spiky brown quills, a small nose, and curious eyes 3 | a woman with long straight black hair, toned build, a blue denim jacket, light gray leggings, and black slip-on shoes 4 | coral reef 5 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.txt: -------------------------------------------------------------------------------- 1 | D_loc1_886_t2n25_0376_Hemi12_1 2 | a man with medium-length straight brown hair, tall and slender, a gray crew-neck t-shirt, beige trousers, dark green sneakers 3 | a wolf with thick silver-gray fur, alert golden eyes, and a lean yet strong body, exuding confidence and boldness 4 | urban rooftop garden 5 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.txt: -------------------------------------------------------------------------------- 1 | D_loc2_1442_t2n36_05a2_Hemi12_1 2 | a storm entity with dark swirling clouds as a body, streaks of electric blue lightning shooting across it 3 | a surveillance drone robot with extendable camera arms, thermal vision, and a stealth black body 4 | swamp 5 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.txt: -------------------------------------------------------------------------------- 1 | D_loc5_1010_t2n2_03f2_Hemi12_1 2 | a man with short curly red hair, average build, a black leather jacket, dark blue cargo pants, and white sneakers 3 | a woman with long wavy blonde hair, petite figure, a red floral dress, white sandals, and a yellow shoulder bag 4 | mall lobby 5 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.txt: -------------------------------------------------------------------------------- 1 | D_loc5_1095_t2n37_0447_Hemi12_1 2 | a companion robot with a friendly digital face, a smooth white exterior, and social interaction algorithms 3 | a man with short straight black hair, tall and lean build, a navy blue sweater, khaki shorts, and brown sandals 4 | sunset beach 5 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.txt: -------------------------------------------------------------------------------- 1 | D_loc5_120_t2n37_0078_Hemi12_1 2 | a compact electric vehicle with a silver finish, aerodynamic profile, and efficient battery 3 | a fox with sleek russet fur, a bushy tail tipped with black, and bright green and cunning eyes 4 | night city square 5 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.txt: -------------------------------------------------------------------------------- 1 | D_loc5_1290_t2n36_050a_Hemi12_1 2 | a firefighting robot with a water cannon arm, heat sensors, and durable red-and-silver exterior 3 | a penguin with a body covered in smooth black-and-white feathers, short wings, and webbed feet 4 | swamp 5 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.txt: -------------------------------------------------------------------------------- 1 | D_loc5_1440_t2n35_05a0_Hemi12_1 2 | a fire spirit with long, twisting flames resembling flowing red and orange hair, a bright yellow core 3 | a moose with a body covered in thick brown fur, massive antlers, and a bulky frame 4 | forest 5 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.txt: -------------------------------------------------------------------------------- 1 | D_loc5_65_t2n23_0041_Hemi12_1 2 | a woman with shoulder-length wavy brown hair, slim build, a green parka, black leggings, and gray hiking boots 3 | a parrot with bright red, blue, and yellow feathers, a curved beak, and sharp intelligent eyes 4 | snowy tundra 5 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.txt: -------------------------------------------------------------------------------- 1 | D_loc1_1041_t3n22_0411_Hemi12_1 2 | a storm entity with dark swirling clouds as a body, streaks of electric blue lightning shooting across it 3 | a regal lion with a thick, flowing golden mane, sharp brown eyes, and a powerful muscular frame 4 | a man with short straight black hair, tall and lean build, a navy blue sweater, khaki shorts, and brown sandals 5 | swamp 6 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.txt: -------------------------------------------------------------------------------- 1 | D_loc1_1226_t3n24_04ca_Hemi12_1 2 | a woman with short blonde hair, slim athletic build, a red leather jacket, dark blue jeans, and white sneakers 3 | a private jet with a shiny silver body, elongated wings, a slim nose, and a compact rear stabilizer 4 | a wolf with a body covered in thick silver fur, sharp ears, and piercing yellow eyes, showcasing its alertness 5 | prairie 6 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.txt: -------------------------------------------------------------------------------- 1 | D_loc1_176_t3n26_00b0_Hemi12_1 2 | a horse with chestnut brown fur, muscular legs, a slim neck, and a flowing mane, exuding strength and grace 3 | a flamingo with a body covered in pink feathers, long slender legs, and a gracefully curved neck 4 | a wolf with thick silver-gray fur, alert golden eyes, and a lean yet strong body, exuding confidence and boldness 5 | abandoned factory 6 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.txt: -------------------------------------------------------------------------------- 1 | D_loc1_196_t3n32_00c4_Hemi12_1 2 | a man with short spiky blonde hair, slim build, a black trench coat, blue jeans, and brown hiking shoes 3 | a polar bear with thick white fur, strong paws, and a black nose, embodying the essence of the Arctic 4 | a deer with sleek tan fur, long slender legs, a graceful neck, and tiny antlers atop its head 5 | desert 6 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.txt: -------------------------------------------------------------------------------- 1 | D_loc1_536_t3n1_0218_Hemi12_1 2 | a tiger with a pristine white coat marked by bold black stripes, bright blue eyes, and a graceful, poised form 3 | a firefighting robot with a water cannon arm, heat sensors, and durable red-and-silver exterior 4 | a sporty roadster with a convertible top, silver trim, and a powerful engine 5 | snowy street 6 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.txt: -------------------------------------------------------------------------------- 1 | D_loc2_1287_t3n5_0507_Hemi12_1 2 | a panda with a body covered in fluffy black-and-white fur, a round face, and gentle eyes, radiating warmth 3 | a man with short straight black hair, tall and lean build, a navy blue sweater, khaki shorts, and brown sandals 4 | an industrial welding robot with articulated arms, a laser precision welder, and heat-resistant shields 5 | urban rooftop garden 6 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.txt: -------------------------------------------------------------------------------- 1 | D_loc2_1392_t3n4_0570_Hemi12_1 2 | a fluttering butterfly with intricate wing patterns, vivid colors, and graceful flight 3 | a man with buzz-cut blonde hair, stocky build, a gray zip-up sweater, black shorts, and red basketball shoes 4 | a giraffe with golden-yellow fur, long legs, a tall slender neck, and patches of brown spots, exuding elegance and calm 5 | volcanic landscape 6 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.txt: -------------------------------------------------------------------------------- 1 | D_loc3_1473_t3n23_05c1_Hemi12_1 2 | a firefighting robot with a water cannon arm, heat sensors, and durable red-and-silver exterior 3 | a crocodile with a body covered in scaly green skin, a powerful tail, and sharp teeth 4 | a rabbit with a body covered in soft fur, quick hops, and a playful demeanor, showcasing its energy 5 | coastal harbor 6 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.txt: -------------------------------------------------------------------------------- 1 | D_loc4_849_t3n28_0351_Hemi12_1 2 | a man with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes 3 | a sedan with a sleek metallic silver body, long wheelbase, a low-profile hood, and a small rear spoiler 4 | a gazelle with a body covered in sleek tan fur, long legs, and elegant curved horns, showcasing its grace 5 | desert 6 | -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.mp4 -------------------------------------------------------------------------------- /CogVideo/inference/output_example/3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.txt: -------------------------------------------------------------------------------- 1 | D_loc5_865_t3n34_0361_Hemi12_1 2 | a man with a shaved head, broad shoulders, a gray graphic t-shirt, dark jeans, and brown leather boots 3 | a foggy apparition with pale gray wisps drifting together in a soft, undefined form, tiny white sparkles 4 | a jaguar with a golden-yellow coat dotted with intricate black rosettes, deep green eyes, and a muscular build 5 | fjord 6 | -------------------------------------------------------------------------------- /CogVideo/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | line-length = 119 3 | 4 | [tool.ruff.lint] 5 | # Never enforce `E501` (line length violations). 6 | ignore = ["C901", "E501", "E741", "F402", "F823"] 7 | select = ["C", "E", "F", "I", "W"] 8 | 9 | # Ignore import violations in all `__init__.py` files. 10 | [tool.ruff.lint.per-file-ignores] 11 | "__init__.py" = ["E402", "F401", "F403", "F811"] 12 | 13 | [tool.ruff.lint.isort] 14 | lines-after-imports = 2 15 | 16 | [tool.ruff.format] 17 | # Like Black, use double quotes for strings. 18 | quote-style = "double" 19 | 20 | # Like Black, indent with spaces, rather than tabs. 21 | indent-style = "space" 22 | 23 | # Like Black, respect magic trailing commas. 24 | skip-magic-trailing-comma = false 25 | 26 | # Like Black, automatically detect the appropriate line ending. 27 | line-ending = "auto" 28 | -------------------------------------------------------------------------------- /CogVideo/requirements.txt: -------------------------------------------------------------------------------- 1 | diffusers==0.31.0 2 | accelerate==1.1.1 3 | transformers==4.46.2 4 | numpy==1.26.0 5 | # torch==2.5.0 6 | # torchvision==0.20.0 7 | sentencepiece==0.2.0 8 | SwissArmyTransformer==0.4.12 9 | gradio==5.5.0 10 | imageio==2.35.1 11 | imageio-ffmpeg==0.5.1 12 | openai==1.54.0 13 | moviepy==1.0.3 14 | scikit-video==1.1.11 15 | opencv-python 16 | peft==0.12.0 17 | decord 18 | wandb -------------------------------------------------------------------------------- /CogVideo/tools/caption/README.md: -------------------------------------------------------------------------------- 1 | # Video Caption 2 | 3 | Typically, most video data does not come with corresponding descriptive text, so it is necessary to convert the video 4 | data into textual descriptions to provide the essential training data for text-to-video models. 5 | 6 | ## Update and News 7 | - 🔥🔥 **News**: ```2024/9/19```: The caption model used in the CogVideoX training process to convert video data into text 8 | descriptions, [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption), is now open-source. Feel 9 | free to download and use it. 10 | 11 | 12 | ## Video Caption via CogVLM2-Caption 13 | 14 | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/) 15 | 16 | CogVLM2-Caption is a video captioning model used to generate training data for the CogVideoX model. 17 | 18 | ### Install 19 | ```shell 20 | pip install -r requirements.txt 21 | ``` 22 | 23 | ### Usage 24 | 25 | ```shell 26 | python video_caption.py 27 | ``` 28 | 29 | Example: 30 |
31 | 32 |
33 | 34 | ## Video Caption via CogVLM2-Video 35 | 36 | [Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) | [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/) 37 | 38 | CogVLM2-Video is a versatile video understanding model equipped with timestamp-based question answering capabilities. 39 | Users can input prompts such as `Please describe this video in detail.` to the model to obtain a detailed video caption: 40 |
41 | 42 |
43 | 44 | Users can use the provided [code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) to load the model or configure a RESTful API to generate video captions. 45 | 46 | ## Citation 47 | 48 | 🌟 If you find our work helpful, please leave us a star and cite our paper. 49 | 50 | CogVLM2-Caption: 51 | ``` 52 | @article{yang2024cogvideox, 53 | title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer}, 54 | author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others}, 55 | journal={arXiv preprint arXiv:2408.06072}, 56 | year={2024} 57 | } 58 | ``` 59 | CogVLM2-Video: 60 | ``` 61 | @article{hong2024cogvlm2, 62 | title={CogVLM2: Visual Language Models for Image and Video Understanding}, 63 | author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others}, 64 | journal={arXiv preprint arXiv:2408.16500}, 65 | year={2024} 66 | } 67 | ``` -------------------------------------------------------------------------------- /CogVideo/tools/caption/README_ja.md: -------------------------------------------------------------------------------- 1 | # ビデオキャプション 2 | 3 | 通常、ほとんどのビデオデータには対応する説明文が付いていないため、ビデオデータをテキストの説明に変換して、テキストからビデオへのモデルに必要なトレーニングデータを提供する必要があります。 4 | 5 | ## 更新とニュース 6 | - 🔥🔥 **ニュース**: ```2024/9/19```:CogVideoX 7 | のトレーニングプロセスで、ビデオデータをテキストに変換するためのキャプションモデル [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption) 8 | がオープンソース化されました。ぜひダウンロードしてご利用ください。 9 | ## CogVLM2-Captionによるビデオキャプション 10 | 11 | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/) 12 | 13 | CogVLM2-Captionは、CogVideoXモデルのトレーニングデータを生成するために使用されるビデオキャプションモデルです。 14 | 15 | ### インストール 16 | ```shell 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ### 使用方法 21 | ```shell 22 | python video_caption.py 23 | ``` 24 | 25 | 例: 26 |
27 | 28 |
29 | 30 | 31 | 32 | ## CogVLM2-Video を使用したビデオキャプション 33 | 34 | [Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) | [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/) 35 | 36 | 37 | CogVLM2-Video は、タイムスタンプベースの質問応答機能を備えた多機能なビデオ理解モデルです。ユーザーは `このビデオを詳細に説明してください。` などのプロンプトをモデルに入力して、詳細なビデオキャプションを取得できます: 38 |
39 | 40 |
41 | 42 | ユーザーは提供された[コード](https://github.com/THUDM/CogVLM2/tree/main/video_demo)を使用してモデルをロードするか、RESTful API を構成してビデオキャプションを生成できます。 43 | 44 | ## Citation 45 | 46 | 🌟 If you find our work helpful, please leave us a star and cite our paper. 47 | 48 | CogVLM2-Caption: 49 | ``` 50 | @article{yang2024cogvideox, 51 | title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer}, 52 | author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others}, 53 | journal={arXiv preprint arXiv:2408.06072}, 54 | year={2024} 55 | } 56 | ``` 57 | CogVLM2-Video: 58 | ``` 59 | @article{hong2024cogvlm2, 60 | title={CogVLM2: Visual Language Models for Image and Video Understanding}, 61 | author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others}, 62 | journal={arXiv preprint arXiv:2408.16500}, 63 | year={2024} 64 | } 65 | ``` 66 | -------------------------------------------------------------------------------- /CogVideo/tools/caption/README_zh.md: -------------------------------------------------------------------------------- 1 | # 视频Caption 2 | 3 | 通常,大多数视频数据不带有相应的描述性文本,因此需要将视频数据转换为文本描述,以提供必要的训练数据用于文本到视频模型。 4 | 5 | ## 项目更新 6 | - 🔥🔥 **News**: ```2024/9/19```: CogVideoX 训练过程中用于将视频数据转换为文本描述的 Caption 7 | 模型 [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption) 8 | 已经开源。欢迎前往下载并使用。 9 | 10 | ## 通过 CogVLM2-Caption 模型生成视频Caption 11 | 12 | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/) 13 | 14 | CogVLM2-Caption是用于生成CogVideoX模型训练数据的视频caption模型。 15 | 16 | ### 安装依赖 17 | ```shell 18 | pip install -r requirements.txt 19 | ``` 20 | 21 | ### 运行caption模型 22 | 23 | ```shell 24 | python video_caption.py 25 | ``` 26 | 27 | 示例: 28 |
29 | 30 |
31 | 32 | ## 通过 CogVLM2-Video 模型生成视频Caption 33 | 34 | [Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) | [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/) 35 | 36 | CogVLM2-Video 是一个多功能的视频理解模型,具备基于时间戳的问题回答能力。用户可以输入诸如 `Describe this video in detail.` 的提示语给模型,以获得详细的视频Caption: 37 | 38 | 39 |
40 | 41 |
42 | 43 | 用户可以使用提供的[代码](https://github.com/THUDM/CogVLM2/tree/main/video_demo)加载模型或配置 RESTful API 来生成视频Caption。 44 | 45 | 46 | ## Citation 47 | 48 | 🌟 If you find our work helpful, please leave us a star and cite our paper. 49 | 50 | CogVLM2-Caption: 51 | ``` 52 | @article{yang2024cogvideox, 53 | title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer}, 54 | author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others}, 55 | journal={arXiv preprint arXiv:2408.06072}, 56 | year={2024} 57 | } 58 | ``` 59 | CogVLM2-Video: 60 | ``` 61 | @article{hong2024cogvlm2, 62 | title={CogVLM2: Visual Language Models for Image and Video Understanding}, 63 | author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others}, 64 | journal={arXiv preprint arXiv:2408.16500}, 65 | year={2024} 66 | } 67 | ``` -------------------------------------------------------------------------------- /CogVideo/tools/caption/assests/CogVLM2-Caption-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/tools/caption/assests/CogVLM2-Caption-example.png -------------------------------------------------------------------------------- /CogVideo/tools/caption/assests/cogvlm2-video-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/tools/caption/assests/cogvlm2-video-example.png -------------------------------------------------------------------------------- /CogVideo/tools/caption/requirements.txt: -------------------------------------------------------------------------------- 1 | decord>=0.6.0 2 | #根据https://download.pytorch.org/whl/torch/,python版本为[3.8,3.11] 3 | torch==2.1.0 4 | torchvision== 0.16.0 5 | pytorchvideo==0.1.5 6 | xformers 7 | transformers==4.42.4 8 | #git+https://github.com/huggingface/transformers.git 9 | huggingface-hub>=0.23.0 10 | pillow 11 | chainlit>=1.0 12 | pydantic>=2.7.1 13 | timm>=0.9.16 14 | openai>=1.30.1 15 | loguru>=0.7.2 16 | pydantic>=2.7.1 17 | einops 18 | sse-starlette>=2.1.0 19 | flask 20 | gunicorn 21 | gevent 22 | requests 23 | gradio -------------------------------------------------------------------------------- /CogVideo/tools/export_sat_lora_weight.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | import torch 3 | import argparse 4 | from diffusers.loaders.lora_base import LoraBaseMixin 5 | from diffusers.models.modeling_utils import load_state_dict 6 | 7 | 8 | def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]: 9 | state_dict = saved_dict 10 | if "model" in saved_dict.keys(): 11 | state_dict = state_dict["model"] 12 | if "module" in saved_dict.keys(): 13 | state_dict = state_dict["module"] 14 | if "state_dict" in saved_dict.keys(): 15 | state_dict = state_dict["state_dict"] 16 | return state_dict 17 | 18 | LORA_KEYS_RENAME = { 19 | 20 | 'attention.query_key_value.matrix_A.0': 'attn1.to_q.lora_A.weight', 21 | 'attention.query_key_value.matrix_A.1': 'attn1.to_k.lora_A.weight', 22 | 'attention.query_key_value.matrix_A.2': 'attn1.to_v.lora_A.weight', 23 | 'attention.query_key_value.matrix_B.0': 'attn1.to_q.lora_B.weight', 24 | 'attention.query_key_value.matrix_B.1': 'attn1.to_k.lora_B.weight', 25 | 'attention.query_key_value.matrix_B.2': 'attn1.to_v.lora_B.weight', 26 | 'attention.dense.matrix_A.0': 'attn1.to_out.0.lora_A.weight', 27 | 'attention.dense.matrix_B.0': 'attn1.to_out.0.lora_B.weight' 28 | } 29 | 30 | 31 | 32 | PREFIX_KEY = "model.diffusion_model." 33 | SAT_UNIT_KEY = "layers" 34 | LORA_PREFIX_KEY = "transformer_blocks" 35 | 36 | 37 | 38 | def export_lora_weight(ckpt_path,lora_save_directory): 39 | 40 | merge_original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True)) 41 | 42 | 43 | lora_state_dict = {} 44 | for key in list(merge_original_state_dict.keys()): 45 | new_key = key[len(PREFIX_KEY) :] 46 | for special_key, lora_keys in LORA_KEYS_RENAME.items(): 47 | if new_key.endswith(special_key): 48 | new_key = new_key.replace(special_key, lora_keys) 49 | new_key = new_key.replace(SAT_UNIT_KEY, LORA_PREFIX_KEY) 50 | 51 | lora_state_dict[new_key] = merge_original_state_dict[key] 52 | 53 | 54 | 55 | # final length should be 240 56 | if len(lora_state_dict) != 240: 57 | raise ValueError("lora_state_dict length is not 240") 58 | 59 | lora_state_dict.keys() 60 | 61 | LoraBaseMixin.write_lora_layers( 62 | state_dict=lora_state_dict, 63 | save_directory=lora_save_directory, 64 | is_main_process=True, 65 | weight_name=None, 66 | save_function=None, 67 | safe_serialization=True 68 | ) 69 | 70 | 71 | def get_args(): 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument( 74 | "--sat_pt_path", type=str, required=True, help="Path to original sat transformer checkpoint" 75 | ) 76 | parser.add_argument("--lora_save_directory", type=str, required=True, help="Path where converted lora should be saved") 77 | return parser.parse_args() 78 | 79 | 80 | if __name__ == "__main__": 81 | args = get_args() 82 | 83 | export_lora_weight(args.sat_pt_path, args.lora_save_directory) 84 | -------------------------------------------------------------------------------- /CogVideo/tools/llm_flux_cogvideox/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NUM_VIDEOS=10 4 | INFERENCE_STEPS=50 5 | GUIDANCE_SCALE=7.0 6 | OUTPUT_DIR_PREFIX="outputs/gpu_" 7 | LOG_DIR_PREFIX="logs/gpu_" 8 | 9 | VIDEO_MODEL_PATH="/share/official_pretrains/hf_home/CogVideoX-5b-I2V" 10 | LLM_MODEL_PATH="/share/home/zyx/Models/Meta-Llama-3.1-8B-Instruct" 11 | IMAGE_MODEL_PATH = "share/home/zyx/Models/FLUX.1-dev" 12 | 13 | #VIDEO_MODEL_PATH="THUDM/CogVideoX-5B-I2V" 14 | #LLM_MODEL_PATH="THUDM/glm-4-9b-chat" 15 | #IMAGE_MODEL_PATH = "black-forest-labs/FLUX.1-dev" 16 | 17 | CUDA_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"} 18 | 19 | IFS=',' read -r -a GPU_ARRAY <<< "$CUDA_DEVICES" 20 | 21 | for i in "${!GPU_ARRAY[@]}" 22 | do 23 | GPU=${GPU_ARRAY[$i]} 24 | echo "Starting task on GPU $GPU..." 25 | CUDA_VISIBLE_DEVICES=$GPU nohup python3 llm_flux_cogvideox.py \ 26 | --caption_generator_model_id $LLM_MODEL_PATH \ 27 | --image_generator_model_id $IMAGE_MODEL_PATH \ 28 | --model_path $VIDEO_MODEL_PATH \ 29 | --num_videos $NUM_VIDEOS \ 30 | --image_generator_num_inference_steps $INFERENCE_STEPS \ 31 | --guidance_scale $GUIDANCE_SCALE \ 32 | --use_dynamic_cfg \ 33 | --output_dir ${OUTPUT_DIR_PREFIX}${GPU} \ 34 | > ${LOG_DIR_PREFIX}${GPU}.log 2>&1 & 35 | done -------------------------------------------------------------------------------- /CogVideo/tools/parallel_inference/run.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | export PYTHONPATH=$PWD:$PYTHONPATH 4 | 5 | # Select the model type 6 | # The model is downloaded to a specified location on disk, 7 | # or you can simply use the model's ID on Hugging Face, 8 | # which will then be downloaded to the default cache path on Hugging Face. 9 | 10 | export MODEL_TYPE="CogVideoX" 11 | # Configuration for different model types 12 | # script, model_id, inference_step 13 | declare -A MODEL_CONFIGS=( 14 | ["CogVideoX"]="parallel_inference_xdit.py /cfs/dit/CogVideoX-2b 20" 15 | ) 16 | 17 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then 18 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}" 19 | export SCRIPT MODEL_ID INFERENCE_STEP 20 | else 21 | echo "Invalid MODEL_TYPE: $MODEL_TYPE" 22 | exit 1 23 | fi 24 | 25 | mkdir -p ./results 26 | 27 | # task args 28 | if [ "$MODEL_TYPE" = "CogVideoX" ]; then 29 | TASK_ARGS="--height 480 --width 720 --num_frames 9" 30 | fi 31 | 32 | # CogVideoX asserts sp_degree == ulysses_degree*ring_degree <= 2. Also, do not set the pipefusion degree. 33 | if [ "$MODEL_TYPE" = "CogVideoX" ]; then 34 | N_GPUS=4 35 | PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 1" 36 | CFG_ARGS="--use_cfg_parallel" 37 | fi 38 | 39 | 40 | torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \ 41 | --model $MODEL_ID \ 42 | $PARALLEL_ARGS \ 43 | $TASK_ARGS \ 44 | $PIPEFUSION_ARGS \ 45 | $OUTPUT_ARGS \ 46 | --num_inference_steps $INFERENCE_STEP \ 47 | --warmup_steps 0 \ 48 | --prompt "A small dog." \ 49 | $CFG_ARGS \ 50 | $PARALLLEL_VAE \ 51 | $COMPILE_FLAG 52 | -------------------------------------------------------------------------------- /CogVideo/tools/replicate/cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://cog.run/yaml 3 | 4 | build: 5 | # set to true if your model requires a GPU 6 | gpu: true 7 | 8 | # a list of ubuntu apt packages to install 9 | system_packages: 10 | - "libgl1-mesa-glx" 11 | - "libglib2.0-0" 12 | 13 | # python version in the form '3.11' or '3.11.4' 14 | python_version: "3.11" 15 | 16 | # a list of packages in the format == 17 | python_packages: 18 | - diffusers>=0.30.3 19 | - accelerate>=0.34.2 20 | - transformers>=4.44.2 21 | - numpy==1.26.0 22 | - torch>=2.4.0 23 | - torchvision>=0.19.0 24 | - sentencepiece>=0.2.0 25 | - SwissArmyTransformer>=0.4.12 26 | - imageio>=2.35.1 27 | - imageio-ffmpeg>=0.5.1 28 | - openai>=1.45.0 29 | - moviepy>=1.0.3 30 | - pillow==9.5.0 31 | - pydantic==1.10.7 32 | run: 33 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget 34 | 35 | # predict.py defines how predictions are run on your model 36 | predict: "predict_t2v.py:Predictor" 37 | # predict: "predict_i2v.py:Predictor" 38 | -------------------------------------------------------------------------------- /CogVideo/tools/replicate/predict_i2v.py: -------------------------------------------------------------------------------- 1 | # Prediction interface for Cog ⚙️ 2 | # https://cog.run/python 3 | 4 | import os 5 | import subprocess 6 | import time 7 | import torch 8 | from diffusers import CogVideoXImageToVideoPipeline 9 | from diffusers.utils import export_to_video, load_image 10 | from cog import BasePredictor, Input, Path 11 | 12 | 13 | MODEL_CACHE = "model_cache_i2v" 14 | MODEL_URL = ( 15 | f"https://weights.replicate.delivery/default/THUDM/CogVideo/{MODEL_CACHE}.tar" 16 | ) 17 | os.environ["HF_DATASETS_OFFLINE"] = "1" 18 | os.environ["TRANSFORMERS_OFFLINE"] = "1" 19 | os.environ["HF_HOME"] = MODEL_CACHE 20 | os.environ["TORCH_HOME"] = MODEL_CACHE 21 | os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE 22 | os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE 23 | os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE 24 | 25 | 26 | def download_weights(url, dest): 27 | start = time.time() 28 | print("downloading url: ", url) 29 | print("downloading to: ", dest) 30 | subprocess.check_call(["pget", "-x", url, dest], close_fds=False) 31 | print("downloading took: ", time.time() - start) 32 | 33 | 34 | class Predictor(BasePredictor): 35 | def setup(self) -> None: 36 | """Load the model into memory to make running multiple predictions efficient""" 37 | 38 | if not os.path.exists(MODEL_CACHE): 39 | download_weights(MODEL_URL, MODEL_CACHE) 40 | 41 | # model_id: THUDM/CogVideoX-5b-I2V 42 | self.pipe = CogVideoXImageToVideoPipeline.from_pretrained( 43 | MODEL_CACHE, torch_dtype=torch.bfloat16 44 | ).to("cuda") 45 | 46 | self.pipe.enable_model_cpu_offload() 47 | self.pipe.vae.enable_tiling() 48 | 49 | def predict( 50 | self, 51 | prompt: str = Input( 52 | description="Input prompt", default="Starry sky slowly rotating." 53 | ), 54 | image: Path = Input(description="Input image"), 55 | num_inference_steps: int = Input( 56 | description="Number of denoising steps", ge=1, le=500, default=50 57 | ), 58 | guidance_scale: float = Input( 59 | description="Scale for classifier-free guidance", ge=1, le=20, default=6 60 | ), 61 | num_frames: int = Input( 62 | description="Number of frames for the output video", default=49 63 | ), 64 | seed: int = Input( 65 | description="Random seed. Leave blank to randomize the seed", default=None 66 | ), 67 | ) -> Path: 68 | """Run a single prediction on the model""" 69 | 70 | if seed is None: 71 | seed = int.from_bytes(os.urandom(2), "big") 72 | print(f"Using seed: {seed}") 73 | 74 | img = load_image(image=str(image)) 75 | 76 | video = self.pipe( 77 | prompt=prompt, 78 | image=img, 79 | num_videos_per_prompt=1, 80 | num_inference_steps=num_inference_steps, 81 | num_frames=num_frames, 82 | guidance_scale=guidance_scale, 83 | generator=torch.Generator(device="cuda").manual_seed(seed), 84 | ).frames[0] 85 | 86 | out_path = "/tmp/out.mp4" 87 | 88 | export_to_video(video, out_path, fps=8) 89 | return Path(out_path) 90 | -------------------------------------------------------------------------------- /CogVideo/tools/replicate/predict_t2v.py: -------------------------------------------------------------------------------- 1 | # Prediction interface for Cog ⚙️ 2 | # https://cog.run/python 3 | 4 | import os 5 | import subprocess 6 | import time 7 | import torch 8 | from diffusers import CogVideoXPipeline 9 | from diffusers.utils import export_to_video 10 | from cog import BasePredictor, Input, Path 11 | 12 | 13 | MODEL_CACHE = "model_cache" 14 | MODEL_URL = ( 15 | f"https://weights.replicate.delivery/default/THUDM/CogVideo/{MODEL_CACHE}.tar" 16 | ) 17 | os.environ["HF_DATASETS_OFFLINE"] = "1" 18 | os.environ["TRANSFORMERS_OFFLINE"] = "1" 19 | os.environ["HF_HOME"] = MODEL_CACHE 20 | os.environ["TORCH_HOME"] = MODEL_CACHE 21 | os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE 22 | os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE 23 | os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE 24 | 25 | 26 | def download_weights(url, dest): 27 | start = time.time() 28 | print("downloading url: ", url) 29 | print("downloading to: ", dest) 30 | subprocess.check_call(["pget", "-x", url, dest], close_fds=False) 31 | print("downloading took: ", time.time() - start) 32 | 33 | 34 | class Predictor(BasePredictor): 35 | def setup(self) -> None: 36 | """Load the model into memory to make running multiple predictions efficient""" 37 | 38 | if not os.path.exists(MODEL_CACHE): 39 | download_weights(MODEL_URL, MODEL_CACHE) 40 | 41 | # model_id: THUDM/CogVideoX-5b 42 | self.pipe = CogVideoXPipeline.from_pretrained( 43 | MODEL_CACHE, 44 | torch_dtype=torch.bfloat16, 45 | ).to("cuda") 46 | 47 | self.pipe.enable_model_cpu_offload() 48 | self.pipe.vae.enable_tiling() 49 | 50 | def predict( 51 | self, 52 | prompt: str = Input( 53 | description="Input prompt", 54 | default="A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance.", 55 | ), 56 | num_inference_steps: int = Input( 57 | description="Number of denoising steps", ge=1, le=500, default=50 58 | ), 59 | guidance_scale: float = Input( 60 | description="Scale for classifier-free guidance", ge=1, le=20, default=6 61 | ), 62 | num_frames: int = Input( 63 | description="Number of frames for the output video", default=49 64 | ), 65 | seed: int = Input( 66 | description="Random seed. Leave blank to randomize the seed", default=None 67 | ), 68 | ) -> Path: 69 | """Run a single prediction on the model""" 70 | 71 | if seed is None: 72 | seed = int.from_bytes(os.urandom(2), "big") 73 | print(f"Using seed: {seed}") 74 | 75 | video = self.pipe( 76 | prompt=prompt, 77 | num_videos_per_prompt=1, 78 | num_inference_steps=num_inference_steps, 79 | num_frames=num_frames, 80 | guidance_scale=guidance_scale, 81 | generator=torch.Generator(device="cuda").manual_seed(seed), 82 | ).frames[0] 83 | 84 | out_path = "/tmp/out.mp4" 85 | 86 | export_to_video(video, out_path, fps=8) 87 | return Path(out_path) 88 | -------------------------------------------------------------------------------- /CogVideo/weights/put weights here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/weights/put weights here.txt -------------------------------------------------------------------------------- /dataset/traj_vis/Hemi12_transforms.json: -------------------------------------------------------------------------------- 1 | { 2 | "C_01_35mm": "[-0.8622445326446021 -0.497817113029644 -0.09334070869305826 0] [0.49999999999999994 -0.8660254037844387 0.0 0] [-0.08083542493543144 -0.04667035434652912 0.9956342260592881 0] [692.820323027551 399.99999999999994 0.0 1]", 3 | "C_02_35mm": "[-0.49781711302964426 -0.862244532644602 -0.09334070869305827 0] [0.8660254037844386 -0.5000000000000002 0.0 0] [-0.04667035434652916 -0.08083542493543144 0.9956342260592881 0] [400.0000000000001 692.8203230275509 0.0 1]", 4 | "C_03_35mm": "[-1.6011019497192098e-16 -0.9956342260592881 -0.09334070869305827 0] [1.0 -1.6081226496766366e-16 0.0 0] [-1.5010330778617594e-17 -0.09334070869305827 0.9956342260592881 0] [4.898587196589413e-14 800.0 0.0 1]", 5 | "C_04_35mm": "[0.49781711302964377 -0.8622445326446022 -0.09334070869305827 0] [0.8660254037844388 0.4999999999999997 0.0 0] [0.04667035434652911 -0.08083542493543147 0.9956342260592881 0] [-399.99999999999983 692.820323027551 0.0 1]", 6 | "C_05_35mm": "[0.8622445326446021 -0.4978171130296439 -0.09334070869305826 0] [0.49999999999999983 0.8660254037844387 0.0 0] [0.08083542493543144 -0.046670354346529115 0.9956342260592881 0] [-692.820323027551 399.99999999999994 0.0 1]", 7 | "C_06_35mm": "[0.9956342260592881 -1.2193002680650596e-16 -0.09334070869305827 0] [1.2246467991473532e-16 1.0 0.0 0] [0.09334070869305827 -1.1430940013109933e-17 0.9956342260592881 0] [-800.0 9.797174393178826e-14 0.0 1]", 8 | "C_07_35mm": "[0.862244532644602 0.49781711302964415 -0.09334070869305827 0] [-0.5000000000000001 0.8660254037844386 0.0 0] [0.08083542493543144 0.04667035434652914 0.9956342260592881 0] [-692.8203230275509 -400.0000000000001 0.0 1]", 9 | "C_08_35mm": "[0.4978171130296444 0.8622445326446019 -0.09334070869305827 0] [-0.8660254037844385 0.5000000000000003 0.0 0] [0.046670354346529164 0.08083542493543144 0.9956342260592881 0] [-400.00000000000034 -692.8203230275508 0.0 1]", 10 | "C_09_35mm": "[2.820402217784269e-16 0.9956342260592881 -0.09334070869305827 0] [-1.0 2.83276944882399e-16 0.0 0] [2.6441270791727528e-17 0.09334070869305827 0.9956342260592881 0] [-1.4695761589768238e-13 -800.0 0.0 1]", 11 | "C_10_35mm": "[-0.49781711302964426 0.862244532644602 -0.09334070869305827 0] [-0.8660254037844386 -0.5000000000000002 0.0 0] [-0.04667035434652916 0.08083542493543144 0.9956342260592881 0] [400.0000000000001 -692.8203230275509 0.0 1]", 12 | "C_11_35mm": "[-0.8622445326446019 0.4978171130296444 -0.09334070869305827 0] [-0.5000000000000003 -0.8660254037844385 0.0 0] [-0.08083542493543144 0.046670354346529164 0.9956342260592881 0] [692.8203230275507 -400.00000000000034 0.0 1]", 13 | "C_12_35mm": "[-0.9956342260592881 1.2193002680650596e-16 -0.09334070869305827 0] [-1.2246467991473532e-16 -1.0 0.0 0] [-0.09334070869305827 1.1430940013109933e-17 0.9956342260592881 0] [800.0 -1.9594348786357651e-13 0.0 1]" 14 | } -------------------------------------------------------------------------------- /eval/GVHMR/.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third-party/DPVO"] 2 | path = third-party/DPVO 3 | url = https://github.com/princeton-vl/DPVO.git 4 | -------------------------------------------------------------------------------- /eval/GVHMR/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022-2023 3D Vision Group at the State Key Lab of CAD&CG, 2 | Zhejiang University. All Rights Reserved. 3 | 4 | For more information see 5 | If you use this software, please cite the corresponding publications 6 | listed on the above website. 7 | 8 | Permission to use, copy, modify and distribute this software and its 9 | documentation for educational, research and non-profit purposes only. 10 | Any modification based on this work must be open-source and prohibited 11 | for commercial use. 12 | You must retain, in the source form of any derivative works that you 13 | distribute, all copyright, patent, trademark, and attribution notices 14 | from the source form of this work. 15 | 16 | For commercial uses of this software, please send email to xwzhou@zju.edu.cn -------------------------------------------------------------------------------- /eval/GVHMR/README.md: -------------------------------------------------------------------------------- 1 | # GVHMR: World-Grounded Human Motion Recovery via Gravity-View Coordinates 2 | ### [Project Page](https://zju3dv.github.io/gvhmr) | [Paper](https://arxiv.org/abs/2409.06662) 3 | 4 | > World-Grounded Human Motion Recovery via Gravity-View Coordinates 5 | > [Zehong Shen](https://zehongs.github.io/)\*, 6 | [Huaijin Pi](https://phj128.github.io/)\*, 7 | [Yan Xia](https://isshikihugh.github.io/scholar), 8 | [Zhi Cen](https://scholar.google.com/citations?user=Xyy-uFMAAAAJ), 9 | [Sida Peng](https://pengsida.net/), 10 | [Zechen Hu](https://zju3dv.github.io/gvhmr), 11 | [Hujun Bao](http://www.cad.zju.edu.cn/home/bao/), 12 | [Ruizhen Hu](https://csse.szu.edu.cn/staff/ruizhenhu/), 13 | [Xiaowei Zhou](https://xzhou.me/) 14 | > SIGGRAPH Asia 2024 15 | 16 |

17 | animated 18 |

19 | 20 | ## Setup 21 | 22 | Please see [installation](docs/INSTALL.md) for details. 23 | 24 | ## Quick Start 25 | 26 | ### [ Google Colab demo for GVHMR](https://colab.research.google.com/drive/1N9WSchizHv2bfQqkE9Wuiegw_OT7mtGj?usp=sharing) 27 | 28 | ### [ HuggingFace demo for GVHMR](https://huggingface.co/spaces/LittleFrog/GVHMR) 29 | 30 | ### Demo 31 | Demo entries are provided in `tools/demo`. Use `-s` to skip visual odometry if you know the camera is static, otherwise the camera will be estimated by DPVO. 32 | We also provide a script `demo_folder.py` to inference a entire folder. 33 | ```shell 34 | python tools/demo/demo.py --video=docs/example_video/tennis.mp4 -s 35 | python tools/demo/demo_folder.py -f inputs/demo/folder_in -d outputs/demo/folder_out -s 36 | ``` 37 | 38 | ### Reproduce 39 | 1. **Test**: 40 | To reproduce the 3DPW, RICH, and EMDB results in a single run, use the following command: 41 | ```shell 42 | python tools/train.py global/task=gvhmr/test_3dpw_emdb_rich exp=gvhmr/mixed/mixed ckpt_path=inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt 43 | ``` 44 | To test individual datasets, change `global/task` to `gvhmr/test_3dpw`, `gvhmr/test_rich`, or `gvhmr/test_emdb`. 45 | 46 | 2. **Train**: 47 | To train the model, use the following command: 48 | ```shell 49 | # The gvhmr_siga24_release.ckpt is trained with 2x4090 for 420 epochs, note that different GPU settings may lead to different results. 50 | python tools/train.py exp=gvhmr/mixed/mixed 51 | ``` 52 | During training, note that we do not employ post-processing as in the test script, so the global metrics results will differ (but should still be good for comparison with baseline methods). 53 | 54 | # Citation 55 | 56 | If you find this code useful for your research, please use the following BibTeX entry. 57 | 58 | ``` 59 | @inproceedings{shen2024gvhmr, 60 | title={World-Grounded Human Motion Recovery via Gravity-View Coordinates}, 61 | author={Shen, Zehong and Pi, Huaijin and Xia, Yan and Cen, Zhi and Peng, Sida and Hu, Zechen and Bao, Hujun and Hu, Ruizhen and Zhou, Xiaowei}, 62 | booktitle={SIGGRAPH Asia Conference Proceedings}, 63 | year={2024} 64 | } 65 | ``` 66 | 67 | # Acknowledgement 68 | 69 | We thank the authors of 70 | [WHAM](https://github.com/yohanshin/WHAM), 71 | [4D-Humans](https://github.com/shubham-goel/4D-Humans), 72 | and [ViTPose-Pytorch](https://github.com/gpastal24/ViTPose-Pytorch) for their great works, without which our project/code would not be possible. 73 | -------------------------------------------------------------------------------- /eval/GVHMR/docs/INSTALL.md: -------------------------------------------------------------------------------- 1 | # Install 2 | 3 | ## Environment 4 | 5 | ```bash 6 | git clone https://github.com/zju3dv/GVHMR --recursive 7 | cd GVHMR 8 | 9 | conda create -y -n gvhmr python=3.10 10 | conda activate gvhmr 11 | pip install -r requirements.txt 12 | pip install -e . 13 | # to install gvhmr in other repo as editable, try adding "python.analysis.extraPaths": ["path/to/your/package"] to settings.json 14 | 15 | # DPVO 16 | cd third-party/DPVO 17 | wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip 18 | unzip eigen-3.4.0.zip -d thirdparty && rm -rf eigen-3.4.0.zip 19 | pip install torch-scatter -f "https://data.pyg.org/whl/torch-2.3.0+cu121.html" 20 | pip install numba pypose 21 | export CUDA_HOME=/usr/local/cuda-12.1/ 22 | export PATH=$PATH:/usr/local/cuda-12.1/bin/ 23 | pip install -e . 24 | ``` 25 | 26 | ## Inputs & Outputs 27 | 28 | ```bash 29 | mkdir inputs 30 | mkdir outputs 31 | ``` 32 | 33 | **Weights** 34 | 35 | ```bash 36 | mkdir -p inputs/checkpoints 37 | 38 | # 1. You need to sign up for downloading [SMPL](https://smpl.is.tue.mpg.de/) and [SMPLX](https://smpl-x.is.tue.mpg.de/). And the checkpoints should be placed in the following structure: 39 | 40 | inputs/checkpoints/ 41 | ├── body_models/smplx/ 42 | │ └── SMPLX_{GENDER}.npz # SMPLX (We predict SMPLX params + evaluation) 43 | └── body_models/smpl/ 44 | └── SMPL_{GENDER}.pkl # SMPL (rendering and evaluation) 45 | 46 | # 2. Download other pretrained models from Google-Drive (By downloading, you agree to the corresponding licences): https://drive.google.com/drive/folders/1eebJ13FUEXrKBawHpJroW0sNSxLjh9xD?usp=drive_link 47 | 48 | inputs/checkpoints/ 49 | ├── dpvo/ 50 | │ └── dpvo.pth 51 | ├── gvhmr/ 52 | │ └── gvhmr_siga24_release.ckpt 53 | ├── hmr2/ 54 | │ └── epoch=10-step=25000.ckpt 55 | ├── vitpose/ 56 | │ └── vitpose-h-multi-coco.pth 57 | └── yolo/ 58 | └── yolov8x.pt 59 | ``` 60 | 61 | **Data** 62 | 63 | We provide preprocessed data for training and evaluation. 64 | Note that we do not intend to distribute the original datasets, and you need to download them (annotation, videos, etc.) from the original websites. 65 | *We're unable to provide the original data due to the license restrictions.* 66 | By downloading the preprocessed data, you agree to the original dataset's terms of use and use the data for research purposes only. 67 | 68 | You can download them from [Google-Drive](https://drive.google.com/drive/folders/10sEef1V_tULzddFxzCmDUpsIqfv7eP-P?usp=drive_link). Please place them in the "inputs" folder and execute the following commands: 69 | 70 | ```bash 71 | cd inputs 72 | # Train 73 | tar -xzvf AMASS_hmr4d_support.tar.gz 74 | tar -xzvf BEDLAM_hmr4d_support.tar.gz 75 | tar -xzvf H36M_hmr4d_support.tar.gz 76 | # Test 77 | tar -xzvf 3DPW_hmr4d_support.tar.gz 78 | tar -xzvf EMDB_hmr4d_support.tar.gz 79 | tar -xzvf RICH_hmr4d_support.tar.gz 80 | 81 | # The folder structure should be like this: 82 | inputs/ 83 | ├── AMASS/hmr4d_support/ 84 | ├── BEDLAM/hmr4d_support/ 85 | ├── H36M/hmr4d_support/ 86 | ├── 3DPW/hmr4d_support/ 87 | ├── EMDB/hmr4d_support/ 88 | └── RICH/hmr4d_support/ 89 | ``` 90 | -------------------------------------------------------------------------------- /eval/GVHMR/docs/example_video/project_teaser.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/docs/example_video/project_teaser.gif -------------------------------------------------------------------------------- /eval/GVHMR/docs/example_video/tennis.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/docs/example_video/tennis.mp4 -------------------------------------------------------------------------------- /eval/GVHMR/download_eval_pose.sh: -------------------------------------------------------------------------------- 1 | gdown https://drive.google.com/uc\?id\=1jMH2-ZC0ZBgtqej5Sp-E5ebBIX7mk3Xz 2 | gdown https://drive.google.com/uc\?id\=1iFcPSlcKb_rDNJ85UPoThdl22BqR2Xgh 3 | 4 | unzip eval_sets.zip 5 | rm -rf eval_sets.zip -------------------------------------------------------------------------------- /eval/GVHMR/eval.sh: -------------------------------------------------------------------------------- 1 | python tools/demo/demo_folder.py -f eval_sets -d outputs/eval_sets_gvhmr -s 2 | python tools/eval_pose.py -f outputs/eval_sets_gvhmr_v2 3 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | PROJ_ROOT = Path(__file__).resolve().parents[1] 5 | 6 | 7 | def os_chdir_to_proj_root(): 8 | """useful for running notebooks in different directories.""" 9 | os.chdir(PROJ_ROOT) 10 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/build_gvhmr.py: -------------------------------------------------------------------------------- 1 | from omegaconf import OmegaConf 2 | from hmr4d import PROJ_ROOT 3 | from hydra.utils import instantiate 4 | from hmr4d.model.gvhmr.gvhmr_pl_demo import DemoPL 5 | 6 | 7 | def build_gvhmr_demo(): 8 | cfg = OmegaConf.load(PROJ_ROOT / "hmr4d/configs/demo_gvhmr_model/siga24_release.yaml") 9 | gvhmr_demo_pl: DemoPL = instantiate(cfg.model, _recursive_=False) 10 | gvhmr_demo_pl.load_pretrained_model(PROJ_ROOT / "inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt") 11 | return gvhmr_demo_pl.eval() 12 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from hydra.core.config_store import ConfigStore 3 | from hydra_zen import builds 4 | 5 | import argparse 6 | from hydra import compose, initialize_config_module 7 | import os 8 | 9 | os.environ["HYDRA_FULL_ERROR"] = "1" 10 | 11 | MainStore = ConfigStore.instance() 12 | 13 | 14 | def register_store_gvhmr(): 15 | """Register group options to MainStore""" 16 | from . import store_gvhmr 17 | 18 | 19 | def parse_args_to_cfg(): 20 | """ 21 | Use minimal Hydra API to parse args and return cfg. 22 | This function don't do _run_hydra which create log file hierarchy. 23 | """ 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--config-name", "-cn", default="train") 26 | parser.add_argument( 27 | "overrides", 28 | nargs="*", 29 | help="Any key=value arguments to override config values (use dots for.nested=overrides)", 30 | ) 31 | args = parser.parse_args() 32 | 33 | # Cfg 34 | with initialize_config_module(version_base="1.3", config_module=f"hmr4d.configs"): 35 | cfg = compose(config_name=args.config_name, overrides=args.overrides) 36 | 37 | return cfg 38 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/data/mocap/testY.yaml: -------------------------------------------------------------------------------- 1 | # definition of lightning datamodule (dataset + dataloader) 2 | _target_: hmr4d.datamodule.mocap_trainX_testY.DataModule 3 | 4 | dataset_opts: 5 | test: ${test_datasets} 6 | 7 | loader_opts: 8 | test: 9 | batch_size: 1 10 | num_workers: 0 11 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/data/mocap/trainX_testY.yaml: -------------------------------------------------------------------------------- 1 | # definition of lightning datamodule (dataset + dataloader) 2 | _target_: hmr4d.datamodule.mocap_trainX_testY.DataModule 3 | 4 | dataset_opts: 5 | train: ${train_datasets} 6 | val: ${test_datasets} 7 | 8 | loader_opts: 9 | train: 10 | batch_size: 32 11 | num_workers: 8 12 | val: 13 | batch_size: 1 14 | num_workers: 1 15 | 16 | limit_each_trainset: null -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/demo.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - model: gvhmr/gvhmr_pl_demo 4 | - network: gvhmr/relative_transformer 5 | - endecoder: gvhmr/v1_amass_local_bedlam_cam 6 | 7 | pipeline: 8 | _target_: hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline 9 | args_denoiser3d: ${network} 10 | args: 11 | endecoder_opt: ${endecoder} 12 | normalize_cam_angvel: True 13 | weights: null 14 | static_conf: null 15 | 16 | ckpt_path: inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt 17 | 18 | # ================================ # 19 | # global setting # 20 | # ================================ # 21 | 22 | video_name: ??? 23 | output_root: outputs/demo 24 | output_dir: "${output_root}/${video_name}" 25 | preprocess_dir: ${output_dir}/preprocess 26 | video_path: "${output_dir}/0_input_video.mp4" 27 | 28 | # Options 29 | static_cam: False 30 | verbose: False 31 | 32 | paths: 33 | bbx: ${preprocess_dir}/bbx.pt 34 | bbx_xyxy_video_overlay: ${preprocess_dir}/bbx_xyxy_video_overlay.mp4 35 | vit_features: ${preprocess_dir}/vit_features.pt 36 | vitpose: ${preprocess_dir}/vitpose.pt 37 | vitpose_video_overlay: ${preprocess_dir}/vitpose_video_overlay.mp4 38 | hmr4d_results: ${output_dir}/hmr4d_results.pt 39 | incam_video: ${output_dir}/1_incam.mp4 40 | global_video: ${output_dir}/2_global.mp4 41 | incam_global_horiz_video: ${output_dir}/${video_name}_3_incam_global_horiz.mp4 42 | slam: ${preprocess_dir}/slam_results.pt 43 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/exp/gvhmr/mixed/mixed.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /data: mocap/trainX_testY 4 | - override /model: gvhmr/gvhmr_pl 5 | - override /endecoder: gvhmr/v1_amass_local_bedlam_cam 6 | - override /optimizer: adamw_2e-4 7 | - override /scheduler_cfg: epoch_half_200_350 8 | - override /train_datasets: 9 | - pure_motion_amass/v11 10 | - imgfeat_bedlam/v2 11 | - imgfeat_h36m/v1 12 | - imgfeat_3dpw/v1 13 | - override /test_datasets: 14 | - emdb1/v1_fliptest 15 | - emdb2/v1_fliptest 16 | - rich/all 17 | - 3dpw/fliptest 18 | - override /callbacks: 19 | - simple_ckpt_saver/every10e_top100 20 | - prog_bar/prog_reporter_every0.1 21 | - train_speed_timer/base 22 | - lr_monitor/pl 23 | - metric_emdb1 24 | - metric_emdb2 25 | - metric_rich 26 | - metric_3dpw 27 | - override /network: gvhmr/relative_transformer 28 | 29 | exp_name_base: mixed 30 | exp_name_var: "" 31 | exp_name: ${exp_name_base}${exp_name_var} 32 | data_name: mocap_mixed_v1 33 | 34 | pipeline: 35 | _target_: hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline 36 | args_denoiser3d: ${network} 37 | args: 38 | endecoder_opt: ${endecoder} 39 | normalize_cam_angvel: True 40 | weights: 41 | cr_j3d: 500. 42 | transl_c: 1. 43 | cr_verts: 500. 44 | j2d: 1000. 45 | verts2d: 1000. 46 | 47 | transl_w: 1. 48 | static_conf_bce: 1. 49 | 50 | static_conf: 51 | vel_thr: 0.15 52 | 53 | data: 54 | loader_opts: 55 | train: 56 | batch_size: 128 57 | num_workers: 12 58 | 59 | pl_trainer: 60 | precision: 16-mixed 61 | log_every_n_steps: 50 62 | gradient_clip_val: 0.5 63 | max_epochs: 500 64 | check_val_every_n_epoch: 10 65 | devices: 2 66 | 67 | logger: 68 | _target_: pytorch_lightning.loggers.TensorBoardLogger 69 | save_dir: ${output_dir} # /save_dir/name/version/sub_dir 70 | name: "" 71 | version: "tb" # merge name and version 72 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/global/debug/debug_train.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | data_name: debug 4 | exp_name: debug 5 | 6 | # data: 7 | # limit_each_trainset: 40 8 | # loader_opts: 9 | # train: 10 | # batch_size: 4 11 | # num_workers: 0 12 | # val: 13 | # batch_size: 1 14 | # num_workers: 0 15 | 16 | pl_trainer: 17 | limit_train_batches: 32 18 | limit_val_batches: 2 19 | check_val_every_n_epoch: 3 20 | enable_checkpointing: False 21 | devices: 1 22 | 23 | callbacks: 24 | model_checkpoint: null 25 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/global/debug/debug_train_limit_data.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | data_name: debug 4 | exp_name: debug 5 | 6 | data: 7 | limit_each_trainset: 40 8 | loader_opts: 9 | train: 10 | batch_size: 4 11 | num_workers: 0 12 | val: 13 | batch_size: 1 14 | num_workers: 0 15 | 16 | pl_trainer: 17 | limit_val_batches: 2 18 | check_val_every_n_epoch: 3 19 | enable_checkpointing: False 20 | devices: 1 21 | 22 | callbacks: 23 | model_checkpoint: null 24 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_3dpw.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /data: mocap/testY 4 | - override /test_datasets: 5 | - 3dpw/fliptest 6 | - override /callbacks: 7 | - metric_3dpw 8 | - _self_ 9 | 10 | task: test 11 | data_name: test_mocap 12 | ckpt_path: ??? # will not override previous setting if already set 13 | 14 | # lightning utilities 15 | pl_trainer: 16 | devices: 1 17 | logger: null 18 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_3dpw_emdb_rich.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /data: mocap/testY 4 | - override /test_datasets: 5 | - rich/all 6 | - emdb1/v1_fliptest 7 | - emdb2/v1_fliptest 8 | - 3dpw/fliptest 9 | - override /callbacks: 10 | - metric_rich 11 | - metric_emdb1 12 | - metric_emdb2 13 | - metric_3dpw 14 | - _self_ 15 | 16 | task: test 17 | data_name: test_mocap 18 | ckpt_path: ??? # will not override previous setting if already set 19 | 20 | # lightning utilities 21 | pl_trainer: 22 | devices: 1 23 | logger: null 24 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_emdb.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /data: mocap/testY 4 | - override /test_datasets: 5 | - emdb1/v1_fliptest 6 | - emdb2/v1_fliptest 7 | - override /callbacks: 8 | - metric_emdb1 9 | - metric_emdb2 10 | - _self_ 11 | 12 | task: test 13 | data_name: test_mocap 14 | ckpt_path: ??? # will not override previous setting if already set 15 | 16 | # lightning utilities 17 | pl_trainer: 18 | devices: 1 19 | logger: null 20 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_rich.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /data: mocap/testY 4 | - override /test_datasets: 5 | - rich/all 6 | - override /callbacks: 7 | - metric_rich 8 | - _self_ 9 | 10 | task: test 11 | data_name: test_mocap 12 | ckpt_path: ??? # will not override previous setting if already set 13 | 14 | # lightning utilities 15 | pl_trainer: 16 | devices: 1 17 | logger: null 18 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # enable color logging 2 | defaults: 3 | - override hydra_logging: colorlog 4 | - override job_logging: colorlog 5 | 6 | job_logging: 7 | formatters: 8 | simple: 9 | datefmt: '%m/%d %H:%M:%S' 10 | format: '[%(asctime)s][%(levelname)s] %(message)s' 11 | colorlog: 12 | datefmt: '%m/%d %H:%M:%S' 13 | format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] %(message)s' 14 | handlers: 15 | file: 16 | filename: ${output_dir}/${hydra.job.name}.log 17 | 18 | run: 19 | dir: ${output_dir} -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/siga24_release.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | _target_: hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline 3 | args_denoiser3d: ${network} 4 | args: 5 | endecoder_opt: ${endecoder} 6 | normalize_cam_angvel: true 7 | weights: null 8 | static_conf: null 9 | model: 10 | _target_: hmr4d.model.gvhmr.gvhmr_pl_demo.DemoPL 11 | pipeline: ${pipeline} 12 | network: 13 | _target_: hmr4d.network.gvhmr.relative_transformer.NetworkEncoderRoPEV2 14 | output_dim: 151 15 | max_len: 120 16 | kp2d_mapping: linear_v2 17 | cliffcam_dim: 3 18 | cam_angvel_dim: 6 19 | imgseq_dim: 1024 20 | f_imgseq_filter: null 21 | cond_ver: v1 22 | latent_dim: 512 23 | num_layers: 12 24 | num_heads: 8 25 | mlp_ratio: 4.0 26 | pred_cam_ver: v2 27 | pred_cam_dim: 3 28 | static_conf_dim: 6 29 | pred_coco17_dim: 0 30 | dropout: 0.1 31 | avgbeta: true 32 | endecoder: 33 | _target_: hmr4d.model.gvhmr.utils.endecoder.EnDecoder 34 | stats_name: MM_V1_AMASS_LOCAL_BEDLAM_CAM 35 | noise_pose_k: 10 36 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/store_gvhmr.py: -------------------------------------------------------------------------------- 1 | # Dataset 2 | import hmr4d.dataset.pure_motion.amass 3 | import hmr4d.dataset.emdb.emdb_motion_test 4 | import hmr4d.dataset.rich.rich_motion_test 5 | import hmr4d.dataset.threedpw.threedpw_motion_test 6 | import hmr4d.dataset.threedpw.threedpw_motion_train 7 | import hmr4d.dataset.bedlam.bedlam 8 | import hmr4d.dataset.h36m.h36m 9 | 10 | # Trainer: Model Optimizer Loss 11 | import hmr4d.model.gvhmr.gvhmr_pl 12 | import hmr4d.model.gvhmr.utils.endecoder 13 | import hmr4d.model.common_utils.optimizer 14 | import hmr4d.model.common_utils.scheduler_cfg 15 | 16 | # Metric 17 | import hmr4d.model.gvhmr.callbacks.metric_emdb 18 | import hmr4d.model.gvhmr.callbacks.metric_rich 19 | import hmr4d.model.gvhmr.callbacks.metric_3dpw 20 | 21 | 22 | # PL Callbacks 23 | import hmr4d.utils.callbacks.simple_ckpt_saver 24 | import hmr4d.utils.callbacks.train_speed_timer 25 | import hmr4d.utils.callbacks.prog_bar 26 | import hmr4d.utils.callbacks.lr_monitor 27 | 28 | # Networks 29 | import hmr4d.network.gvhmr.relative_transformer 30 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/configs/train.yaml: -------------------------------------------------------------------------------- 1 | # ================================ # 2 | # override # 3 | # ================================ # 4 | # specify default configuration; the order determines the override order 5 | defaults: 6 | - _self_ 7 | # pytorch-lightning 8 | - data: ??? 9 | - model: ??? 10 | - callbacks: null 11 | 12 | # system 13 | - hydra: default 14 | 15 | # utility groups that changes a lot 16 | - pipeline: null 17 | - network: null 18 | - optimizer: null 19 | - scheduler_cfg: default 20 | - train_datasets: null 21 | - test_datasets: null 22 | - endecoder: null # normalize/unnormalize data 23 | - refiner: null 24 | 25 | # global-override 26 | - exp: ??? # set "data, model and callbacks" in yaml 27 | - global/task: null # dump/test 28 | - global/hsearch: null # hyper-param search 29 | - global/debug: null # debug mode 30 | 31 | # ================================ # 32 | # global setting # 33 | # ================================ # 34 | # expirement information 35 | task: fit # [fit, predict] 36 | exp_name: ??? 37 | data_name: ??? 38 | 39 | # utilities in the entry file 40 | output_dir: "outputs/${data_name}/${exp_name}" 41 | ckpt_path: null 42 | resume_mode: null 43 | seed: 42 44 | 45 | # lightning default settings 46 | pl_trainer: 47 | devices: 1 48 | num_sanity_val_steps: 0 # disable sanity check 49 | precision: 32 50 | inference_mode: False 51 | 52 | logger: null 53 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/dataset/bedlam/resource/vname2lwh.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/dataset/bedlam/resource/vname2lwh.pt -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/dataset/bedlam/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from pathlib import Path 4 | 5 | resource_dir = Path(__file__).parent / "resource" 6 | 7 | 8 | def mid2vname(mid): 9 | """vname = {scene}/{seq}, Note that it ends with .mp4""" 10 | # mid example: "inputs/bedlam/bedlam_download/20221011_1_250_batch01hand_closeup_suburb_a/mp4/seq_000001.mp4-rp_emma_posed_008" 11 | # -> vname: 20221011_1_250_batch01hand_closeup_suburb_a/seq_000001.mp4 12 | scene = mid.split("/")[-3] 13 | seq = mid.split("/")[-1].split("-")[0] 14 | vname = f"{scene}/{seq}" 15 | return vname 16 | 17 | 18 | def mid2featname(mid): 19 | """featname = {scene}/{seqsubj}, Note that it ends with .pt (extra)""" 20 | # mid example: "inputs/bedlam/bedlam_download/20221011_1_250_batch01hand_closeup_suburb_a/mp4/seq_000001.mp4-rp_emma_posed_008" 21 | # -> featname: 20221011_1_250_batch01hand_closeup_suburb_a/seq_000001.mp4-rp_emma_posed_008.pt 22 | scene = mid.split("/")[-3] 23 | seqsubj = mid.split("/")[-1] 24 | featname = f"{scene}/{seqsubj}.pt" 25 | return featname 26 | 27 | 28 | def featname2mid(featname): 29 | """reverse func of mid2featname, Note that it removes .pt (extra)""" 30 | # featname example: 20221011_1_250_batch01hand_closeup_suburb_a/seq_000001.mp4-rp_emma_posed_008.pt 31 | # -> mid: inputs/bedlam/bedlam_download/20221011_1_250_batch01hand_closeup_suburb_a/mp4/seq_000001.mp4-rp_emma_posed_008 32 | scene = featname.split("/")[0] 33 | seqsubj = featname.split("/")[1].strip(".pt") 34 | mid = f"inputs/bedlam/bedlam_download/{scene}/mp4/{seqsubj}" 35 | return mid 36 | 37 | 38 | def load_vname2lwh(): 39 | return torch.load(resource_dir / "vname2lwh.pt") 40 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/dataset/h36m/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | from pathlib import Path 4 | from collections import defaultdict 5 | import pickle 6 | import torch 7 | 8 | RESOURCE_FOLDER = Path(__file__).resolve().parent / "resource" 9 | 10 | camera_idx_to_name = {0: "54138969", 1: "55011271", 2: "58860488", 3: "60457274"} 11 | 12 | 13 | def get_vid(pkl_path, cam_id): 14 | """.../S6/Posing 1.pkl, 54138969 -> S6@Posing_1@54138969""" 15 | sub_id, fn = pkl_path.split("/")[-2:] 16 | vid = f"{sub_id}@{fn.split('.')[0].replace(' ', '_')}@{cam_id}" 17 | return vid 18 | 19 | 20 | def get_raw_pkl_paths(h36m_raw_root): 21 | smpl_param_dir = h36m_raw_root / "neutrSMPL_H3.6" 22 | pkl_paths = [] 23 | for train_sub in ["S1", "S5", "S6", "S7", "S8"]: 24 | for pth in (smpl_param_dir / train_sub).glob("*.pkl"): 25 | if "aligned" not in str(pth): # Use world sequence only 26 | pkl_paths.append(str(pth)) 27 | 28 | return pkl_paths 29 | 30 | 31 | def get_cam_KRts(): 32 | """ 33 | Returns: 34 | Ks (torch.Tensor): {cam_id: 3x3} 35 | Rts (torch.Tensor): {subj_id: {cam_id: 4x4}} 36 | """ 37 | # this file is copied from https://github.com/karfly/human36m-camera-parameters 38 | cameras_path = RESOURCE_FOLDER / "camera-parameters.json" 39 | with open(cameras_path, "r") as f: 40 | cameras = json.load(f) 41 | 42 | # 4 camera ids: '54138969', '55011271', '58860488', '60457274' 43 | Ks = {} 44 | for cam in cameras["intrinsics"]: 45 | Ks[cam] = torch.tensor(cameras["intrinsics"][cam]["calibration_matrix"]).float() 46 | 47 | # extrinsics 48 | extrinsics = cameras["extrinsics"] 49 | Rts = defaultdict(dict) 50 | for subj in extrinsics: 51 | for cam in extrinsics[subj]: 52 | Rt = torch.eye(4) 53 | Rt[:3, :3] = torch.tensor(extrinsics[subj][cam]["R"]) 54 | Rt[:3, [3]] = torch.tensor(extrinsics[subj][cam]["t"]) / 1000 55 | Rts[subj][cam] = Rt.float() 56 | 57 | return Ks, Rts 58 | 59 | 60 | def parse_raw_pkl(pkl_path, to_50hz=True): 61 | """ 62 | raw_pkl @ 200Hz, where video @ 50Hz. 63 | the frames should be divided by 4, and mannually align with the video. 64 | """ 65 | with open(str(pkl_path), "rb") as f: 66 | data = pickle.load(f, encoding="bytes") 67 | poses = torch.from_numpy(data[b"poses"]).float() 68 | betas = torch.from_numpy(data[b"betas"]).float() 69 | trans = torch.from_numpy(data[b"trans"]).float() 70 | assert poses.shape[0] == trans.shape[0] 71 | if to_50hz: 72 | poses = poses[::4] 73 | trans = trans[::4] 74 | 75 | seq_length = poses.shape[0] # 50FPS 76 | smpl_params = { 77 | "body_pose": poses[:, 3:], 78 | "betas": betas[None].expand(seq_length, -1), 79 | "global_orient": poses[:, :3], 80 | "transl": trans, 81 | } 82 | return smpl_params 83 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/dataset/imgfeat_motion/base_dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils import data 3 | import numpy as np 4 | from pathlib import Path 5 | from hmr4d.utils.pylogger import Log 6 | 7 | 8 | class ImgfeatMotionDatasetBase(data.Dataset): 9 | def __init__(self): 10 | super().__init__() 11 | self._load_dataset() 12 | self._get_idx2meta() # -> Set self.idx2meta 13 | 14 | def __len__(self): 15 | return len(self.idx2meta) 16 | 17 | def _load_dataset(self): 18 | raise NotImplemented 19 | 20 | def _get_idx2meta(self): 21 | raise NotImplemented 22 | 23 | def _load_data(self, idx): 24 | raise NotImplemented 25 | 26 | def _process_data(self, data, idx): 27 | raise NotImplemented 28 | 29 | def __getitem__(self, idx): 30 | data = self._load_data(idx) 31 | data = self._process_data(data, idx) 32 | return data 33 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/dataset/pure_motion/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from pytorch3d.transforms import ( 4 | axis_angle_to_matrix, 5 | matrix_to_axis_angle, 6 | matrix_to_rotation_6d, 7 | rotation_6d_to_matrix, 8 | ) 9 | from einops import rearrange 10 | 11 | 12 | def aa_to_r6d(x): 13 | return matrix_to_rotation_6d(axis_angle_to_matrix(x)) 14 | 15 | 16 | def r6d_to_aa(x): 17 | return matrix_to_axis_angle(rotation_6d_to_matrix(x)) 18 | 19 | 20 | def interpolate_smpl_params(smpl_params, tgt_len): 21 | """ 22 | smpl_params['body_pose'] (L, 63) 23 | tgt_len: L->L' 24 | """ 25 | betas = smpl_params["betas"] 26 | body_pose = smpl_params["body_pose"] 27 | global_orient = smpl_params["global_orient"] # (L, 3) 28 | transl = smpl_params["transl"] # (L, 3) 29 | 30 | # Interpolate 31 | body_pose = rearrange(aa_to_r6d(body_pose.reshape(-1, 21, 3)), "l j c -> c j l") 32 | body_pose = F.interpolate(body_pose, tgt_len, mode="linear", align_corners=True) 33 | body_pose = r6d_to_aa(rearrange(body_pose, "c j l -> l j c")).reshape(-1, 63) 34 | 35 | # although this should be the same as above, we do it for consistency 36 | betas = rearrange(betas, "l c -> c 1 l") 37 | betas = F.interpolate(betas, tgt_len, mode="linear", align_corners=True) 38 | betas = rearrange(betas, "c 1 l -> l c") 39 | 40 | global_orient = rearrange(aa_to_r6d(global_orient.reshape(-1, 1, 3)), "l j c -> c j l") 41 | global_orient = F.interpolate(global_orient, tgt_len, mode="linear", align_corners=True) 42 | global_orient = r6d_to_aa(rearrange(global_orient, "c j l -> l j c")).reshape(-1, 3) 43 | 44 | transl = rearrange(transl, "l c -> c 1 l") 45 | transl = F.interpolate(transl, tgt_len, mode="linear", align_corners=True) 46 | transl = rearrange(transl, "c 1 l -> l c") 47 | 48 | return {"body_pose": body_pose, "betas": betas, "global_orient": global_orient, "transl": transl} 49 | 50 | 51 | def rotate_around_axis(global_orient, transl, axis="y"): 52 | """Global coordinate augmentation. Random rotation around y-axis""" 53 | angle = torch.rand(1) * 2 * torch.pi 54 | if axis == "y": 55 | aa = torch.tensor([0.0, angle, 0.0]).float().unsqueeze(0) 56 | rmat = axis_angle_to_matrix(aa) 57 | 58 | global_orient = matrix_to_axis_angle(rmat @ axis_angle_to_matrix(global_orient)) 59 | transl = (rmat.squeeze(0) @ transl.T).T 60 | return global_orient, transl 61 | 62 | 63 | def augment_betas(betas, std=0.1): 64 | noise = torch.normal(mean=torch.zeros(10), std=torch.ones(10) * std) 65 | betas_aug = betas + noise[None] 66 | return betas_aug 67 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/dataset/rich/resource/cam2params.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/dataset/rich/resource/cam2params.pt -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/dataset/rich/resource/val.txt: -------------------------------------------------------------------------------- 1 | sequence_name capture_name scan_name id moving_cam gender scene action/scene-interaction subjects view_id 2 | ParkingLot1_002_stretching2 ParkingLot1 scan_camcoord 002 X male V V V 0,1,2,3,4,5,6,7 3 | ParkingLot1_002_burpee1 ParkingLot1 scan_camcoord 002 X male V V V 0,1,2,3,4,5,6,7 4 | ParkingLot1_002_burpee2 ParkingLot1 scan_camcoord 002 X male V V V 0,1,2,3,4,5,6,7 5 | ParkingLot1_004_pushup1 ParkingLot1 scan_camcoord 004 X male V V V 0,1,2,3,4,5,6,7 6 | ParkingLot1_004_eating2 ParkingLot1 scan_camcoord 004 X male V V V 0,1,2,3,4,5,6,7 7 | ParkingLot1_004_phonetalk2 ParkingLot1 scan_camcoord 004 X male V V V 0,1,2,3,4,5,6,7 8 | ParkingLot1_004_takingphotos2 ParkingLot1 scan_camcoord 004 X male V V V 0,1,2,3,4,5,6,7 9 | ParkingLot1_004_stretching2 ParkingLot1 scan_camcoord 004 X male V V V 0,1,2,3,4,5,6,7 10 | ParkingLot1_005_overfence2 ParkingLot1 scan_camcoord 005 X male V V V 0,1,2,3,4,5,6,7 11 | ParkingLot1_005_pushup1 ParkingLot1 scan_camcoord 005 X male V V V 0,1,2,3,4,5,6,7 12 | ParkingLot1_005_burpeejump1 ParkingLot1 scan_camcoord 005 X male V V V 0,1,2,3,4,5,6,7 13 | ParkingLot1_007_burpee2 ParkingLot1 scan_camcoord 007 X male V V V 0,1,2,3,4,5,6,7 14 | ParkingLot2_008_eating2 ParkingLot2 scan_camcoord 008 V male V V V 0,1,2,3,4,5 15 | ParkingLot2_008_burpeejump2 ParkingLot2 scan_camcoord 008 V male V V V 0,1,2,3,4,5 16 | ParkingLot2_014_overfence1 ParkingLot2 scan_camcoord 014 X male V V V 0,1,2,3,4,5 17 | ParkingLot2_014_eating2 ParkingLot2 scan_camcoord 014 X male V V V 0,1,2,3,4,5 18 | ParkingLot2_016_phonetalk5 ParkingLot2 scan_camcoord 016 V female V V V 0,1,2,3,4,5 19 | Pavallion_002_sidebalancerun Pavallion scan_camcoord 002 V male V V V 0,1,2,3,4,5,6 20 | Pavallion_013_sidebalancerun Pavallion scan_camcoord 013 X female V V V 0,1,2,3,4,5,6 21 | Pavallion_018_sidebalancerun Pavallion scan_camcoord 018 V female V V V 0,1,2,3,4,5,6 22 | LectureHall_018_wipingtable1 LectureHall scan_chair_scene_camcoord 018 X female V V V 0,2,4,5,6 23 | LectureHall_020_wipingchairs1 LectureHall scan_chair_scene_camcoord 020 X male V V V 0,1,2,3,4,5,6 24 | LectureHall_003_wipingchairs1 LectureHall scan_chair_scene_camcoord 003 X male V V V 0,1,2,3,4,5,6 25 | Pavallion_000_yoga1 Pavallion scan_camcoord 000 X male V X V 0,1,2,3,4,5,6 26 | Pavallion_002_yoga1 Pavallion scan_camcoord 002 V male V X V 0,1,2,3,4,5,6 27 | Pavallion_003_yoga1 Pavallion scan_camcoord 003 V male V X V 0,1,2,3,4,5,6 28 | Pavallion_006_yoga1 Pavallion scan_camcoord 006 V male V X V 0,1,2,3,4,5,6 29 | Pavallion_018_yoga1 Pavallion scan_camcoord 018 V female V X V 0,1,2,3,4,5,6 -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/dataset/rich/resource/w2az_sahmr.json: -------------------------------------------------------------------------------- 1 | {"BBQ_scan_camcoord": [[0.9989829107564298, 0.03367618890797693, -0.029984301180211045, 0.0008183751635392625], [0.03414262169451401, -0.1305975871406019, 0.9908473906797644, -0.005059823133706893], [0.02945208652127451, -0.9908633531086326, -0.13161455111748036, 1.4054905296083466], [0.0, 0.0, 0.0, 1.0]], "Gym_scan_camcoord": [[0.9932599733260449, -0.07628732032461205, 0.0872632233306122, -0.047601130084306706], [-0.10233962102690007, -0.22374853741942266, 0.9692590953768503, -0.04091804681182174], [-0.05441716049582774, -0.9716567484252654, -0.23004768176013274, 1.537911791136788], [0.0, 0.0, 0.0, 1.0]], "Gym_scan_table_camcoord": [[0.9974451989415423, -0.06250743213795668, 0.03458172980064169, 0.02231858470834599], [-0.04804912583358893, -0.22882402250236075, 0.972281259838159, 0.039081886755815726], [-0.05286167435026744, -0.9714588965331274, -0.2312428501197992, 1.5421821446346522], [0.0, 0.0, 0.0, 1.0]], "LectureHall_scan_chair_scene_camcoord": [[0.9992930513998263, 0.030087515976743376, -0.0225419343977731, 0.001998908749589632], [0.030705594681969043, -0.30721111058653017, 0.9511458878570781, -0.025811963513866963], [0.021692484396004613, -0.9511656401040444, -0.307917783192506, 2.060346184503773], [0.0, 0.0, 0.0, 1.0]], "LectureHall_scan_yoga_scene_camcoord": [[0.9993358324246812, 0.03030060260429296, -0.020242715082476024, -0.003510046042036605], [0.028600729415016745, -0.3079667078507395, 0.9509671419836329, -0.01748548118379142], [0.022580795137075255, -0.9509144968594153, -0.3086287856852993, 2.0424701474796567], [0.0, 0.0, 0.0, 1.0]], "ParkingLot1_scan_camcoord": [[0.9989627324729327, -0.03724260727951709, 0.02620013994738054, 0.0070941466745699025], [-0.03091587075252664, -0.13228243926883107, 0.9907298144280939, -0.0274920377236923], [-0.03343154297742938, -0.9905121627037764, -0.13329661462331338, 1.3859200914120975], [0.0, 0.0, 0.0, 1.0]], "ParkingLot2_scan_camcoord": [[0.9989532636786039, -0.04044665659892979, 0.021364572447267097, 0.01646827411554571], [-0.026687287930043047, -0.13600581518076985, 0.9903485279940424, 0.030197722289598695], [-0.03715058073335097, -0.9898820567153364, -0.13694286452455984, 1.4372015171546513], [0.0, 0.0, 0.0, 1.0]], "Pavallion_scan_camcoord": [[0.9971864096076799, 0.05693557331723671, -0.048760690979605295, 0.0012478238054067193], [0.05746407703876882, -0.16289761936471214, 0.9849681443861059, -0.006002953831755452], [0.04813672552068054, -0.9849988355812122, -0.16571104235928033, 1.7638454838942128], [0.0, 0.0, 0.0, 1.0]]} -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/dataset/threedpw/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | from pathlib import Path 4 | from collections import defaultdict 5 | import pickle 6 | import torch 7 | import joblib 8 | 9 | RESOURCE_FOLDER = Path(__file__).resolve().parent / "resource" 10 | 11 | 12 | def read_raw_pkl(pkl_path): 13 | with open(pkl_path, "rb") as f: 14 | data = pickle.load(f, encoding="bytes") 15 | 16 | num_subjects = len(data[b"poses"]) 17 | F = data[b"poses"][0].shape[0] 18 | smpl_params = [] 19 | for i in range(num_subjects): 20 | smpl_params.append( 21 | { 22 | "body_pose": torch.from_numpy(data[b"poses"][i][:, 3:72]).float(), # (F, 69) 23 | "betas": torch.from_numpy(data[b"betas"][i][:10]).repeat(F, 1).float(), # (F, 10) 24 | "global_orient": torch.from_numpy(data[b"poses"][i][:, :3]).float(), # (F, 3) 25 | "transl": torch.from_numpy(data[b"trans"][i]).float(), # (F, 3) 26 | } 27 | ) 28 | genders = ["male" if g == "m" else "female" for g in data[b"genders"]] 29 | campose_valid = [torch.from_numpy(v).bool() for v in data[b"campose_valid"]] 30 | 31 | seq_name = data[b"sequence"] 32 | K_fullimg = torch.from_numpy(data[b"cam_intrinsics"]).float() 33 | T_w2c = torch.from_numpy(data[b"cam_poses"]).float() 34 | 35 | return_data = { 36 | "sequence": seq_name, # 'courtyard_bodyScannerMotions_00' 37 | "K_fullimg": K_fullimg, # (3, 3), not 55FoV 38 | "T_w2c": T_w2c, # (F, 4, 4) 39 | "smpl_params": smpl_params, # list of dict 40 | "genders": genders, # list of str 41 | "campose_valid": campose_valid, # list of bool-array 42 | # "jointPositions": data[b'jointPositions'], # SMPL, 24x3 43 | # "poses2d": data[b"poses2d"], # COCO, 3x18(?) 44 | } 45 | return return_data 46 | 47 | 48 | def load_and_convert_wham_pth(pth): 49 | """ 50 | Convert to {vid: DataDict} style, Add smpl_params_incam 51 | """ 52 | # load 53 | wham_labels_raw = joblib.load(pth) 54 | # convert it to {vid: DataDict} style 55 | wham_labels = {} 56 | for i, vid in enumerate(wham_labels_raw["vid"]): 57 | wham_labels[vid] = {k: wham_labels_raw[k][i] for k in wham_labels_raw} 58 | 59 | # convert pose and betas as smpl_params_incam (without transl) 60 | for vid in wham_labels: 61 | pose = wham_labels[vid]["pose"] 62 | global_orient = pose[:, :3] # (F, 3) 63 | body_pose = pose[:, 3:] # (F, 69) 64 | betas = wham_labels[vid]["betas"] # (F, 10), all frames are the same 65 | wham_labels[vid]["smpl_params_incam"] = { 66 | "body_pose": body_pose.float(), # (F, 69) 67 | "betas": betas.float(), # (F, 10) 68 | "global_orient": global_orient.float(), # (F, 3) 69 | } 70 | 71 | return wham_labels 72 | 73 | 74 | # Neural-Annot utils 75 | 76 | 77 | def na_cam_param_to_K_fullimg(cam_param): 78 | K = torch.eye(3) 79 | K[[0, 1], [0, 1]] = torch.tensor(cam_param["focal"]) 80 | K[[0, 1], [2, 2]] = torch.tensor(cam_param["princpt"]) 81 | return K 82 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/model/common_utils/optimizer.py: -------------------------------------------------------------------------------- 1 | from torch.optim import AdamW, Adam 2 | from hmr4d.configs import MainStore, builds 3 | 4 | 5 | optimizer_cfgs = { 6 | "adam_1e-3": builds(Adam, lr=1e-3, zen_partial=True), 7 | "adam_2e-4": builds(Adam, lr=2e-4, zen_partial=True), 8 | "adamw_2e-4": builds(AdamW, lr=2e-4, zen_partial=True), 9 | "adamw_1e-4": builds(AdamW, lr=1e-4, zen_partial=True), 10 | "adamw_5e-5": builds(AdamW, lr=5e-5, zen_partial=True), 11 | "adamw_1e-5": builds(AdamW, lr=1e-5, zen_partial=True), 12 | # zero-shot text-to-image generation 13 | "adamw_1e-3_dalle": builds(AdamW, lr=1e-3, weight_decay=1e-4, zen_partial=True), 14 | } 15 | 16 | for name, cfg in optimizer_cfgs.items(): 17 | MainStore.store(name=name, node=cfg, group=f"optimizer") 18 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/model/common_utils/scheduler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from bisect import bisect_right 3 | 4 | 5 | class WarmupMultiStepLR(torch.optim.lr_scheduler.LRScheduler): 6 | def __init__(self, optimizer, milestones, warmup=0, gamma=0.1, last_epoch=-1, verbose="deprecated"): 7 | """Assume optimizer does not change lr; Scheduler is called epoch-based""" 8 | self.milestones = milestones 9 | self.warmup = warmup 10 | assert warmup < milestones[0] 11 | self.gamma = gamma 12 | super().__init__(optimizer, last_epoch, verbose) 13 | 14 | def get_lr(self): 15 | base_lrs = self.base_lrs # base lr for each groups 16 | n_groups = len(base_lrs) 17 | comming_epoch = self.last_epoch # the lr will be set for the comming epoch, starts from 0 18 | 19 | # add extra warmup 20 | if comming_epoch < self.warmup: 21 | # e.g. comming_epoch [0, 1, 2] for warmup == 3 22 | # lr should be base_lr * (last_epoch+1) / (warmup + 1), e.g. [0.25, 0.5, 0.75] * base_lr 23 | lr_factor = (self.last_epoch + 1) / (self.warmup + 1) 24 | return [base_lrs[i] * lr_factor for i in range(n_groups)] 25 | else: 26 | # bisect_right([3,5,7], 0) -> 0; bisect_right([3,5,7], 5) -> 2 27 | p = bisect_right(self.milestones, comming_epoch) 28 | lr_factor = self.gamma**p 29 | return [base_lrs[i] * lr_factor for i in range(n_groups)] 30 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/model/common_utils/scheduler_cfg.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig, ListConfig 2 | from hmr4d.configs import MainStore, builds 3 | 4 | # do not perform scheduling 5 | default = DictConfig({"scheduler": None}) 6 | MainStore.store(name="default", node=default, group=f"scheduler_cfg") 7 | 8 | 9 | # epoch-based 10 | def epoch_half_by(milestones=[100, 200, 300]): 11 | return DictConfig( 12 | { 13 | "scheduler": { 14 | "_target_": "torch.optim.lr_scheduler.MultiStepLR", 15 | "milestones": milestones, 16 | "gamma": 0.5, 17 | }, 18 | "interval": "epoch", 19 | "frequency": 1, 20 | } 21 | ) 22 | 23 | 24 | MainStore.store(name="epoch_half_100_200_300", node=epoch_half_by([100, 200, 300]), group=f"scheduler_cfg") 25 | MainStore.store(name="epoch_half_100_200", node=epoch_half_by([100, 200]), group=f"scheduler_cfg") 26 | MainStore.store(name="epoch_half_200_350", node=epoch_half_by([200, 350]), group=f"scheduler_cfg") 27 | MainStore.store(name="epoch_half_300", node=epoch_half_by([300]), group=f"scheduler_cfg") 28 | 29 | 30 | # epoch-based 31 | def warmup_epoch_half_by(warmup=10, milestones=[100, 200, 300]): 32 | return DictConfig( 33 | { 34 | "scheduler": { 35 | "_target_": "hmr4d.model.common_utils.scheduler.WarmupMultiStepLR", 36 | "milestones": milestones, 37 | "warmup": warmup, 38 | "gamma": 0.5, 39 | }, 40 | "interval": "epoch", 41 | "frequency": 1, 42 | } 43 | ) 44 | 45 | 46 | MainStore.store(name="warmup_5_epoch_half_200_350", node=warmup_epoch_half_by(5, [200, 350]), group=f"scheduler_cfg") 47 | MainStore.store(name="warmup_10_epoch_half_200_350", node=warmup_epoch_half_by(10, [200, 350]), group=f"scheduler_cfg") 48 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/model/gvhmr/gvhmr_pl_demo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytorch_lightning as pl 3 | from hydra.utils import instantiate 4 | from hmr4d.utils.pylogger import Log 5 | from hmr4d.configs import MainStore, builds 6 | 7 | from hmr4d.utils.geo.hmr_cam import normalize_kp2d 8 | 9 | 10 | class DemoPL(pl.LightningModule): 11 | def __init__(self, pipeline): 12 | super().__init__() 13 | self.pipeline = instantiate(pipeline, _recursive_=False) 14 | 15 | @torch.no_grad() 16 | def predict(self, data, static_cam=False): 17 | """auto add batch dim 18 | data: { 19 | "length": int, or Torch.Tensor, 20 | "kp2d": (F, 3) 21 | "bbx_xys": (F, 3) 22 | "K_fullimg": (F, 3, 3) 23 | "cam_angvel": (F, 3) 24 | "f_imgseq": (F, 3, 256, 256) 25 | } 26 | 27 | """ 28 | # ROPE inference 29 | batch = { 30 | "length": data["length"][None], 31 | "obs": normalize_kp2d(data["kp2d"], data["bbx_xys"])[None], 32 | "bbx_xys": data["bbx_xys"][None], 33 | "K_fullimg": data["K_fullimg"][None], 34 | "cam_angvel": data["cam_angvel"][None], 35 | "f_imgseq": data["f_imgseq"][None], 36 | } 37 | batch = {k: v.cuda() for k, v in batch.items()} 38 | outputs = self.pipeline.forward(batch, train=False, postproc=True, static_cam=static_cam) 39 | 40 | pred = { 41 | "smpl_params_global": {k: v[0] for k, v in outputs["pred_smpl_params_global"].items()}, 42 | "smpl_params_incam": {k: v[0] for k, v in outputs["pred_smpl_params_incam"].items()}, 43 | "K_fullimg": data["K_fullimg"], 44 | "net_outputs": outputs, # intermediate outputs 45 | } 46 | return pred 47 | 48 | def load_pretrained_model(self, ckpt_path): 49 | """Load pretrained checkpoint, and assign each weight to the corresponding part.""" 50 | Log.info(f"[PL-Trainer] Loading ckpt type: {ckpt_path}") 51 | 52 | state_dict = torch.load(ckpt_path, "cpu")["state_dict"] 53 | missing, unexpected = self.load_state_dict(state_dict, strict=False) 54 | if len(missing) > 0: 55 | Log.warn(f"Missing keys: {missing}") 56 | if len(unexpected) > 0: 57 | Log.warn(f"Unexpected keys: {unexpected}") 58 | 59 | 60 | MainStore.store(name="gvhmr_pl_demo", node=builds(DemoPL, pipeline="${pipeline}"), group="model/gvhmr") 61 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/base_arch/embeddings/rotary_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from einops import repeat, rearrange 4 | from torch.cuda.amp import autocast 5 | 6 | 7 | def rotate_half(x): 8 | x = rearrange(x, "... (d r) -> ... d r", r=2) 9 | x1, x2 = x.unbind(dim=-1) 10 | x = torch.stack((-x2, x1), dim=-1) 11 | return rearrange(x, "... d r -> ... (d r)") 12 | 13 | 14 | @autocast(enabled=False) 15 | def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2): 16 | if t.ndim == 3: 17 | seq_len = t.shape[seq_dim] 18 | freqs = freqs[-seq_len:].to(t) 19 | 20 | rot_dim = freqs.shape[-1] 21 | end_index = start_index + rot_dim 22 | 23 | assert ( 24 | rot_dim <= t.shape[-1] 25 | ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}" 26 | 27 | t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:] 28 | t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale) 29 | return torch.cat((t_left, t, t_right), dim=-1) 30 | 31 | 32 | def get_encoding(d_model, max_seq_len=4096): 33 | """Return: (L, D)""" 34 | t = torch.arange(max_seq_len).float() 35 | freqs = 1.0 / (10000 ** (torch.arange(0, d_model, 2).float() / d_model)) 36 | freqs = torch.einsum("i, j -> i j", t, freqs) 37 | freqs = repeat(freqs, "i j -> i (j r)", r=2) 38 | return freqs 39 | 40 | 41 | class ROPE(nn.Module): 42 | """Minimal impl of a lang-style positional encoding.""" 43 | 44 | def __init__(self, d_model, max_seq_len=4096): 45 | super().__init__() 46 | self.d_model = d_model 47 | self.max_seq_len = max_seq_len 48 | 49 | # Pre-cache a freqs tensor 50 | encoding = get_encoding(d_model, max_seq_len) 51 | self.register_buffer("encoding", encoding, False) 52 | 53 | def rotate_queries_or_keys(self, x): 54 | """ 55 | Args: 56 | x : (B, H, L, D) 57 | Returns: 58 | rotated_x: (B, H, L, D) 59 | """ 60 | 61 | seq_len, d_model = x.shape[-2:] 62 | assert d_model == self.d_model 63 | 64 | # encoding: (L, D)s 65 | if seq_len > self.max_seq_len: 66 | encoding = get_encoding(d_model, seq_len).to(x) 67 | else: 68 | encoding = self.encoding[:seq_len] 69 | 70 | # encoding: (L, D) 71 | # x: (B, H, L, D) 72 | rotated_x = apply_rotary_emb(encoding, x, seq_dim=-2) 73 | 74 | return rotated_x 75 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/base_arch/transformer/encoder_rope.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import math 5 | from timm.models.vision_transformer import Mlp 6 | from typing import Optional, Tuple 7 | from einops import einsum, rearrange, repeat 8 | from hmr4d.network.base_arch.embeddings.rotary_embedding import ROPE 9 | 10 | 11 | class RoPEAttention(nn.Module): 12 | def __init__(self, embed_dim, num_heads, dropout=0.1): 13 | super().__init__() 14 | self.embed_dim = embed_dim 15 | self.num_heads = num_heads 16 | self.head_dim = embed_dim // num_heads 17 | 18 | self.rope = ROPE(self.head_dim, max_seq_len=4096) 19 | 20 | self.query = nn.Linear(embed_dim, embed_dim) 21 | self.key = nn.Linear(embed_dim, embed_dim) 22 | self.value = nn.Linear(embed_dim, embed_dim) 23 | self.dropout = nn.Dropout(dropout) 24 | self.proj = nn.Linear(embed_dim, embed_dim) 25 | 26 | def forward(self, x, attn_mask=None, key_padding_mask=None): 27 | # x: (B, L, C) 28 | # attn_mask: (L, L) 29 | # key_padding_mask: (B, L) 30 | B, L, _ = x.shape 31 | xq, xk, xv = self.query(x), self.key(x), self.value(x) 32 | 33 | xq = xq.reshape(B, L, self.num_heads, -1).transpose(1, 2) 34 | xk = xk.reshape(B, L, self.num_heads, -1).transpose(1, 2) 35 | xv = xv.reshape(B, L, self.num_heads, -1).transpose(1, 2) 36 | 37 | xq = self.rope.rotate_queries_or_keys(xq) # B, N, L, C 38 | xk = self.rope.rotate_queries_or_keys(xk) # B, N, L, C 39 | 40 | attn_score = einsum(xq, xk, "b n i c, b n j c -> b n i j") / math.sqrt(self.head_dim) 41 | if attn_mask is not None: 42 | attn_mask = attn_mask.reshape(1, 1, L, L).expand(B, self.num_heads, -1, -1) 43 | attn_score = attn_score.masked_fill(attn_mask, float("-inf")) 44 | if key_padding_mask is not None: 45 | key_padding_mask = key_padding_mask.reshape(B, 1, 1, L).expand(-1, self.num_heads, L, -1) 46 | attn_score = attn_score.masked_fill(key_padding_mask, float("-inf")) 47 | 48 | attn_score = torch.softmax(attn_score, dim=-1) 49 | attn_score = self.dropout(attn_score) 50 | output = einsum(attn_score, xv, "b n i j, b n j c -> b n i c") # B, N, L, C 51 | output = output.transpose(1, 2).reshape(B, L, -1) # B, L, C 52 | output = self.proj(output) # B, L, C 53 | return output 54 | 55 | 56 | class EncoderRoPEBlock(nn.Module): 57 | def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, dropout=0.1, **block_kwargs): 58 | super().__init__() 59 | self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6) 60 | self.attn = RoPEAttention(hidden_size, num_heads, dropout) 61 | self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6) 62 | mlp_hidden_dim = int(hidden_size * mlp_ratio) 63 | approx_gelu = lambda: nn.GELU(approximate="tanh") 64 | self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=dropout) 65 | 66 | self.gate_msa = nn.Parameter(torch.zeros(1, 1, hidden_size)) 67 | self.gate_mlp = nn.Parameter(torch.zeros(1, 1, hidden_size)) 68 | 69 | # Zero-out adaLN modulation layers 70 | nn.init.constant_(self.gate_msa, 0) 71 | nn.init.constant_(self.gate_mlp, 0) 72 | 73 | def forward(self, x, attn_mask=None, tgt_key_padding_mask=None): 74 | x = x + self.gate_msa * self._sa_block( 75 | self.norm1(x), attn_mask=attn_mask, key_padding_mask=tgt_key_padding_mask 76 | ) 77 | x = x + self.gate_mlp * self.mlp(self.norm2(x)) 78 | return x 79 | 80 | def _sa_block(self, x, attn_mask=None, key_padding_mask=None): 81 | # x: (B, L, C) 82 | x = self.attn(x, attn_mask=attn_mask, key_padding_mask=key_padding_mask) 83 | return x 84 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/base_arch/transformer/layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | def zero_module(module): 7 | """ 8 | Zero out the parameters of a module and return it. 9 | """ 10 | for p in module.parameters(): 11 | p.detach().zero_() 12 | return module 13 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/hmr2/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .hmr2 import HMR2 3 | from pathlib import Path 4 | from .configs import get_config 5 | from hmr4d import PROJ_ROOT 6 | 7 | HMR2A_CKPT = PROJ_ROOT / f"inputs/checkpoints/hmr2/epoch=10-step=25000.ckpt" # this is HMR2.0a, follow WHAM 8 | 9 | 10 | def load_hmr2(checkpoint_path=HMR2A_CKPT): 11 | model_cfg = str((Path(__file__).parent / "configs/model_config.yaml").resolve()) 12 | model_cfg = get_config(model_cfg) 13 | 14 | # Override some config values, to crop bbox correctly 15 | if (model_cfg.MODEL.BACKBONE.TYPE == "vit") and ("BBOX_SHAPE" not in model_cfg.MODEL): 16 | model_cfg.defrost() 17 | assert ( 18 | model_cfg.MODEL.IMAGE_SIZE == 256 19 | ), f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone" 20 | model_cfg.MODEL.BBOX_SHAPE = [192, 256] # (W, H) 21 | model_cfg.freeze() 22 | 23 | # Setup model and Load weights. 24 | # model = HMR2.load_from_checkpoint(checkpoint_path, strict=False, cfg=model_cfg) 25 | model = HMR2(model_cfg) 26 | 27 | state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"] 28 | keys = [k for k in state_dict.keys() if k.split(".")[0] in ["backbone", "smpl_head"]] 29 | state_dict = {k: v for k, v in state_dict.items() if k in keys} 30 | model.load_state_dict(state_dict, strict=True) 31 | 32 | return model 33 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/hmr2/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/network/hmr2/components/__init__.py -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/hmr2/configs/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict 3 | from yacs.config import CfgNode as CN 4 | from pathlib import Path 5 | 6 | # CACHE_DIR = os.path.join(os.environ.get("HOME"), "Code/4D-Humans/cache") 7 | # CACHE_DIR_4DHUMANS = os.path.join(CACHE_DIR, "4DHumans") 8 | 9 | 10 | def to_lower(x: Dict) -> Dict: 11 | """ 12 | Convert all dictionary keys to lowercase 13 | Args: 14 | x (dict): Input dictionary 15 | Returns: 16 | dict: Output dictionary with all keys converted to lowercase 17 | """ 18 | return {k.lower(): v for k, v in x.items()} 19 | 20 | 21 | _C = CN(new_allowed=True) 22 | 23 | _C.GENERAL = CN(new_allowed=True) 24 | _C.GENERAL.RESUME = True 25 | _C.GENERAL.TIME_TO_RUN = 3300 26 | _C.GENERAL.VAL_STEPS = 100 27 | _C.GENERAL.LOG_STEPS = 100 28 | _C.GENERAL.CHECKPOINT_STEPS = 20000 29 | _C.GENERAL.CHECKPOINT_DIR = "checkpoints" 30 | _C.GENERAL.SUMMARY_DIR = "tensorboard" 31 | _C.GENERAL.NUM_GPUS = 1 32 | _C.GENERAL.NUM_WORKERS = 4 33 | _C.GENERAL.MIXED_PRECISION = True 34 | _C.GENERAL.ALLOW_CUDA = True 35 | _C.GENERAL.PIN_MEMORY = False 36 | _C.GENERAL.DISTRIBUTED = False 37 | _C.GENERAL.LOCAL_RANK = 0 38 | _C.GENERAL.USE_SYNCBN = False 39 | _C.GENERAL.WORLD_SIZE = 1 40 | 41 | _C.TRAIN = CN(new_allowed=True) 42 | _C.TRAIN.NUM_EPOCHS = 100 43 | _C.TRAIN.BATCH_SIZE = 32 44 | _C.TRAIN.SHUFFLE = True 45 | _C.TRAIN.WARMUP = False 46 | _C.TRAIN.NORMALIZE_PER_IMAGE = False 47 | _C.TRAIN.CLIP_GRAD = False 48 | _C.TRAIN.CLIP_GRAD_VALUE = 1.0 49 | _C.LOSS_WEIGHTS = CN(new_allowed=True) 50 | 51 | _C.DATASETS = CN(new_allowed=True) 52 | 53 | _C.MODEL = CN(new_allowed=True) 54 | _C.MODEL.IMAGE_SIZE = 224 55 | 56 | _C.EXTRA = CN(new_allowed=True) 57 | _C.EXTRA.FOCAL_LENGTH = 5000 58 | 59 | _C.DATASETS.CONFIG = CN(new_allowed=True) 60 | _C.DATASETS.CONFIG.SCALE_FACTOR = 0.3 61 | _C.DATASETS.CONFIG.ROT_FACTOR = 30 62 | _C.DATASETS.CONFIG.TRANS_FACTOR = 0.02 63 | _C.DATASETS.CONFIG.COLOR_SCALE = 0.2 64 | _C.DATASETS.CONFIG.ROT_AUG_RATE = 0.6 65 | _C.DATASETS.CONFIG.TRANS_AUG_RATE = 0.5 66 | _C.DATASETS.CONFIG.DO_FLIP = True 67 | _C.DATASETS.CONFIG.FLIP_AUG_RATE = 0.5 68 | _C.DATASETS.CONFIG.EXTREME_CROP_AUG_RATE = 0.10 69 | 70 | 71 | def default_config() -> CN: 72 | """ 73 | Get a yacs CfgNode object with the default config values. 74 | """ 75 | # Return a clone so that the defaults will not be altered 76 | # This is for the "local variable" use pattern 77 | return _C.clone() 78 | 79 | 80 | def dataset_config(name="datasets_tar.yaml") -> CN: 81 | """ 82 | Get dataset config file 83 | Returns: 84 | CfgNode: Dataset config as a yacs CfgNode object. 85 | """ 86 | cfg = CN(new_allowed=True) 87 | config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), name) 88 | cfg.merge_from_file(config_file) 89 | cfg.freeze() 90 | return cfg 91 | 92 | 93 | def dataset_eval_config() -> CN: 94 | return dataset_config("datasets_eval.yaml") 95 | 96 | 97 | def get_config(config_file: str, merge: bool = True) -> CN: 98 | """ 99 | Read a config file and optionally merge it with the default config file. 100 | Args: 101 | config_file (str): Path to config file. 102 | merge (bool): Whether to merge with the default config or not. 103 | Returns: 104 | CfgNode: Config as a yacs CfgNode object. 105 | """ 106 | if merge: 107 | cfg = default_config() 108 | else: 109 | cfg = CN(new_allowed=True) 110 | cfg.merge_from_file(config_file) 111 | 112 | # ---- Update ---- # 113 | cfg.SMPL.MODEL_PATH = cfg.SMPL.MODEL_PATH # Not used 114 | cfg.SMPL.JOINT_REGRESSOR_EXTRA = cfg.SMPL.JOINT_REGRESSOR_EXTRA # Not Used 115 | cfg.SMPL.MEAN_PARAMS = str(Path(__file__).parent / "smpl_mean_params.npz") 116 | # ---------------- # 117 | 118 | cfg.freeze() 119 | return cfg 120 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/hmr2/configs/model_config.yaml: -------------------------------------------------------------------------------- 1 | task_name: train 2 | tags: 3 | - dev 4 | train: true 5 | test: false 6 | ckpt_path: null 7 | seed: null 8 | DATASETS: 9 | TRAIN: 10 | H36M-TRAIN: 11 | WEIGHT: 0.3 12 | MPII-TRAIN: 13 | WEIGHT: 0.1 14 | COCO-TRAIN-2014: 15 | WEIGHT: 0.4 16 | MPI-INF-TRAIN: 17 | WEIGHT: 0.2 18 | VAL: 19 | COCO-VAL: 20 | WEIGHT: 1.0 21 | MOCAP: CMU-MOCAP 22 | CONFIG: 23 | SCALE_FACTOR: 0.3 24 | ROT_FACTOR: 30 25 | TRANS_FACTOR: 0.02 26 | COLOR_SCALE: 0.2 27 | ROT_AUG_RATE: 0.6 28 | TRANS_AUG_RATE: 0.5 29 | DO_FLIP: true 30 | FLIP_AUG_RATE: 0.5 31 | EXTREME_CROP_AUG_RATE: 0.1 32 | trainer: 33 | _target_: pytorch_lightning.Trainer 34 | default_root_dir: ${paths.output_dir} 35 | accelerator: gpu 36 | devices: 8 37 | deterministic: false 38 | num_sanity_val_steps: 0 39 | log_every_n_steps: ${GENERAL.LOG_STEPS} 40 | val_check_interval: ${GENERAL.VAL_STEPS} 41 | precision: 16 42 | max_steps: ${GENERAL.TOTAL_STEPS} 43 | move_metrics_to_cpu: true 44 | limit_val_batches: 1 45 | track_grad_norm: 2 46 | strategy: ddp 47 | num_nodes: 1 48 | sync_batchnorm: true 49 | paths: 50 | root_dir: ${oc.env:PROJECT_ROOT} 51 | data_dir: ${paths.root_dir}/data/ 52 | log_dir: /fsx/shubham/code/hmr2023/logs_hydra/ 53 | output_dir: ${hydra:runtime.output_dir} 54 | work_dir: ${hydra:runtime.cwd} 55 | extras: 56 | ignore_warnings: false 57 | enforce_tags: true 58 | print_config: true 59 | exp_name: 3001d 60 | SMPL: 61 | MODEL_PATH: data/smpl 62 | GENDER: neutral 63 | NUM_BODY_JOINTS: 23 64 | JOINT_REGRESSOR_EXTRA: data/SMPL_to_J19.pkl 65 | MEAN_PARAMS: data/smpl_mean_params.npz 66 | EXTRA: 67 | FOCAL_LENGTH: 5000 68 | NUM_LOG_IMAGES: 4 69 | NUM_LOG_SAMPLES_PER_IMAGE: 8 70 | PELVIS_IND: 39 71 | MODEL: 72 | IMAGE_SIZE: 256 73 | IMAGE_MEAN: 74 | - 0.485 75 | - 0.456 76 | - 0.406 77 | IMAGE_STD: 78 | - 0.229 79 | - 0.224 80 | - 0.225 81 | BACKBONE: 82 | TYPE: vit 83 | FREEZE: true 84 | NUM_LAYERS: 50 85 | OUT_CHANNELS: 2048 86 | ADD_NECK: false 87 | FLOW: 88 | DIM: 144 89 | NUM_LAYERS: 4 90 | CONTEXT_FEATURES: 2048 91 | LAYER_HIDDEN_FEATURES: 1024 92 | LAYER_DEPTH: 2 93 | FC_HEAD: 94 | NUM_FEATURES: 1024 95 | SMPL_HEAD: 96 | TYPE: transformer_decoder 97 | IN_CHANNELS: 2048 98 | TRANSFORMER_DECODER: 99 | depth: 6 100 | heads: 8 101 | mlp_dim: 1024 102 | dim_head: 64 103 | dropout: 0.0 104 | emb_dropout: 0.0 105 | norm: layer 106 | context_dim: 1280 107 | GENERAL: 108 | TOTAL_STEPS: 100000 109 | LOG_STEPS: 100 110 | VAL_STEPS: 100 111 | CHECKPOINT_STEPS: 1000 112 | CHECKPOINT_SAVE_TOP_K: -1 113 | NUM_WORKERS: 6 114 | PREFETCH_FACTOR: 2 115 | TRAIN: 116 | LR: 0.0001 117 | WEIGHT_DECAY: 0.0001 118 | BATCH_SIZE: 512 119 | LOSS_REDUCTION: mean 120 | NUM_TRAIN_SAMPLES: 2 121 | NUM_TEST_SAMPLES: 64 122 | POSE_2D_NOISE_RATIO: 0.01 123 | SMPL_PARAM_NOISE_RATIO: 0.005 124 | LOSS_WEIGHTS: 125 | KEYPOINTS_3D: 0.05 126 | KEYPOINTS_2D: 0.01 127 | GLOBAL_ORIENT: 0.001 128 | BODY_POSE: 0.001 129 | BETAS: 0.0005 130 | ADVERSARIAL: 0.0005 131 | local: {} 132 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/hmr2/configs/smpl_mean_params.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/network/hmr2/configs/smpl_mean_params.npz -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/hmr2/hmr2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytorch_lightning as pl 3 | from yacs.config import CfgNode 4 | from .vit import ViT 5 | from .smpl_head import SMPLTransformerDecoderHead 6 | 7 | from pytorch3d.transforms import matrix_to_axis_angle 8 | from hmr4d.utils.geo.hmr_cam import compute_transl_full_cam 9 | 10 | 11 | class HMR2(pl.LightningModule): 12 | def __init__(self, cfg: CfgNode): 13 | super().__init__() 14 | self.cfg = cfg 15 | self.backbone = ViT( 16 | img_size=(256, 192), 17 | patch_size=16, 18 | embed_dim=1280, 19 | depth=32, 20 | num_heads=16, 21 | ratio=1, 22 | use_checkpoint=False, 23 | mlp_ratio=4, 24 | qkv_bias=True, 25 | drop_path_rate=0.55, 26 | ) 27 | self.smpl_head = SMPLTransformerDecoderHead(cfg) 28 | 29 | def forward(self, batch, feat_mode=True): 30 | """this file has been modified 31 | Args: 32 | feat_mode: default True, as we only need the feature token output for the HMR4D project; 33 | when False, the full process of HMR2 will be executed. 34 | """ 35 | # Backbone 36 | x = batch["img"][:, :, :, 32:-32] 37 | vit_feats = self.backbone(x) 38 | 39 | # Output head 40 | if feat_mode: 41 | token_out = self.smpl_head(vit_feats, only_return_token_out=True) # (B, 1024) 42 | return token_out 43 | 44 | # return full process 45 | pred_smpl_params, pred_cam, _, token_out = self.smpl_head(vit_feats, only_return_token_out=False) 46 | output = {} 47 | output["token_out"] = token_out 48 | output["smpl_params"] = { 49 | "body_pose": matrix_to_axis_angle(pred_smpl_params["body_pose"]).flatten(-2), # (B, 23, 3) 50 | "betas": pred_smpl_params["betas"], # (B, 10) 51 | "global_orient": matrix_to_axis_angle(pred_smpl_params["global_orient"])[:, 0], # (B, 3) 52 | "transl": compute_transl_full_cam(pred_cam, batch["bbx_xys"], batch["K_fullimg"]), # (B, 3) 53 | } 54 | 55 | return output 56 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/hmr2/utils/preproc.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import torch 4 | from pathlib import Path 5 | 6 | IMAGE_MEAN = torch.tensor([0.485, 0.456, 0.406]) 7 | IMAGE_STD = torch.tensor([0.229, 0.224, 0.225]) 8 | 9 | 10 | def expand_to_aspect_ratio(input_shape, target_aspect_ratio=[192, 256]): 11 | """Increase the size of the bounding box to match the target shape.""" 12 | if target_aspect_ratio is None: 13 | return input_shape 14 | 15 | try: 16 | w, h = input_shape 17 | except (ValueError, TypeError): 18 | return input_shape 19 | 20 | w_t, h_t = target_aspect_ratio 21 | if h / w < h_t / w_t: 22 | h_new = max(w * h_t / w_t, h) 23 | w_new = w 24 | else: 25 | h_new = h 26 | w_new = max(h * w_t / h_t, w) 27 | if h_new < h or w_new < w: 28 | breakpoint() 29 | return np.array([w_new, h_new]) 30 | 31 | 32 | def crop_and_resize(img, bbx_xy, bbx_s, dst_size=256, enlarge_ratio=1.2): 33 | """ 34 | Args: 35 | img: (H, W, 3) 36 | bbx_xy: (2,) 37 | bbx_s: scalar 38 | """ 39 | hs = bbx_s * enlarge_ratio / 2 40 | src = np.stack( 41 | [ 42 | bbx_xy - hs, # left-up corner 43 | bbx_xy + np.array([hs, -hs]), # right-up corner 44 | bbx_xy, # center 45 | ] 46 | ).astype(np.float32) 47 | dst = np.array([[0, 0], [dst_size - 1, 0], [dst_size / 2 - 0.5, dst_size / 2 - 0.5]], dtype=np.float32) 48 | A = cv2.getAffineTransform(src, dst) 49 | 50 | img_crop = cv2.warpAffine(img, A, (dst_size, dst_size), flags=cv2.INTER_LINEAR) 51 | bbx_xys_final = np.array([*bbx_xy, bbx_s * enlarge_ratio]) 52 | return img_crop, bbx_xys_final 53 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/network/hmr2/utils/smpl_wrapper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pickle 4 | from typing import Optional 5 | import smplx 6 | from smplx.lbs import vertices2joints 7 | from smplx.utils import SMPLOutput 8 | 9 | 10 | class SMPL(smplx.SMPLLayer): 11 | def __init__(self, *args, joint_regressor_extra: Optional[str] = None, update_hips: bool = False, **kwargs): 12 | """ 13 | Extension of the official SMPL implementation to support more joints. 14 | Args: 15 | Same as SMPLLayer. 16 | joint_regressor_extra (str): Path to extra joint regressor. 17 | """ 18 | super(SMPL, self).__init__(*args, **kwargs) 19 | smpl_to_openpose = [24, 12, 17, 19, 21, 16, 18, 20, 0, 2, 5, 8, 1, 4, 7, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34] 20 | 21 | if joint_regressor_extra is not None: 22 | self.register_buffer( 23 | "joint_regressor_extra", 24 | torch.tensor(pickle.load(open(joint_regressor_extra, "rb"), encoding="latin1"), dtype=torch.float32), 25 | ) 26 | self.register_buffer("joint_map", torch.tensor(smpl_to_openpose, dtype=torch.long)) 27 | self.update_hips = update_hips 28 | 29 | def forward(self, *args, **kwargs) -> SMPLOutput: 30 | """ 31 | Run forward pass. Same as SMPL and also append an extra set of joints if joint_regressor_extra is specified. 32 | """ 33 | smpl_output = super(SMPL, self).forward(*args, **kwargs) 34 | joints = smpl_output.joints[:, self.joint_map, :] 35 | if self.update_hips: 36 | joints[:, [9, 12]] = ( 37 | joints[:, [9, 12]] 38 | + 0.25 * (joints[:, [9, 12]] - joints[:, [12, 9]]) 39 | + 0.5 * (joints[:, [8]] - 0.5 * (joints[:, [9, 12]] + joints[:, [12, 9]])) 40 | ) 41 | if hasattr(self, "joint_regressor_extra"): 42 | extra_joints = vertices2joints(self.joint_regressor_extra, smpl_output.vertices) 43 | joints = torch.cat([joints, extra_joints], dim=1) 44 | smpl_output.joints = joints 45 | return smpl_output 46 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/body_model/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | Contents of this folder are modified from HuMoR repository. -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/body_model/__init__.py: -------------------------------------------------------------------------------- 1 | from .body_model import BodyModel 2 | from .body_model_smplh import BodyModelSMPLH 3 | from .body_model_smplx import BodyModelSMPLX 4 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/body_model/body_model_smplh.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import smplx 4 | 5 | kwargs_disable_member_var = { 6 | "create_body_pose": False, 7 | "create_betas": False, 8 | "create_global_orient": False, 9 | "create_transl": False, 10 | "create_left_hand_pose": False, 11 | "create_right_hand_pose": False, 12 | } 13 | 14 | 15 | class BodyModelSMPLH(nn.Module): 16 | """Support Batch inference""" 17 | 18 | def __init__(self, model_path, **kwargs): 19 | super().__init__() 20 | # enable flexible batchsize, handle missing variable at forward() 21 | kwargs.update(kwargs_disable_member_var) 22 | self.bm = smplx.create(model_path=model_path, **kwargs) 23 | self.faces = self.bm.faces 24 | self.is_smpl = kwargs.get("model_type", "smpl") == "smpl" 25 | if not self.is_smpl: 26 | self.hand_pose_dim = self.bm.num_pca_comps if self.bm.use_pca else 3 * self.bm.NUM_HAND_JOINTS 27 | 28 | # For fast computing of skeleton under beta 29 | shapedirs = self.bm.shapedirs # (V, 3, 10) 30 | J_regressor = self.bm.J_regressor[:22, :] # (22, V) 31 | v_template = self.bm.v_template # (V, 3) 32 | J_template = J_regressor @ v_template # (22, 3) 33 | J_shapedirs = torch.einsum("jv, vcd -> jcd", J_regressor, shapedirs) # (22, 3, 10) 34 | self.register_buffer("J_template", J_template, False) 35 | self.register_buffer("J_shapedirs", J_shapedirs, False) 36 | 37 | def forward( 38 | self, 39 | betas=None, 40 | global_orient=None, 41 | transl=None, 42 | body_pose=None, 43 | left_hand_pose=None, 44 | right_hand_pose=None, 45 | **kwargs 46 | ): 47 | 48 | device, dtype = self.bm.shapedirs.device, self.bm.shapedirs.dtype 49 | 50 | model_vars = [betas, global_orient, body_pose, transl, left_hand_pose, right_hand_pose] 51 | batch_size = 1 52 | for var in model_vars: 53 | if var is None: 54 | continue 55 | batch_size = max(batch_size, len(var)) 56 | 57 | if global_orient is None: 58 | global_orient = torch.zeros([batch_size, 3], dtype=dtype, device=device) 59 | if body_pose is None: 60 | body_pose = ( 61 | torch.zeros(3 * self.bm.NUM_BODY_JOINTS, device=device, dtype=dtype)[None] 62 | .expand(batch_size, -1) 63 | .contiguous() 64 | ) 65 | if not self.is_smpl: 66 | if left_hand_pose is None: 67 | left_hand_pose = ( 68 | torch.zeros(self.hand_pose_dim, device=device, dtype=dtype)[None] 69 | .expand(batch_size, -1) 70 | .contiguous() 71 | ) 72 | if right_hand_pose is None: 73 | right_hand_pose = ( 74 | torch.zeros(self.hand_pose_dim, device=device, dtype=dtype)[None] 75 | .expand(batch_size, -1) 76 | .contiguous() 77 | ) 78 | if betas is None: 79 | betas = torch.zeros([batch_size, self.bm.num_betas], dtype=dtype, device=device) 80 | if transl is None: 81 | transl = torch.zeros([batch_size, 3], dtype=dtype, device=device) 82 | 83 | bm_out = self.bm( 84 | betas=betas, 85 | global_orient=global_orient, 86 | body_pose=body_pose, 87 | left_hand_pose=left_hand_pose, 88 | right_hand_pose=right_hand_pose, 89 | transl=transl, 90 | **kwargs 91 | ) 92 | 93 | return bm_out 94 | 95 | def get_skeleton(self, betas): 96 | """betas: (*, 10) -> skeleton_beta: (*, 22, 3)""" 97 | skeleton_beta = self.J_template + torch.einsum("...d, jcd -> ...jc", betas, self.J_shapedirs) # (22, 3) 98 | return skeleton_beta 99 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/body_model/coco_aug_dict.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/coco_aug_dict.pth -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/body_model/seg_part_info.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/seg_part_info.npy -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/body_model/smpl_3dpw14_J_regressor_sparse.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smpl_3dpw14_J_regressor_sparse.pt -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/body_model/smpl_coco17_J_regressor.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smpl_coco17_J_regressor.pt -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/body_model/smpl_neutral_J_regressor.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smpl_neutral_J_regressor.pt -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/body_model/smplx2smpl_sparse.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smplx2smpl_sparse.pt -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/body_model/smplx_verts437.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smplx_verts437.pt -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/callbacks/lr_monitor.py: -------------------------------------------------------------------------------- 1 | from pytorch_lightning.callbacks import LearningRateMonitor 2 | from hmr4d.configs import builds, MainStore 3 | 4 | 5 | MainStore.store(name="pl", node=builds(LearningRateMonitor), group="callbacks/lr_monitor") 6 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/callbacks/train_speed_timer.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | from pytorch_lightning.utilities import rank_zero_only 3 | from time import time 4 | from collections import deque 5 | 6 | from hmr4d.configs import MainStore, builds 7 | 8 | 9 | class TrainSpeedTimer(pl.Callback): 10 | def __init__(self, N_avg=5): 11 | """ 12 | This callback times the training speed (averge over recent 5 iterations) 13 | 1. Data waiting time: this should be small, otherwise the data loading should be improved 14 | 2. Single batch time: this is the time for one batch of training (excluding data waiting) 15 | """ 16 | super().__init__() 17 | self.last_batch_end = None 18 | self.this_batch_start = None 19 | 20 | # time queues for averaging 21 | self.data_waiting_time_queue = deque(maxlen=N_avg) 22 | self.single_batch_time_queue = deque(maxlen=N_avg) 23 | 24 | @rank_zero_only 25 | def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): 26 | """Count the time of data waiting""" 27 | if self.last_batch_end is not None: 28 | # This should be small, otherwise the data loading should be improved 29 | data_waiting = time() - self.last_batch_end 30 | 31 | # Average the time 32 | self.data_waiting_time_queue.append(data_waiting) 33 | average_time = sum(self.data_waiting_time_queue) / len(self.data_waiting_time_queue) 34 | 35 | # Log to prog-bar 36 | pl_module.log( 37 | "train_timer/data_waiting", average_time, on_step=True, on_epoch=False, prog_bar=True, logger=True 38 | ) 39 | 40 | self.this_batch_start = time() 41 | 42 | @rank_zero_only 43 | def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): 44 | # Effective training time elapsed (excluding data waiting) 45 | single_batch = time() - self.this_batch_start 46 | 47 | # Average the time 48 | self.single_batch_time_queue.append(single_batch) 49 | average_time = sum(self.single_batch_time_queue) / len(self.single_batch_time_queue) 50 | 51 | # Log iter time 52 | pl_module.log( 53 | "train_timer/single_batch", average_time, on_step=True, on_epoch=False, prog_bar=False, logger=True 54 | ) 55 | 56 | # Set timer for counting data waiting 57 | self.last_batch_end = time() 58 | 59 | @rank_zero_only 60 | def on_train_epoch_end(self, trainer, pl_module): 61 | # Reset the timer 62 | self.last_batch_end = None 63 | self.this_batch_start = None 64 | # Clear the queue 65 | self.data_waiting_time_queue.clear() 66 | self.single_batch_time_queue.clear() 67 | 68 | 69 | group_name = "callbacks/train_speed_timer" 70 | base = builds(TrainSpeedTimer, populate_full_signature=True) 71 | MainStore.store(name="base", node=base, group=group_name) 72 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/geo/flip_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pytorch3d.transforms import axis_angle_to_matrix, matrix_to_axis_angle 3 | 4 | 5 | def flip_heatmap_coco17(output_flipped): 6 | assert output_flipped.ndim == 4, "output_flipped should be [B, J, H, W]" 7 | shape_ori = output_flipped.shape 8 | channels = 1 9 | output_flipped = output_flipped.reshape(shape_ori[0], -1, channels, shape_ori[2], shape_ori[3]) 10 | output_flipped_back = output_flipped.clone() 11 | 12 | # Swap left-right parts 13 | for left, right in [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]: 14 | output_flipped_back[:, left, ...] = output_flipped[:, right, ...] 15 | output_flipped_back[:, right, ...] = output_flipped[:, left, ...] 16 | output_flipped_back = output_flipped_back.reshape(shape_ori) 17 | # Flip horizontally 18 | output_flipped_back = output_flipped_back.flip(3) 19 | return output_flipped_back 20 | 21 | 22 | def flip_bbx_xys(bbx_xys, w): 23 | """ 24 | bbx_xys: (F, 3) 25 | """ 26 | bbx_xys_flip = bbx_xys.clone() 27 | bbx_xys_flip[:, 0] = w - bbx_xys_flip[:, 0] 28 | return bbx_xys_flip 29 | 30 | 31 | def flip_kp2d_coco17(kp2d, w): 32 | """Flip keypoints.""" 33 | kp2d = kp2d.clone() 34 | flipped_parts = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] 35 | kp2d = kp2d[..., flipped_parts, :] 36 | kp2d[..., 0] = w - kp2d[..., 0] 37 | return kp2d 38 | 39 | 40 | def flip_smplx_params(smplx_params): 41 | """Flip pose. 42 | The flipping is based on SMPLX parameters. 43 | """ 44 | rotation = torch.cat([smplx_params["global_orient"], smplx_params["body_pose"]], dim=1) 45 | 46 | BN = rotation.shape[0] 47 | pose = rotation.reshape(BN, -1).transpose(0, 1) 48 | 49 | SMPL_JOINTS_FLIP_PERM = [0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13, 15, 17, 16, 19, 18, 21, 20] # , 23, 22] 50 | SMPL_POSE_FLIP_PERM = [] 51 | for i in SMPL_JOINTS_FLIP_PERM: 52 | SMPL_POSE_FLIP_PERM.append(3 * i) 53 | SMPL_POSE_FLIP_PERM.append(3 * i + 1) 54 | SMPL_POSE_FLIP_PERM.append(3 * i + 2) 55 | 56 | pose = pose[SMPL_POSE_FLIP_PERM] 57 | 58 | # we also negate the second and the third dimension of the axis-angle 59 | pose[1::3] = -pose[1::3] 60 | pose[2::3] = -pose[2::3] 61 | pose = pose.transpose(0, 1).reshape(BN, -1, 3) 62 | 63 | smplx_params_flipped = smplx_params.copy() 64 | smplx_params_flipped["global_orient"] = pose[:, :1] 65 | smplx_params_flipped["body_pose"] = pose[:, 1:] 66 | return smplx_params_flipped 67 | 68 | 69 | def avg_smplx_aa(aa1, aa2): 70 | def avg_rot(rot): 71 | # input [B,...,3,3] --> output [...,3,3] 72 | rot = rot.mean(dim=0) 73 | U, _, V = torch.svd(rot) 74 | rot = U @ V.transpose(-1, -2) 75 | return rot 76 | 77 | B, J3 = aa1.shape 78 | aa1 = aa1.reshape(B, -1, 3) 79 | aa2 = aa2.reshape(B, -1, 3) 80 | 81 | R1 = axis_angle_to_matrix(aa1) 82 | R2 = axis_angle_to_matrix(aa2) 83 | R_avg = avg_rot(torch.stack([R1, R2])) 84 | aa_avg = matrix_to_axis_angle(R_avg).reshape(B, -1) 85 | 86 | return aa_avg 87 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/geo/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def axis_rotate_to_matrix(angle, axis="x"): 5 | """Get rotation matrix for rotating around one axis 6 | Args: 7 | angle: (N, 1) 8 | Returns: 9 | R: (N, 3, 3) 10 | """ 11 | if isinstance(angle, float): 12 | angle = torch.tensor([angle], dtype=torch.float) 13 | 14 | c = torch.cos(angle) 15 | s = torch.sin(angle) 16 | z = torch.zeros_like(angle) 17 | o = torch.ones_like(angle) 18 | if axis == "x": 19 | R = torch.stack([o, z, z, z, c, -s, z, s, c], dim=1).view(-1, 3, 3) 20 | elif axis == "y": 21 | R = torch.stack([c, z, s, z, o, z, -s, z, c], dim=1).view(-1, 3, 3) 22 | else: 23 | assert axis == "z" 24 | R = torch.stack([c, -s, z, s, c, z, z, z, o], dim=1).view(-1, 3, 3) 25 | return R 26 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from hmr4d.utils.preproc.tracker import Tracker 3 | from hmr4d.utils.preproc.vitfeat_extractor import Extractor 4 | from hmr4d.utils.preproc.vitpose import VitPoseExtractor 5 | from hmr4d.utils.preproc.slam import SLAMModel 6 | except: 7 | pass 8 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/slam.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import time 3 | import torch 4 | from multiprocessing import Process, Queue 5 | 6 | try: 7 | from dpvo.utils import Timer 8 | from dpvo.dpvo import DPVO 9 | from dpvo.config import cfg 10 | except: 11 | pass 12 | 13 | 14 | from hmr4d import PROJ_ROOT 15 | from hmr4d.utils.geo.hmr_cam import estimate_focal_length 16 | 17 | 18 | class SLAMModel(object): 19 | def __init__(self, video_path, width, height, intrinsics=None, stride=1, skip=0, buffer=2048, resize=0.5): 20 | """ 21 | Args: 22 | intrinsics: [fx, fy, cx, cy] 23 | """ 24 | if intrinsics is None: 25 | print("Estimating focal length") 26 | focal_length = estimate_focal_length(width, height) 27 | intrinsics = torch.tensor([focal_length, focal_length, width / 2.0, height / 2.0]) 28 | else: 29 | intrinsics = intrinsics.clone() 30 | 31 | self.dpvo_cfg = str(PROJ_ROOT / "third-party/DPVO/config/default.yaml") 32 | self.dpvo_ckpt = "inputs/checkpoints/dpvo/dpvo.pth" 33 | 34 | self.buffer = buffer 35 | self.times = [] 36 | self.slam = None 37 | self.queue = Queue(maxsize=8) 38 | self.reader = Process(target=video_stream, args=(self.queue, video_path, intrinsics, stride, skip, resize)) 39 | self.reader.start() 40 | 41 | def track(self): 42 | (t, image, intrinsics) = self.queue.get() 43 | 44 | if t < 0: 45 | return False 46 | 47 | image = torch.from_numpy(image).permute(2, 0, 1).cuda() 48 | intrinsics = intrinsics.cuda() # [fx, fy, cx, cy] 49 | 50 | if self.slam is None: 51 | cfg.merge_from_file(self.dpvo_cfg) 52 | cfg.BUFFER_SIZE = self.buffer 53 | self.slam = DPVO(cfg, self.dpvo_ckpt, ht=image.shape[1], wd=image.shape[2], viz=False) 54 | 55 | with Timer("SLAM", enabled=False): 56 | t = time.time() 57 | self.slam(t, image, intrinsics) 58 | self.times.append(time.time() - t) 59 | 60 | return True 61 | 62 | def process(self): 63 | for _ in range(12): 64 | self.slam.update() 65 | 66 | self.reader.join() 67 | return self.slam.terminate()[0] 68 | 69 | 70 | def video_stream(queue, imagedir, intrinsics, stride, skip=0, resize=0.5): 71 | """video generator""" 72 | assert len(intrinsics) == 4, "intrinsics should be [fx, fy, cx, cy]" 73 | 74 | cap = cv2.VideoCapture(imagedir) 75 | t = 0 76 | for _ in range(skip): 77 | ret, image = cap.read() 78 | 79 | while True: 80 | # Capture frame-by-frame 81 | for _ in range(stride): 82 | ret, image = cap.read() 83 | # if frame is read correctly ret is True 84 | if not ret: 85 | break 86 | 87 | if not ret: 88 | break 89 | 90 | image = cv2.resize(image, None, fx=resize, fy=resize, interpolation=cv2.INTER_AREA) 91 | h, w, _ = image.shape 92 | image = image[: h - h % 16, : w - w % 16] 93 | 94 | intrinsics_ = intrinsics.clone() * resize 95 | queue.put((t, image, intrinsics_)) 96 | 97 | t += 1 98 | 99 | queue.put((-1, image, intrinsics)) # -1 will terminate the process 100 | cap.release() 101 | 102 | # wait for the queue to be empty, otherwise the process will end immediately 103 | while not queue.empty(): 104 | time.sleep(1) 105 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitfeat_extractor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from hmr4d.network.hmr2 import load_hmr2, HMR2 3 | 4 | 5 | from hmr4d.utils.video_io_utils import read_video_np 6 | import cv2 7 | import numpy as np 8 | 9 | from hmr4d.network.hmr2.utils.preproc import crop_and_resize, IMAGE_MEAN, IMAGE_STD 10 | from tqdm import tqdm 11 | 12 | 13 | def get_batch(input_path, bbx_xys, img_ds=0.5, img_dst_size=256, path_type="video"): 14 | if path_type == "video": 15 | imgs = read_video_np(input_path, scale=img_ds) 16 | elif path_type == "image": 17 | imgs = cv2.imread(str(input_path))[..., ::-1] 18 | imgs = cv2.resize(imgs, (0, 0), fx=img_ds, fy=img_ds) 19 | imgs = imgs[None] 20 | elif path_type == "np": 21 | assert isinstance(input_path, np.ndarray) 22 | assert img_ds == 1.0 # this is safe 23 | imgs = input_path 24 | 25 | gt_center = bbx_xys[:, :2] 26 | gt_bbx_size = bbx_xys[:, 2] 27 | 28 | # Blur image to avoid aliasing artifacts 29 | if True: 30 | gt_bbx_size_ds = gt_bbx_size * img_ds 31 | ds_factors = ((gt_bbx_size_ds * 1.0) / img_dst_size / 2.0).numpy() 32 | imgs = np.stack( 33 | [ 34 | # gaussian(v, sigma=(d - 1) / 2, channel_axis=2, preserve_range=True) if d > 1.1 else v 35 | cv2.GaussianBlur(v, (5, 5), (d - 1) / 2) if d > 1.1 else v 36 | for v, d in zip(imgs, ds_factors) 37 | ] 38 | ) 39 | 40 | # Output 41 | imgs_list = [] 42 | bbx_xys_ds_list = [] 43 | for i in range(len(imgs)): 44 | img, bbx_xys_ds = crop_and_resize( 45 | imgs[i], 46 | gt_center[i] * img_ds, 47 | gt_bbx_size[i] * img_ds, 48 | img_dst_size, 49 | enlarge_ratio=1.0, 50 | ) 51 | imgs_list.append(img) 52 | bbx_xys_ds_list.append(bbx_xys_ds) 53 | imgs = torch.from_numpy(np.stack(imgs_list)) # (F, 256, 256, 3), RGB 54 | bbx_xys = torch.from_numpy(np.stack(bbx_xys_ds_list)) / img_ds # (F, 3) 55 | 56 | imgs = ((imgs / 255.0 - IMAGE_MEAN) / IMAGE_STD).permute(0, 3, 1, 2) # (F, 3, 256, 256 57 | return imgs, bbx_xys 58 | 59 | 60 | class Extractor: 61 | def __init__(self, tqdm_leave=True): 62 | self.extractor: HMR2 = load_hmr2().cuda().eval() 63 | self.tqdm_leave = tqdm_leave 64 | 65 | def extract_video_features(self, video_path, bbx_xys, img_ds=0.5): 66 | """ 67 | img_ds makes the image smaller, which is useful for faster processing 68 | """ 69 | # Get the batch 70 | if isinstance(video_path, str): 71 | imgs, bbx_xys = get_batch(video_path, bbx_xys, img_ds=img_ds) 72 | else: 73 | assert isinstance(video_path, torch.Tensor) 74 | imgs = video_path 75 | 76 | # Inference 77 | F, _, H, W = imgs.shape # (F, 3, H, W) 78 | imgs = imgs.cuda() 79 | batch_size = 16 # 5GB GPU memory, occupies all CUDA cores of 3090 80 | features = [] 81 | for j in tqdm(range(0, F, batch_size), desc="HMR2 Feature", leave=self.tqdm_leave): 82 | imgs_batch = imgs[j : j + batch_size] 83 | 84 | with torch.no_grad(): 85 | feature = self.extractor({"img": imgs_batch}) 86 | features.append(feature.detach().cpu()) 87 | 88 | features = torch.cat(features, dim=0).clone() # (F, 1024) 89 | return features 90 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .src.vitpose_infer.model_builder import build_model 2 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/__init__.py -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/__init__.py -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # from .alexnet import AlexNet 3 | # from .cpm import CPM 4 | # from .hourglass import HourglassNet 5 | # from .hourglass_ae import HourglassAENet 6 | # from .hrformer import HRFormer 7 | # from .hrnet import HRNet 8 | # from .litehrnet import LiteHRNet 9 | # from .mobilenet_v2 import MobileNetV2 10 | # from .mobilenet_v3 import MobileNetV3 11 | # from .mspn import MSPN 12 | # from .regnet import RegNet 13 | # from .resnest import ResNeSt 14 | # from .resnet import ResNet, ResNetV1d 15 | # from .resnext import ResNeXt 16 | # from .rsn import RSN 17 | # from .scnet import SCNet 18 | # from .seresnet import SEResNet 19 | # from .seresnext import SEResNeXt 20 | # from .shufflenet_v1 import ShuffleNetV1 21 | # from .shufflenet_v2 import ShuffleNetV2 22 | # from .tcn import TCN 23 | # from .v2v_net import V2VNet 24 | # from .vgg import VGG 25 | # from .vipnas_mbv3 import ViPNAS_MobileNetV3 26 | # from .vipnas_resnet import ViPNAS_ResNet 27 | from .vit import ViT 28 | 29 | # __all__ = [ 30 | # 'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2', 31 | # 'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet', 32 | # 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', 33 | # 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3', 34 | # 'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT' 35 | # ] 36 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/alexnet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch.nn as nn 3 | 4 | from ..builder import BACKBONES 5 | from .base_backbone import BaseBackbone 6 | 7 | 8 | @BACKBONES.register_module() 9 | class AlexNet(BaseBackbone): 10 | """`AlexNet `__ backbone. 11 | 12 | The input for AlexNet is a 224x224 RGB image. 13 | 14 | Args: 15 | num_classes (int): number of classes for classification. 16 | The default value is -1, which uses the backbone as 17 | a feature extractor without the top classifier. 18 | """ 19 | 20 | def __init__(self, num_classes=-1): 21 | super().__init__() 22 | self.num_classes = num_classes 23 | self.features = nn.Sequential( 24 | nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), 25 | nn.ReLU(inplace=True), 26 | nn.MaxPool2d(kernel_size=3, stride=2), 27 | nn.Conv2d(64, 192, kernel_size=5, padding=2), 28 | nn.ReLU(inplace=True), 29 | nn.MaxPool2d(kernel_size=3, stride=2), 30 | nn.Conv2d(192, 384, kernel_size=3, padding=1), 31 | nn.ReLU(inplace=True), 32 | nn.Conv2d(384, 256, kernel_size=3, padding=1), 33 | nn.ReLU(inplace=True), 34 | nn.Conv2d(256, 256, kernel_size=3, padding=1), 35 | nn.ReLU(inplace=True), 36 | nn.MaxPool2d(kernel_size=3, stride=2), 37 | ) 38 | if self.num_classes > 0: 39 | self.classifier = nn.Sequential( 40 | nn.Dropout(), 41 | nn.Linear(256 * 6 * 6, 4096), 42 | nn.ReLU(inplace=True), 43 | nn.Dropout(), 44 | nn.Linear(4096, 4096), 45 | nn.ReLU(inplace=True), 46 | nn.Linear(4096, num_classes), 47 | ) 48 | 49 | def forward(self, x): 50 | 51 | x = self.features(x) 52 | if self.num_classes > 0: 53 | x = x.view(x.size(0), 256 * 6 * 6) 54 | x = self.classifier(x) 55 | 56 | return x 57 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/test_torch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Net(nn.Module): 7 | 8 | def __init__(self): 9 | super(Net, self).__init__() 10 | # 1 input image channel, 6 output channels, 5x5 square convolution 11 | # kernel 12 | self.conv1 = nn.Conv2d(1, 6, 5) 13 | self.conv2 = nn.Conv2d(6, 16, 5) 14 | # an affine operation: y = Wx + b 15 | self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension 16 | self.fc2 = nn.Linear(120, 84) 17 | self.fc3 = nn.Linear(84, 10) 18 | 19 | def forward(self, x): 20 | # Max pooling over a (2, 2) window 21 | x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) 22 | # If the size is a square, you can specify with a single number 23 | x = F.max_pool2d(F.relu(self.conv2(x)), 2) 24 | x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension 25 | x = F.relu(self.fc1(x)) 26 | x = F.relu(self.fc2(x)) 27 | x = self.fc3(x) 28 | return x 29 | 30 | 31 | net = Net() 32 | # print(net) 33 | 34 | net.train() 35 | 36 | input = torch.randn(1, 1, 32, 32) 37 | # out = net(input) 38 | # print(out) 39 | output = net(input) 40 | target = torch.randn(10) # a dummy target, for example 41 | target = target.view(1, -1) # make it the same shape as output 42 | criterion = nn.MSELoss() 43 | 44 | # loss = criterion(output.cuda(), target.cuda()) 45 | 46 | import torch.optim as optim 47 | 48 | # create your optimizer 49 | optimizer = optim.SGD(net.parameters(), lr=0.01) 50 | 51 | # in your training loop: 52 | optimizer.zero_grad() # zero the gradient buffers 53 | output = net(input) 54 | loss = criterion(output, target) 55 | 56 | loss.backward() 57 | 58 | optimizer.step() 59 | 60 | # print(loss) -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .channel_shuffle import channel_shuffle 3 | from .inverted_residual import InvertedResidual 4 | from .make_divisible import make_divisible 5 | from .se_layer import SELayer 6 | from .utils import load_checkpoint 7 | 8 | __all__ = [ 9 | 'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer', 10 | 'load_checkpoint' 11 | ] 12 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/channel_shuffle.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | 5 | def channel_shuffle(x, groups): 6 | """Channel Shuffle operation. 7 | 8 | This function enables cross-group information flow for multiple groups 9 | convolution layers. 10 | 11 | Args: 12 | x (Tensor): The input tensor. 13 | groups (int): The number of groups to divide the input tensor 14 | in the channel dimension. 15 | 16 | Returns: 17 | Tensor: The output tensor after channel shuffle operation. 18 | """ 19 | 20 | batch_size, num_channels, height, width = x.size() 21 | assert (num_channels % groups == 0), ('num_channels should be ' 22 | 'divisible by groups') 23 | channels_per_group = num_channels // groups 24 | 25 | x = x.view(batch_size, groups, channels_per_group, height, width) 26 | x = torch.transpose(x, 1, 2).contiguous() 27 | x = x.view(batch_size, -1, height, width) 28 | 29 | return x 30 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/make_divisible.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def make_divisible(value, divisor, min_value=None, min_ratio=0.9): 3 | """Make divisible function. 4 | 5 | This function rounds the channel number down to the nearest value that can 6 | be divisible by the divisor. 7 | 8 | Args: 9 | value (int): The original channel number. 10 | divisor (int): The divisor to fully divide the channel number. 11 | min_value (int, optional): The minimum value of the output channel. 12 | Default: None, means that the minimum value equal to the divisor. 13 | min_ratio (float, optional): The minimum ratio of the rounded channel 14 | number to the original channel number. Default: 0.9. 15 | Returns: 16 | int: The modified output channel number 17 | """ 18 | 19 | if min_value is None: 20 | min_value = divisor 21 | new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) 22 | # Make sure that round down does not go down by more than (1-min_ratio). 23 | if new_value < min_ratio * value: 24 | new_value += divisor 25 | return new_value 26 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/se_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import mmcv 3 | import torch.nn as nn 4 | from mmcv.cnn import ConvModule 5 | 6 | 7 | class SELayer(nn.Module): 8 | """Squeeze-and-Excitation Module. 9 | 10 | Args: 11 | channels (int): The input (and output) channels of the SE layer. 12 | ratio (int): Squeeze ratio in SELayer, the intermediate channel will be 13 | ``int(channels/ratio)``. Default: 16. 14 | conv_cfg (None or dict): Config dict for convolution layer. 15 | Default: None, which means using conv2d. 16 | act_cfg (dict or Sequence[dict]): Config dict for activation layer. 17 | If act_cfg is a dict, two activation layers will be configurated 18 | by this dict. If act_cfg is a sequence of dicts, the first 19 | activation layer will be configurated by the first dict and the 20 | second activation layer will be configurated by the second dict. 21 | Default: (dict(type='ReLU'), dict(type='Sigmoid')) 22 | """ 23 | 24 | def __init__(self, 25 | channels, 26 | ratio=16, 27 | conv_cfg=None, 28 | act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))): 29 | super().__init__() 30 | if isinstance(act_cfg, dict): 31 | act_cfg = (act_cfg, act_cfg) 32 | assert len(act_cfg) == 2 33 | assert mmcv.is_tuple_of(act_cfg, dict) 34 | self.global_avgpool = nn.AdaptiveAvgPool2d(1) 35 | self.conv1 = ConvModule( 36 | in_channels=channels, 37 | out_channels=int(channels / ratio), 38 | kernel_size=1, 39 | stride=1, 40 | conv_cfg=conv_cfg, 41 | act_cfg=act_cfg[0]) 42 | self.conv2 = ConvModule( 43 | in_channels=int(channels / ratio), 44 | out_channels=channels, 45 | kernel_size=1, 46 | stride=1, 47 | conv_cfg=conv_cfg, 48 | act_cfg=act_cfg[1]) 49 | 50 | def forward(self, x): 51 | out = self.global_avgpool(x) 52 | out = self.conv1(out) 53 | out = self.conv2(out) 54 | return x * out 55 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from collections import OrderedDict 3 | 4 | from mmcv.runner.checkpoint import _load_checkpoint, load_state_dict 5 | 6 | 7 | def load_checkpoint(model, 8 | filename, 9 | map_location='cpu', 10 | strict=False, 11 | logger=None): 12 | """Load checkpoint from a file or URI. 13 | 14 | Args: 15 | model (Module): Module to load checkpoint. 16 | filename (str): Accept local filepath, URL, ``torchvision://xxx``, 17 | ``open-mmlab://xxx``. 18 | map_location (str): Same as :func:`torch.load`. 19 | strict (bool): Whether to allow different params for the model and 20 | checkpoint. 21 | logger (:mod:`logging.Logger` or None): The logger for error message. 22 | 23 | Returns: 24 | dict or OrderedDict: The loaded checkpoint. 25 | """ 26 | checkpoint = _load_checkpoint(filename, map_location) 27 | # OrderedDict is a subclass of dict 28 | if not isinstance(checkpoint, dict): 29 | raise RuntimeError( 30 | f'No state_dict found in checkpoint file {filename}') 31 | # get state_dict from checkpoint 32 | if 'state_dict' in checkpoint: 33 | state_dict_tmp = checkpoint['state_dict'] 34 | else: 35 | state_dict_tmp = checkpoint 36 | 37 | state_dict = OrderedDict() 38 | # strip prefix of state_dict 39 | for k, v in state_dict_tmp.items(): 40 | if k.startswith('module.backbone.'): 41 | state_dict[k[16:]] = v 42 | elif k.startswith('module.'): 43 | state_dict[k[7:]] = v 44 | elif k.startswith('backbone.'): 45 | state_dict[k[9:]] = v 46 | else: 47 | state_dict[k] = v 48 | # load state_dict 49 | load_state_dict(model, state_dict, strict, logger) 50 | return checkpoint 51 | 52 | 53 | def get_state_dict(filename, map_location='cpu'): 54 | """Get state_dict from a file or URI. 55 | 56 | Args: 57 | filename (str): Accept local filepath, URL, ``torchvision://xxx``, 58 | ``open-mmlab://xxx``. 59 | map_location (str): Same as :func:`torch.load`. 60 | 61 | Returns: 62 | OrderedDict: The state_dict. 63 | """ 64 | checkpoint = _load_checkpoint(filename, map_location) 65 | # OrderedDict is a subclass of dict 66 | if not isinstance(checkpoint, dict): 67 | raise RuntimeError( 68 | f'No state_dict found in checkpoint file {filename}') 69 | # get state_dict from checkpoint 70 | if 'state_dict' in checkpoint: 71 | state_dict_tmp = checkpoint['state_dict'] 72 | else: 73 | state_dict_tmp = checkpoint 74 | 75 | state_dict = OrderedDict() 76 | # strip prefix of state_dict 77 | for k, v in state_dict_tmp.items(): 78 | if k.startswith('module.backbone.'): 79 | state_dict[k[16:]] = v 80 | elif k.startswith('module.'): 81 | state_dict[k[7:]] = v 82 | elif k.startswith('backbone.'): 83 | state_dict[k[9:]] = v 84 | else: 85 | state_dict[k] = v 86 | 87 | return state_dict 88 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/configs/coco/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/configs/coco/__init__.py -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # from .ae_higher_resolution_head import AEHigherResolutionHead 3 | # from .ae_multi_stage_head import AEMultiStageHead 4 | # from .ae_simple_head import AESimpleHead 5 | # from .deconv_head import DeconvHead 6 | # from .deeppose_regression_head import DeepposeRegressionHead 7 | # from .hmr_head import HMRMeshHead 8 | # from .interhand_3d_head import Interhand3DHead 9 | # from .temporal_regression_head import TemporalRegressionHead 10 | from .topdown_heatmap_base_head import TopdownHeatmapBaseHead 11 | # from .topdown_heatmap_multi_stage_head import (TopdownHeatmapMSMUHead, 12 | # TopdownHeatmapMultiStageHead) 13 | from .topdown_heatmap_simple_head import TopdownHeatmapSimpleHead 14 | # from .vipnas_heatmap_simple_head import ViPNASHeatmapSimpleHead 15 | # from .voxelpose_head import CuboidCenterHead, CuboidPoseHead 16 | 17 | # __all__ = [ 18 | # 'TopdownHeatmapSimpleHead', 'TopdownHeatmapMultiStageHead', 19 | # 'TopdownHeatmapMSMUHead', 'TopdownHeatmapBaseHead', 20 | # 'AEHigherResolutionHead', 'AESimpleHead', 'AEMultiStageHead', 21 | # 'DeepposeRegressionHead', 'TemporalRegressionHead', 'Interhand3DHead', 22 | # 'HMRMeshHead', 'DeconvHead', 'ViPNASHeatmapSimpleHead', 'CuboidCenterHead', 23 | # 'CuboidPoseHead' 24 | # ] 25 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/hmr_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from mmcv.cnn import xavier_init 6 | 7 | from ..builder import HEADS 8 | from ..utils.geometry import rot6d_to_rotmat 9 | 10 | 11 | @HEADS.register_module() 12 | class HMRMeshHead(nn.Module): 13 | """SMPL parameters regressor head of simple baseline. "End-to-end Recovery 14 | of Human Shape and Pose", CVPR'2018. 15 | 16 | Args: 17 | in_channels (int): Number of input channels 18 | smpl_mean_params (str): The file name of the mean SMPL parameters 19 | n_iter (int): The iterations of estimating delta parameters 20 | """ 21 | 22 | def __init__(self, in_channels, smpl_mean_params=None, n_iter=3): 23 | super().__init__() 24 | 25 | self.in_channels = in_channels 26 | self.n_iter = n_iter 27 | 28 | npose = 24 * 6 29 | nbeta = 10 30 | ncam = 3 31 | hidden_dim = 1024 32 | 33 | self.fc1 = nn.Linear(in_channels + npose + nbeta + ncam, hidden_dim) 34 | self.drop1 = nn.Dropout() 35 | self.fc2 = nn.Linear(hidden_dim, hidden_dim) 36 | self.drop2 = nn.Dropout() 37 | self.decpose = nn.Linear(hidden_dim, npose) 38 | self.decshape = nn.Linear(hidden_dim, nbeta) 39 | self.deccam = nn.Linear(hidden_dim, ncam) 40 | 41 | # Load mean SMPL parameters 42 | if smpl_mean_params is None: 43 | init_pose = torch.zeros([1, npose]) 44 | init_shape = torch.zeros([1, nbeta]) 45 | init_cam = torch.FloatTensor([[1, 0, 0]]) 46 | else: 47 | mean_params = np.load(smpl_mean_params) 48 | init_pose = torch.from_numpy( 49 | mean_params['pose'][:]).unsqueeze(0).float() 50 | init_shape = torch.from_numpy( 51 | mean_params['shape'][:]).unsqueeze(0).float() 52 | init_cam = torch.from_numpy( 53 | mean_params['cam']).unsqueeze(0).float() 54 | self.register_buffer('init_pose', init_pose) 55 | self.register_buffer('init_shape', init_shape) 56 | self.register_buffer('init_cam', init_cam) 57 | 58 | def forward(self, x): 59 | """Forward function. 60 | 61 | x is the image feature map and is expected to be in shape (batch size x 62 | channel number x height x width) 63 | """ 64 | batch_size = x.shape[0] 65 | # extract the global feature vector by average along 66 | # spatial dimension. 67 | x = x.mean(dim=-1).mean(dim=-1) 68 | 69 | init_pose = self.init_pose.expand(batch_size, -1) 70 | init_shape = self.init_shape.expand(batch_size, -1) 71 | init_cam = self.init_cam.expand(batch_size, -1) 72 | 73 | pred_pose = init_pose 74 | pred_shape = init_shape 75 | pred_cam = init_cam 76 | for _ in range(self.n_iter): 77 | xc = torch.cat([x, pred_pose, pred_shape, pred_cam], 1) 78 | xc = self.fc1(xc) 79 | xc = self.drop1(xc) 80 | xc = self.fc2(xc) 81 | xc = self.drop2(xc) 82 | pred_pose = self.decpose(xc) + pred_pose 83 | pred_shape = self.decshape(xc) + pred_shape 84 | pred_cam = self.deccam(xc) + pred_cam 85 | 86 | pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3) 87 | out = (pred_rotmat, pred_shape, pred_cam) 88 | return out 89 | 90 | def init_weights(self): 91 | """Initialize model weights.""" 92 | xavier_init(self.decpose, gain=0.01) 93 | xavier_init(self.decshape, gain=0.01) 94 | xavier_init(self.deccam, gain=0.01) 95 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/model_builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # from configs.coco.ViTPose_base_coco_256x192 import model 4 | from .heads.topdown_heatmap_simple_head import TopdownHeatmapSimpleHead 5 | 6 | # import TopdownHeatmapSimpleHead 7 | from .backbones import ViT 8 | 9 | # print(model) 10 | import torch 11 | from functools import partial 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from importlib import import_module 15 | 16 | 17 | def build_model(model_name, checkpoint=None): 18 | try: 19 | path = ".configs.coco." + model_name 20 | mod = import_module(path, package="src.vitpose_infer") 21 | 22 | model = getattr(mod, "model") 23 | # from path import model 24 | except: 25 | raise ValueError("not a correct config") 26 | 27 | head = TopdownHeatmapSimpleHead( 28 | in_channels=model["keypoint_head"]["in_channels"], 29 | out_channels=model["keypoint_head"]["out_channels"], 30 | num_deconv_filters=model["keypoint_head"]["num_deconv_filters"], 31 | num_deconv_kernels=model["keypoint_head"]["num_deconv_kernels"], 32 | num_deconv_layers=model["keypoint_head"]["num_deconv_layers"], 33 | extra=model["keypoint_head"]["extra"], 34 | ) 35 | # print(head) 36 | backbone = ViT( 37 | img_size=model["backbone"]["img_size"], 38 | patch_size=model["backbone"]["patch_size"], 39 | embed_dim=model["backbone"]["embed_dim"], 40 | depth=model["backbone"]["depth"], 41 | num_heads=model["backbone"]["num_heads"], 42 | ratio=model["backbone"]["ratio"], 43 | mlp_ratio=model["backbone"]["mlp_ratio"], 44 | qkv_bias=model["backbone"]["qkv_bias"], 45 | drop_path_rate=model["backbone"]["drop_path_rate"], 46 | ) 47 | 48 | class VitPoseModel(nn.Module): 49 | def __init__(self, backbone, keypoint_head): 50 | super(VitPoseModel, self).__init__() 51 | self.backbone = backbone 52 | self.keypoint_head = keypoint_head 53 | 54 | def forward(self, x): 55 | x = self.backbone(x) 56 | x = self.keypoint_head(x) 57 | return x 58 | 59 | pose = VitPoseModel(backbone, head) 60 | if checkpoint is not None: 61 | check = torch.load(checkpoint) 62 | 63 | pose.load_state_dict(check["state_dict"]) 64 | return pose 65 | 66 | 67 | # pose = build_model('ViTPose_base_coco_256x192','./models/vitpose-b-multi-coco.pth') 68 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/__init__.py -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/convert_to_trt.py: -------------------------------------------------------------------------------- 1 | from torch2trt import TRTModule,torch2trt 2 | from builder import build_model 3 | import torch 4 | pose = build_model('ViTPose_base_coco_256x192','./models/vitpose-b.pth') 5 | pose.cuda().eval() 6 | 7 | x = torch.ones(1,3,256,192).cuda() 8 | net_trt = torch2trt(pose, [x],max_batch_size=10, fp16_mode=True) 9 | torch.save(net_trt.state_dict(), 'vitpose_trt.pth') -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/inference_test.py: -------------------------------------------------------------------------------- 1 | from builder import build_model 2 | import torch 3 | from ViTPose_trt import TRTModule_ViTPose 4 | # pose = TRTModule_ViTPose(path='pose_higher_hrnet_w32_512.engine',device='cuda:0') 5 | pose = build_model('ViTPose_base_coco_256x192','./models/vitpose-b.pth') 6 | pose.cuda().eval() 7 | if pose.training: 8 | print('train') 9 | else: 10 | print('eval') 11 | device = torch.device("cuda") 12 | # pose.to(device) 13 | dummy_input = torch.randn(10, 3,256,192, dtype=torch.float).to(device) 14 | repetitions=100 15 | total_time = 0 16 | starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) 17 | with torch.no_grad(): 18 | for rep in range(repetitions): 19 | # starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) 20 | starter.record() 21 | # for k in range(10): 22 | _ = pose(dummy_input) 23 | ender.record() 24 | torch.cuda.synchronize() 25 | curr_time = starter.elapsed_time(ender)/1000 26 | total_time += curr_time 27 | Throughput = repetitions*10/total_time 28 | print('Final Throughput:',Throughput) 29 | print('Total time',total_time) -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/logger_helper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | class CustomFormatter(logging.Formatter): 4 | 5 | grey = "\x1b[38;20m" 6 | yellow = "\x1b[33;20m" 7 | red = "\x1b[31;20m" 8 | bold_red = "\x1b[31;1m" 9 | reset = "\x1b[0m" 10 | format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)" 11 | 12 | FORMATS = { 13 | logging.DEBUG: grey + format + reset, 14 | logging.INFO: grey + format + reset, 15 | logging.WARNING: yellow + format + reset, 16 | logging.ERROR: red + format + reset, 17 | logging.CRITICAL: bold_red + format + reset 18 | } 19 | 20 | def format(self, record): 21 | log_fmt = self.FORMATS.get(record.levelno) 22 | formatter = logging.Formatter(log_fmt) 23 | return formatter.format(record) -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/timerr.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | class Timer(object): 5 | """A simple timer.""" 6 | def __init__(self): 7 | self.total_time = 0. 8 | self.calls = 0 9 | self.start_time = 0. 10 | self.diff = 0. 11 | self.average_time = 0. 12 | 13 | self.duration = 0. 14 | 15 | def tic(self): 16 | # using time.time instead of time.clock because time time.clock 17 | # does not normalize for multithreading 18 | self.start_time = time.time() 19 | 20 | def toc(self, average=True): 21 | self.diff = time.time() - self.start_time 22 | self.total_time += self.diff 23 | self.calls += 1 24 | self.average_time = self.total_time / self.calls 25 | if average: 26 | self.duration = self.average_time 27 | else: 28 | self.duration = self.diff 29 | return self.duration 30 | 31 | def clear(self): 32 | self.total_time = 0. 33 | self.calls = 0 34 | self.start_time = 0. 35 | self.diff = 0. 36 | self.average_time = 0. 37 | self.duration = 0. -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/pylogger.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | import logging 3 | import torch 4 | from colorlog import ColoredFormatter 5 | 6 | 7 | def sync_time(): 8 | torch.cuda.synchronize() 9 | return time() 10 | 11 | 12 | Log = logging.getLogger() 13 | Log.time = time 14 | Log.sync_time = sync_time 15 | 16 | # Set default 17 | Log.setLevel(logging.INFO) 18 | ch = logging.StreamHandler() 19 | ch.setLevel(logging.INFO) 20 | # Use colorlog 21 | formatstring = "[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] %(message)s" 22 | datefmt = "%m/%d %H:%M:%S" 23 | ch.setFormatter(ColoredFormatter(formatstring, datefmt=datefmt)) 24 | 25 | Log.addHandler(ch) 26 | # Log.info("Init-Logger") 27 | 28 | 29 | def timer(sync_cuda=False, mem=False, loop=1): 30 | """ 31 | Args: 32 | func: function 33 | sync_cuda: bool, whether to synchronize cuda 34 | mem: bool, whether to log memory 35 | """ 36 | 37 | def decorator(func): 38 | def wrapper(*args, **kwargs): 39 | if mem: 40 | start_mem = torch.cuda.memory_allocated() / 1024**2 41 | if sync_cuda: 42 | torch.cuda.synchronize() 43 | 44 | start = Log.time() 45 | for _ in range(loop): 46 | result = func(*args, **kwargs) 47 | 48 | if sync_cuda: 49 | torch.cuda.synchronize() 50 | if loop == 1: 51 | message = f"{func.__name__} took {Log.time() - start:.3f} s." 52 | else: 53 | message = f"{func.__name__} took {((Log.time() - start))/loop:.3f} s. (loop={loop})" 54 | 55 | if mem: 56 | end_mem = torch.cuda.memory_allocated() / 1024**2 57 | end_max_mem = torch.cuda.max_memory_allocated() / 1024**2 58 | message += f" Start_Mem {start_mem:.1f} Max {end_max_mem:.1f} MB" 59 | Log.info(message) 60 | 61 | return result 62 | 63 | return wrapper 64 | 65 | return decorator 66 | 67 | 68 | def timed(fn): 69 | """example usage: timed(lambda: model(inp))""" 70 | start = torch.cuda.Event(enable_timing=True) 71 | end = torch.cuda.Event(enable_timing=True) 72 | start.record() 73 | result = fn() 74 | end.record() 75 | torch.cuda.synchronize() 76 | return result, start.elapsed_time(end) / 1000 77 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/vis/README.md: -------------------------------------------------------------------------------- 1 | ## Pytorch3D Renderer 2 | 3 | Example: 4 | ```python 5 | from hmr4d.utils.vis.renderer import Renderer 6 | import imageio 7 | 8 | fps = 30 9 | focal_length = data["cam_int"][0][0, 0] 10 | width, height = img_hw 11 | faces = smplh[data["gender"]].bm.faces 12 | renderer = Renderer(width, height, focal_length, "cuda", faces) 13 | writer = imageio.get_writer("tmp_debug.mp4", fps=fps, mode="I", format="FFMPEG", macro_block_size=1) 14 | 15 | for i in tqdm(range(length)): 16 | img = np.zeros((height, width, 3), dtype=np.uint8) 17 | img = renderer.render_mesh(smplh_out.vertices[i].cuda(), img) 18 | writer.append_data(img) 19 | writer.close() 20 | ``` -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/vis/renderer_utils.py: -------------------------------------------------------------------------------- 1 | from hmr4d.utils.vis.renderer import Renderer 2 | from tqdm import tqdm 3 | import numpy as np 4 | 5 | 6 | def simple_render_mesh(render_dict): 7 | """Render an camera-space mesh, blank background""" 8 | width, height, focal_length = render_dict["whf"] 9 | faces = render_dict["faces"] 10 | verts = render_dict["verts"] 11 | 12 | renderer = Renderer(width, height, focal_length, device="cuda", faces=faces) 13 | outputs = [] 14 | for i in tqdm(range(len(verts)), desc=f"Rendering"): 15 | img = renderer.render_mesh(verts[i].cuda(), colors=[0.8, 0.8, 0.8]) 16 | outputs.append(img) 17 | outputs = np.stack(outputs, axis=0) 18 | return outputs 19 | 20 | 21 | def simple_render_mesh_background(render_dict, VI=50, colors=[0.8, 0.8, 0.8]): 22 | """Render an camera-space mesh, blank background""" 23 | K = render_dict["K"] 24 | faces = render_dict["faces"] 25 | verts = render_dict["verts"] 26 | background = render_dict["background"] 27 | N_frames = len(verts) 28 | if len(background.shape) == 3: 29 | background = [background] * N_frames 30 | height, width = background[0].shape[:2] 31 | 32 | renderer = Renderer(width, height, device="cuda", faces=faces, K=K) 33 | outputs = [] 34 | for i in tqdm(range(len(verts)), desc=f"Rendering"): 35 | img = renderer.render_mesh(verts[i].cuda(), colors=colors, background=background[i], VI=VI) 36 | outputs.append(img) 37 | outputs = np.stack(outputs, axis=0) 38 | return outputs 39 | -------------------------------------------------------------------------------- /eval/GVHMR/hmr4d/utils/vis/rich_logger.py: -------------------------------------------------------------------------------- 1 | from pytorch_lightning.utilities import rank_zero_only 2 | from omegaconf import DictConfig, OmegaConf 3 | import rich 4 | import rich.tree 5 | import rich.syntax 6 | from hmr4d.utils.pylogger import Log 7 | 8 | 9 | @rank_zero_only 10 | def print_cfg(cfg: DictConfig, use_rich: bool = False): 11 | if use_rich: 12 | print_order = ("data", "model", "callbacks", "logger", "pl_trainer") 13 | style = "dim" 14 | tree = rich.tree.Tree("CONFIG", style=style, guide_style=style) 15 | 16 | # add fields from `print_order` to queue 17 | # add all the other fields to queue (not specified in `print_order`) 18 | queue = [] 19 | for field in print_order: 20 | queue.append(field) if field in cfg else Log.warn(f"Field '{field}' not found in config. Skipping.") 21 | for field in cfg: 22 | if field not in queue: 23 | queue.append(field) 24 | 25 | # generate config tree from queue 26 | for field in queue: 27 | branch = tree.add(field, style=style, guide_style=style) 28 | config_group = cfg[field] 29 | if isinstance(config_group, DictConfig): 30 | branch_content = OmegaConf.to_yaml(config_group, resolve=False) 31 | else: 32 | branch_content = str(config_group) 33 | branch.add(rich.syntax.Syntax(branch_content, "yaml")) 34 | rich.print(tree) 35 | else: 36 | Log.info(OmegaConf.to_yaml(cfg, resolve=False)) 37 | -------------------------------------------------------------------------------- /eval/GVHMR/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | include = '\.pyi?$' 4 | exclude = ''' 5 | /( 6 | \.git 7 | | \.hg 8 | | \.mypy_cache 9 | | \.tox 10 | | \.venv 11 | | _build 12 | | buck-out 13 | | build 14 | | dist 15 | )/ 16 | ''' 17 | -------------------------------------------------------------------------------- /eval/GVHMR/pyrightconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "exclude": [ 3 | "./inputs", 4 | "./outputs" 5 | ], 6 | "typeCheckingMode": "off", 7 | } 8 | -------------------------------------------------------------------------------- /eval/GVHMR/requirements.txt: -------------------------------------------------------------------------------- 1 | # PyTorch 2 | --extra-index-url https://download.pytorch.org/whl/cu121 3 | torch==2.3.0+cu121 4 | torchvision==0.18.0+cu121 5 | timm==0.9.12 # For HMR2.0a feature extraction 6 | 7 | # Lightning + Hydra 8 | lightning==2.3.0 9 | hydra-core==1.3 10 | hydra-zen 11 | hydra_colorlog 12 | rich 13 | 14 | # Common utilities 15 | numpy==1.23.5 16 | jupyter 17 | matplotlib 18 | ipdb 19 | setuptools>=68.0 20 | black 21 | tensorboardX 22 | opencv-python 23 | ffmpeg-python 24 | scikit-image 25 | termcolor 26 | einops 27 | imageio==2.34.1 28 | av # imageio[pyav], improved performance over imageio[ffmpeg] 29 | joblib 30 | 31 | # Diffusion 32 | # diffusers[torch]==0.19.3 33 | # transformers==4.31.0 34 | 35 | # 3D-Vision 36 | pytorch3d @ https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt230/pytorch3d-0.7.6-cp310-cp310-linux_x86_64.whl 37 | trimesh 38 | chumpy 39 | smplx 40 | # open3d==0.17.0 41 | wis3d 42 | 43 | # 2D-Pose 44 | ultralytics==8.2.42 # YOLO 45 | cython_bbox 46 | lapx -------------------------------------------------------------------------------- /eval/GVHMR/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name="gvhmr", 6 | version="1.0.0", 7 | packages=find_packages(), 8 | author="Zehong Shen", 9 | description=["GVHMR training and inference"], 10 | url="https://github.com/zju3dv/GVHMR", 11 | ) 12 | -------------------------------------------------------------------------------- /eval/GVHMR/tools/demo/demo_folder.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | from tqdm import tqdm 4 | from hmr4d.utils.pylogger import Log 5 | import subprocess 6 | import os 7 | 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("-f", "--folder", type=str) 12 | parser.add_argument("-d", "--output_root", type=str, default=None) 13 | parser.add_argument("-s", "--static_cam", action="store_true", help="If true, skip DPVO") 14 | args = parser.parse_args() 15 | 16 | output_root = args.output_root 17 | 18 | sub_folders = os.listdir(args.folder) 19 | mp4_paths = [] 20 | for sub_folder in sub_folders: 21 | files = os.listdir(os.path.join(args.folder, sub_folder)) 22 | for file in files: 23 | if file.endswith('.mp4'): 24 | mp4_path = os.path.join(args.folder, sub_folder, file) 25 | mp4_paths.append(mp4_path) 26 | 27 | # Run demo.py for each .mp4 file 28 | Log.info(f"Found {len(mp4_paths)} .mp4 files in {args.folder}") 29 | for mp4_path in tqdm(mp4_paths): 30 | try: 31 | command = ["python", "tools/demo/demo.py", "--video", str(mp4_path)] 32 | if output_root is not None: 33 | command += ["--output_root", output_root] 34 | if args.static_cam: 35 | command += ["-s"] 36 | Log.info(f"Running: {' '.join(command)}") 37 | subprocess.run(command, env=dict(os.environ), check=True) 38 | except: 39 | continue 40 | -------------------------------------------------------------------------------- /eval/GVHMR/tools/train.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | import pytorch_lightning as pl 3 | from omegaconf import DictConfig, OmegaConf 4 | from pytorch_lightning.callbacks.checkpoint import Checkpoint 5 | 6 | from hmr4d.utils.pylogger import Log 7 | from hmr4d.configs import register_store_gvhmr 8 | from hmr4d.utils.vis.rich_logger import print_cfg 9 | from hmr4d.utils.net_utils import load_pretrained_model, get_resume_ckpt_path 10 | 11 | 12 | def get_callbacks(cfg: DictConfig) -> list: 13 | """Parse and instantiate all the callbacks in the config.""" 14 | if not hasattr(cfg, "callbacks") or cfg.callbacks is None: 15 | return None 16 | # Handle special callbacks 17 | enable_checkpointing = cfg.pl_trainer.get("enable_checkpointing", True) 18 | # Instantiate all the callbacks 19 | callbacks = [] 20 | for callback in cfg.callbacks.values(): 21 | if callback is not None: 22 | cb = hydra.utils.instantiate(callback, _recursive_=False) 23 | # skip when disable checkpointing and the callback is Checkpoint 24 | if not enable_checkpointing and isinstance(cb, Checkpoint): 25 | continue 26 | else: 27 | callbacks.append(cb) 28 | return callbacks 29 | 30 | 31 | def train(cfg: DictConfig) -> None: 32 | """Train/Test""" 33 | Log.info(f"[Exp Name]: {cfg.exp_name}") 34 | if cfg.task == "fit": 35 | Log.info(f"[GPU x Batch] = {cfg.pl_trainer.devices} x {cfg.data.loader_opts.train.batch_size}") 36 | pl.seed_everything(cfg.seed) 37 | 38 | # preparation 39 | datamodule: pl.LightningDataModule = hydra.utils.instantiate(cfg.data, _recursive_=False) 40 | model: pl.LightningModule = hydra.utils.instantiate(cfg.model, _recursive_=False) 41 | if cfg.ckpt_path is not None: 42 | load_pretrained_model(model, cfg.ckpt_path) 43 | 44 | # PL callbacks and logger 45 | callbacks = get_callbacks(cfg) 46 | has_ckpt_cb = any([isinstance(cb, Checkpoint) for cb in callbacks]) 47 | if not has_ckpt_cb and cfg.pl_trainer.get("enable_checkpointing", True): 48 | Log.warning("No checkpoint-callback found. Disabling PL auto checkpointing.") 49 | cfg.pl_trainer = {**cfg.pl_trainer, "enable_checkpointing": False} 50 | logger = hydra.utils.instantiate(cfg.logger, _recursive_=False) 51 | 52 | # PL-Trainer 53 | if cfg.task == "test": 54 | Log.info("Test mode forces full-precision.") 55 | cfg.pl_trainer = {**cfg.pl_trainer, "precision": 32} 56 | trainer = pl.Trainer( 57 | accelerator="gpu", 58 | logger=logger if logger is not None else False, 59 | callbacks=callbacks, 60 | **cfg.pl_trainer, 61 | ) 62 | 63 | if cfg.task == "fit": 64 | resume_path = None 65 | if cfg.resume_mode is not None: 66 | resume_path = get_resume_ckpt_path(cfg.resume_mode, ckpt_dir=cfg.callbacks.model_checkpoint.dirpath) 67 | Log.info(f"Resume training from {resume_path}") 68 | Log.info("Start Fitiing...") 69 | trainer.fit(model, datamodule.train_dataloader(), datamodule.val_dataloader(), ckpt_path=resume_path) 70 | elif cfg.task == "test": 71 | Log.info("Start Testing...") 72 | trainer.test(model, datamodule.test_dataloader()) 73 | else: 74 | raise ValueError(f"Unknown task: {cfg.task}") 75 | 76 | Log.info("End of script.") 77 | 78 | 79 | @hydra.main(version_base="1.3", config_path="../hmr4d/configs", config_name="train") 80 | def main(cfg) -> None: 81 | print_cfg(cfg, use_rich=True) 82 | train(cfg) 83 | 84 | 85 | if __name__ == "__main__": 86 | register_store_gvhmr() 87 | main() 88 | -------------------------------------------------------------------------------- /eval/GVHMR/tools/unitest/make_hydra_cfg.py: -------------------------------------------------------------------------------- 1 | from hmr4d.configs import parse_args_to_cfg, register_store_gvhmr 2 | from hmr4d.utils.vis.rich_logger import print_cfg 3 | 4 | if __name__ == "__main__": 5 | register_store_gvhmr() 6 | cfg = parse_args_to_cfg() 7 | print_cfg(cfg, use_rich=True) 8 | -------------------------------------------------------------------------------- /eval/GVHMR/tools/unitest/run_dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import DataLoader 3 | from tqdm import tqdm 4 | 5 | 6 | def get_dataset(DATA_TYPE): 7 | if DATA_TYPE == "BEDLAM_V2": 8 | from hmr4d.dataset.bedlam.bedlam import BedlamDatasetV2 9 | 10 | return BedlamDatasetV2() 11 | 12 | if DATA_TYPE == "3DPW_TRAIN": 13 | from hmr4d.dataset.threedpw.threedpw_motion_train import ThreedpwSmplDataset 14 | 15 | return ThreedpwSmplDataset() 16 | 17 | if __name__ == "__main__": 18 | DATA_TYPE = "3DPW_TRAIN" 19 | dataset = get_dataset(DATA_TYPE) 20 | print(len(dataset)) 21 | 22 | data = dataset[0] 23 | 24 | from hmr4d.datamodule.mocap_trainX_testY import collate_fn 25 | 26 | loader = DataLoader( 27 | dataset, 28 | shuffle=False, 29 | num_workers=0, 30 | persistent_workers=False, 31 | pin_memory=False, 32 | batch_size=1, 33 | collate_fn=collate_fn, 34 | ) 35 | i = 0 36 | for batch in tqdm(loader): 37 | i += 1 38 | # if i == 20: 39 | # raise AssertionError 40 | # time.sleep(0.2) 41 | pass 42 | -------------------------------------------------------------------------------- /eval/GVHMR/tools/video/merge_folder.py: -------------------------------------------------------------------------------- 1 | """This script will glob two folder, check the mp4 files are one-to-one match precisely, then call merge_horizontal.py to merge them one by one""" 2 | 3 | import os 4 | import argparse 5 | from pathlib import Path 6 | 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("input_dir1", type=str) 11 | parser.add_argument("input_dir2", type=str) 12 | parser.add_argument("output_dir", type=str) 13 | parser.add_argument("--vertical", action="store_true") # By default use horizontal 14 | args = parser.parse_args() 15 | 16 | # Check input 17 | input_dir1 = Path(args.input_dir1) 18 | input_dir2 = Path(args.input_dir2) 19 | assert input_dir1.exists() 20 | assert input_dir2.exists() 21 | video_paths1 = sorted(input_dir1.glob("*.mp4")) 22 | video_paths2 = sorted(input_dir2.glob("*.mp4")) 23 | assert len(video_paths1) == len(video_paths2) 24 | for path1, path2 in zip(video_paths1, video_paths2): 25 | assert path1.stem == path2.stem 26 | 27 | # Merge to output 28 | output_dir = Path(args.output_dir) 29 | output_dir.mkdir(parents=True, exist_ok=True) 30 | 31 | for path1, path2 in zip(video_paths1, video_paths2): 32 | out_path = output_dir / f"{path1.stem}.mp4" 33 | in_paths = [str(path1), str(path2)] 34 | print(f"Merging {in_paths} to {out_path}") 35 | if args.vertical: 36 | os.system(f"python tools/video/merge_vertical.py {' '.join(in_paths)} -o {out_path}") 37 | else: 38 | os.system(f"python tools/video/merge_horizontal.py {' '.join(in_paths)} -o {out_path}") 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /eval/GVHMR/tools/video/merge_horizontal.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from hmr4d.utils.video_io_utils import merge_videos_horizontal 3 | 4 | 5 | def parse_args(): 6 | """python tools/video/merge_horizontal.py a.mp4 b.mp4 c.mp4 -o out.mp4""" 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("input_videos", nargs="+", help="Input video paths") 9 | parser.add_argument("-o", "--output", type=str, required=True, help="Output video path") 10 | return parser.parse_args() 11 | 12 | 13 | if __name__ == "__main__": 14 | args = parse_args() 15 | merge_videos_horizontal(args.input_videos, args.output) 16 | -------------------------------------------------------------------------------- /eval/GVHMR/tools/video/merge_vertical.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from hmr4d.utils.video_io_utils import merge_videos_vertical 3 | 4 | 5 | def parse_args(): 6 | """python tools/video/merge_vertical.py a.mp4 b.mp4 c.mp4 -o out.mp4""" 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("input_videos", nargs="+", help="Input video paths") 9 | parser.add_argument("-o", "--output", type=str, required=True, help="Output video path") 10 | return parser.parse_args() 11 | 12 | 13 | if __name__ == "__main__": 14 | args = parse_args() 15 | merge_videos_vertical(args.input_videos, args.output) 16 | -------------------------------------------------------------------------------- /eval/common_metrics_on_video_quality/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ -------------------------------------------------------------------------------- /eval/common_metrics_on_video_quality/calculate_clip.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from PIL import Image 3 | import torch 4 | from transformers import CLIPProcessor, CLIPModel 5 | import json 6 | import os 7 | from tqdm import tqdm 8 | import torch 9 | import clip 10 | from PIL import Image 11 | import cv2 12 | import numpy as np 13 | import os 14 | import argparse 15 | 16 | device = "cuda" if torch.cuda.is_available() else "cpu" 17 | 18 | model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device) 19 | processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") 20 | 21 | def get_video_scores(video_path, prompt): 22 | video = cv2.VideoCapture(video_path) 23 | texts = [prompt] 24 | clip_score_list = [] 25 | while True: 26 | ret, frame = video.read() 27 | 28 | if ret: 29 | image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 30 | inputs = processor(text=texts, images=[image], return_tensors="pt", padding=True, truncation=True).to(device) 31 | logits_per_image = model(**inputs).logits_per_image 32 | clip_score = logits_per_image.item() 33 | clip_score_list.append(clip_score) 34 | else: 35 | break 36 | 37 | video.release() 38 | return sum(clip_score_list) / len(clip_score_list) 39 | 40 | 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("-v_f", "--videos_folder", type=str) 43 | args = parser.parse_args() 44 | 45 | videos_folder_path = args.videos_folder 46 | prompts_path = '/ytech_m2v2_hdd/fuxiao/scenectrl/common_metrics_on_video_quality/eval_prompts.json' 47 | with open(prompts_path, "r", encoding="utf-8") as f: prompts_dict = json.load(f) 48 | 49 | sub_folders = os.listdir(videos_folder_path) 50 | videos_name = [] 51 | for sub_folder in sub_folders: 52 | files = os.listdir(os.path.join(videos_folder_path, sub_folder)) 53 | for file in files: 54 | if file.endswith('.mp4'): 55 | video_name = os.path.join(sub_folder, file) 56 | videos_name.append(video_name) 57 | 58 | num_videos = len(videos_name) 59 | 60 | prompts = [] 61 | video_paths = [] 62 | for video_name in videos_name: 63 | prompt = prompts_dict[video_name.split('/')[0]] 64 | video_path = os.path.join(videos_folder_path, video_name) 65 | prompts.append(prompt) 66 | video_paths.append(video_path) 67 | 68 | import csv 69 | CLIP_T = True 70 | if CLIP_T: 71 | scores = [] 72 | for i in tqdm(range(num_videos)): 73 | # 加载图片 74 | video_path = video_paths[i] 75 | 76 | # 准备文本 77 | texts = prompts[i] 78 | score = get_video_scores(video_path, texts) 79 | scores.append(score) 80 | 81 | print(f"CLIP-SIM: {sum(scores)/len(scores)/100.}") 82 | #### CLIP-T #### 83 | # basemodel: 33.44 -------------------------------------------------------------------------------- /eval/common_metrics_on_video_quality/calculate_fvd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from tqdm import tqdm 4 | 5 | def trans(x): 6 | # if greyscale images add channel 7 | if x.shape[-3] == 1: 8 | x = x.repeat(1, 1, 3, 1, 1) 9 | 10 | # permute BTCHW -> BCTHW 11 | x = x.permute(0, 2, 1, 3, 4) 12 | 13 | return x 14 | 15 | def calculate_fvd(videos1, videos2, device, method='styleganv'): 16 | 17 | if method == 'styleganv': 18 | from fvd.styleganv.fvd import get_fvd_feats, frechet_distance, load_i3d_pretrained 19 | elif method == 'videogpt': 20 | from fvd.videogpt.fvd import load_i3d_pretrained 21 | from fvd.videogpt.fvd import get_fvd_logits as get_fvd_feats 22 | from fvd.videogpt.fvd import frechet_distance 23 | 24 | print("calculate_fvd...") 25 | 26 | # videos [batch_size, timestamps, channel, h, w] 27 | 28 | assert videos1.shape == videos2.shape 29 | 30 | i3d = load_i3d_pretrained(device=device) 31 | fvd_results = [] 32 | 33 | # support grayscale input, if grayscale -> channel*3 34 | # BTCHW -> BCTHW 35 | # videos -> [batch_size, channel, timestamps, h, w] 36 | 37 | videos1 = trans(videos1) 38 | videos2 = trans(videos2) 39 | 40 | fvd_results = {} 41 | 42 | # for calculate FVD, each clip_timestamp must >= 10 43 | 44 | # get a video clip 45 | # videos_clip [batch_size, channel, timestamps[:clip], h, w] 46 | videos_clip1 = videos1[:, :, :] 47 | videos_clip2 = videos2[:, :, :] 48 | 49 | # get FVD features 50 | feats1 = get_fvd_feats(videos_clip1, i3d=i3d, device=device) 51 | feats2 = get_fvd_feats(videos_clip2, i3d=i3d, device=device) 52 | 53 | # calculate FVD when timestamps[:clip] 54 | fvd_results = frechet_distance(feats1, feats2) 55 | 56 | result = { 57 | "value": fvd_results, 58 | "video_setting": videos1.shape, 59 | "video_setting_name": "batch_size, channel, time, heigth, width", 60 | } 61 | 62 | return result 63 | 64 | # test code / using example 65 | 66 | def main(): 67 | NUMBER_OF_VIDEOS = 8 68 | VIDEO_LENGTH = 50 69 | CHANNEL = 3 70 | SIZE = 64 71 | videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) 72 | videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) 73 | device = torch.device("cuda") 74 | # device = torch.device("cpu") 75 | 76 | import json 77 | result = calculate_fvd(videos1, videos2, device, method='videogpt') 78 | print(json.dumps(result, indent=4)) 79 | 80 | result = calculate_fvd(videos1, videos2, device, method='styleganv') 81 | print(json.dumps(result, indent=4)) 82 | 83 | if __name__ == "__main__": 84 | main() -------------------------------------------------------------------------------- /eval/common_metrics_on_video_quality/calculate_lpips.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from tqdm import tqdm 4 | import math 5 | 6 | import torch 7 | import lpips 8 | 9 | spatial = True # Return a spatial map of perceptual distance. 10 | 11 | # Linearly calibrated models (LPIPS) 12 | loss_fn = lpips.LPIPS(net='alex', spatial=spatial) # Can also set net = 'squeeze' or 'vgg' 13 | # loss_fn = lpips.LPIPS(net='alex', spatial=spatial, lpips=False) # Can also set net = 'squeeze' or 'vgg' 14 | 15 | def trans(x): 16 | # if greyscale images add channel 17 | if x.shape[-3] == 1: 18 | x = x.repeat(1, 1, 3, 1, 1) 19 | 20 | # value range [0, 1] -> [-1, 1] 21 | x = x * 2 - 1 22 | 23 | return x 24 | 25 | def calculate_lpips(videos1, videos2, device): 26 | # image should be RGB, IMPORTANT: normalized to [-1,1] 27 | print("calculate_lpips...") 28 | 29 | assert videos1.shape == videos2.shape 30 | 31 | # videos [batch_size, timestamps, channel, h, w] 32 | 33 | # support grayscale input, if grayscale -> channel*3 34 | # value range [0, 1] -> [-1, 1] 35 | videos1 = trans(videos1) 36 | videos2 = trans(videos2) 37 | 38 | lpips_results = [] 39 | 40 | for video_num in tqdm(range(videos1.shape[0])): 41 | # get a video 42 | # video [timestamps, channel, h, w] 43 | video1 = videos1[video_num] 44 | video2 = videos2[video_num] 45 | 46 | lpips_results_of_a_video = [] 47 | for clip_timestamp in range(len(video1)): 48 | # get a img 49 | # img [timestamps[x], channel, h, w] 50 | # img [channel, h, w] tensor 51 | 52 | img1 = video1[clip_timestamp].unsqueeze(0).to(device) 53 | img2 = video2[clip_timestamp].unsqueeze(0).to(device) 54 | 55 | loss_fn.to(device) 56 | 57 | # calculate lpips of a video 58 | lpips_results_of_a_video.append(loss_fn.forward(img1, img2).mean().detach().cpu().tolist()) 59 | lpips_results.append(lpips_results_of_a_video) 60 | 61 | lpips_results = np.array(lpips_results) 62 | 63 | lpips = {} 64 | lpips_std = {} 65 | 66 | for clip_timestamp in range(len(video1)): 67 | lpips[clip_timestamp] = np.mean(lpips_results[:,clip_timestamp]) 68 | lpips_std[clip_timestamp] = np.std(lpips_results[:,clip_timestamp]) 69 | 70 | 71 | result = { 72 | "value": lpips, 73 | "value_std": lpips_std, 74 | "video_setting": video1.shape, 75 | "video_setting_name": "time, channel, heigth, width", 76 | } 77 | 78 | return result 79 | 80 | # test code / using example 81 | 82 | def main(): 83 | NUMBER_OF_VIDEOS = 8 84 | VIDEO_LENGTH = 50 85 | CHANNEL = 3 86 | SIZE = 64 87 | videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) 88 | videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) 89 | device = torch.device("cuda") 90 | # device = torch.device("cpu") 91 | 92 | import json 93 | result = calculate_lpips(videos1, videos2, device) 94 | print(json.dumps(result, indent=4)) 95 | 96 | if __name__ == "__main__": 97 | main() -------------------------------------------------------------------------------- /eval/common_metrics_on_video_quality/calculate_psnr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from tqdm import tqdm 4 | import math 5 | 6 | def img_psnr(img1, img2): 7 | # [0,1] 8 | # compute mse 9 | # mse = np.mean((img1-img2)**2) 10 | mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2) 11 | # compute psnr 12 | if mse < 1e-10: 13 | return 100 14 | psnr = 20 * math.log10(1 / math.sqrt(mse)) 15 | return psnr 16 | 17 | def trans(x): 18 | return x 19 | 20 | def calculate_psnr(videos1, videos2): 21 | print("calculate_psnr...") 22 | 23 | # videos [batch_size, timestamps, channel, h, w] 24 | 25 | assert videos1.shape == videos2.shape 26 | 27 | videos1 = trans(videos1) 28 | videos2 = trans(videos2) 29 | 30 | psnr_results = [] 31 | 32 | for video_num in tqdm(range(videos1.shape[0])): 33 | # get a video 34 | # video [timestamps, channel, h, w] 35 | video1 = videos1[video_num] 36 | video2 = videos2[video_num] 37 | 38 | psnr_results_of_a_video = [] 39 | for clip_timestamp in range(len(video1)): 40 | # get a img 41 | # img [timestamps[x], channel, h, w] 42 | # img [channel, h, w] numpy 43 | 44 | img1 = video1[clip_timestamp].numpy() 45 | img2 = video2[clip_timestamp].numpy() 46 | 47 | # calculate psnr of a video 48 | psnr_results_of_a_video.append(img_psnr(img1, img2)) 49 | 50 | psnr_results.append(psnr_results_of_a_video) 51 | 52 | psnr_results = np.array(psnr_results) 53 | 54 | psnr = {} 55 | psnr_std = {} 56 | 57 | for clip_timestamp in range(len(video1)): 58 | psnr[clip_timestamp] = np.mean(psnr_results[:,clip_timestamp]) 59 | psnr_std[clip_timestamp] = np.std(psnr_results[:,clip_timestamp]) 60 | 61 | result = { 62 | "value": psnr, 63 | "value_std": psnr_std, 64 | "video_setting": video1.shape, 65 | "video_setting_name": "time, channel, heigth, width", 66 | } 67 | 68 | return result 69 | 70 | # test code / using example 71 | 72 | def main(): 73 | NUMBER_OF_VIDEOS = 8 74 | VIDEO_LENGTH = 50 75 | CHANNEL = 3 76 | SIZE = 64 77 | videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) 78 | videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) 79 | 80 | import json 81 | result = calculate_psnr(videos1, videos2) 82 | print(json.dumps(result, indent=4)) 83 | 84 | if __name__ == "__main__": 85 | main() -------------------------------------------------------------------------------- /eval/common_metrics_on_video_quality/download_eval_visual.sh: -------------------------------------------------------------------------------- 1 | gdown https://drive.google.com/uc\?id\=1U2hd6qvwKLfp7c8yGgcTqdqrP_lKJElB 2 | gdown https://drive.google.com/uc\?id\=1jMH2-ZC0ZBgtqej5Sp-E5ebBIX7mk3Xz 3 | gdown https://drive.google.com/uc\?id\=1kfdCDA5koYh9g3IkCCHb4XPch2CJAwek 4 | 5 | unzip fvd.zip 6 | unzip eval_sets.zip 7 | unzip base_t2v_eval_sets.zip 8 | 9 | mv eval_sets eval_folder/ 10 | mv base_t2v_eval_sets eval_folder/ 11 | 12 | rm -rf *.zip -------------------------------------------------------------------------------- /eval/common_metrics_on_video_quality/eval_visual.sh: -------------------------------------------------------------------------------- 1 | basedir=eval_folder 2 | folder1_path=${basedir}/base_t2v_eval_sets 3 | folder2_path=${basedir}/eval_sets 4 | 5 | # calculate FVD 6 | python calculate_fvd_styleganv.py -v1_f ${folder1_path} -v2_f ${folder2_path} 7 | 8 | # calculate FID 9 | python -m pytorch_fid ${basedir}/eval_1 ${basedir}/eval_2 10 | 11 | # calculate CLIP-SIM 12 | python calculate_clip.py -v_f ${folder2_path} 13 | 14 | rm -rf ${basedir}/eval_1 15 | rm -rf ${basedir}/eval_2 -------------------------------------------------------------------------------- /imgs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/imgs/logo.png -------------------------------------------------------------------------------- /imgs/vis_objstraj.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/imgs/vis_objstraj.png --------------------------------------------------------------------------------