├── CogVideo
├── .github
│ ├── ISSUE_TEMPLATE
│ │ ├── bug_report.yaml
│ │ └── feature-request.yaml
│ └── PULL_REQUEST_TEMPLATE
│ │ └── pr_template.md
├── .gitignore
├── LICENSE
├── MODEL_LICENSE
├── README.md
├── README_ja.md
├── README_zh.md
├── download.sh
├── finetune
│ ├── README.md
│ ├── README_ja.md
│ ├── README_zh.md
│ ├── accelerate_config_machine_single.yaml
│ ├── accelerate_config_machine_single_debug.yaml
│ ├── finetune_single_rank_injector.sh
│ ├── finetune_single_rank_lora.sh
│ ├── hostfile.txt
│ ├── models
│ │ ├── attention.py
│ │ ├── attention_processor.py
│ │ ├── cogvideox_transformer_3d.py
│ │ ├── embeddings.py
│ │ ├── pipeline_cogvideox.py
│ │ ├── pipeline_output.py
│ │ └── utils.py
│ ├── train_cogvideox_injector.py
│ └── train_cogvideox_lora.py
├── inference
│ ├── 3dtrajmaster_inference.py
│ ├── entity_zoo.txt
│ ├── location_zoo.txt
│ ├── output_example
│ │ ├── 1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.mp4
│ │ ├── 1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.txt
│ │ ├── 1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.mp4
│ │ ├── 1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.txt
│ │ ├── 1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.mp4
│ │ ├── 1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.txt
│ │ ├── 1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.mp4
│ │ ├── 1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.txt
│ │ ├── 1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.mp4
│ │ ├── 1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.txt
│ │ ├── 1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.mp4
│ │ ├── 1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.txt
│ │ ├── 1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.mp4
│ │ ├── 1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.txt
│ │ ├── 1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.mp4
│ │ ├── 1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.txt
│ │ ├── 1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.mp4
│ │ ├── 1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.txt
│ │ ├── 1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.mp4
│ │ ├── 1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.txt
│ │ ├── 2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.mp4
│ │ ├── 2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.txt
│ │ ├── 2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.mp4
│ │ ├── 2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.txt
│ │ ├── 2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.mp4
│ │ ├── 2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.txt
│ │ ├── 2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.mp4
│ │ ├── 2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.txt
│ │ ├── 2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.mp4
│ │ ├── 2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.txt
│ │ ├── 2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.mp4
│ │ ├── 2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.txt
│ │ ├── 2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.mp4
│ │ ├── 2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.txt
│ │ ├── 2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.mp4
│ │ ├── 2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.txt
│ │ ├── 2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.mp4
│ │ ├── 2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.txt
│ │ ├── 2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.mp4
│ │ ├── 2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.txt
│ │ ├── 3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.mp4
│ │ ├── 3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.txt
│ │ ├── 3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.mp4
│ │ ├── 3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.txt
│ │ ├── 3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.mp4
│ │ ├── 3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.txt
│ │ ├── 3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.mp4
│ │ ├── 3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.txt
│ │ ├── 3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.mp4
│ │ ├── 3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.txt
│ │ ├── 3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.mp4
│ │ ├── 3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.txt
│ │ ├── 3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.mp4
│ │ ├── 3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.txt
│ │ ├── 3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.mp4
│ │ ├── 3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.txt
│ │ ├── 3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.mp4
│ │ ├── 3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.txt
│ │ ├── 3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.mp4
│ │ └── 3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.txt
│ └── test_sets.json
├── pyproject.toml
├── requirements.txt
├── tools
│ ├── caption
│ │ ├── README.md
│ │ ├── README_ja.md
│ │ ├── README_zh.md
│ │ ├── assests
│ │ │ ├── CogVLM2-Caption-example.png
│ │ │ └── cogvlm2-video-example.png
│ │ ├── requirements.txt
│ │ └── video_caption.py
│ ├── convert_weight_sat2hf.py
│ ├── export_sat_lora_weight.py
│ ├── llm_flux_cogvideox
│ │ ├── generate.sh
│ │ ├── gradio_page.py
│ │ └── llm_flux_cogvideox.py
│ ├── load_cogvideox_lora.py
│ ├── parallel_inference
│ │ ├── parallel_inference_xdit.py
│ │ └── run.sh
│ ├── replicate
│ │ ├── cog.yaml
│ │ ├── predict_i2v.py
│ │ └── predict_t2v.py
│ └── venhancer
│ │ ├── README.md
│ │ ├── README_ja.md
│ │ └── README_zh.md
└── weights
│ └── put weights here.txt
├── README.md
├── dataset
├── load_dataset.py
├── traj_vis
│ ├── D_loc1_61_t3n13_003d_Hemi12_1.json
│ ├── Hemi12_transforms.json
│ └── location_data_desert.json
├── utils.py
└── vis_trajectory.py
├── eval
├── GVHMR
│ ├── .gitignore
│ ├── .gitmodules
│ ├── LICENSE
│ ├── README.md
│ ├── docs
│ │ ├── INSTALL.md
│ │ └── example_video
│ │ │ ├── project_teaser.gif
│ │ │ └── tennis.mp4
│ ├── download_eval_pose.sh
│ ├── eval.sh
│ ├── hmr4d
│ │ ├── __init__.py
│ │ ├── build_gvhmr.py
│ │ ├── configs
│ │ │ ├── __init__.py
│ │ │ ├── data
│ │ │ │ └── mocap
│ │ │ │ │ ├── testY.yaml
│ │ │ │ │ └── trainX_testY.yaml
│ │ │ ├── demo.yaml
│ │ │ ├── exp
│ │ │ │ └── gvhmr
│ │ │ │ │ └── mixed
│ │ │ │ │ └── mixed.yaml
│ │ │ ├── global
│ │ │ │ ├── debug
│ │ │ │ │ ├── debug_train.yaml
│ │ │ │ │ └── debug_train_limit_data.yaml
│ │ │ │ └── task
│ │ │ │ │ └── gvhmr
│ │ │ │ │ ├── test_3dpw.yaml
│ │ │ │ │ ├── test_3dpw_emdb_rich.yaml
│ │ │ │ │ ├── test_emdb.yaml
│ │ │ │ │ └── test_rich.yaml
│ │ │ ├── hydra
│ │ │ │ └── default.yaml
│ │ │ ├── siga24_release.yaml
│ │ │ ├── store_gvhmr.py
│ │ │ └── train.yaml
│ │ ├── datamodule
│ │ │ └── mocap_trainX_testY.py
│ │ ├── dataset
│ │ │ ├── bedlam
│ │ │ │ ├── bedlam.py
│ │ │ │ ├── resource
│ │ │ │ │ └── vname2lwh.pt
│ │ │ │ └── utils.py
│ │ │ ├── emdb
│ │ │ │ ├── emdb_motion_test.py
│ │ │ │ └── utils.py
│ │ │ ├── h36m
│ │ │ │ ├── camera-parameters.json
│ │ │ │ ├── h36m.py
│ │ │ │ └── utils.py
│ │ │ ├── imgfeat_motion
│ │ │ │ └── base_dataset.py
│ │ │ ├── pure_motion
│ │ │ │ ├── amass.py
│ │ │ │ ├── base_dataset.py
│ │ │ │ ├── cam_traj_utils.py
│ │ │ │ └── utils.py
│ │ │ ├── rich
│ │ │ │ ├── resource
│ │ │ │ │ ├── cam2params.pt
│ │ │ │ │ ├── seqname2imgrange.json
│ │ │ │ │ ├── test.txt
│ │ │ │ │ ├── train.txt
│ │ │ │ │ ├── val.txt
│ │ │ │ │ └── w2az_sahmr.json
│ │ │ │ ├── rich_motion_test.py
│ │ │ │ └── rich_utils.py
│ │ │ └── threedpw
│ │ │ │ ├── threedpw_motion_test.py
│ │ │ │ ├── threedpw_motion_train.py
│ │ │ │ └── utils.py
│ │ ├── model
│ │ │ ├── common_utils
│ │ │ │ ├── optimizer.py
│ │ │ │ ├── scheduler.py
│ │ │ │ └── scheduler_cfg.py
│ │ │ └── gvhmr
│ │ │ │ ├── callbacks
│ │ │ │ ├── metric_3dpw.py
│ │ │ │ ├── metric_emdb.py
│ │ │ │ └── metric_rich.py
│ │ │ │ ├── gvhmr_pl.py
│ │ │ │ ├── gvhmr_pl_demo.py
│ │ │ │ ├── pipeline
│ │ │ │ └── gvhmr_pipeline.py
│ │ │ │ └── utils
│ │ │ │ ├── endecoder.py
│ │ │ │ ├── postprocess.py
│ │ │ │ └── stats_compose.py
│ │ ├── network
│ │ │ ├── base_arch
│ │ │ │ ├── embeddings
│ │ │ │ │ └── rotary_embedding.py
│ │ │ │ └── transformer
│ │ │ │ │ ├── encoder_rope.py
│ │ │ │ │ └── layer.py
│ │ │ ├── gvhmr
│ │ │ │ └── relative_transformer.py
│ │ │ └── hmr2
│ │ │ │ ├── __init__.py
│ │ │ │ ├── components
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pose_transformer.py
│ │ │ │ └── t_cond_mlp.py
│ │ │ │ ├── configs
│ │ │ │ ├── __init__.py
│ │ │ │ ├── model_config.yaml
│ │ │ │ └── smpl_mean_params.npz
│ │ │ │ ├── hmr2.py
│ │ │ │ ├── smpl_head.py
│ │ │ │ ├── utils
│ │ │ │ ├── geometry.py
│ │ │ │ ├── preproc.py
│ │ │ │ └── smpl_wrapper.py
│ │ │ │ └── vit.py
│ │ └── utils
│ │ │ ├── body_model
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── body_model.py
│ │ │ ├── body_model_smplh.py
│ │ │ ├── body_model_smplx.py
│ │ │ ├── coco_aug_dict.pth
│ │ │ ├── min_lbs.py
│ │ │ ├── seg_part_info.npy
│ │ │ ├── smpl_3dpw14_J_regressor_sparse.pt
│ │ │ ├── smpl_coco17_J_regressor.pt
│ │ │ ├── smpl_lite.py
│ │ │ ├── smpl_neutral_J_regressor.pt
│ │ │ ├── smpl_vert_segmentation.json
│ │ │ ├── smplx2smpl_sparse.pt
│ │ │ ├── smplx_lite.py
│ │ │ ├── smplx_verts437.pt
│ │ │ └── utils.py
│ │ │ ├── callbacks
│ │ │ ├── lr_monitor.py
│ │ │ ├── prog_bar.py
│ │ │ ├── simple_ckpt_saver.py
│ │ │ └── train_speed_timer.py
│ │ │ ├── comm
│ │ │ └── gather.py
│ │ │ ├── eval
│ │ │ └── eval_utils.py
│ │ │ ├── geo
│ │ │ ├── augment_noisy_pose.py
│ │ │ ├── flip_utils.py
│ │ │ ├── hmr_cam.py
│ │ │ ├── hmr_global.py
│ │ │ ├── quaternion.py
│ │ │ └── transforms.py
│ │ │ ├── geo_transform.py
│ │ │ ├── ik
│ │ │ └── ccd_ik.py
│ │ │ ├── kpts
│ │ │ └── kp2d_utils.py
│ │ │ ├── matrix.py
│ │ │ ├── net_utils.py
│ │ │ ├── preproc
│ │ │ ├── __init__.py
│ │ │ ├── slam.py
│ │ │ ├── tracker.py
│ │ │ ├── vitfeat_extractor.py
│ │ │ ├── vitpose.py
│ │ │ └── vitpose_pytorch
│ │ │ │ ├── __init__.py
│ │ │ │ └── src
│ │ │ │ └── vitpose_infer
│ │ │ │ ├── __init__.py
│ │ │ │ ├── builder
│ │ │ │ ├── __init__.py
│ │ │ │ ├── backbones
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── alexnet.py
│ │ │ │ │ ├── cpm.py
│ │ │ │ │ ├── hourglass.py
│ │ │ │ │ ├── hourglass_ae.py
│ │ │ │ │ ├── hrformer.py
│ │ │ │ │ ├── litehrnet.py
│ │ │ │ │ ├── mobilenet_v2.py
│ │ │ │ │ ├── mobilenet_v3.py
│ │ │ │ │ ├── mspn.py
│ │ │ │ │ ├── regnet.py
│ │ │ │ │ ├── resnest.py
│ │ │ │ │ ├── resnext.py
│ │ │ │ │ ├── rsn.py
│ │ │ │ │ ├── scnet.py
│ │ │ │ │ ├── seresnet.py
│ │ │ │ │ ├── seresnext.py
│ │ │ │ │ ├── shufflenet_v1.py
│ │ │ │ │ ├── shufflenet_v2.py
│ │ │ │ │ ├── tcn.py
│ │ │ │ │ ├── test_torch.py
│ │ │ │ │ ├── utils
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── channel_shuffle.py
│ │ │ │ │ │ ├── inverted_residual.py
│ │ │ │ │ │ ├── make_divisible.py
│ │ │ │ │ │ ├── se_layer.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── vgg.py
│ │ │ │ │ ├── vipnas_mbv3.py
│ │ │ │ │ ├── vipnas_resnet.py
│ │ │ │ │ └── vit.py
│ │ │ │ ├── configs
│ │ │ │ │ └── coco
│ │ │ │ │ │ ├── ViTPose_base_coco_256x192.py
│ │ │ │ │ │ ├── ViTPose_base_simple_coco_256x192.py
│ │ │ │ │ │ ├── ViTPose_huge_coco_256x192.py
│ │ │ │ │ │ ├── ViTPose_huge_simple_coco_256x192.py
│ │ │ │ │ │ ├── ViTPose_large_coco_256x192.py
│ │ │ │ │ │ ├── ViTPose_large_simple_coco_256x192.py
│ │ │ │ │ │ └── __init__.py
│ │ │ │ ├── heads
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── deconv_head.py
│ │ │ │ │ ├── deeppose_regression_head.py
│ │ │ │ │ ├── hmr_head.py
│ │ │ │ │ ├── interhand_3d_head.py
│ │ │ │ │ ├── temporal_regression_head.py
│ │ │ │ │ ├── topdown_heatmap_base_head.py
│ │ │ │ │ ├── topdown_heatmap_multi_stage_head.py
│ │ │ │ │ ├── topdown_heatmap_simple_head.py
│ │ │ │ │ ├── vipnas_heatmap_simple_head.py
│ │ │ │ │ └── voxelpose_head.py
│ │ │ │ └── model_builder.py
│ │ │ │ ├── model_builder.py
│ │ │ │ └── pose_utils
│ │ │ │ ├── ViTPose_trt.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── convert_to_trt.py
│ │ │ │ ├── general_utils.py
│ │ │ │ ├── inference_test.py
│ │ │ │ ├── logger_helper.py
│ │ │ │ ├── pose_utils.py
│ │ │ │ ├── pose_viz.py
│ │ │ │ ├── timerr.py
│ │ │ │ └── visualizer.py
│ │ │ ├── pylogger.py
│ │ │ ├── seq_utils.py
│ │ │ ├── smplx_utils.py
│ │ │ ├── video_io_utils.py
│ │ │ ├── vis
│ │ │ ├── README.md
│ │ │ ├── cv2_utils.py
│ │ │ ├── renderer.py
│ │ │ ├── renderer_tools.py
│ │ │ ├── renderer_utils.py
│ │ │ └── rich_logger.py
│ │ │ └── wis3d_utils.py
│ ├── pyproject.toml
│ ├── pyrightconfig.json
│ ├── requirements.txt
│ ├── setup.py
│ └── tools
│ │ ├── demo
│ │ ├── colab_demo.ipynb
│ │ ├── demo.py
│ │ └── demo_folder.py
│ │ ├── eval_pose.py
│ │ ├── train.py
│ │ ├── unitest
│ │ ├── make_hydra_cfg.py
│ │ └── run_dataset.py
│ │ └── video
│ │ ├── merge_folder.py
│ │ ├── merge_horizontal.py
│ │ └── merge_vertical.py
└── common_metrics_on_video_quality
│ ├── .gitignore
│ ├── README.md
│ ├── calculate_clip.py
│ ├── calculate_fvd.py
│ ├── calculate_fvd_styleganv.py
│ ├── calculate_lpips.py
│ ├── calculate_psnr.py
│ ├── calculate_ssim.py
│ ├── download_eval_visual.sh
│ ├── eval_prompts.json
│ └── eval_visual.sh
└── imgs
├── logo.png
└── vis_objstraj.png
/CogVideo/.github/ISSUE_TEMPLATE/bug_report.yaml:
--------------------------------------------------------------------------------
1 | name: "\U0001F41B Bug Report"
2 | description: Submit a bug report to help us improve CogVideoX / 提交一个 Bug 问题报告来帮助我们改进 CogVideoX 开源模型
3 | body:
4 | - type: textarea
5 | id: system-info
6 | attributes:
7 | label: System Info / 系統信息
8 | description: Your operating environment / 您的运行环境信息
9 | placeholder: Includes Cuda version, Diffusers version, Python version, operating system, hardware information (if you suspect a hardware problem)... / 包括Cuda版本,Diffusers,Python版本,操作系统,硬件信息(如果您怀疑是硬件方面的问题)...
10 | validations:
11 | required: true
12 |
13 | - type: checkboxes
14 | id: information-scripts-examples
15 | attributes:
16 | label: Information / 问题信息
17 | description: 'The problem arises when using: / 问题出现在'
18 | options:
19 | - label: "The official example scripts / 官方的示例脚本"
20 | - label: "My own modified scripts / 我自己修改的脚本和任务"
21 |
22 | - type: textarea
23 | id: reproduction
24 | validations:
25 | required: true
26 | attributes:
27 | label: Reproduction / 复现过程
28 | description: |
29 | Please provide a code example that reproduces the problem you encountered, preferably with a minimal reproduction unit.
30 | If you have code snippets, error messages, stack traces, please provide them here as well.
31 | Please format your code correctly using code tags. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
32 | Do not use screenshots, as they are difficult to read and (more importantly) do not allow others to copy and paste your code.
33 |
34 | 请提供能重现您遇到的问题的代码示例,最好是最小复现单元。
35 | 如果您有代码片段、错误信息、堆栈跟踪,也请在此提供。
36 | 请使用代码标签正确格式化您的代码。请参见 https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
37 | 请勿使用截图,因为截图难以阅读,而且(更重要的是)不允许他人复制粘贴您的代码。
38 | placeholder: |
39 | Steps to reproduce the behavior/复现Bug的步骤:
40 |
41 | 1.
42 | 2.
43 | 3.
44 |
45 | - type: textarea
46 | id: expected-behavior
47 | validations:
48 | required: true
49 | attributes:
50 | label: Expected behavior / 期待表现
51 | description: "A clear and concise description of what you would expect to happen. /简单描述您期望发生的事情。"
--------------------------------------------------------------------------------
/CogVideo/.github/ISSUE_TEMPLATE/feature-request.yaml:
--------------------------------------------------------------------------------
1 | name: "\U0001F680 Feature request"
2 | description: Submit a request for a new CogVideoX feature / 提交一个新的 CogVideoX开源模型的功能建议
3 | labels: [ "feature" ]
4 | body:
5 | - type: textarea
6 | id: feature-request
7 | validations:
8 | required: true
9 | attributes:
10 | label: Feature request / 功能建议
11 | description: |
12 | A brief description of the functional proposal. Links to corresponding papers and code are desirable.
13 | 对功能建议的简述。最好提供对应的论文和代码链接。
14 |
15 | - type: textarea
16 | id: motivation
17 | validations:
18 | required: true
19 | attributes:
20 | label: Motivation / 动机
21 | description: |
22 | Your motivation for making the suggestion. If that motivation is related to another GitHub issue, link to it here.
23 | 您提出建议的动机。如果该动机与另一个 GitHub 问题有关,请在此处提供对应的链接。
24 |
25 | - type: textarea
26 | id: contribution
27 | validations:
28 | required: true
29 | attributes:
30 | label: Your contribution / 您的贡献
31 | description: |
32 |
33 | Your PR link or any other link you can help with.
34 | 您的PR链接或者其他您能提供帮助的链接。
--------------------------------------------------------------------------------
/CogVideo/.github/PULL_REQUEST_TEMPLATE/pr_template.md:
--------------------------------------------------------------------------------
1 | # Raise valuable PR / 提出有价值的PR
2 |
3 | ## Caution / 注意事项:
4 | Users should keep the following points in mind when submitting PRs:
5 |
6 | 1. Ensure that your code meets the requirements in the [specification](../../resources/contribute.md).
7 | 2. the proposed PR should be relevant, if there are multiple ideas and optimizations, they should be assigned to different PRs.
8 |
9 | 用户在提交PR时候应该注意以下几点:
10 |
11 | 1. 确保您的代码符合 [规范](../../resources/contribute_zh.md) 中的要求。
12 | 2. 提出的PR应该具有针对性,如果具有多个不同的想法和优化方案,应该分配到不同的PR中。
13 |
14 | ## 不应该提出的PR / PRs that should not be proposed
15 |
16 | If a developer proposes a PR about any of the following, it may be closed or Rejected.
17 |
18 | 1. those that don't describe improvement options.
19 | 2. multiple issues of different types combined in one PR.
20 | 3. The proposed PR is highly duplicative of already existing PRs.
21 |
22 | 如果开发者提出关于以下方面的PR,则可能会被直接关闭或拒绝通过。
23 |
24 | 1. 没有说明改进方案的。
25 | 2. 多个不同类型的问题合并在一个PR中的。
26 | 3. 提出的PR与已经存在的PR高度重复的。
27 |
28 |
29 | # 检查您的PR
30 | - [ ] Have you read the Contributor Guidelines, Pull Request section? / 您是否阅读了贡献者指南、Pull Request 部分?
31 | - [ ] Has this been discussed/approved via a Github issue or forum? If so, add a link. / 是否通过 Github 问题或论坛讨论/批准过?如果是,请添加链接。
32 | - [ ] Did you make sure you updated the documentation with your changes? Here are the Documentation Guidelines, and here are the Documentation Formatting Tips. /您是否确保根据您的更改更新了文档?这里是文档指南,这里是文档格式化技巧。
33 | - [ ] Did you write new required tests? / 您是否编写了新的必要测试?
34 | - [ ] Are your PRs for only one issue / 您的PR是否仅针对一个问题
--------------------------------------------------------------------------------
/CogVideo/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__/
2 | samples*/
3 | runs/
4 | checkpoints/
5 | master_ip
6 | logs/
7 | *.DS_Store
8 | .idea
9 | output*
10 | test*
--------------------------------------------------------------------------------
/CogVideo/download.sh:
--------------------------------------------------------------------------------
1 | mkdir CogVideoX-2b-sat
2 | cd CogVideoX-2b-sat
3 | wget https://cloud.tsinghua.edu.cn/f/fdba7608a49c463ba754/?dl=1
4 | mv 'index.html?dl=1' vae.zip
5 | unzip vae.zip
6 | wget https://cloud.tsinghua.edu.cn/f/556a3e1329e74f1bac45/?dl=1
7 | mv 'index.html?dl=1' transformer.zip
8 | unzip transformer.zip
--------------------------------------------------------------------------------
/CogVideo/finetune/accelerate_config_machine_single.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | deepspeed_config:
4 | gradient_accumulation_steps: 1
5 | gradient_clipping: 1.0
6 | offload_optimizer_device: none
7 | offload_param_device: none
8 | zero3_init_flag: false
9 | zero_stage: 2
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | enable_cpu_affinity: false
13 | machine_rank: 0
14 | main_training_function: main
15 | dynamo_backend: 'no'
16 | mixed_precision: 'no'
17 | num_machines: 1
18 | num_processes: 8
19 | rdzv_backend: static
20 | same_network: true
21 | tpu_env: []
22 | tpu_use_cluster: false
23 | tpu_use_sudo: false
24 | use_cpu: false
--------------------------------------------------------------------------------
/CogVideo/finetune/accelerate_config_machine_single_debug.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | deepspeed_config:
4 | gradient_accumulation_steps: 1
5 | gradient_clipping: 1.0
6 | offload_optimizer_device: none
7 | offload_param_device: none
8 | zero3_init_flag: false
9 | zero_stage: 2
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | enable_cpu_affinity: false
13 | machine_rank: 0
14 | main_training_function: main
15 | dynamo_backend: 'no'
16 | mixed_precision: 'no'
17 | num_machines: 1
18 | num_processes: 1
19 | rdzv_backend: static
20 | same_network: true
21 | tpu_env: []
22 | tpu_use_cluster: false
23 | tpu_use_sudo: false
24 | use_cpu: false
--------------------------------------------------------------------------------
/CogVideo/finetune/finetune_single_rank_injector.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export MODEL_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/cogvideox-5b" # Change it to CogVideoX-5B path
4 | export TRANSFORMER_PATH="" # Resume from pretrained injector checkpoint
5 | export LORA_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/lora" # Change it to pretrained lora path
6 | export CACHE_PATH="~/.cache"
7 | export DATASET_PATH="/ytech_m2v2_hdd/fuxiao/360Motion-Dataset" # Change it to 360-Motion Dataset path
8 | export OUTPUT_PATH="injector"
9 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
10 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,"
11 |
12 | # if you are not using wth 8 gus, change `accelerate_config_machine_single_debug.yaml` num_processes as your gpu number
13 | accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \
14 | train_cogvideox_injector.py \
15 | --gradient_checkpointing \
16 | --pretrained_model_name_or_path $MODEL_PATH \
17 | --lora_path $LORA_PATH \
18 | --cache_dir $CACHE_PATH \
19 | --enable_tiling \
20 | --enable_slicing \
21 | --finetune_init \
22 | --instance_data_root $DATASET_PATH \
23 | --validation_prompt "a woman with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes and a robotic gazelle with a sturdy aluminum frame, an agile build, articulated legs and curved, metallic horns are moving in the city" \
24 | --validation_prompt_separator ::: \
25 | --num_validation_videos 1 \
26 | --validation_epochs 1 \
27 | --block_interval 2 \
28 | --seed 42 \
29 | --lora_scale 1.0 \
30 | --mixed_precision bf16 \
31 | --output_dir $OUTPUT_PATH \
32 | --height 480 \
33 | --width 720 \
34 | --fps 8 \
35 | --max_num_frames 49 \
36 | --skip_frames_start 0 \
37 | --skip_frames_end 0 \
38 | --train_batch_size 1 \
39 | --num_train_epochs 1000 \
40 | --checkpointing_steps 4000 \
41 | --gradient_accumulation_steps 1 \
42 | --learning_rate 1e-4 \
43 | --lr_scheduler cosine_with_restarts \
44 | --lr_warmup_steps 200 \
45 | --lr_num_cycles 1 \
46 | --enable_slicing \
47 | --enable_tiling \
48 | --gradient_checkpointing \
49 | --optimizer AdamW \
50 | --adam_beta1 0.9 \
51 | --adam_beta2 0.95 \
52 | --max_grad_norm 1.0 \
53 | --allow_tf32 \
54 | --report_to wandb
55 |
56 | # --resume_from_checkpoint $TRANSFORMER_PATH \
--------------------------------------------------------------------------------
/CogVideo/finetune/finetune_single_rank_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export MODEL_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/cogvideox-5b" # Change it to CogVideoX-5B path
4 | export CACHE_PATH="~/.cache"
5 | export DATASET_PATH="/ytech_m2v2_hdd/fuxiao/360Motion-Dataset" # Change it to 360-Motion Dataset path
6 | export OUTPUT_PATH="lora"
7 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
8 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,"
9 |
10 | # if you are not using wth 1 gpu, change `accelerate_config_machine_single_debug.yaml` num_processes as your gpu number
11 | accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \
12 | train_cogvideox_lora.py \
13 | --gradient_checkpointing \
14 | --pretrained_model_name_or_path $MODEL_PATH \
15 | --cache_dir $CACHE_PATH \
16 | --enable_tiling \
17 | --enable_slicing \
18 | --instance_data_root $DATASET_PATH \
19 | --validation_prompt "a woman with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes and a robotic gazelle with a sturdy aluminum frame, an agile build, articulated legs and curved, metallic horns are moving in the city" \
20 | --validation_prompt_separator ::: \
21 | --num_validation_videos 1 \
22 | --validation_epochs 1 \
23 | --seed 42 \
24 | --rank 32 \
25 | --lora_alpha 32 \
26 | --mixed_precision bf16 \
27 | --output_dir $OUTPUT_PATH \
28 | --height 480 \
29 | --width 720 \
30 | --fps 8 \
31 | --max_num_frames 49 \
32 | --skip_frames_start 0 \
33 | --skip_frames_end 0 \
34 | --train_batch_size 2 \
35 | --num_train_epochs 1000 \
36 | --checkpointing_steps 1000 \
37 | --gradient_accumulation_steps 1 \
38 | --learning_rate 3e-4 \
39 | --lr_scheduler cosine_with_restarts \
40 | --lr_warmup_steps 200 \
41 | --lr_num_cycles 1 \
42 | --enable_slicing \
43 | --enable_tiling \
44 | --gradient_checkpointing \
45 | --optimizer AdamW \
46 | --adam_beta1 0.9 \
47 | --adam_beta2 0.95 \
48 | --max_grad_norm 1.0 \
49 | --allow_tf32 \
50 | --report_to wandb
51 |
--------------------------------------------------------------------------------
/CogVideo/finetune/hostfile.txt:
--------------------------------------------------------------------------------
1 | node1 slots=8
2 | node2 slots=8
--------------------------------------------------------------------------------
/CogVideo/finetune/models/pipeline_output.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | import torch
4 |
5 | from diffusers.utils import BaseOutput
6 |
7 |
8 | @dataclass
9 | class CogVideoXPipelineOutput(BaseOutput):
10 | r"""
11 | Output class for CogVideo pipelines.
12 |
13 | Args:
14 | frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
15 | List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
16 | denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
17 | `(batch_size, num_frames, channels, height, width)`.
18 | """
19 |
20 | frames: torch.Tensor
21 |
--------------------------------------------------------------------------------
/CogVideo/inference/location_zoo.txt:
--------------------------------------------------------------------------------
1 | [
2 | 'fjord',
3 | 'sunset beach',
4 | 'cave',
5 | 'snowy tundra',
6 | 'prairie',
7 | 'asian town',
8 | 'rainforest',
9 | 'canyon',
10 | 'savanna',
11 | 'urban rooftop garden',
12 | 'swamp',
13 | 'riverbank',
14 | 'coral reef',
15 | 'volcanic landscape',
16 | 'wind farm',
17 | 'town street',
18 | 'night city square',
19 | 'mall lobby',
20 | 'glacier',
21 | 'seaside street',
22 | 'gymnastics room',
23 | 'abandoned factory',
24 | 'autumn forest',
25 | 'mountain village',
26 | 'coastal harbor',
27 | 'ancient ruins',
28 | 'modern metropolis',
29 | 'desert',
30 | 'forest',
31 | 'city',
32 | 'snowy street',
33 | 'park',
34 | ]
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.txt:
--------------------------------------------------------------------------------
1 | D_loc1_541_t1n37_021d_Hemi12_1
2 | a rabbit with a body covered in soft fur, quick hops, and a playful demeanor, showcasing its energy
3 | urban rooftop garden
4 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.txt:
--------------------------------------------------------------------------------
1 | D_loc1_66_t1n36_0042_Hemi12_1
2 | a fire spirit with long, twisting flames resembling flowing red and orange hair, a bright yellow core
3 | park
4 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.txt:
--------------------------------------------------------------------------------
1 | D_loc1_81_t1n42_0051_Hemi12_1
2 | a pickup truck with rugged dark green paint, extended cab, raised suspension, and a modest cargo bed cover
3 | wind farm
4 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.txt:
--------------------------------------------------------------------------------
1 | D_loc2_17_t1n8_0011_Hemi12_1
2 | a disaster rescue robot with reinforced limbs, advanced AI, and a rugged body designed to navigate
3 | sunset beach
4 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.txt:
--------------------------------------------------------------------------------
1 | D_loc2_482_t1n48_01e2_Hemi12_1
2 | a man with short spiky blonde hair, slim build, a black trench coat, blue jeans, and brown hiking shoes
3 | riverbank
4 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.txt:
--------------------------------------------------------------------------------
1 | D_loc3_323_t1n15_0143_Hemi12_1
2 | a cloud creature with billowing white and gray plumes forming a soft, rounded body, wisps of darker fog
3 | coral reef
4 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.txt:
--------------------------------------------------------------------------------
1 | D_loc3_568_t1n3_0238_Hemi12_1
2 | a woman with long straight black hair, toned build, a blue denim jacket, light gray leggings, and black slip-on shoes
3 | cave
4 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.txt:
--------------------------------------------------------------------------------
1 | D_loc4_1174_t1n9_0496_Hemi12_1
2 | a polar bear with thick white fur, strong paws, and a black nose, embodying the essence of the Arctic
3 | mall lobby
4 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.txt:
--------------------------------------------------------------------------------
1 | D_loc5_1210_t1n34_04ba_Hemi12_1
2 | a moose with a body covered in thick brown fur, massive antlers, and a bulky frame
3 | rainforest
4 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.txt:
--------------------------------------------------------------------------------
1 | D_loc5_440_t1n35_01b8_Hemi12_1
2 | a dolphin with sleek grey skin, a curved dorsal fin, and intelligent, playful eyes, reflecting its nature
3 | sunset beach
4 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.txt:
--------------------------------------------------------------------------------
1 | D_loc1_1276_t2n30_04fc_Hemi12_1
2 | a man with short curly red hair, average build, a black leather jacket, dark blue cargo pants, and white sneakers
3 | a fox with sleek russet fur, a bushy tail tipped with black, and bright green and cunning eyes
4 | sunset beach
5 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.txt:
--------------------------------------------------------------------------------
1 | D_loc1_806_t2n2_0326_Hemi12_1
2 | a porcupine with a body covered in spiky brown quills, a small nose, and curious eyes
3 | a woman with long straight black hair, toned build, a blue denim jacket, light gray leggings, and black slip-on shoes
4 | coral reef
5 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.txt:
--------------------------------------------------------------------------------
1 | D_loc1_886_t2n25_0376_Hemi12_1
2 | a man with medium-length straight brown hair, tall and slender, a gray crew-neck t-shirt, beige trousers, dark green sneakers
3 | a wolf with thick silver-gray fur, alert golden eyes, and a lean yet strong body, exuding confidence and boldness
4 | urban rooftop garden
5 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.txt:
--------------------------------------------------------------------------------
1 | D_loc2_1442_t2n36_05a2_Hemi12_1
2 | a storm entity with dark swirling clouds as a body, streaks of electric blue lightning shooting across it
3 | a surveillance drone robot with extendable camera arms, thermal vision, and a stealth black body
4 | swamp
5 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.txt:
--------------------------------------------------------------------------------
1 | D_loc5_1010_t2n2_03f2_Hemi12_1
2 | a man with short curly red hair, average build, a black leather jacket, dark blue cargo pants, and white sneakers
3 | a woman with long wavy blonde hair, petite figure, a red floral dress, white sandals, and a yellow shoulder bag
4 | mall lobby
5 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.txt:
--------------------------------------------------------------------------------
1 | D_loc5_1095_t2n37_0447_Hemi12_1
2 | a companion robot with a friendly digital face, a smooth white exterior, and social interaction algorithms
3 | a man with short straight black hair, tall and lean build, a navy blue sweater, khaki shorts, and brown sandals
4 | sunset beach
5 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.txt:
--------------------------------------------------------------------------------
1 | D_loc5_120_t2n37_0078_Hemi12_1
2 | a compact electric vehicle with a silver finish, aerodynamic profile, and efficient battery
3 | a fox with sleek russet fur, a bushy tail tipped with black, and bright green and cunning eyes
4 | night city square
5 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.txt:
--------------------------------------------------------------------------------
1 | D_loc5_1290_t2n36_050a_Hemi12_1
2 | a firefighting robot with a water cannon arm, heat sensors, and durable red-and-silver exterior
3 | a penguin with a body covered in smooth black-and-white feathers, short wings, and webbed feet
4 | swamp
5 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.txt:
--------------------------------------------------------------------------------
1 | D_loc5_1440_t2n35_05a0_Hemi12_1
2 | a fire spirit with long, twisting flames resembling flowing red and orange hair, a bright yellow core
3 | a moose with a body covered in thick brown fur, massive antlers, and a bulky frame
4 | forest
5 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.txt:
--------------------------------------------------------------------------------
1 | D_loc5_65_t2n23_0041_Hemi12_1
2 | a woman with shoulder-length wavy brown hair, slim build, a green parka, black leggings, and gray hiking boots
3 | a parrot with bright red, blue, and yellow feathers, a curved beak, and sharp intelligent eyes
4 | snowy tundra
5 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.txt:
--------------------------------------------------------------------------------
1 | D_loc1_1041_t3n22_0411_Hemi12_1
2 | a storm entity with dark swirling clouds as a body, streaks of electric blue lightning shooting across it
3 | a regal lion with a thick, flowing golden mane, sharp brown eyes, and a powerful muscular frame
4 | a man with short straight black hair, tall and lean build, a navy blue sweater, khaki shorts, and brown sandals
5 | swamp
6 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.txt:
--------------------------------------------------------------------------------
1 | D_loc1_1226_t3n24_04ca_Hemi12_1
2 | a woman with short blonde hair, slim athletic build, a red leather jacket, dark blue jeans, and white sneakers
3 | a private jet with a shiny silver body, elongated wings, a slim nose, and a compact rear stabilizer
4 | a wolf with a body covered in thick silver fur, sharp ears, and piercing yellow eyes, showcasing its alertness
5 | prairie
6 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.txt:
--------------------------------------------------------------------------------
1 | D_loc1_176_t3n26_00b0_Hemi12_1
2 | a horse with chestnut brown fur, muscular legs, a slim neck, and a flowing mane, exuding strength and grace
3 | a flamingo with a body covered in pink feathers, long slender legs, and a gracefully curved neck
4 | a wolf with thick silver-gray fur, alert golden eyes, and a lean yet strong body, exuding confidence and boldness
5 | abandoned factory
6 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.txt:
--------------------------------------------------------------------------------
1 | D_loc1_196_t3n32_00c4_Hemi12_1
2 | a man with short spiky blonde hair, slim build, a black trench coat, blue jeans, and brown hiking shoes
3 | a polar bear with thick white fur, strong paws, and a black nose, embodying the essence of the Arctic
4 | a deer with sleek tan fur, long slender legs, a graceful neck, and tiny antlers atop its head
5 | desert
6 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.txt:
--------------------------------------------------------------------------------
1 | D_loc1_536_t3n1_0218_Hemi12_1
2 | a tiger with a pristine white coat marked by bold black stripes, bright blue eyes, and a graceful, poised form
3 | a firefighting robot with a water cannon arm, heat sensors, and durable red-and-silver exterior
4 | a sporty roadster with a convertible top, silver trim, and a powerful engine
5 | snowy street
6 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.txt:
--------------------------------------------------------------------------------
1 | D_loc2_1287_t3n5_0507_Hemi12_1
2 | a panda with a body covered in fluffy black-and-white fur, a round face, and gentle eyes, radiating warmth
3 | a man with short straight black hair, tall and lean build, a navy blue sweater, khaki shorts, and brown sandals
4 | an industrial welding robot with articulated arms, a laser precision welder, and heat-resistant shields
5 | urban rooftop garden
6 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.txt:
--------------------------------------------------------------------------------
1 | D_loc2_1392_t3n4_0570_Hemi12_1
2 | a fluttering butterfly with intricate wing patterns, vivid colors, and graceful flight
3 | a man with buzz-cut blonde hair, stocky build, a gray zip-up sweater, black shorts, and red basketball shoes
4 | a giraffe with golden-yellow fur, long legs, a tall slender neck, and patches of brown spots, exuding elegance and calm
5 | volcanic landscape
6 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.txt:
--------------------------------------------------------------------------------
1 | D_loc3_1473_t3n23_05c1_Hemi12_1
2 | a firefighting robot with a water cannon arm, heat sensors, and durable red-and-silver exterior
3 | a crocodile with a body covered in scaly green skin, a powerful tail, and sharp teeth
4 | a rabbit with a body covered in soft fur, quick hops, and a playful demeanor, showcasing its energy
5 | coastal harbor
6 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.txt:
--------------------------------------------------------------------------------
1 | D_loc4_849_t3n28_0351_Hemi12_1
2 | a man with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes
3 | a sedan with a sleek metallic silver body, long wheelbase, a low-profile hood, and a small rear spoiler
4 | a gazelle with a body covered in sleek tan fur, long legs, and elegant curved horns, showcasing its grace
5 | desert
6 |
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.mp4
--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.txt:
--------------------------------------------------------------------------------
1 | D_loc5_865_t3n34_0361_Hemi12_1
2 | a man with a shaved head, broad shoulders, a gray graphic t-shirt, dark jeans, and brown leather boots
3 | a foggy apparition with pale gray wisps drifting together in a soft, undefined form, tiny white sparkles
4 | a jaguar with a golden-yellow coat dotted with intricate black rosettes, deep green eyes, and a muscular build
5 | fjord
6 |
--------------------------------------------------------------------------------
/CogVideo/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff]
2 | line-length = 119
3 |
4 | [tool.ruff.lint]
5 | # Never enforce `E501` (line length violations).
6 | ignore = ["C901", "E501", "E741", "F402", "F823"]
7 | select = ["C", "E", "F", "I", "W"]
8 |
9 | # Ignore import violations in all `__init__.py` files.
10 | [tool.ruff.lint.per-file-ignores]
11 | "__init__.py" = ["E402", "F401", "F403", "F811"]
12 |
13 | [tool.ruff.lint.isort]
14 | lines-after-imports = 2
15 |
16 | [tool.ruff.format]
17 | # Like Black, use double quotes for strings.
18 | quote-style = "double"
19 |
20 | # Like Black, indent with spaces, rather than tabs.
21 | indent-style = "space"
22 |
23 | # Like Black, respect magic trailing commas.
24 | skip-magic-trailing-comma = false
25 |
26 | # Like Black, automatically detect the appropriate line ending.
27 | line-ending = "auto"
28 |
--------------------------------------------------------------------------------
/CogVideo/requirements.txt:
--------------------------------------------------------------------------------
1 | diffusers==0.31.0
2 | accelerate==1.1.1
3 | transformers==4.46.2
4 | numpy==1.26.0
5 | # torch==2.5.0
6 | # torchvision==0.20.0
7 | sentencepiece==0.2.0
8 | SwissArmyTransformer==0.4.12
9 | gradio==5.5.0
10 | imageio==2.35.1
11 | imageio-ffmpeg==0.5.1
12 | openai==1.54.0
13 | moviepy==1.0.3
14 | scikit-video==1.1.11
15 | opencv-python
16 | peft==0.12.0
17 | decord
18 | wandb
--------------------------------------------------------------------------------
/CogVideo/tools/caption/README.md:
--------------------------------------------------------------------------------
1 | # Video Caption
2 |
3 | Typically, most video data does not come with corresponding descriptive text, so it is necessary to convert the video
4 | data into textual descriptions to provide the essential training data for text-to-video models.
5 |
6 | ## Update and News
7 | - 🔥🔥 **News**: ```2024/9/19```: The caption model used in the CogVideoX training process to convert video data into text
8 | descriptions, [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption), is now open-source. Feel
9 | free to download and use it.
10 |
11 |
12 | ## Video Caption via CogVLM2-Caption
13 |
14 | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/)
15 |
16 | CogVLM2-Caption is a video captioning model used to generate training data for the CogVideoX model.
17 |
18 | ### Install
19 | ```shell
20 | pip install -r requirements.txt
21 | ```
22 |
23 | ### Usage
24 |
25 | ```shell
26 | python video_caption.py
27 | ```
28 |
29 | Example:
30 |
31 |

32 |
33 |
34 | ## Video Caption via CogVLM2-Video
35 |
36 | [Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) | [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/)
37 |
38 | CogVLM2-Video is a versatile video understanding model equipped with timestamp-based question answering capabilities.
39 | Users can input prompts such as `Please describe this video in detail.` to the model to obtain a detailed video caption:
40 |
41 |

42 |
43 |
44 | Users can use the provided [code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) to load the model or configure a RESTful API to generate video captions.
45 |
46 | ## Citation
47 |
48 | 🌟 If you find our work helpful, please leave us a star and cite our paper.
49 |
50 | CogVLM2-Caption:
51 | ```
52 | @article{yang2024cogvideox,
53 | title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
54 | author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
55 | journal={arXiv preprint arXiv:2408.06072},
56 | year={2024}
57 | }
58 | ```
59 | CogVLM2-Video:
60 | ```
61 | @article{hong2024cogvlm2,
62 | title={CogVLM2: Visual Language Models for Image and Video Understanding},
63 | author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others},
64 | journal={arXiv preprint arXiv:2408.16500},
65 | year={2024}
66 | }
67 | ```
--------------------------------------------------------------------------------
/CogVideo/tools/caption/README_ja.md:
--------------------------------------------------------------------------------
1 | # ビデオキャプション
2 |
3 | 通常、ほとんどのビデオデータには対応する説明文が付いていないため、ビデオデータをテキストの説明に変換して、テキストからビデオへのモデルに必要なトレーニングデータを提供する必要があります。
4 |
5 | ## 更新とニュース
6 | - 🔥🔥 **ニュース**: ```2024/9/19```:CogVideoX
7 | のトレーニングプロセスで、ビデオデータをテキストに変換するためのキャプションモデル [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption)
8 | がオープンソース化されました。ぜひダウンロードしてご利用ください。
9 | ## CogVLM2-Captionによるビデオキャプション
10 |
11 | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/)
12 |
13 | CogVLM2-Captionは、CogVideoXモデルのトレーニングデータを生成するために使用されるビデオキャプションモデルです。
14 |
15 | ### インストール
16 | ```shell
17 | pip install -r requirements.txt
18 | ```
19 |
20 | ### 使用方法
21 | ```shell
22 | python video_caption.py
23 | ```
24 |
25 | 例:
26 |
27 |

28 |
29 |
30 |
31 |
32 | ## CogVLM2-Video を使用したビデオキャプション
33 |
34 | [Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) | [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/)
35 |
36 |
37 | CogVLM2-Video は、タイムスタンプベースの質問応答機能を備えた多機能なビデオ理解モデルです。ユーザーは `このビデオを詳細に説明してください。` などのプロンプトをモデルに入力して、詳細なビデオキャプションを取得できます:
38 |
39 |

40 |
41 |
42 | ユーザーは提供された[コード](https://github.com/THUDM/CogVLM2/tree/main/video_demo)を使用してモデルをロードするか、RESTful API を構成してビデオキャプションを生成できます。
43 |
44 | ## Citation
45 |
46 | 🌟 If you find our work helpful, please leave us a star and cite our paper.
47 |
48 | CogVLM2-Caption:
49 | ```
50 | @article{yang2024cogvideox,
51 | title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
52 | author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
53 | journal={arXiv preprint arXiv:2408.06072},
54 | year={2024}
55 | }
56 | ```
57 | CogVLM2-Video:
58 | ```
59 | @article{hong2024cogvlm2,
60 | title={CogVLM2: Visual Language Models for Image and Video Understanding},
61 | author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others},
62 | journal={arXiv preprint arXiv:2408.16500},
63 | year={2024}
64 | }
65 | ```
66 |
--------------------------------------------------------------------------------
/CogVideo/tools/caption/README_zh.md:
--------------------------------------------------------------------------------
1 | # 视频Caption
2 |
3 | 通常,大多数视频数据不带有相应的描述性文本,因此需要将视频数据转换为文本描述,以提供必要的训练数据用于文本到视频模型。
4 |
5 | ## 项目更新
6 | - 🔥🔥 **News**: ```2024/9/19```: CogVideoX 训练过程中用于将视频数据转换为文本描述的 Caption
7 | 模型 [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption)
8 | 已经开源。欢迎前往下载并使用。
9 |
10 | ## 通过 CogVLM2-Caption 模型生成视频Caption
11 |
12 | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/)
13 |
14 | CogVLM2-Caption是用于生成CogVideoX模型训练数据的视频caption模型。
15 |
16 | ### 安装依赖
17 | ```shell
18 | pip install -r requirements.txt
19 | ```
20 |
21 | ### 运行caption模型
22 |
23 | ```shell
24 | python video_caption.py
25 | ```
26 |
27 | 示例:
28 |
29 |

30 |
31 |
32 | ## 通过 CogVLM2-Video 模型生成视频Caption
33 |
34 | [Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) | [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/)
35 |
36 | CogVLM2-Video 是一个多功能的视频理解模型,具备基于时间戳的问题回答能力。用户可以输入诸如 `Describe this video in detail.` 的提示语给模型,以获得详细的视频Caption:
37 |
38 |
39 |
40 |

41 |
42 |
43 | 用户可以使用提供的[代码](https://github.com/THUDM/CogVLM2/tree/main/video_demo)加载模型或配置 RESTful API 来生成视频Caption。
44 |
45 |
46 | ## Citation
47 |
48 | 🌟 If you find our work helpful, please leave us a star and cite our paper.
49 |
50 | CogVLM2-Caption:
51 | ```
52 | @article{yang2024cogvideox,
53 | title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
54 | author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
55 | journal={arXiv preprint arXiv:2408.06072},
56 | year={2024}
57 | }
58 | ```
59 | CogVLM2-Video:
60 | ```
61 | @article{hong2024cogvlm2,
62 | title={CogVLM2: Visual Language Models for Image and Video Understanding},
63 | author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others},
64 | journal={arXiv preprint arXiv:2408.16500},
65 | year={2024}
66 | }
67 | ```
--------------------------------------------------------------------------------
/CogVideo/tools/caption/assests/CogVLM2-Caption-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/tools/caption/assests/CogVLM2-Caption-example.png
--------------------------------------------------------------------------------
/CogVideo/tools/caption/assests/cogvlm2-video-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/tools/caption/assests/cogvlm2-video-example.png
--------------------------------------------------------------------------------
/CogVideo/tools/caption/requirements.txt:
--------------------------------------------------------------------------------
1 | decord>=0.6.0
2 | #根据https://download.pytorch.org/whl/torch/,python版本为[3.8,3.11]
3 | torch==2.1.0
4 | torchvision== 0.16.0
5 | pytorchvideo==0.1.5
6 | xformers
7 | transformers==4.42.4
8 | #git+https://github.com/huggingface/transformers.git
9 | huggingface-hub>=0.23.0
10 | pillow
11 | chainlit>=1.0
12 | pydantic>=2.7.1
13 | timm>=0.9.16
14 | openai>=1.30.1
15 | loguru>=0.7.2
16 | pydantic>=2.7.1
17 | einops
18 | sse-starlette>=2.1.0
19 | flask
20 | gunicorn
21 | gevent
22 | requests
23 | gradio
--------------------------------------------------------------------------------
/CogVideo/tools/export_sat_lora_weight.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 | import torch
3 | import argparse
4 | from diffusers.loaders.lora_base import LoraBaseMixin
5 | from diffusers.models.modeling_utils import load_state_dict
6 |
7 |
8 | def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
9 | state_dict = saved_dict
10 | if "model" in saved_dict.keys():
11 | state_dict = state_dict["model"]
12 | if "module" in saved_dict.keys():
13 | state_dict = state_dict["module"]
14 | if "state_dict" in saved_dict.keys():
15 | state_dict = state_dict["state_dict"]
16 | return state_dict
17 |
18 | LORA_KEYS_RENAME = {
19 |
20 | 'attention.query_key_value.matrix_A.0': 'attn1.to_q.lora_A.weight',
21 | 'attention.query_key_value.matrix_A.1': 'attn1.to_k.lora_A.weight',
22 | 'attention.query_key_value.matrix_A.2': 'attn1.to_v.lora_A.weight',
23 | 'attention.query_key_value.matrix_B.0': 'attn1.to_q.lora_B.weight',
24 | 'attention.query_key_value.matrix_B.1': 'attn1.to_k.lora_B.weight',
25 | 'attention.query_key_value.matrix_B.2': 'attn1.to_v.lora_B.weight',
26 | 'attention.dense.matrix_A.0': 'attn1.to_out.0.lora_A.weight',
27 | 'attention.dense.matrix_B.0': 'attn1.to_out.0.lora_B.weight'
28 | }
29 |
30 |
31 |
32 | PREFIX_KEY = "model.diffusion_model."
33 | SAT_UNIT_KEY = "layers"
34 | LORA_PREFIX_KEY = "transformer_blocks"
35 |
36 |
37 |
38 | def export_lora_weight(ckpt_path,lora_save_directory):
39 |
40 | merge_original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
41 |
42 |
43 | lora_state_dict = {}
44 | for key in list(merge_original_state_dict.keys()):
45 | new_key = key[len(PREFIX_KEY) :]
46 | for special_key, lora_keys in LORA_KEYS_RENAME.items():
47 | if new_key.endswith(special_key):
48 | new_key = new_key.replace(special_key, lora_keys)
49 | new_key = new_key.replace(SAT_UNIT_KEY, LORA_PREFIX_KEY)
50 |
51 | lora_state_dict[new_key] = merge_original_state_dict[key]
52 |
53 |
54 |
55 | # final length should be 240
56 | if len(lora_state_dict) != 240:
57 | raise ValueError("lora_state_dict length is not 240")
58 |
59 | lora_state_dict.keys()
60 |
61 | LoraBaseMixin.write_lora_layers(
62 | state_dict=lora_state_dict,
63 | save_directory=lora_save_directory,
64 | is_main_process=True,
65 | weight_name=None,
66 | save_function=None,
67 | safe_serialization=True
68 | )
69 |
70 |
71 | def get_args():
72 | parser = argparse.ArgumentParser()
73 | parser.add_argument(
74 | "--sat_pt_path", type=str, required=True, help="Path to original sat transformer checkpoint"
75 | )
76 | parser.add_argument("--lora_save_directory", type=str, required=True, help="Path where converted lora should be saved")
77 | return parser.parse_args()
78 |
79 |
80 | if __name__ == "__main__":
81 | args = get_args()
82 |
83 | export_lora_weight(args.sat_pt_path, args.lora_save_directory)
84 |
--------------------------------------------------------------------------------
/CogVideo/tools/llm_flux_cogvideox/generate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | NUM_VIDEOS=10
4 | INFERENCE_STEPS=50
5 | GUIDANCE_SCALE=7.0
6 | OUTPUT_DIR_PREFIX="outputs/gpu_"
7 | LOG_DIR_PREFIX="logs/gpu_"
8 |
9 | VIDEO_MODEL_PATH="/share/official_pretrains/hf_home/CogVideoX-5b-I2V"
10 | LLM_MODEL_PATH="/share/home/zyx/Models/Meta-Llama-3.1-8B-Instruct"
11 | IMAGE_MODEL_PATH = "share/home/zyx/Models/FLUX.1-dev"
12 |
13 | #VIDEO_MODEL_PATH="THUDM/CogVideoX-5B-I2V"
14 | #LLM_MODEL_PATH="THUDM/glm-4-9b-chat"
15 | #IMAGE_MODEL_PATH = "black-forest-labs/FLUX.1-dev"
16 |
17 | CUDA_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
18 |
19 | IFS=',' read -r -a GPU_ARRAY <<< "$CUDA_DEVICES"
20 |
21 | for i in "${!GPU_ARRAY[@]}"
22 | do
23 | GPU=${GPU_ARRAY[$i]}
24 | echo "Starting task on GPU $GPU..."
25 | CUDA_VISIBLE_DEVICES=$GPU nohup python3 llm_flux_cogvideox.py \
26 | --caption_generator_model_id $LLM_MODEL_PATH \
27 | --image_generator_model_id $IMAGE_MODEL_PATH \
28 | --model_path $VIDEO_MODEL_PATH \
29 | --num_videos $NUM_VIDEOS \
30 | --image_generator_num_inference_steps $INFERENCE_STEPS \
31 | --guidance_scale $GUIDANCE_SCALE \
32 | --use_dynamic_cfg \
33 | --output_dir ${OUTPUT_DIR_PREFIX}${GPU} \
34 | > ${LOG_DIR_PREFIX}${GPU}.log 2>&1 &
35 | done
--------------------------------------------------------------------------------
/CogVideo/tools/parallel_inference/run.sh:
--------------------------------------------------------------------------------
1 | set -x
2 |
3 | export PYTHONPATH=$PWD:$PYTHONPATH
4 |
5 | # Select the model type
6 | # The model is downloaded to a specified location on disk,
7 | # or you can simply use the model's ID on Hugging Face,
8 | # which will then be downloaded to the default cache path on Hugging Face.
9 |
10 | export MODEL_TYPE="CogVideoX"
11 | # Configuration for different model types
12 | # script, model_id, inference_step
13 | declare -A MODEL_CONFIGS=(
14 | ["CogVideoX"]="parallel_inference_xdit.py /cfs/dit/CogVideoX-2b 20"
15 | )
16 |
17 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
18 | IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
19 | export SCRIPT MODEL_ID INFERENCE_STEP
20 | else
21 | echo "Invalid MODEL_TYPE: $MODEL_TYPE"
22 | exit 1
23 | fi
24 |
25 | mkdir -p ./results
26 |
27 | # task args
28 | if [ "$MODEL_TYPE" = "CogVideoX" ]; then
29 | TASK_ARGS="--height 480 --width 720 --num_frames 9"
30 | fi
31 |
32 | # CogVideoX asserts sp_degree == ulysses_degree*ring_degree <= 2. Also, do not set the pipefusion degree.
33 | if [ "$MODEL_TYPE" = "CogVideoX" ]; then
34 | N_GPUS=4
35 | PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 1"
36 | CFG_ARGS="--use_cfg_parallel"
37 | fi
38 |
39 |
40 | torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
41 | --model $MODEL_ID \
42 | $PARALLEL_ARGS \
43 | $TASK_ARGS \
44 | $PIPEFUSION_ARGS \
45 | $OUTPUT_ARGS \
46 | --num_inference_steps $INFERENCE_STEP \
47 | --warmup_steps 0 \
48 | --prompt "A small dog." \
49 | $CFG_ARGS \
50 | $PARALLLEL_VAE \
51 | $COMPILE_FLAG
52 |
--------------------------------------------------------------------------------
/CogVideo/tools/replicate/cog.yaml:
--------------------------------------------------------------------------------
1 | # Configuration for Cog ⚙️
2 | # Reference: https://cog.run/yaml
3 |
4 | build:
5 | # set to true if your model requires a GPU
6 | gpu: true
7 |
8 | # a list of ubuntu apt packages to install
9 | system_packages:
10 | - "libgl1-mesa-glx"
11 | - "libglib2.0-0"
12 |
13 | # python version in the form '3.11' or '3.11.4'
14 | python_version: "3.11"
15 |
16 | # a list of packages in the format ==
17 | python_packages:
18 | - diffusers>=0.30.3
19 | - accelerate>=0.34.2
20 | - transformers>=4.44.2
21 | - numpy==1.26.0
22 | - torch>=2.4.0
23 | - torchvision>=0.19.0
24 | - sentencepiece>=0.2.0
25 | - SwissArmyTransformer>=0.4.12
26 | - imageio>=2.35.1
27 | - imageio-ffmpeg>=0.5.1
28 | - openai>=1.45.0
29 | - moviepy>=1.0.3
30 | - pillow==9.5.0
31 | - pydantic==1.10.7
32 | run:
33 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
34 |
35 | # predict.py defines how predictions are run on your model
36 | predict: "predict_t2v.py:Predictor"
37 | # predict: "predict_i2v.py:Predictor"
38 |
--------------------------------------------------------------------------------
/CogVideo/tools/replicate/predict_i2v.py:
--------------------------------------------------------------------------------
1 | # Prediction interface for Cog ⚙️
2 | # https://cog.run/python
3 |
4 | import os
5 | import subprocess
6 | import time
7 | import torch
8 | from diffusers import CogVideoXImageToVideoPipeline
9 | from diffusers.utils import export_to_video, load_image
10 | from cog import BasePredictor, Input, Path
11 |
12 |
13 | MODEL_CACHE = "model_cache_i2v"
14 | MODEL_URL = (
15 | f"https://weights.replicate.delivery/default/THUDM/CogVideo/{MODEL_CACHE}.tar"
16 | )
17 | os.environ["HF_DATASETS_OFFLINE"] = "1"
18 | os.environ["TRANSFORMERS_OFFLINE"] = "1"
19 | os.environ["HF_HOME"] = MODEL_CACHE
20 | os.environ["TORCH_HOME"] = MODEL_CACHE
21 | os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE
22 | os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE
23 | os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE
24 |
25 |
26 | def download_weights(url, dest):
27 | start = time.time()
28 | print("downloading url: ", url)
29 | print("downloading to: ", dest)
30 | subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
31 | print("downloading took: ", time.time() - start)
32 |
33 |
34 | class Predictor(BasePredictor):
35 | def setup(self) -> None:
36 | """Load the model into memory to make running multiple predictions efficient"""
37 |
38 | if not os.path.exists(MODEL_CACHE):
39 | download_weights(MODEL_URL, MODEL_CACHE)
40 |
41 | # model_id: THUDM/CogVideoX-5b-I2V
42 | self.pipe = CogVideoXImageToVideoPipeline.from_pretrained(
43 | MODEL_CACHE, torch_dtype=torch.bfloat16
44 | ).to("cuda")
45 |
46 | self.pipe.enable_model_cpu_offload()
47 | self.pipe.vae.enable_tiling()
48 |
49 | def predict(
50 | self,
51 | prompt: str = Input(
52 | description="Input prompt", default="Starry sky slowly rotating."
53 | ),
54 | image: Path = Input(description="Input image"),
55 | num_inference_steps: int = Input(
56 | description="Number of denoising steps", ge=1, le=500, default=50
57 | ),
58 | guidance_scale: float = Input(
59 | description="Scale for classifier-free guidance", ge=1, le=20, default=6
60 | ),
61 | num_frames: int = Input(
62 | description="Number of frames for the output video", default=49
63 | ),
64 | seed: int = Input(
65 | description="Random seed. Leave blank to randomize the seed", default=None
66 | ),
67 | ) -> Path:
68 | """Run a single prediction on the model"""
69 |
70 | if seed is None:
71 | seed = int.from_bytes(os.urandom(2), "big")
72 | print(f"Using seed: {seed}")
73 |
74 | img = load_image(image=str(image))
75 |
76 | video = self.pipe(
77 | prompt=prompt,
78 | image=img,
79 | num_videos_per_prompt=1,
80 | num_inference_steps=num_inference_steps,
81 | num_frames=num_frames,
82 | guidance_scale=guidance_scale,
83 | generator=torch.Generator(device="cuda").manual_seed(seed),
84 | ).frames[0]
85 |
86 | out_path = "/tmp/out.mp4"
87 |
88 | export_to_video(video, out_path, fps=8)
89 | return Path(out_path)
90 |
--------------------------------------------------------------------------------
/CogVideo/tools/replicate/predict_t2v.py:
--------------------------------------------------------------------------------
1 | # Prediction interface for Cog ⚙️
2 | # https://cog.run/python
3 |
4 | import os
5 | import subprocess
6 | import time
7 | import torch
8 | from diffusers import CogVideoXPipeline
9 | from diffusers.utils import export_to_video
10 | from cog import BasePredictor, Input, Path
11 |
12 |
13 | MODEL_CACHE = "model_cache"
14 | MODEL_URL = (
15 | f"https://weights.replicate.delivery/default/THUDM/CogVideo/{MODEL_CACHE}.tar"
16 | )
17 | os.environ["HF_DATASETS_OFFLINE"] = "1"
18 | os.environ["TRANSFORMERS_OFFLINE"] = "1"
19 | os.environ["HF_HOME"] = MODEL_CACHE
20 | os.environ["TORCH_HOME"] = MODEL_CACHE
21 | os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE
22 | os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE
23 | os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE
24 |
25 |
26 | def download_weights(url, dest):
27 | start = time.time()
28 | print("downloading url: ", url)
29 | print("downloading to: ", dest)
30 | subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
31 | print("downloading took: ", time.time() - start)
32 |
33 |
34 | class Predictor(BasePredictor):
35 | def setup(self) -> None:
36 | """Load the model into memory to make running multiple predictions efficient"""
37 |
38 | if not os.path.exists(MODEL_CACHE):
39 | download_weights(MODEL_URL, MODEL_CACHE)
40 |
41 | # model_id: THUDM/CogVideoX-5b
42 | self.pipe = CogVideoXPipeline.from_pretrained(
43 | MODEL_CACHE,
44 | torch_dtype=torch.bfloat16,
45 | ).to("cuda")
46 |
47 | self.pipe.enable_model_cpu_offload()
48 | self.pipe.vae.enable_tiling()
49 |
50 | def predict(
51 | self,
52 | prompt: str = Input(
53 | description="Input prompt",
54 | default="A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance.",
55 | ),
56 | num_inference_steps: int = Input(
57 | description="Number of denoising steps", ge=1, le=500, default=50
58 | ),
59 | guidance_scale: float = Input(
60 | description="Scale for classifier-free guidance", ge=1, le=20, default=6
61 | ),
62 | num_frames: int = Input(
63 | description="Number of frames for the output video", default=49
64 | ),
65 | seed: int = Input(
66 | description="Random seed. Leave blank to randomize the seed", default=None
67 | ),
68 | ) -> Path:
69 | """Run a single prediction on the model"""
70 |
71 | if seed is None:
72 | seed = int.from_bytes(os.urandom(2), "big")
73 | print(f"Using seed: {seed}")
74 |
75 | video = self.pipe(
76 | prompt=prompt,
77 | num_videos_per_prompt=1,
78 | num_inference_steps=num_inference_steps,
79 | num_frames=num_frames,
80 | guidance_scale=guidance_scale,
81 | generator=torch.Generator(device="cuda").manual_seed(seed),
82 | ).frames[0]
83 |
84 | out_path = "/tmp/out.mp4"
85 |
86 | export_to_video(video, out_path, fps=8)
87 | return Path(out_path)
88 |
--------------------------------------------------------------------------------
/CogVideo/weights/put weights here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/weights/put weights here.txt
--------------------------------------------------------------------------------
/dataset/traj_vis/Hemi12_transforms.json:
--------------------------------------------------------------------------------
1 | {
2 | "C_01_35mm": "[-0.8622445326446021 -0.497817113029644 -0.09334070869305826 0] [0.49999999999999994 -0.8660254037844387 0.0 0] [-0.08083542493543144 -0.04667035434652912 0.9956342260592881 0] [692.820323027551 399.99999999999994 0.0 1]",
3 | "C_02_35mm": "[-0.49781711302964426 -0.862244532644602 -0.09334070869305827 0] [0.8660254037844386 -0.5000000000000002 0.0 0] [-0.04667035434652916 -0.08083542493543144 0.9956342260592881 0] [400.0000000000001 692.8203230275509 0.0 1]",
4 | "C_03_35mm": "[-1.6011019497192098e-16 -0.9956342260592881 -0.09334070869305827 0] [1.0 -1.6081226496766366e-16 0.0 0] [-1.5010330778617594e-17 -0.09334070869305827 0.9956342260592881 0] [4.898587196589413e-14 800.0 0.0 1]",
5 | "C_04_35mm": "[0.49781711302964377 -0.8622445326446022 -0.09334070869305827 0] [0.8660254037844388 0.4999999999999997 0.0 0] [0.04667035434652911 -0.08083542493543147 0.9956342260592881 0] [-399.99999999999983 692.820323027551 0.0 1]",
6 | "C_05_35mm": "[0.8622445326446021 -0.4978171130296439 -0.09334070869305826 0] [0.49999999999999983 0.8660254037844387 0.0 0] [0.08083542493543144 -0.046670354346529115 0.9956342260592881 0] [-692.820323027551 399.99999999999994 0.0 1]",
7 | "C_06_35mm": "[0.9956342260592881 -1.2193002680650596e-16 -0.09334070869305827 0] [1.2246467991473532e-16 1.0 0.0 0] [0.09334070869305827 -1.1430940013109933e-17 0.9956342260592881 0] [-800.0 9.797174393178826e-14 0.0 1]",
8 | "C_07_35mm": "[0.862244532644602 0.49781711302964415 -0.09334070869305827 0] [-0.5000000000000001 0.8660254037844386 0.0 0] [0.08083542493543144 0.04667035434652914 0.9956342260592881 0] [-692.8203230275509 -400.0000000000001 0.0 1]",
9 | "C_08_35mm": "[0.4978171130296444 0.8622445326446019 -0.09334070869305827 0] [-0.8660254037844385 0.5000000000000003 0.0 0] [0.046670354346529164 0.08083542493543144 0.9956342260592881 0] [-400.00000000000034 -692.8203230275508 0.0 1]",
10 | "C_09_35mm": "[2.820402217784269e-16 0.9956342260592881 -0.09334070869305827 0] [-1.0 2.83276944882399e-16 0.0 0] [2.6441270791727528e-17 0.09334070869305827 0.9956342260592881 0] [-1.4695761589768238e-13 -800.0 0.0 1]",
11 | "C_10_35mm": "[-0.49781711302964426 0.862244532644602 -0.09334070869305827 0] [-0.8660254037844386 -0.5000000000000002 0.0 0] [-0.04667035434652916 0.08083542493543144 0.9956342260592881 0] [400.0000000000001 -692.8203230275509 0.0 1]",
12 | "C_11_35mm": "[-0.8622445326446019 0.4978171130296444 -0.09334070869305827 0] [-0.5000000000000003 -0.8660254037844385 0.0 0] [-0.08083542493543144 0.046670354346529164 0.9956342260592881 0] [692.8203230275507 -400.00000000000034 0.0 1]",
13 | "C_12_35mm": "[-0.9956342260592881 1.2193002680650596e-16 -0.09334070869305827 0] [-1.2246467991473532e-16 -1.0 0.0 0] [-0.09334070869305827 1.1430940013109933e-17 0.9956342260592881 0] [800.0 -1.9594348786357651e-13 0.0 1]"
14 | }
--------------------------------------------------------------------------------
/eval/GVHMR/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third-party/DPVO"]
2 | path = third-party/DPVO
3 | url = https://github.com/princeton-vl/DPVO.git
4 |
--------------------------------------------------------------------------------
/eval/GVHMR/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2022-2023 3D Vision Group at the State Key Lab of CAD&CG,
2 | Zhejiang University. All Rights Reserved.
3 |
4 | For more information see
5 | If you use this software, please cite the corresponding publications
6 | listed on the above website.
7 |
8 | Permission to use, copy, modify and distribute this software and its
9 | documentation for educational, research and non-profit purposes only.
10 | Any modification based on this work must be open-source and prohibited
11 | for commercial use.
12 | You must retain, in the source form of any derivative works that you
13 | distribute, all copyright, patent, trademark, and attribution notices
14 | from the source form of this work.
15 |
16 | For commercial uses of this software, please send email to xwzhou@zju.edu.cn
--------------------------------------------------------------------------------
/eval/GVHMR/README.md:
--------------------------------------------------------------------------------
1 | # GVHMR: World-Grounded Human Motion Recovery via Gravity-View Coordinates
2 | ### [Project Page](https://zju3dv.github.io/gvhmr) | [Paper](https://arxiv.org/abs/2409.06662)
3 |
4 | > World-Grounded Human Motion Recovery via Gravity-View Coordinates
5 | > [Zehong Shen](https://zehongs.github.io/)\*,
6 | [Huaijin Pi](https://phj128.github.io/)\*,
7 | [Yan Xia](https://isshikihugh.github.io/scholar),
8 | [Zhi Cen](https://scholar.google.com/citations?user=Xyy-uFMAAAAJ),
9 | [Sida Peng](https://pengsida.net/)†,
10 | [Zechen Hu](https://zju3dv.github.io/gvhmr),
11 | [Hujun Bao](http://www.cad.zju.edu.cn/home/bao/),
12 | [Ruizhen Hu](https://csse.szu.edu.cn/staff/ruizhenhu/),
13 | [Xiaowei Zhou](https://xzhou.me/)
14 | > SIGGRAPH Asia 2024
15 |
16 |
17 |
18 |
19 |
20 | ## Setup
21 |
22 | Please see [installation](docs/INSTALL.md) for details.
23 |
24 | ## Quick Start
25 |
26 | ### [
Google Colab demo for GVHMR](https://colab.research.google.com/drive/1N9WSchizHv2bfQqkE9Wuiegw_OT7mtGj?usp=sharing)
27 |
28 | ### [
HuggingFace demo for GVHMR](https://huggingface.co/spaces/LittleFrog/GVHMR)
29 |
30 | ### Demo
31 | Demo entries are provided in `tools/demo`. Use `-s` to skip visual odometry if you know the camera is static, otherwise the camera will be estimated by DPVO.
32 | We also provide a script `demo_folder.py` to inference a entire folder.
33 | ```shell
34 | python tools/demo/demo.py --video=docs/example_video/tennis.mp4 -s
35 | python tools/demo/demo_folder.py -f inputs/demo/folder_in -d outputs/demo/folder_out -s
36 | ```
37 |
38 | ### Reproduce
39 | 1. **Test**:
40 | To reproduce the 3DPW, RICH, and EMDB results in a single run, use the following command:
41 | ```shell
42 | python tools/train.py global/task=gvhmr/test_3dpw_emdb_rich exp=gvhmr/mixed/mixed ckpt_path=inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt
43 | ```
44 | To test individual datasets, change `global/task` to `gvhmr/test_3dpw`, `gvhmr/test_rich`, or `gvhmr/test_emdb`.
45 |
46 | 2. **Train**:
47 | To train the model, use the following command:
48 | ```shell
49 | # The gvhmr_siga24_release.ckpt is trained with 2x4090 for 420 epochs, note that different GPU settings may lead to different results.
50 | python tools/train.py exp=gvhmr/mixed/mixed
51 | ```
52 | During training, note that we do not employ post-processing as in the test script, so the global metrics results will differ (but should still be good for comparison with baseline methods).
53 |
54 | # Citation
55 |
56 | If you find this code useful for your research, please use the following BibTeX entry.
57 |
58 | ```
59 | @inproceedings{shen2024gvhmr,
60 | title={World-Grounded Human Motion Recovery via Gravity-View Coordinates},
61 | author={Shen, Zehong and Pi, Huaijin and Xia, Yan and Cen, Zhi and Peng, Sida and Hu, Zechen and Bao, Hujun and Hu, Ruizhen and Zhou, Xiaowei},
62 | booktitle={SIGGRAPH Asia Conference Proceedings},
63 | year={2024}
64 | }
65 | ```
66 |
67 | # Acknowledgement
68 |
69 | We thank the authors of
70 | [WHAM](https://github.com/yohanshin/WHAM),
71 | [4D-Humans](https://github.com/shubham-goel/4D-Humans),
72 | and [ViTPose-Pytorch](https://github.com/gpastal24/ViTPose-Pytorch) for their great works, without which our project/code would not be possible.
73 |
--------------------------------------------------------------------------------
/eval/GVHMR/docs/INSTALL.md:
--------------------------------------------------------------------------------
1 | # Install
2 |
3 | ## Environment
4 |
5 | ```bash
6 | git clone https://github.com/zju3dv/GVHMR --recursive
7 | cd GVHMR
8 |
9 | conda create -y -n gvhmr python=3.10
10 | conda activate gvhmr
11 | pip install -r requirements.txt
12 | pip install -e .
13 | # to install gvhmr in other repo as editable, try adding "python.analysis.extraPaths": ["path/to/your/package"] to settings.json
14 |
15 | # DPVO
16 | cd third-party/DPVO
17 | wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip
18 | unzip eigen-3.4.0.zip -d thirdparty && rm -rf eigen-3.4.0.zip
19 | pip install torch-scatter -f "https://data.pyg.org/whl/torch-2.3.0+cu121.html"
20 | pip install numba pypose
21 | export CUDA_HOME=/usr/local/cuda-12.1/
22 | export PATH=$PATH:/usr/local/cuda-12.1/bin/
23 | pip install -e .
24 | ```
25 |
26 | ## Inputs & Outputs
27 |
28 | ```bash
29 | mkdir inputs
30 | mkdir outputs
31 | ```
32 |
33 | **Weights**
34 |
35 | ```bash
36 | mkdir -p inputs/checkpoints
37 |
38 | # 1. You need to sign up for downloading [SMPL](https://smpl.is.tue.mpg.de/) and [SMPLX](https://smpl-x.is.tue.mpg.de/). And the checkpoints should be placed in the following structure:
39 |
40 | inputs/checkpoints/
41 | ├── body_models/smplx/
42 | │ └── SMPLX_{GENDER}.npz # SMPLX (We predict SMPLX params + evaluation)
43 | └── body_models/smpl/
44 | └── SMPL_{GENDER}.pkl # SMPL (rendering and evaluation)
45 |
46 | # 2. Download other pretrained models from Google-Drive (By downloading, you agree to the corresponding licences): https://drive.google.com/drive/folders/1eebJ13FUEXrKBawHpJroW0sNSxLjh9xD?usp=drive_link
47 |
48 | inputs/checkpoints/
49 | ├── dpvo/
50 | │ └── dpvo.pth
51 | ├── gvhmr/
52 | │ └── gvhmr_siga24_release.ckpt
53 | ├── hmr2/
54 | │ └── epoch=10-step=25000.ckpt
55 | ├── vitpose/
56 | │ └── vitpose-h-multi-coco.pth
57 | └── yolo/
58 | └── yolov8x.pt
59 | ```
60 |
61 | **Data**
62 |
63 | We provide preprocessed data for training and evaluation.
64 | Note that we do not intend to distribute the original datasets, and you need to download them (annotation, videos, etc.) from the original websites.
65 | *We're unable to provide the original data due to the license restrictions.*
66 | By downloading the preprocessed data, you agree to the original dataset's terms of use and use the data for research purposes only.
67 |
68 | You can download them from [Google-Drive](https://drive.google.com/drive/folders/10sEef1V_tULzddFxzCmDUpsIqfv7eP-P?usp=drive_link). Please place them in the "inputs" folder and execute the following commands:
69 |
70 | ```bash
71 | cd inputs
72 | # Train
73 | tar -xzvf AMASS_hmr4d_support.tar.gz
74 | tar -xzvf BEDLAM_hmr4d_support.tar.gz
75 | tar -xzvf H36M_hmr4d_support.tar.gz
76 | # Test
77 | tar -xzvf 3DPW_hmr4d_support.tar.gz
78 | tar -xzvf EMDB_hmr4d_support.tar.gz
79 | tar -xzvf RICH_hmr4d_support.tar.gz
80 |
81 | # The folder structure should be like this:
82 | inputs/
83 | ├── AMASS/hmr4d_support/
84 | ├── BEDLAM/hmr4d_support/
85 | ├── H36M/hmr4d_support/
86 | ├── 3DPW/hmr4d_support/
87 | ├── EMDB/hmr4d_support/
88 | └── RICH/hmr4d_support/
89 | ```
90 |
--------------------------------------------------------------------------------
/eval/GVHMR/docs/example_video/project_teaser.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/docs/example_video/project_teaser.gif
--------------------------------------------------------------------------------
/eval/GVHMR/docs/example_video/tennis.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/docs/example_video/tennis.mp4
--------------------------------------------------------------------------------
/eval/GVHMR/download_eval_pose.sh:
--------------------------------------------------------------------------------
1 | gdown https://drive.google.com/uc\?id\=1jMH2-ZC0ZBgtqej5Sp-E5ebBIX7mk3Xz
2 | gdown https://drive.google.com/uc\?id\=1iFcPSlcKb_rDNJ85UPoThdl22BqR2Xgh
3 |
4 | unzip eval_sets.zip
5 | rm -rf eval_sets.zip
--------------------------------------------------------------------------------
/eval/GVHMR/eval.sh:
--------------------------------------------------------------------------------
1 | python tools/demo/demo_folder.py -f eval_sets -d outputs/eval_sets_gvhmr -s
2 | python tools/eval_pose.py -f outputs/eval_sets_gvhmr_v2
3 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | PROJ_ROOT = Path(__file__).resolve().parents[1]
5 |
6 |
7 | def os_chdir_to_proj_root():
8 | """useful for running notebooks in different directories."""
9 | os.chdir(PROJ_ROOT)
10 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/build_gvhmr.py:
--------------------------------------------------------------------------------
1 | from omegaconf import OmegaConf
2 | from hmr4d import PROJ_ROOT
3 | from hydra.utils import instantiate
4 | from hmr4d.model.gvhmr.gvhmr_pl_demo import DemoPL
5 |
6 |
7 | def build_gvhmr_demo():
8 | cfg = OmegaConf.load(PROJ_ROOT / "hmr4d/configs/demo_gvhmr_model/siga24_release.yaml")
9 | gvhmr_demo_pl: DemoPL = instantiate(cfg.model, _recursive_=False)
10 | gvhmr_demo_pl.load_pretrained_model(PROJ_ROOT / "inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt")
11 | return gvhmr_demo_pl.eval()
12 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from hydra.core.config_store import ConfigStore
3 | from hydra_zen import builds
4 |
5 | import argparse
6 | from hydra import compose, initialize_config_module
7 | import os
8 |
9 | os.environ["HYDRA_FULL_ERROR"] = "1"
10 |
11 | MainStore = ConfigStore.instance()
12 |
13 |
14 | def register_store_gvhmr():
15 | """Register group options to MainStore"""
16 | from . import store_gvhmr
17 |
18 |
19 | def parse_args_to_cfg():
20 | """
21 | Use minimal Hydra API to parse args and return cfg.
22 | This function don't do _run_hydra which create log file hierarchy.
23 | """
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument("--config-name", "-cn", default="train")
26 | parser.add_argument(
27 | "overrides",
28 | nargs="*",
29 | help="Any key=value arguments to override config values (use dots for.nested=overrides)",
30 | )
31 | args = parser.parse_args()
32 |
33 | # Cfg
34 | with initialize_config_module(version_base="1.3", config_module=f"hmr4d.configs"):
35 | cfg = compose(config_name=args.config_name, overrides=args.overrides)
36 |
37 | return cfg
38 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/data/mocap/testY.yaml:
--------------------------------------------------------------------------------
1 | # definition of lightning datamodule (dataset + dataloader)
2 | _target_: hmr4d.datamodule.mocap_trainX_testY.DataModule
3 |
4 | dataset_opts:
5 | test: ${test_datasets}
6 |
7 | loader_opts:
8 | test:
9 | batch_size: 1
10 | num_workers: 0
11 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/data/mocap/trainX_testY.yaml:
--------------------------------------------------------------------------------
1 | # definition of lightning datamodule (dataset + dataloader)
2 | _target_: hmr4d.datamodule.mocap_trainX_testY.DataModule
3 |
4 | dataset_opts:
5 | train: ${train_datasets}
6 | val: ${test_datasets}
7 |
8 | loader_opts:
9 | train:
10 | batch_size: 32
11 | num_workers: 8
12 | val:
13 | batch_size: 1
14 | num_workers: 1
15 |
16 | limit_each_trainset: null
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/demo.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - _self_
3 | - model: gvhmr/gvhmr_pl_demo
4 | - network: gvhmr/relative_transformer
5 | - endecoder: gvhmr/v1_amass_local_bedlam_cam
6 |
7 | pipeline:
8 | _target_: hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline
9 | args_denoiser3d: ${network}
10 | args:
11 | endecoder_opt: ${endecoder}
12 | normalize_cam_angvel: True
13 | weights: null
14 | static_conf: null
15 |
16 | ckpt_path: inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt
17 |
18 | # ================================ #
19 | # global setting #
20 | # ================================ #
21 |
22 | video_name: ???
23 | output_root: outputs/demo
24 | output_dir: "${output_root}/${video_name}"
25 | preprocess_dir: ${output_dir}/preprocess
26 | video_path: "${output_dir}/0_input_video.mp4"
27 |
28 | # Options
29 | static_cam: False
30 | verbose: False
31 |
32 | paths:
33 | bbx: ${preprocess_dir}/bbx.pt
34 | bbx_xyxy_video_overlay: ${preprocess_dir}/bbx_xyxy_video_overlay.mp4
35 | vit_features: ${preprocess_dir}/vit_features.pt
36 | vitpose: ${preprocess_dir}/vitpose.pt
37 | vitpose_video_overlay: ${preprocess_dir}/vitpose_video_overlay.mp4
38 | hmr4d_results: ${output_dir}/hmr4d_results.pt
39 | incam_video: ${output_dir}/1_incam.mp4
40 | global_video: ${output_dir}/2_global.mp4
41 | incam_global_horiz_video: ${output_dir}/${video_name}_3_incam_global_horiz.mp4
42 | slam: ${preprocess_dir}/slam_results.pt
43 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/exp/gvhmr/mixed/mixed.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | defaults:
3 | - override /data: mocap/trainX_testY
4 | - override /model: gvhmr/gvhmr_pl
5 | - override /endecoder: gvhmr/v1_amass_local_bedlam_cam
6 | - override /optimizer: adamw_2e-4
7 | - override /scheduler_cfg: epoch_half_200_350
8 | - override /train_datasets:
9 | - pure_motion_amass/v11
10 | - imgfeat_bedlam/v2
11 | - imgfeat_h36m/v1
12 | - imgfeat_3dpw/v1
13 | - override /test_datasets:
14 | - emdb1/v1_fliptest
15 | - emdb2/v1_fliptest
16 | - rich/all
17 | - 3dpw/fliptest
18 | - override /callbacks:
19 | - simple_ckpt_saver/every10e_top100
20 | - prog_bar/prog_reporter_every0.1
21 | - train_speed_timer/base
22 | - lr_monitor/pl
23 | - metric_emdb1
24 | - metric_emdb2
25 | - metric_rich
26 | - metric_3dpw
27 | - override /network: gvhmr/relative_transformer
28 |
29 | exp_name_base: mixed
30 | exp_name_var: ""
31 | exp_name: ${exp_name_base}${exp_name_var}
32 | data_name: mocap_mixed_v1
33 |
34 | pipeline:
35 | _target_: hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline
36 | args_denoiser3d: ${network}
37 | args:
38 | endecoder_opt: ${endecoder}
39 | normalize_cam_angvel: True
40 | weights:
41 | cr_j3d: 500.
42 | transl_c: 1.
43 | cr_verts: 500.
44 | j2d: 1000.
45 | verts2d: 1000.
46 |
47 | transl_w: 1.
48 | static_conf_bce: 1.
49 |
50 | static_conf:
51 | vel_thr: 0.15
52 |
53 | data:
54 | loader_opts:
55 | train:
56 | batch_size: 128
57 | num_workers: 12
58 |
59 | pl_trainer:
60 | precision: 16-mixed
61 | log_every_n_steps: 50
62 | gradient_clip_val: 0.5
63 | max_epochs: 500
64 | check_val_every_n_epoch: 10
65 | devices: 2
66 |
67 | logger:
68 | _target_: pytorch_lightning.loggers.TensorBoardLogger
69 | save_dir: ${output_dir} # /save_dir/name/version/sub_dir
70 | name: ""
71 | version: "tb" # merge name and version
72 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/debug/debug_train.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | data_name: debug
4 | exp_name: debug
5 |
6 | # data:
7 | # limit_each_trainset: 40
8 | # loader_opts:
9 | # train:
10 | # batch_size: 4
11 | # num_workers: 0
12 | # val:
13 | # batch_size: 1
14 | # num_workers: 0
15 |
16 | pl_trainer:
17 | limit_train_batches: 32
18 | limit_val_batches: 2
19 | check_val_every_n_epoch: 3
20 | enable_checkpointing: False
21 | devices: 1
22 |
23 | callbacks:
24 | model_checkpoint: null
25 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/debug/debug_train_limit_data.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | data_name: debug
4 | exp_name: debug
5 |
6 | data:
7 | limit_each_trainset: 40
8 | loader_opts:
9 | train:
10 | batch_size: 4
11 | num_workers: 0
12 | val:
13 | batch_size: 1
14 | num_workers: 0
15 |
16 | pl_trainer:
17 | limit_val_batches: 2
18 | check_val_every_n_epoch: 3
19 | enable_checkpointing: False
20 | devices: 1
21 |
22 | callbacks:
23 | model_checkpoint: null
24 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_3dpw.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | defaults:
3 | - override /data: mocap/testY
4 | - override /test_datasets:
5 | - 3dpw/fliptest
6 | - override /callbacks:
7 | - metric_3dpw
8 | - _self_
9 |
10 | task: test
11 | data_name: test_mocap
12 | ckpt_path: ??? # will not override previous setting if already set
13 |
14 | # lightning utilities
15 | pl_trainer:
16 | devices: 1
17 | logger: null
18 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_3dpw_emdb_rich.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | defaults:
3 | - override /data: mocap/testY
4 | - override /test_datasets:
5 | - rich/all
6 | - emdb1/v1_fliptest
7 | - emdb2/v1_fliptest
8 | - 3dpw/fliptest
9 | - override /callbacks:
10 | - metric_rich
11 | - metric_emdb1
12 | - metric_emdb2
13 | - metric_3dpw
14 | - _self_
15 |
16 | task: test
17 | data_name: test_mocap
18 | ckpt_path: ??? # will not override previous setting if already set
19 |
20 | # lightning utilities
21 | pl_trainer:
22 | devices: 1
23 | logger: null
24 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_emdb.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | defaults:
3 | - override /data: mocap/testY
4 | - override /test_datasets:
5 | - emdb1/v1_fliptest
6 | - emdb2/v1_fliptest
7 | - override /callbacks:
8 | - metric_emdb1
9 | - metric_emdb2
10 | - _self_
11 |
12 | task: test
13 | data_name: test_mocap
14 | ckpt_path: ??? # will not override previous setting if already set
15 |
16 | # lightning utilities
17 | pl_trainer:
18 | devices: 1
19 | logger: null
20 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_rich.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | defaults:
3 | - override /data: mocap/testY
4 | - override /test_datasets:
5 | - rich/all
6 | - override /callbacks:
7 | - metric_rich
8 | - _self_
9 |
10 | task: test
11 | data_name: test_mocap
12 | ckpt_path: ??? # will not override previous setting if already set
13 |
14 | # lightning utilities
15 | pl_trainer:
16 | devices: 1
17 | logger: null
18 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/hydra/default.yaml:
--------------------------------------------------------------------------------
1 | # enable color logging
2 | defaults:
3 | - override hydra_logging: colorlog
4 | - override job_logging: colorlog
5 |
6 | job_logging:
7 | formatters:
8 | simple:
9 | datefmt: '%m/%d %H:%M:%S'
10 | format: '[%(asctime)s][%(levelname)s] %(message)s'
11 | colorlog:
12 | datefmt: '%m/%d %H:%M:%S'
13 | format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] %(message)s'
14 | handlers:
15 | file:
16 | filename: ${output_dir}/${hydra.job.name}.log
17 |
18 | run:
19 | dir: ${output_dir}
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/siga24_release.yaml:
--------------------------------------------------------------------------------
1 | pipeline:
2 | _target_: hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline
3 | args_denoiser3d: ${network}
4 | args:
5 | endecoder_opt: ${endecoder}
6 | normalize_cam_angvel: true
7 | weights: null
8 | static_conf: null
9 | model:
10 | _target_: hmr4d.model.gvhmr.gvhmr_pl_demo.DemoPL
11 | pipeline: ${pipeline}
12 | network:
13 | _target_: hmr4d.network.gvhmr.relative_transformer.NetworkEncoderRoPEV2
14 | output_dim: 151
15 | max_len: 120
16 | kp2d_mapping: linear_v2
17 | cliffcam_dim: 3
18 | cam_angvel_dim: 6
19 | imgseq_dim: 1024
20 | f_imgseq_filter: null
21 | cond_ver: v1
22 | latent_dim: 512
23 | num_layers: 12
24 | num_heads: 8
25 | mlp_ratio: 4.0
26 | pred_cam_ver: v2
27 | pred_cam_dim: 3
28 | static_conf_dim: 6
29 | pred_coco17_dim: 0
30 | dropout: 0.1
31 | avgbeta: true
32 | endecoder:
33 | _target_: hmr4d.model.gvhmr.utils.endecoder.EnDecoder
34 | stats_name: MM_V1_AMASS_LOCAL_BEDLAM_CAM
35 | noise_pose_k: 10
36 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/store_gvhmr.py:
--------------------------------------------------------------------------------
1 | # Dataset
2 | import hmr4d.dataset.pure_motion.amass
3 | import hmr4d.dataset.emdb.emdb_motion_test
4 | import hmr4d.dataset.rich.rich_motion_test
5 | import hmr4d.dataset.threedpw.threedpw_motion_test
6 | import hmr4d.dataset.threedpw.threedpw_motion_train
7 | import hmr4d.dataset.bedlam.bedlam
8 | import hmr4d.dataset.h36m.h36m
9 |
10 | # Trainer: Model Optimizer Loss
11 | import hmr4d.model.gvhmr.gvhmr_pl
12 | import hmr4d.model.gvhmr.utils.endecoder
13 | import hmr4d.model.common_utils.optimizer
14 | import hmr4d.model.common_utils.scheduler_cfg
15 |
16 | # Metric
17 | import hmr4d.model.gvhmr.callbacks.metric_emdb
18 | import hmr4d.model.gvhmr.callbacks.metric_rich
19 | import hmr4d.model.gvhmr.callbacks.metric_3dpw
20 |
21 |
22 | # PL Callbacks
23 | import hmr4d.utils.callbacks.simple_ckpt_saver
24 | import hmr4d.utils.callbacks.train_speed_timer
25 | import hmr4d.utils.callbacks.prog_bar
26 | import hmr4d.utils.callbacks.lr_monitor
27 |
28 | # Networks
29 | import hmr4d.network.gvhmr.relative_transformer
30 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/train.yaml:
--------------------------------------------------------------------------------
1 | # ================================ #
2 | # override #
3 | # ================================ #
4 | # specify default configuration; the order determines the override order
5 | defaults:
6 | - _self_
7 | # pytorch-lightning
8 | - data: ???
9 | - model: ???
10 | - callbacks: null
11 |
12 | # system
13 | - hydra: default
14 |
15 | # utility groups that changes a lot
16 | - pipeline: null
17 | - network: null
18 | - optimizer: null
19 | - scheduler_cfg: default
20 | - train_datasets: null
21 | - test_datasets: null
22 | - endecoder: null # normalize/unnormalize data
23 | - refiner: null
24 |
25 | # global-override
26 | - exp: ??? # set "data, model and callbacks" in yaml
27 | - global/task: null # dump/test
28 | - global/hsearch: null # hyper-param search
29 | - global/debug: null # debug mode
30 |
31 | # ================================ #
32 | # global setting #
33 | # ================================ #
34 | # expirement information
35 | task: fit # [fit, predict]
36 | exp_name: ???
37 | data_name: ???
38 |
39 | # utilities in the entry file
40 | output_dir: "outputs/${data_name}/${exp_name}"
41 | ckpt_path: null
42 | resume_mode: null
43 | seed: 42
44 |
45 | # lightning default settings
46 | pl_trainer:
47 | devices: 1
48 | num_sanity_val_steps: 0 # disable sanity check
49 | precision: 32
50 | inference_mode: False
51 |
52 | logger: null
53 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/bedlam/resource/vname2lwh.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/dataset/bedlam/resource/vname2lwh.pt
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/bedlam/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from pathlib import Path
4 |
5 | resource_dir = Path(__file__).parent / "resource"
6 |
7 |
8 | def mid2vname(mid):
9 | """vname = {scene}/{seq}, Note that it ends with .mp4"""
10 | # mid example: "inputs/bedlam/bedlam_download/20221011_1_250_batch01hand_closeup_suburb_a/mp4/seq_000001.mp4-rp_emma_posed_008"
11 | # -> vname: 20221011_1_250_batch01hand_closeup_suburb_a/seq_000001.mp4
12 | scene = mid.split("/")[-3]
13 | seq = mid.split("/")[-1].split("-")[0]
14 | vname = f"{scene}/{seq}"
15 | return vname
16 |
17 |
18 | def mid2featname(mid):
19 | """featname = {scene}/{seqsubj}, Note that it ends with .pt (extra)"""
20 | # mid example: "inputs/bedlam/bedlam_download/20221011_1_250_batch01hand_closeup_suburb_a/mp4/seq_000001.mp4-rp_emma_posed_008"
21 | # -> featname: 20221011_1_250_batch01hand_closeup_suburb_a/seq_000001.mp4-rp_emma_posed_008.pt
22 | scene = mid.split("/")[-3]
23 | seqsubj = mid.split("/")[-1]
24 | featname = f"{scene}/{seqsubj}.pt"
25 | return featname
26 |
27 |
28 | def featname2mid(featname):
29 | """reverse func of mid2featname, Note that it removes .pt (extra)"""
30 | # featname example: 20221011_1_250_batch01hand_closeup_suburb_a/seq_000001.mp4-rp_emma_posed_008.pt
31 | # -> mid: inputs/bedlam/bedlam_download/20221011_1_250_batch01hand_closeup_suburb_a/mp4/seq_000001.mp4-rp_emma_posed_008
32 | scene = featname.split("/")[0]
33 | seqsubj = featname.split("/")[1].strip(".pt")
34 | mid = f"inputs/bedlam/bedlam_download/{scene}/mp4/{seqsubj}"
35 | return mid
36 |
37 |
38 | def load_vname2lwh():
39 | return torch.load(resource_dir / "vname2lwh.pt")
40 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/h36m/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import numpy as np
3 | from pathlib import Path
4 | from collections import defaultdict
5 | import pickle
6 | import torch
7 |
8 | RESOURCE_FOLDER = Path(__file__).resolve().parent / "resource"
9 |
10 | camera_idx_to_name = {0: "54138969", 1: "55011271", 2: "58860488", 3: "60457274"}
11 |
12 |
13 | def get_vid(pkl_path, cam_id):
14 | """.../S6/Posing 1.pkl, 54138969 -> S6@Posing_1@54138969"""
15 | sub_id, fn = pkl_path.split("/")[-2:]
16 | vid = f"{sub_id}@{fn.split('.')[0].replace(' ', '_')}@{cam_id}"
17 | return vid
18 |
19 |
20 | def get_raw_pkl_paths(h36m_raw_root):
21 | smpl_param_dir = h36m_raw_root / "neutrSMPL_H3.6"
22 | pkl_paths = []
23 | for train_sub in ["S1", "S5", "S6", "S7", "S8"]:
24 | for pth in (smpl_param_dir / train_sub).glob("*.pkl"):
25 | if "aligned" not in str(pth): # Use world sequence only
26 | pkl_paths.append(str(pth))
27 |
28 | return pkl_paths
29 |
30 |
31 | def get_cam_KRts():
32 | """
33 | Returns:
34 | Ks (torch.Tensor): {cam_id: 3x3}
35 | Rts (torch.Tensor): {subj_id: {cam_id: 4x4}}
36 | """
37 | # this file is copied from https://github.com/karfly/human36m-camera-parameters
38 | cameras_path = RESOURCE_FOLDER / "camera-parameters.json"
39 | with open(cameras_path, "r") as f:
40 | cameras = json.load(f)
41 |
42 | # 4 camera ids: '54138969', '55011271', '58860488', '60457274'
43 | Ks = {}
44 | for cam in cameras["intrinsics"]:
45 | Ks[cam] = torch.tensor(cameras["intrinsics"][cam]["calibration_matrix"]).float()
46 |
47 | # extrinsics
48 | extrinsics = cameras["extrinsics"]
49 | Rts = defaultdict(dict)
50 | for subj in extrinsics:
51 | for cam in extrinsics[subj]:
52 | Rt = torch.eye(4)
53 | Rt[:3, :3] = torch.tensor(extrinsics[subj][cam]["R"])
54 | Rt[:3, [3]] = torch.tensor(extrinsics[subj][cam]["t"]) / 1000
55 | Rts[subj][cam] = Rt.float()
56 |
57 | return Ks, Rts
58 |
59 |
60 | def parse_raw_pkl(pkl_path, to_50hz=True):
61 | """
62 | raw_pkl @ 200Hz, where video @ 50Hz.
63 | the frames should be divided by 4, and mannually align with the video.
64 | """
65 | with open(str(pkl_path), "rb") as f:
66 | data = pickle.load(f, encoding="bytes")
67 | poses = torch.from_numpy(data[b"poses"]).float()
68 | betas = torch.from_numpy(data[b"betas"]).float()
69 | trans = torch.from_numpy(data[b"trans"]).float()
70 | assert poses.shape[0] == trans.shape[0]
71 | if to_50hz:
72 | poses = poses[::4]
73 | trans = trans[::4]
74 |
75 | seq_length = poses.shape[0] # 50FPS
76 | smpl_params = {
77 | "body_pose": poses[:, 3:],
78 | "betas": betas[None].expand(seq_length, -1),
79 | "global_orient": poses[:, :3],
80 | "transl": trans,
81 | }
82 | return smpl_params
83 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/imgfeat_motion/base_dataset.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils import data
3 | import numpy as np
4 | from pathlib import Path
5 | from hmr4d.utils.pylogger import Log
6 |
7 |
8 | class ImgfeatMotionDatasetBase(data.Dataset):
9 | def __init__(self):
10 | super().__init__()
11 | self._load_dataset()
12 | self._get_idx2meta() # -> Set self.idx2meta
13 |
14 | def __len__(self):
15 | return len(self.idx2meta)
16 |
17 | def _load_dataset(self):
18 | raise NotImplemented
19 |
20 | def _get_idx2meta(self):
21 | raise NotImplemented
22 |
23 | def _load_data(self, idx):
24 | raise NotImplemented
25 |
26 | def _process_data(self, data, idx):
27 | raise NotImplemented
28 |
29 | def __getitem__(self, idx):
30 | data = self._load_data(idx)
31 | data = self._process_data(data, idx)
32 | return data
33 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/pure_motion/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from pytorch3d.transforms import (
4 | axis_angle_to_matrix,
5 | matrix_to_axis_angle,
6 | matrix_to_rotation_6d,
7 | rotation_6d_to_matrix,
8 | )
9 | from einops import rearrange
10 |
11 |
12 | def aa_to_r6d(x):
13 | return matrix_to_rotation_6d(axis_angle_to_matrix(x))
14 |
15 |
16 | def r6d_to_aa(x):
17 | return matrix_to_axis_angle(rotation_6d_to_matrix(x))
18 |
19 |
20 | def interpolate_smpl_params(smpl_params, tgt_len):
21 | """
22 | smpl_params['body_pose'] (L, 63)
23 | tgt_len: L->L'
24 | """
25 | betas = smpl_params["betas"]
26 | body_pose = smpl_params["body_pose"]
27 | global_orient = smpl_params["global_orient"] # (L, 3)
28 | transl = smpl_params["transl"] # (L, 3)
29 |
30 | # Interpolate
31 | body_pose = rearrange(aa_to_r6d(body_pose.reshape(-1, 21, 3)), "l j c -> c j l")
32 | body_pose = F.interpolate(body_pose, tgt_len, mode="linear", align_corners=True)
33 | body_pose = r6d_to_aa(rearrange(body_pose, "c j l -> l j c")).reshape(-1, 63)
34 |
35 | # although this should be the same as above, we do it for consistency
36 | betas = rearrange(betas, "l c -> c 1 l")
37 | betas = F.interpolate(betas, tgt_len, mode="linear", align_corners=True)
38 | betas = rearrange(betas, "c 1 l -> l c")
39 |
40 | global_orient = rearrange(aa_to_r6d(global_orient.reshape(-1, 1, 3)), "l j c -> c j l")
41 | global_orient = F.interpolate(global_orient, tgt_len, mode="linear", align_corners=True)
42 | global_orient = r6d_to_aa(rearrange(global_orient, "c j l -> l j c")).reshape(-1, 3)
43 |
44 | transl = rearrange(transl, "l c -> c 1 l")
45 | transl = F.interpolate(transl, tgt_len, mode="linear", align_corners=True)
46 | transl = rearrange(transl, "c 1 l -> l c")
47 |
48 | return {"body_pose": body_pose, "betas": betas, "global_orient": global_orient, "transl": transl}
49 |
50 |
51 | def rotate_around_axis(global_orient, transl, axis="y"):
52 | """Global coordinate augmentation. Random rotation around y-axis"""
53 | angle = torch.rand(1) * 2 * torch.pi
54 | if axis == "y":
55 | aa = torch.tensor([0.0, angle, 0.0]).float().unsqueeze(0)
56 | rmat = axis_angle_to_matrix(aa)
57 |
58 | global_orient = matrix_to_axis_angle(rmat @ axis_angle_to_matrix(global_orient))
59 | transl = (rmat.squeeze(0) @ transl.T).T
60 | return global_orient, transl
61 |
62 |
63 | def augment_betas(betas, std=0.1):
64 | noise = torch.normal(mean=torch.zeros(10), std=torch.ones(10) * std)
65 | betas_aug = betas + noise[None]
66 | return betas_aug
67 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/rich/resource/cam2params.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/dataset/rich/resource/cam2params.pt
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/rich/resource/val.txt:
--------------------------------------------------------------------------------
1 | sequence_name capture_name scan_name id moving_cam gender scene action/scene-interaction subjects view_id
2 | ParkingLot1_002_stretching2 ParkingLot1 scan_camcoord 002 X male V V V 0,1,2,3,4,5,6,7
3 | ParkingLot1_002_burpee1 ParkingLot1 scan_camcoord 002 X male V V V 0,1,2,3,4,5,6,7
4 | ParkingLot1_002_burpee2 ParkingLot1 scan_camcoord 002 X male V V V 0,1,2,3,4,5,6,7
5 | ParkingLot1_004_pushup1 ParkingLot1 scan_camcoord 004 X male V V V 0,1,2,3,4,5,6,7
6 | ParkingLot1_004_eating2 ParkingLot1 scan_camcoord 004 X male V V V 0,1,2,3,4,5,6,7
7 | ParkingLot1_004_phonetalk2 ParkingLot1 scan_camcoord 004 X male V V V 0,1,2,3,4,5,6,7
8 | ParkingLot1_004_takingphotos2 ParkingLot1 scan_camcoord 004 X male V V V 0,1,2,3,4,5,6,7
9 | ParkingLot1_004_stretching2 ParkingLot1 scan_camcoord 004 X male V V V 0,1,2,3,4,5,6,7
10 | ParkingLot1_005_overfence2 ParkingLot1 scan_camcoord 005 X male V V V 0,1,2,3,4,5,6,7
11 | ParkingLot1_005_pushup1 ParkingLot1 scan_camcoord 005 X male V V V 0,1,2,3,4,5,6,7
12 | ParkingLot1_005_burpeejump1 ParkingLot1 scan_camcoord 005 X male V V V 0,1,2,3,4,5,6,7
13 | ParkingLot1_007_burpee2 ParkingLot1 scan_camcoord 007 X male V V V 0,1,2,3,4,5,6,7
14 | ParkingLot2_008_eating2 ParkingLot2 scan_camcoord 008 V male V V V 0,1,2,3,4,5
15 | ParkingLot2_008_burpeejump2 ParkingLot2 scan_camcoord 008 V male V V V 0,1,2,3,4,5
16 | ParkingLot2_014_overfence1 ParkingLot2 scan_camcoord 014 X male V V V 0,1,2,3,4,5
17 | ParkingLot2_014_eating2 ParkingLot2 scan_camcoord 014 X male V V V 0,1,2,3,4,5
18 | ParkingLot2_016_phonetalk5 ParkingLot2 scan_camcoord 016 V female V V V 0,1,2,3,4,5
19 | Pavallion_002_sidebalancerun Pavallion scan_camcoord 002 V male V V V 0,1,2,3,4,5,6
20 | Pavallion_013_sidebalancerun Pavallion scan_camcoord 013 X female V V V 0,1,2,3,4,5,6
21 | Pavallion_018_sidebalancerun Pavallion scan_camcoord 018 V female V V V 0,1,2,3,4,5,6
22 | LectureHall_018_wipingtable1 LectureHall scan_chair_scene_camcoord 018 X female V V V 0,2,4,5,6
23 | LectureHall_020_wipingchairs1 LectureHall scan_chair_scene_camcoord 020 X male V V V 0,1,2,3,4,5,6
24 | LectureHall_003_wipingchairs1 LectureHall scan_chair_scene_camcoord 003 X male V V V 0,1,2,3,4,5,6
25 | Pavallion_000_yoga1 Pavallion scan_camcoord 000 X male V X V 0,1,2,3,4,5,6
26 | Pavallion_002_yoga1 Pavallion scan_camcoord 002 V male V X V 0,1,2,3,4,5,6
27 | Pavallion_003_yoga1 Pavallion scan_camcoord 003 V male V X V 0,1,2,3,4,5,6
28 | Pavallion_006_yoga1 Pavallion scan_camcoord 006 V male V X V 0,1,2,3,4,5,6
29 | Pavallion_018_yoga1 Pavallion scan_camcoord 018 V female V X V 0,1,2,3,4,5,6
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/rich/resource/w2az_sahmr.json:
--------------------------------------------------------------------------------
1 | {"BBQ_scan_camcoord": [[0.9989829107564298, 0.03367618890797693, -0.029984301180211045, 0.0008183751635392625], [0.03414262169451401, -0.1305975871406019, 0.9908473906797644, -0.005059823133706893], [0.02945208652127451, -0.9908633531086326, -0.13161455111748036, 1.4054905296083466], [0.0, 0.0, 0.0, 1.0]], "Gym_scan_camcoord": [[0.9932599733260449, -0.07628732032461205, 0.0872632233306122, -0.047601130084306706], [-0.10233962102690007, -0.22374853741942266, 0.9692590953768503, -0.04091804681182174], [-0.05441716049582774, -0.9716567484252654, -0.23004768176013274, 1.537911791136788], [0.0, 0.0, 0.0, 1.0]], "Gym_scan_table_camcoord": [[0.9974451989415423, -0.06250743213795668, 0.03458172980064169, 0.02231858470834599], [-0.04804912583358893, -0.22882402250236075, 0.972281259838159, 0.039081886755815726], [-0.05286167435026744, -0.9714588965331274, -0.2312428501197992, 1.5421821446346522], [0.0, 0.0, 0.0, 1.0]], "LectureHall_scan_chair_scene_camcoord": [[0.9992930513998263, 0.030087515976743376, -0.0225419343977731, 0.001998908749589632], [0.030705594681969043, -0.30721111058653017, 0.9511458878570781, -0.025811963513866963], [0.021692484396004613, -0.9511656401040444, -0.307917783192506, 2.060346184503773], [0.0, 0.0, 0.0, 1.0]], "LectureHall_scan_yoga_scene_camcoord": [[0.9993358324246812, 0.03030060260429296, -0.020242715082476024, -0.003510046042036605], [0.028600729415016745, -0.3079667078507395, 0.9509671419836329, -0.01748548118379142], [0.022580795137075255, -0.9509144968594153, -0.3086287856852993, 2.0424701474796567], [0.0, 0.0, 0.0, 1.0]], "ParkingLot1_scan_camcoord": [[0.9989627324729327, -0.03724260727951709, 0.02620013994738054, 0.0070941466745699025], [-0.03091587075252664, -0.13228243926883107, 0.9907298144280939, -0.0274920377236923], [-0.03343154297742938, -0.9905121627037764, -0.13329661462331338, 1.3859200914120975], [0.0, 0.0, 0.0, 1.0]], "ParkingLot2_scan_camcoord": [[0.9989532636786039, -0.04044665659892979, 0.021364572447267097, 0.01646827411554571], [-0.026687287930043047, -0.13600581518076985, 0.9903485279940424, 0.030197722289598695], [-0.03715058073335097, -0.9898820567153364, -0.13694286452455984, 1.4372015171546513], [0.0, 0.0, 0.0, 1.0]], "Pavallion_scan_camcoord": [[0.9971864096076799, 0.05693557331723671, -0.048760690979605295, 0.0012478238054067193], [0.05746407703876882, -0.16289761936471214, 0.9849681443861059, -0.006002953831755452], [0.04813672552068054, -0.9849988355812122, -0.16571104235928033, 1.7638454838942128], [0.0, 0.0, 0.0, 1.0]]}
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/threedpw/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import numpy as np
3 | from pathlib import Path
4 | from collections import defaultdict
5 | import pickle
6 | import torch
7 | import joblib
8 |
9 | RESOURCE_FOLDER = Path(__file__).resolve().parent / "resource"
10 |
11 |
12 | def read_raw_pkl(pkl_path):
13 | with open(pkl_path, "rb") as f:
14 | data = pickle.load(f, encoding="bytes")
15 |
16 | num_subjects = len(data[b"poses"])
17 | F = data[b"poses"][0].shape[0]
18 | smpl_params = []
19 | for i in range(num_subjects):
20 | smpl_params.append(
21 | {
22 | "body_pose": torch.from_numpy(data[b"poses"][i][:, 3:72]).float(), # (F, 69)
23 | "betas": torch.from_numpy(data[b"betas"][i][:10]).repeat(F, 1).float(), # (F, 10)
24 | "global_orient": torch.from_numpy(data[b"poses"][i][:, :3]).float(), # (F, 3)
25 | "transl": torch.from_numpy(data[b"trans"][i]).float(), # (F, 3)
26 | }
27 | )
28 | genders = ["male" if g == "m" else "female" for g in data[b"genders"]]
29 | campose_valid = [torch.from_numpy(v).bool() for v in data[b"campose_valid"]]
30 |
31 | seq_name = data[b"sequence"]
32 | K_fullimg = torch.from_numpy(data[b"cam_intrinsics"]).float()
33 | T_w2c = torch.from_numpy(data[b"cam_poses"]).float()
34 |
35 | return_data = {
36 | "sequence": seq_name, # 'courtyard_bodyScannerMotions_00'
37 | "K_fullimg": K_fullimg, # (3, 3), not 55FoV
38 | "T_w2c": T_w2c, # (F, 4, 4)
39 | "smpl_params": smpl_params, # list of dict
40 | "genders": genders, # list of str
41 | "campose_valid": campose_valid, # list of bool-array
42 | # "jointPositions": data[b'jointPositions'], # SMPL, 24x3
43 | # "poses2d": data[b"poses2d"], # COCO, 3x18(?)
44 | }
45 | return return_data
46 |
47 |
48 | def load_and_convert_wham_pth(pth):
49 | """
50 | Convert to {vid: DataDict} style, Add smpl_params_incam
51 | """
52 | # load
53 | wham_labels_raw = joblib.load(pth)
54 | # convert it to {vid: DataDict} style
55 | wham_labels = {}
56 | for i, vid in enumerate(wham_labels_raw["vid"]):
57 | wham_labels[vid] = {k: wham_labels_raw[k][i] for k in wham_labels_raw}
58 |
59 | # convert pose and betas as smpl_params_incam (without transl)
60 | for vid in wham_labels:
61 | pose = wham_labels[vid]["pose"]
62 | global_orient = pose[:, :3] # (F, 3)
63 | body_pose = pose[:, 3:] # (F, 69)
64 | betas = wham_labels[vid]["betas"] # (F, 10), all frames are the same
65 | wham_labels[vid]["smpl_params_incam"] = {
66 | "body_pose": body_pose.float(), # (F, 69)
67 | "betas": betas.float(), # (F, 10)
68 | "global_orient": global_orient.float(), # (F, 3)
69 | }
70 |
71 | return wham_labels
72 |
73 |
74 | # Neural-Annot utils
75 |
76 |
77 | def na_cam_param_to_K_fullimg(cam_param):
78 | K = torch.eye(3)
79 | K[[0, 1], [0, 1]] = torch.tensor(cam_param["focal"])
80 | K[[0, 1], [2, 2]] = torch.tensor(cam_param["princpt"])
81 | return K
82 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/model/common_utils/optimizer.py:
--------------------------------------------------------------------------------
1 | from torch.optim import AdamW, Adam
2 | from hmr4d.configs import MainStore, builds
3 |
4 |
5 | optimizer_cfgs = {
6 | "adam_1e-3": builds(Adam, lr=1e-3, zen_partial=True),
7 | "adam_2e-4": builds(Adam, lr=2e-4, zen_partial=True),
8 | "adamw_2e-4": builds(AdamW, lr=2e-4, zen_partial=True),
9 | "adamw_1e-4": builds(AdamW, lr=1e-4, zen_partial=True),
10 | "adamw_5e-5": builds(AdamW, lr=5e-5, zen_partial=True),
11 | "adamw_1e-5": builds(AdamW, lr=1e-5, zen_partial=True),
12 | # zero-shot text-to-image generation
13 | "adamw_1e-3_dalle": builds(AdamW, lr=1e-3, weight_decay=1e-4, zen_partial=True),
14 | }
15 |
16 | for name, cfg in optimizer_cfgs.items():
17 | MainStore.store(name=name, node=cfg, group=f"optimizer")
18 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/model/common_utils/scheduler.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from bisect import bisect_right
3 |
4 |
5 | class WarmupMultiStepLR(torch.optim.lr_scheduler.LRScheduler):
6 | def __init__(self, optimizer, milestones, warmup=0, gamma=0.1, last_epoch=-1, verbose="deprecated"):
7 | """Assume optimizer does not change lr; Scheduler is called epoch-based"""
8 | self.milestones = milestones
9 | self.warmup = warmup
10 | assert warmup < milestones[0]
11 | self.gamma = gamma
12 | super().__init__(optimizer, last_epoch, verbose)
13 |
14 | def get_lr(self):
15 | base_lrs = self.base_lrs # base lr for each groups
16 | n_groups = len(base_lrs)
17 | comming_epoch = self.last_epoch # the lr will be set for the comming epoch, starts from 0
18 |
19 | # add extra warmup
20 | if comming_epoch < self.warmup:
21 | # e.g. comming_epoch [0, 1, 2] for warmup == 3
22 | # lr should be base_lr * (last_epoch+1) / (warmup + 1), e.g. [0.25, 0.5, 0.75] * base_lr
23 | lr_factor = (self.last_epoch + 1) / (self.warmup + 1)
24 | return [base_lrs[i] * lr_factor for i in range(n_groups)]
25 | else:
26 | # bisect_right([3,5,7], 0) -> 0; bisect_right([3,5,7], 5) -> 2
27 | p = bisect_right(self.milestones, comming_epoch)
28 | lr_factor = self.gamma**p
29 | return [base_lrs[i] * lr_factor for i in range(n_groups)]
30 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/model/common_utils/scheduler_cfg.py:
--------------------------------------------------------------------------------
1 | from omegaconf import DictConfig, ListConfig
2 | from hmr4d.configs import MainStore, builds
3 |
4 | # do not perform scheduling
5 | default = DictConfig({"scheduler": None})
6 | MainStore.store(name="default", node=default, group=f"scheduler_cfg")
7 |
8 |
9 | # epoch-based
10 | def epoch_half_by(milestones=[100, 200, 300]):
11 | return DictConfig(
12 | {
13 | "scheduler": {
14 | "_target_": "torch.optim.lr_scheduler.MultiStepLR",
15 | "milestones": milestones,
16 | "gamma": 0.5,
17 | },
18 | "interval": "epoch",
19 | "frequency": 1,
20 | }
21 | )
22 |
23 |
24 | MainStore.store(name="epoch_half_100_200_300", node=epoch_half_by([100, 200, 300]), group=f"scheduler_cfg")
25 | MainStore.store(name="epoch_half_100_200", node=epoch_half_by([100, 200]), group=f"scheduler_cfg")
26 | MainStore.store(name="epoch_half_200_350", node=epoch_half_by([200, 350]), group=f"scheduler_cfg")
27 | MainStore.store(name="epoch_half_300", node=epoch_half_by([300]), group=f"scheduler_cfg")
28 |
29 |
30 | # epoch-based
31 | def warmup_epoch_half_by(warmup=10, milestones=[100, 200, 300]):
32 | return DictConfig(
33 | {
34 | "scheduler": {
35 | "_target_": "hmr4d.model.common_utils.scheduler.WarmupMultiStepLR",
36 | "milestones": milestones,
37 | "warmup": warmup,
38 | "gamma": 0.5,
39 | },
40 | "interval": "epoch",
41 | "frequency": 1,
42 | }
43 | )
44 |
45 |
46 | MainStore.store(name="warmup_5_epoch_half_200_350", node=warmup_epoch_half_by(5, [200, 350]), group=f"scheduler_cfg")
47 | MainStore.store(name="warmup_10_epoch_half_200_350", node=warmup_epoch_half_by(10, [200, 350]), group=f"scheduler_cfg")
48 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/model/gvhmr/gvhmr_pl_demo.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import pytorch_lightning as pl
3 | from hydra.utils import instantiate
4 | from hmr4d.utils.pylogger import Log
5 | from hmr4d.configs import MainStore, builds
6 |
7 | from hmr4d.utils.geo.hmr_cam import normalize_kp2d
8 |
9 |
10 | class DemoPL(pl.LightningModule):
11 | def __init__(self, pipeline):
12 | super().__init__()
13 | self.pipeline = instantiate(pipeline, _recursive_=False)
14 |
15 | @torch.no_grad()
16 | def predict(self, data, static_cam=False):
17 | """auto add batch dim
18 | data: {
19 | "length": int, or Torch.Tensor,
20 | "kp2d": (F, 3)
21 | "bbx_xys": (F, 3)
22 | "K_fullimg": (F, 3, 3)
23 | "cam_angvel": (F, 3)
24 | "f_imgseq": (F, 3, 256, 256)
25 | }
26 |
27 | """
28 | # ROPE inference
29 | batch = {
30 | "length": data["length"][None],
31 | "obs": normalize_kp2d(data["kp2d"], data["bbx_xys"])[None],
32 | "bbx_xys": data["bbx_xys"][None],
33 | "K_fullimg": data["K_fullimg"][None],
34 | "cam_angvel": data["cam_angvel"][None],
35 | "f_imgseq": data["f_imgseq"][None],
36 | }
37 | batch = {k: v.cuda() for k, v in batch.items()}
38 | outputs = self.pipeline.forward(batch, train=False, postproc=True, static_cam=static_cam)
39 |
40 | pred = {
41 | "smpl_params_global": {k: v[0] for k, v in outputs["pred_smpl_params_global"].items()},
42 | "smpl_params_incam": {k: v[0] for k, v in outputs["pred_smpl_params_incam"].items()},
43 | "K_fullimg": data["K_fullimg"],
44 | "net_outputs": outputs, # intermediate outputs
45 | }
46 | return pred
47 |
48 | def load_pretrained_model(self, ckpt_path):
49 | """Load pretrained checkpoint, and assign each weight to the corresponding part."""
50 | Log.info(f"[PL-Trainer] Loading ckpt type: {ckpt_path}")
51 |
52 | state_dict = torch.load(ckpt_path, "cpu")["state_dict"]
53 | missing, unexpected = self.load_state_dict(state_dict, strict=False)
54 | if len(missing) > 0:
55 | Log.warn(f"Missing keys: {missing}")
56 | if len(unexpected) > 0:
57 | Log.warn(f"Unexpected keys: {unexpected}")
58 |
59 |
60 | MainStore.store(name="gvhmr_pl_demo", node=builds(DemoPL, pipeline="${pipeline}"), group="model/gvhmr")
61 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/base_arch/embeddings/rotary_embedding.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from einops import repeat, rearrange
4 | from torch.cuda.amp import autocast
5 |
6 |
7 | def rotate_half(x):
8 | x = rearrange(x, "... (d r) -> ... d r", r=2)
9 | x1, x2 = x.unbind(dim=-1)
10 | x = torch.stack((-x2, x1), dim=-1)
11 | return rearrange(x, "... d r -> ... (d r)")
12 |
13 |
14 | @autocast(enabled=False)
15 | def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
16 | if t.ndim == 3:
17 | seq_len = t.shape[seq_dim]
18 | freqs = freqs[-seq_len:].to(t)
19 |
20 | rot_dim = freqs.shape[-1]
21 | end_index = start_index + rot_dim
22 |
23 | assert (
24 | rot_dim <= t.shape[-1]
25 | ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
26 |
27 | t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
28 | t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
29 | return torch.cat((t_left, t, t_right), dim=-1)
30 |
31 |
32 | def get_encoding(d_model, max_seq_len=4096):
33 | """Return: (L, D)"""
34 | t = torch.arange(max_seq_len).float()
35 | freqs = 1.0 / (10000 ** (torch.arange(0, d_model, 2).float() / d_model))
36 | freqs = torch.einsum("i, j -> i j", t, freqs)
37 | freqs = repeat(freqs, "i j -> i (j r)", r=2)
38 | return freqs
39 |
40 |
41 | class ROPE(nn.Module):
42 | """Minimal impl of a lang-style positional encoding."""
43 |
44 | def __init__(self, d_model, max_seq_len=4096):
45 | super().__init__()
46 | self.d_model = d_model
47 | self.max_seq_len = max_seq_len
48 |
49 | # Pre-cache a freqs tensor
50 | encoding = get_encoding(d_model, max_seq_len)
51 | self.register_buffer("encoding", encoding, False)
52 |
53 | def rotate_queries_or_keys(self, x):
54 | """
55 | Args:
56 | x : (B, H, L, D)
57 | Returns:
58 | rotated_x: (B, H, L, D)
59 | """
60 |
61 | seq_len, d_model = x.shape[-2:]
62 | assert d_model == self.d_model
63 |
64 | # encoding: (L, D)s
65 | if seq_len > self.max_seq_len:
66 | encoding = get_encoding(d_model, seq_len).to(x)
67 | else:
68 | encoding = self.encoding[:seq_len]
69 |
70 | # encoding: (L, D)
71 | # x: (B, H, L, D)
72 | rotated_x = apply_rotary_emb(encoding, x, seq_dim=-2)
73 |
74 | return rotated_x
75 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/base_arch/transformer/encoder_rope.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import math
5 | from timm.models.vision_transformer import Mlp
6 | from typing import Optional, Tuple
7 | from einops import einsum, rearrange, repeat
8 | from hmr4d.network.base_arch.embeddings.rotary_embedding import ROPE
9 |
10 |
11 | class RoPEAttention(nn.Module):
12 | def __init__(self, embed_dim, num_heads, dropout=0.1):
13 | super().__init__()
14 | self.embed_dim = embed_dim
15 | self.num_heads = num_heads
16 | self.head_dim = embed_dim // num_heads
17 |
18 | self.rope = ROPE(self.head_dim, max_seq_len=4096)
19 |
20 | self.query = nn.Linear(embed_dim, embed_dim)
21 | self.key = nn.Linear(embed_dim, embed_dim)
22 | self.value = nn.Linear(embed_dim, embed_dim)
23 | self.dropout = nn.Dropout(dropout)
24 | self.proj = nn.Linear(embed_dim, embed_dim)
25 |
26 | def forward(self, x, attn_mask=None, key_padding_mask=None):
27 | # x: (B, L, C)
28 | # attn_mask: (L, L)
29 | # key_padding_mask: (B, L)
30 | B, L, _ = x.shape
31 | xq, xk, xv = self.query(x), self.key(x), self.value(x)
32 |
33 | xq = xq.reshape(B, L, self.num_heads, -1).transpose(1, 2)
34 | xk = xk.reshape(B, L, self.num_heads, -1).transpose(1, 2)
35 | xv = xv.reshape(B, L, self.num_heads, -1).transpose(1, 2)
36 |
37 | xq = self.rope.rotate_queries_or_keys(xq) # B, N, L, C
38 | xk = self.rope.rotate_queries_or_keys(xk) # B, N, L, C
39 |
40 | attn_score = einsum(xq, xk, "b n i c, b n j c -> b n i j") / math.sqrt(self.head_dim)
41 | if attn_mask is not None:
42 | attn_mask = attn_mask.reshape(1, 1, L, L).expand(B, self.num_heads, -1, -1)
43 | attn_score = attn_score.masked_fill(attn_mask, float("-inf"))
44 | if key_padding_mask is not None:
45 | key_padding_mask = key_padding_mask.reshape(B, 1, 1, L).expand(-1, self.num_heads, L, -1)
46 | attn_score = attn_score.masked_fill(key_padding_mask, float("-inf"))
47 |
48 | attn_score = torch.softmax(attn_score, dim=-1)
49 | attn_score = self.dropout(attn_score)
50 | output = einsum(attn_score, xv, "b n i j, b n j c -> b n i c") # B, N, L, C
51 | output = output.transpose(1, 2).reshape(B, L, -1) # B, L, C
52 | output = self.proj(output) # B, L, C
53 | return output
54 |
55 |
56 | class EncoderRoPEBlock(nn.Module):
57 | def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, dropout=0.1, **block_kwargs):
58 | super().__init__()
59 | self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
60 | self.attn = RoPEAttention(hidden_size, num_heads, dropout)
61 | self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
62 | mlp_hidden_dim = int(hidden_size * mlp_ratio)
63 | approx_gelu = lambda: nn.GELU(approximate="tanh")
64 | self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=dropout)
65 |
66 | self.gate_msa = nn.Parameter(torch.zeros(1, 1, hidden_size))
67 | self.gate_mlp = nn.Parameter(torch.zeros(1, 1, hidden_size))
68 |
69 | # Zero-out adaLN modulation layers
70 | nn.init.constant_(self.gate_msa, 0)
71 | nn.init.constant_(self.gate_mlp, 0)
72 |
73 | def forward(self, x, attn_mask=None, tgt_key_padding_mask=None):
74 | x = x + self.gate_msa * self._sa_block(
75 | self.norm1(x), attn_mask=attn_mask, key_padding_mask=tgt_key_padding_mask
76 | )
77 | x = x + self.gate_mlp * self.mlp(self.norm2(x))
78 | return x
79 |
80 | def _sa_block(self, x, attn_mask=None, key_padding_mask=None):
81 | # x: (B, L, C)
82 | x = self.attn(x, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
83 | return x
84 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/base_arch/transformer/layer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | def zero_module(module):
7 | """
8 | Zero out the parameters of a module and return it.
9 | """
10 | for p in module.parameters():
11 | p.detach().zero_()
12 | return module
13 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from .hmr2 import HMR2
3 | from pathlib import Path
4 | from .configs import get_config
5 | from hmr4d import PROJ_ROOT
6 |
7 | HMR2A_CKPT = PROJ_ROOT / f"inputs/checkpoints/hmr2/epoch=10-step=25000.ckpt" # this is HMR2.0a, follow WHAM
8 |
9 |
10 | def load_hmr2(checkpoint_path=HMR2A_CKPT):
11 | model_cfg = str((Path(__file__).parent / "configs/model_config.yaml").resolve())
12 | model_cfg = get_config(model_cfg)
13 |
14 | # Override some config values, to crop bbox correctly
15 | if (model_cfg.MODEL.BACKBONE.TYPE == "vit") and ("BBOX_SHAPE" not in model_cfg.MODEL):
16 | model_cfg.defrost()
17 | assert (
18 | model_cfg.MODEL.IMAGE_SIZE == 256
19 | ), f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone"
20 | model_cfg.MODEL.BBOX_SHAPE = [192, 256] # (W, H)
21 | model_cfg.freeze()
22 |
23 | # Setup model and Load weights.
24 | # model = HMR2.load_from_checkpoint(checkpoint_path, strict=False, cfg=model_cfg)
25 | model = HMR2(model_cfg)
26 |
27 | state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
28 | keys = [k for k in state_dict.keys() if k.split(".")[0] in ["backbone", "smpl_head"]]
29 | state_dict = {k: v for k, v in state_dict.items() if k in keys}
30 | model.load_state_dict(state_dict, strict=True)
31 |
32 | return model
33 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/network/hmr2/components/__init__.py
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/configs/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Dict
3 | from yacs.config import CfgNode as CN
4 | from pathlib import Path
5 |
6 | # CACHE_DIR = os.path.join(os.environ.get("HOME"), "Code/4D-Humans/cache")
7 | # CACHE_DIR_4DHUMANS = os.path.join(CACHE_DIR, "4DHumans")
8 |
9 |
10 | def to_lower(x: Dict) -> Dict:
11 | """
12 | Convert all dictionary keys to lowercase
13 | Args:
14 | x (dict): Input dictionary
15 | Returns:
16 | dict: Output dictionary with all keys converted to lowercase
17 | """
18 | return {k.lower(): v for k, v in x.items()}
19 |
20 |
21 | _C = CN(new_allowed=True)
22 |
23 | _C.GENERAL = CN(new_allowed=True)
24 | _C.GENERAL.RESUME = True
25 | _C.GENERAL.TIME_TO_RUN = 3300
26 | _C.GENERAL.VAL_STEPS = 100
27 | _C.GENERAL.LOG_STEPS = 100
28 | _C.GENERAL.CHECKPOINT_STEPS = 20000
29 | _C.GENERAL.CHECKPOINT_DIR = "checkpoints"
30 | _C.GENERAL.SUMMARY_DIR = "tensorboard"
31 | _C.GENERAL.NUM_GPUS = 1
32 | _C.GENERAL.NUM_WORKERS = 4
33 | _C.GENERAL.MIXED_PRECISION = True
34 | _C.GENERAL.ALLOW_CUDA = True
35 | _C.GENERAL.PIN_MEMORY = False
36 | _C.GENERAL.DISTRIBUTED = False
37 | _C.GENERAL.LOCAL_RANK = 0
38 | _C.GENERAL.USE_SYNCBN = False
39 | _C.GENERAL.WORLD_SIZE = 1
40 |
41 | _C.TRAIN = CN(new_allowed=True)
42 | _C.TRAIN.NUM_EPOCHS = 100
43 | _C.TRAIN.BATCH_SIZE = 32
44 | _C.TRAIN.SHUFFLE = True
45 | _C.TRAIN.WARMUP = False
46 | _C.TRAIN.NORMALIZE_PER_IMAGE = False
47 | _C.TRAIN.CLIP_GRAD = False
48 | _C.TRAIN.CLIP_GRAD_VALUE = 1.0
49 | _C.LOSS_WEIGHTS = CN(new_allowed=True)
50 |
51 | _C.DATASETS = CN(new_allowed=True)
52 |
53 | _C.MODEL = CN(new_allowed=True)
54 | _C.MODEL.IMAGE_SIZE = 224
55 |
56 | _C.EXTRA = CN(new_allowed=True)
57 | _C.EXTRA.FOCAL_LENGTH = 5000
58 |
59 | _C.DATASETS.CONFIG = CN(new_allowed=True)
60 | _C.DATASETS.CONFIG.SCALE_FACTOR = 0.3
61 | _C.DATASETS.CONFIG.ROT_FACTOR = 30
62 | _C.DATASETS.CONFIG.TRANS_FACTOR = 0.02
63 | _C.DATASETS.CONFIG.COLOR_SCALE = 0.2
64 | _C.DATASETS.CONFIG.ROT_AUG_RATE = 0.6
65 | _C.DATASETS.CONFIG.TRANS_AUG_RATE = 0.5
66 | _C.DATASETS.CONFIG.DO_FLIP = True
67 | _C.DATASETS.CONFIG.FLIP_AUG_RATE = 0.5
68 | _C.DATASETS.CONFIG.EXTREME_CROP_AUG_RATE = 0.10
69 |
70 |
71 | def default_config() -> CN:
72 | """
73 | Get a yacs CfgNode object with the default config values.
74 | """
75 | # Return a clone so that the defaults will not be altered
76 | # This is for the "local variable" use pattern
77 | return _C.clone()
78 |
79 |
80 | def dataset_config(name="datasets_tar.yaml") -> CN:
81 | """
82 | Get dataset config file
83 | Returns:
84 | CfgNode: Dataset config as a yacs CfgNode object.
85 | """
86 | cfg = CN(new_allowed=True)
87 | config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), name)
88 | cfg.merge_from_file(config_file)
89 | cfg.freeze()
90 | return cfg
91 |
92 |
93 | def dataset_eval_config() -> CN:
94 | return dataset_config("datasets_eval.yaml")
95 |
96 |
97 | def get_config(config_file: str, merge: bool = True) -> CN:
98 | """
99 | Read a config file and optionally merge it with the default config file.
100 | Args:
101 | config_file (str): Path to config file.
102 | merge (bool): Whether to merge with the default config or not.
103 | Returns:
104 | CfgNode: Config as a yacs CfgNode object.
105 | """
106 | if merge:
107 | cfg = default_config()
108 | else:
109 | cfg = CN(new_allowed=True)
110 | cfg.merge_from_file(config_file)
111 |
112 | # ---- Update ---- #
113 | cfg.SMPL.MODEL_PATH = cfg.SMPL.MODEL_PATH # Not used
114 | cfg.SMPL.JOINT_REGRESSOR_EXTRA = cfg.SMPL.JOINT_REGRESSOR_EXTRA # Not Used
115 | cfg.SMPL.MEAN_PARAMS = str(Path(__file__).parent / "smpl_mean_params.npz")
116 | # ---------------- #
117 |
118 | cfg.freeze()
119 | return cfg
120 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/configs/model_config.yaml:
--------------------------------------------------------------------------------
1 | task_name: train
2 | tags:
3 | - dev
4 | train: true
5 | test: false
6 | ckpt_path: null
7 | seed: null
8 | DATASETS:
9 | TRAIN:
10 | H36M-TRAIN:
11 | WEIGHT: 0.3
12 | MPII-TRAIN:
13 | WEIGHT: 0.1
14 | COCO-TRAIN-2014:
15 | WEIGHT: 0.4
16 | MPI-INF-TRAIN:
17 | WEIGHT: 0.2
18 | VAL:
19 | COCO-VAL:
20 | WEIGHT: 1.0
21 | MOCAP: CMU-MOCAP
22 | CONFIG:
23 | SCALE_FACTOR: 0.3
24 | ROT_FACTOR: 30
25 | TRANS_FACTOR: 0.02
26 | COLOR_SCALE: 0.2
27 | ROT_AUG_RATE: 0.6
28 | TRANS_AUG_RATE: 0.5
29 | DO_FLIP: true
30 | FLIP_AUG_RATE: 0.5
31 | EXTREME_CROP_AUG_RATE: 0.1
32 | trainer:
33 | _target_: pytorch_lightning.Trainer
34 | default_root_dir: ${paths.output_dir}
35 | accelerator: gpu
36 | devices: 8
37 | deterministic: false
38 | num_sanity_val_steps: 0
39 | log_every_n_steps: ${GENERAL.LOG_STEPS}
40 | val_check_interval: ${GENERAL.VAL_STEPS}
41 | precision: 16
42 | max_steps: ${GENERAL.TOTAL_STEPS}
43 | move_metrics_to_cpu: true
44 | limit_val_batches: 1
45 | track_grad_norm: 2
46 | strategy: ddp
47 | num_nodes: 1
48 | sync_batchnorm: true
49 | paths:
50 | root_dir: ${oc.env:PROJECT_ROOT}
51 | data_dir: ${paths.root_dir}/data/
52 | log_dir: /fsx/shubham/code/hmr2023/logs_hydra/
53 | output_dir: ${hydra:runtime.output_dir}
54 | work_dir: ${hydra:runtime.cwd}
55 | extras:
56 | ignore_warnings: false
57 | enforce_tags: true
58 | print_config: true
59 | exp_name: 3001d
60 | SMPL:
61 | MODEL_PATH: data/smpl
62 | GENDER: neutral
63 | NUM_BODY_JOINTS: 23
64 | JOINT_REGRESSOR_EXTRA: data/SMPL_to_J19.pkl
65 | MEAN_PARAMS: data/smpl_mean_params.npz
66 | EXTRA:
67 | FOCAL_LENGTH: 5000
68 | NUM_LOG_IMAGES: 4
69 | NUM_LOG_SAMPLES_PER_IMAGE: 8
70 | PELVIS_IND: 39
71 | MODEL:
72 | IMAGE_SIZE: 256
73 | IMAGE_MEAN:
74 | - 0.485
75 | - 0.456
76 | - 0.406
77 | IMAGE_STD:
78 | - 0.229
79 | - 0.224
80 | - 0.225
81 | BACKBONE:
82 | TYPE: vit
83 | FREEZE: true
84 | NUM_LAYERS: 50
85 | OUT_CHANNELS: 2048
86 | ADD_NECK: false
87 | FLOW:
88 | DIM: 144
89 | NUM_LAYERS: 4
90 | CONTEXT_FEATURES: 2048
91 | LAYER_HIDDEN_FEATURES: 1024
92 | LAYER_DEPTH: 2
93 | FC_HEAD:
94 | NUM_FEATURES: 1024
95 | SMPL_HEAD:
96 | TYPE: transformer_decoder
97 | IN_CHANNELS: 2048
98 | TRANSFORMER_DECODER:
99 | depth: 6
100 | heads: 8
101 | mlp_dim: 1024
102 | dim_head: 64
103 | dropout: 0.0
104 | emb_dropout: 0.0
105 | norm: layer
106 | context_dim: 1280
107 | GENERAL:
108 | TOTAL_STEPS: 100000
109 | LOG_STEPS: 100
110 | VAL_STEPS: 100
111 | CHECKPOINT_STEPS: 1000
112 | CHECKPOINT_SAVE_TOP_K: -1
113 | NUM_WORKERS: 6
114 | PREFETCH_FACTOR: 2
115 | TRAIN:
116 | LR: 0.0001
117 | WEIGHT_DECAY: 0.0001
118 | BATCH_SIZE: 512
119 | LOSS_REDUCTION: mean
120 | NUM_TRAIN_SAMPLES: 2
121 | NUM_TEST_SAMPLES: 64
122 | POSE_2D_NOISE_RATIO: 0.01
123 | SMPL_PARAM_NOISE_RATIO: 0.005
124 | LOSS_WEIGHTS:
125 | KEYPOINTS_3D: 0.05
126 | KEYPOINTS_2D: 0.01
127 | GLOBAL_ORIENT: 0.001
128 | BODY_POSE: 0.001
129 | BETAS: 0.0005
130 | ADVERSARIAL: 0.0005
131 | local: {}
132 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/configs/smpl_mean_params.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/network/hmr2/configs/smpl_mean_params.npz
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/hmr2.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import pytorch_lightning as pl
3 | from yacs.config import CfgNode
4 | from .vit import ViT
5 | from .smpl_head import SMPLTransformerDecoderHead
6 |
7 | from pytorch3d.transforms import matrix_to_axis_angle
8 | from hmr4d.utils.geo.hmr_cam import compute_transl_full_cam
9 |
10 |
11 | class HMR2(pl.LightningModule):
12 | def __init__(self, cfg: CfgNode):
13 | super().__init__()
14 | self.cfg = cfg
15 | self.backbone = ViT(
16 | img_size=(256, 192),
17 | patch_size=16,
18 | embed_dim=1280,
19 | depth=32,
20 | num_heads=16,
21 | ratio=1,
22 | use_checkpoint=False,
23 | mlp_ratio=4,
24 | qkv_bias=True,
25 | drop_path_rate=0.55,
26 | )
27 | self.smpl_head = SMPLTransformerDecoderHead(cfg)
28 |
29 | def forward(self, batch, feat_mode=True):
30 | """this file has been modified
31 | Args:
32 | feat_mode: default True, as we only need the feature token output for the HMR4D project;
33 | when False, the full process of HMR2 will be executed.
34 | """
35 | # Backbone
36 | x = batch["img"][:, :, :, 32:-32]
37 | vit_feats = self.backbone(x)
38 |
39 | # Output head
40 | if feat_mode:
41 | token_out = self.smpl_head(vit_feats, only_return_token_out=True) # (B, 1024)
42 | return token_out
43 |
44 | # return full process
45 | pred_smpl_params, pred_cam, _, token_out = self.smpl_head(vit_feats, only_return_token_out=False)
46 | output = {}
47 | output["token_out"] = token_out
48 | output["smpl_params"] = {
49 | "body_pose": matrix_to_axis_angle(pred_smpl_params["body_pose"]).flatten(-2), # (B, 23, 3)
50 | "betas": pred_smpl_params["betas"], # (B, 10)
51 | "global_orient": matrix_to_axis_angle(pred_smpl_params["global_orient"])[:, 0], # (B, 3)
52 | "transl": compute_transl_full_cam(pred_cam, batch["bbx_xys"], batch["K_fullimg"]), # (B, 3)
53 | }
54 |
55 | return output
56 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/utils/preproc.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | import torch
4 | from pathlib import Path
5 |
6 | IMAGE_MEAN = torch.tensor([0.485, 0.456, 0.406])
7 | IMAGE_STD = torch.tensor([0.229, 0.224, 0.225])
8 |
9 |
10 | def expand_to_aspect_ratio(input_shape, target_aspect_ratio=[192, 256]):
11 | """Increase the size of the bounding box to match the target shape."""
12 | if target_aspect_ratio is None:
13 | return input_shape
14 |
15 | try:
16 | w, h = input_shape
17 | except (ValueError, TypeError):
18 | return input_shape
19 |
20 | w_t, h_t = target_aspect_ratio
21 | if h / w < h_t / w_t:
22 | h_new = max(w * h_t / w_t, h)
23 | w_new = w
24 | else:
25 | h_new = h
26 | w_new = max(h * w_t / h_t, w)
27 | if h_new < h or w_new < w:
28 | breakpoint()
29 | return np.array([w_new, h_new])
30 |
31 |
32 | def crop_and_resize(img, bbx_xy, bbx_s, dst_size=256, enlarge_ratio=1.2):
33 | """
34 | Args:
35 | img: (H, W, 3)
36 | bbx_xy: (2,)
37 | bbx_s: scalar
38 | """
39 | hs = bbx_s * enlarge_ratio / 2
40 | src = np.stack(
41 | [
42 | bbx_xy - hs, # left-up corner
43 | bbx_xy + np.array([hs, -hs]), # right-up corner
44 | bbx_xy, # center
45 | ]
46 | ).astype(np.float32)
47 | dst = np.array([[0, 0], [dst_size - 1, 0], [dst_size / 2 - 0.5, dst_size / 2 - 0.5]], dtype=np.float32)
48 | A = cv2.getAffineTransform(src, dst)
49 |
50 | img_crop = cv2.warpAffine(img, A, (dst_size, dst_size), flags=cv2.INTER_LINEAR)
51 | bbx_xys_final = np.array([*bbx_xy, bbx_s * enlarge_ratio])
52 | return img_crop, bbx_xys_final
53 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/utils/smpl_wrapper.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import pickle
4 | from typing import Optional
5 | import smplx
6 | from smplx.lbs import vertices2joints
7 | from smplx.utils import SMPLOutput
8 |
9 |
10 | class SMPL(smplx.SMPLLayer):
11 | def __init__(self, *args, joint_regressor_extra: Optional[str] = None, update_hips: bool = False, **kwargs):
12 | """
13 | Extension of the official SMPL implementation to support more joints.
14 | Args:
15 | Same as SMPLLayer.
16 | joint_regressor_extra (str): Path to extra joint regressor.
17 | """
18 | super(SMPL, self).__init__(*args, **kwargs)
19 | smpl_to_openpose = [24, 12, 17, 19, 21, 16, 18, 20, 0, 2, 5, 8, 1, 4, 7, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
20 |
21 | if joint_regressor_extra is not None:
22 | self.register_buffer(
23 | "joint_regressor_extra",
24 | torch.tensor(pickle.load(open(joint_regressor_extra, "rb"), encoding="latin1"), dtype=torch.float32),
25 | )
26 | self.register_buffer("joint_map", torch.tensor(smpl_to_openpose, dtype=torch.long))
27 | self.update_hips = update_hips
28 |
29 | def forward(self, *args, **kwargs) -> SMPLOutput:
30 | """
31 | Run forward pass. Same as SMPL and also append an extra set of joints if joint_regressor_extra is specified.
32 | """
33 | smpl_output = super(SMPL, self).forward(*args, **kwargs)
34 | joints = smpl_output.joints[:, self.joint_map, :]
35 | if self.update_hips:
36 | joints[:, [9, 12]] = (
37 | joints[:, [9, 12]]
38 | + 0.25 * (joints[:, [9, 12]] - joints[:, [12, 9]])
39 | + 0.5 * (joints[:, [8]] - 0.5 * (joints[:, [9, 12]] + joints[:, [12, 9]]))
40 | )
41 | if hasattr(self, "joint_regressor_extra"):
42 | extra_joints = vertices2joints(self.joint_regressor_extra, smpl_output.vertices)
43 | joints = torch.cat([joints, extra_joints], dim=1)
44 | smpl_output.joints = joints
45 | return smpl_output
46 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/README.md:
--------------------------------------------------------------------------------
1 | # README
2 |
3 | Contents of this folder are modified from HuMoR repository.
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/__init__.py:
--------------------------------------------------------------------------------
1 | from .body_model import BodyModel
2 | from .body_model_smplh import BodyModelSMPLH
3 | from .body_model_smplx import BodyModelSMPLX
4 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/body_model_smplh.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import smplx
4 |
5 | kwargs_disable_member_var = {
6 | "create_body_pose": False,
7 | "create_betas": False,
8 | "create_global_orient": False,
9 | "create_transl": False,
10 | "create_left_hand_pose": False,
11 | "create_right_hand_pose": False,
12 | }
13 |
14 |
15 | class BodyModelSMPLH(nn.Module):
16 | """Support Batch inference"""
17 |
18 | def __init__(self, model_path, **kwargs):
19 | super().__init__()
20 | # enable flexible batchsize, handle missing variable at forward()
21 | kwargs.update(kwargs_disable_member_var)
22 | self.bm = smplx.create(model_path=model_path, **kwargs)
23 | self.faces = self.bm.faces
24 | self.is_smpl = kwargs.get("model_type", "smpl") == "smpl"
25 | if not self.is_smpl:
26 | self.hand_pose_dim = self.bm.num_pca_comps if self.bm.use_pca else 3 * self.bm.NUM_HAND_JOINTS
27 |
28 | # For fast computing of skeleton under beta
29 | shapedirs = self.bm.shapedirs # (V, 3, 10)
30 | J_regressor = self.bm.J_regressor[:22, :] # (22, V)
31 | v_template = self.bm.v_template # (V, 3)
32 | J_template = J_regressor @ v_template # (22, 3)
33 | J_shapedirs = torch.einsum("jv, vcd -> jcd", J_regressor, shapedirs) # (22, 3, 10)
34 | self.register_buffer("J_template", J_template, False)
35 | self.register_buffer("J_shapedirs", J_shapedirs, False)
36 |
37 | def forward(
38 | self,
39 | betas=None,
40 | global_orient=None,
41 | transl=None,
42 | body_pose=None,
43 | left_hand_pose=None,
44 | right_hand_pose=None,
45 | **kwargs
46 | ):
47 |
48 | device, dtype = self.bm.shapedirs.device, self.bm.shapedirs.dtype
49 |
50 | model_vars = [betas, global_orient, body_pose, transl, left_hand_pose, right_hand_pose]
51 | batch_size = 1
52 | for var in model_vars:
53 | if var is None:
54 | continue
55 | batch_size = max(batch_size, len(var))
56 |
57 | if global_orient is None:
58 | global_orient = torch.zeros([batch_size, 3], dtype=dtype, device=device)
59 | if body_pose is None:
60 | body_pose = (
61 | torch.zeros(3 * self.bm.NUM_BODY_JOINTS, device=device, dtype=dtype)[None]
62 | .expand(batch_size, -1)
63 | .contiguous()
64 | )
65 | if not self.is_smpl:
66 | if left_hand_pose is None:
67 | left_hand_pose = (
68 | torch.zeros(self.hand_pose_dim, device=device, dtype=dtype)[None]
69 | .expand(batch_size, -1)
70 | .contiguous()
71 | )
72 | if right_hand_pose is None:
73 | right_hand_pose = (
74 | torch.zeros(self.hand_pose_dim, device=device, dtype=dtype)[None]
75 | .expand(batch_size, -1)
76 | .contiguous()
77 | )
78 | if betas is None:
79 | betas = torch.zeros([batch_size, self.bm.num_betas], dtype=dtype, device=device)
80 | if transl is None:
81 | transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
82 |
83 | bm_out = self.bm(
84 | betas=betas,
85 | global_orient=global_orient,
86 | body_pose=body_pose,
87 | left_hand_pose=left_hand_pose,
88 | right_hand_pose=right_hand_pose,
89 | transl=transl,
90 | **kwargs
91 | )
92 |
93 | return bm_out
94 |
95 | def get_skeleton(self, betas):
96 | """betas: (*, 10) -> skeleton_beta: (*, 22, 3)"""
97 | skeleton_beta = self.J_template + torch.einsum("...d, jcd -> ...jc", betas, self.J_shapedirs) # (22, 3)
98 | return skeleton_beta
99 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/coco_aug_dict.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/coco_aug_dict.pth
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/seg_part_info.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/seg_part_info.npy
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/smpl_3dpw14_J_regressor_sparse.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smpl_3dpw14_J_regressor_sparse.pt
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/smpl_coco17_J_regressor.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smpl_coco17_J_regressor.pt
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/smpl_neutral_J_regressor.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smpl_neutral_J_regressor.pt
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/smplx2smpl_sparse.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smplx2smpl_sparse.pt
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/smplx_verts437.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smplx_verts437.pt
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/callbacks/lr_monitor.py:
--------------------------------------------------------------------------------
1 | from pytorch_lightning.callbacks import LearningRateMonitor
2 | from hmr4d.configs import builds, MainStore
3 |
4 |
5 | MainStore.store(name="pl", node=builds(LearningRateMonitor), group="callbacks/lr_monitor")
6 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/callbacks/train_speed_timer.py:
--------------------------------------------------------------------------------
1 | import pytorch_lightning as pl
2 | from pytorch_lightning.utilities import rank_zero_only
3 | from time import time
4 | from collections import deque
5 |
6 | from hmr4d.configs import MainStore, builds
7 |
8 |
9 | class TrainSpeedTimer(pl.Callback):
10 | def __init__(self, N_avg=5):
11 | """
12 | This callback times the training speed (averge over recent 5 iterations)
13 | 1. Data waiting time: this should be small, otherwise the data loading should be improved
14 | 2. Single batch time: this is the time for one batch of training (excluding data waiting)
15 | """
16 | super().__init__()
17 | self.last_batch_end = None
18 | self.this_batch_start = None
19 |
20 | # time queues for averaging
21 | self.data_waiting_time_queue = deque(maxlen=N_avg)
22 | self.single_batch_time_queue = deque(maxlen=N_avg)
23 |
24 | @rank_zero_only
25 | def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
26 | """Count the time of data waiting"""
27 | if self.last_batch_end is not None:
28 | # This should be small, otherwise the data loading should be improved
29 | data_waiting = time() - self.last_batch_end
30 |
31 | # Average the time
32 | self.data_waiting_time_queue.append(data_waiting)
33 | average_time = sum(self.data_waiting_time_queue) / len(self.data_waiting_time_queue)
34 |
35 | # Log to prog-bar
36 | pl_module.log(
37 | "train_timer/data_waiting", average_time, on_step=True, on_epoch=False, prog_bar=True, logger=True
38 | )
39 |
40 | self.this_batch_start = time()
41 |
42 | @rank_zero_only
43 | def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
44 | # Effective training time elapsed (excluding data waiting)
45 | single_batch = time() - self.this_batch_start
46 |
47 | # Average the time
48 | self.single_batch_time_queue.append(single_batch)
49 | average_time = sum(self.single_batch_time_queue) / len(self.single_batch_time_queue)
50 |
51 | # Log iter time
52 | pl_module.log(
53 | "train_timer/single_batch", average_time, on_step=True, on_epoch=False, prog_bar=False, logger=True
54 | )
55 |
56 | # Set timer for counting data waiting
57 | self.last_batch_end = time()
58 |
59 | @rank_zero_only
60 | def on_train_epoch_end(self, trainer, pl_module):
61 | # Reset the timer
62 | self.last_batch_end = None
63 | self.this_batch_start = None
64 | # Clear the queue
65 | self.data_waiting_time_queue.clear()
66 | self.single_batch_time_queue.clear()
67 |
68 |
69 | group_name = "callbacks/train_speed_timer"
70 | base = builds(TrainSpeedTimer, populate_full_signature=True)
71 | MainStore.store(name="base", node=base, group=group_name)
72 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/geo/flip_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from pytorch3d.transforms import axis_angle_to_matrix, matrix_to_axis_angle
3 |
4 |
5 | def flip_heatmap_coco17(output_flipped):
6 | assert output_flipped.ndim == 4, "output_flipped should be [B, J, H, W]"
7 | shape_ori = output_flipped.shape
8 | channels = 1
9 | output_flipped = output_flipped.reshape(shape_ori[0], -1, channels, shape_ori[2], shape_ori[3])
10 | output_flipped_back = output_flipped.clone()
11 |
12 | # Swap left-right parts
13 | for left, right in [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]:
14 | output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
15 | output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
16 | output_flipped_back = output_flipped_back.reshape(shape_ori)
17 | # Flip horizontally
18 | output_flipped_back = output_flipped_back.flip(3)
19 | return output_flipped_back
20 |
21 |
22 | def flip_bbx_xys(bbx_xys, w):
23 | """
24 | bbx_xys: (F, 3)
25 | """
26 | bbx_xys_flip = bbx_xys.clone()
27 | bbx_xys_flip[:, 0] = w - bbx_xys_flip[:, 0]
28 | return bbx_xys_flip
29 |
30 |
31 | def flip_kp2d_coco17(kp2d, w):
32 | """Flip keypoints."""
33 | kp2d = kp2d.clone()
34 | flipped_parts = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
35 | kp2d = kp2d[..., flipped_parts, :]
36 | kp2d[..., 0] = w - kp2d[..., 0]
37 | return kp2d
38 |
39 |
40 | def flip_smplx_params(smplx_params):
41 | """Flip pose.
42 | The flipping is based on SMPLX parameters.
43 | """
44 | rotation = torch.cat([smplx_params["global_orient"], smplx_params["body_pose"]], dim=1)
45 |
46 | BN = rotation.shape[0]
47 | pose = rotation.reshape(BN, -1).transpose(0, 1)
48 |
49 | SMPL_JOINTS_FLIP_PERM = [0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13, 15, 17, 16, 19, 18, 21, 20] # , 23, 22]
50 | SMPL_POSE_FLIP_PERM = []
51 | for i in SMPL_JOINTS_FLIP_PERM:
52 | SMPL_POSE_FLIP_PERM.append(3 * i)
53 | SMPL_POSE_FLIP_PERM.append(3 * i + 1)
54 | SMPL_POSE_FLIP_PERM.append(3 * i + 2)
55 |
56 | pose = pose[SMPL_POSE_FLIP_PERM]
57 |
58 | # we also negate the second and the third dimension of the axis-angle
59 | pose[1::3] = -pose[1::3]
60 | pose[2::3] = -pose[2::3]
61 | pose = pose.transpose(0, 1).reshape(BN, -1, 3)
62 |
63 | smplx_params_flipped = smplx_params.copy()
64 | smplx_params_flipped["global_orient"] = pose[:, :1]
65 | smplx_params_flipped["body_pose"] = pose[:, 1:]
66 | return smplx_params_flipped
67 |
68 |
69 | def avg_smplx_aa(aa1, aa2):
70 | def avg_rot(rot):
71 | # input [B,...,3,3] --> output [...,3,3]
72 | rot = rot.mean(dim=0)
73 | U, _, V = torch.svd(rot)
74 | rot = U @ V.transpose(-1, -2)
75 | return rot
76 |
77 | B, J3 = aa1.shape
78 | aa1 = aa1.reshape(B, -1, 3)
79 | aa2 = aa2.reshape(B, -1, 3)
80 |
81 | R1 = axis_angle_to_matrix(aa1)
82 | R2 = axis_angle_to_matrix(aa2)
83 | R_avg = avg_rot(torch.stack([R1, R2]))
84 | aa_avg = matrix_to_axis_angle(R_avg).reshape(B, -1)
85 |
86 | return aa_avg
87 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/geo/transforms.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def axis_rotate_to_matrix(angle, axis="x"):
5 | """Get rotation matrix for rotating around one axis
6 | Args:
7 | angle: (N, 1)
8 | Returns:
9 | R: (N, 3, 3)
10 | """
11 | if isinstance(angle, float):
12 | angle = torch.tensor([angle], dtype=torch.float)
13 |
14 | c = torch.cos(angle)
15 | s = torch.sin(angle)
16 | z = torch.zeros_like(angle)
17 | o = torch.ones_like(angle)
18 | if axis == "x":
19 | R = torch.stack([o, z, z, z, c, -s, z, s, c], dim=1).view(-1, 3, 3)
20 | elif axis == "y":
21 | R = torch.stack([c, z, s, z, o, z, -s, z, c], dim=1).view(-1, 3, 3)
22 | else:
23 | assert axis == "z"
24 | R = torch.stack([c, -s, z, s, c, z, z, z, o], dim=1).view(-1, 3, 3)
25 | return R
26 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | from hmr4d.utils.preproc.tracker import Tracker
3 | from hmr4d.utils.preproc.vitfeat_extractor import Extractor
4 | from hmr4d.utils.preproc.vitpose import VitPoseExtractor
5 | from hmr4d.utils.preproc.slam import SLAMModel
6 | except:
7 | pass
8 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/slam.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import time
3 | import torch
4 | from multiprocessing import Process, Queue
5 |
6 | try:
7 | from dpvo.utils import Timer
8 | from dpvo.dpvo import DPVO
9 | from dpvo.config import cfg
10 | except:
11 | pass
12 |
13 |
14 | from hmr4d import PROJ_ROOT
15 | from hmr4d.utils.geo.hmr_cam import estimate_focal_length
16 |
17 |
18 | class SLAMModel(object):
19 | def __init__(self, video_path, width, height, intrinsics=None, stride=1, skip=0, buffer=2048, resize=0.5):
20 | """
21 | Args:
22 | intrinsics: [fx, fy, cx, cy]
23 | """
24 | if intrinsics is None:
25 | print("Estimating focal length")
26 | focal_length = estimate_focal_length(width, height)
27 | intrinsics = torch.tensor([focal_length, focal_length, width / 2.0, height / 2.0])
28 | else:
29 | intrinsics = intrinsics.clone()
30 |
31 | self.dpvo_cfg = str(PROJ_ROOT / "third-party/DPVO/config/default.yaml")
32 | self.dpvo_ckpt = "inputs/checkpoints/dpvo/dpvo.pth"
33 |
34 | self.buffer = buffer
35 | self.times = []
36 | self.slam = None
37 | self.queue = Queue(maxsize=8)
38 | self.reader = Process(target=video_stream, args=(self.queue, video_path, intrinsics, stride, skip, resize))
39 | self.reader.start()
40 |
41 | def track(self):
42 | (t, image, intrinsics) = self.queue.get()
43 |
44 | if t < 0:
45 | return False
46 |
47 | image = torch.from_numpy(image).permute(2, 0, 1).cuda()
48 | intrinsics = intrinsics.cuda() # [fx, fy, cx, cy]
49 |
50 | if self.slam is None:
51 | cfg.merge_from_file(self.dpvo_cfg)
52 | cfg.BUFFER_SIZE = self.buffer
53 | self.slam = DPVO(cfg, self.dpvo_ckpt, ht=image.shape[1], wd=image.shape[2], viz=False)
54 |
55 | with Timer("SLAM", enabled=False):
56 | t = time.time()
57 | self.slam(t, image, intrinsics)
58 | self.times.append(time.time() - t)
59 |
60 | return True
61 |
62 | def process(self):
63 | for _ in range(12):
64 | self.slam.update()
65 |
66 | self.reader.join()
67 | return self.slam.terminate()[0]
68 |
69 |
70 | def video_stream(queue, imagedir, intrinsics, stride, skip=0, resize=0.5):
71 | """video generator"""
72 | assert len(intrinsics) == 4, "intrinsics should be [fx, fy, cx, cy]"
73 |
74 | cap = cv2.VideoCapture(imagedir)
75 | t = 0
76 | for _ in range(skip):
77 | ret, image = cap.read()
78 |
79 | while True:
80 | # Capture frame-by-frame
81 | for _ in range(stride):
82 | ret, image = cap.read()
83 | # if frame is read correctly ret is True
84 | if not ret:
85 | break
86 |
87 | if not ret:
88 | break
89 |
90 | image = cv2.resize(image, None, fx=resize, fy=resize, interpolation=cv2.INTER_AREA)
91 | h, w, _ = image.shape
92 | image = image[: h - h % 16, : w - w % 16]
93 |
94 | intrinsics_ = intrinsics.clone() * resize
95 | queue.put((t, image, intrinsics_))
96 |
97 | t += 1
98 |
99 | queue.put((-1, image, intrinsics)) # -1 will terminate the process
100 | cap.release()
101 |
102 | # wait for the queue to be empty, otherwise the process will end immediately
103 | while not queue.empty():
104 | time.sleep(1)
105 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitfeat_extractor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from hmr4d.network.hmr2 import load_hmr2, HMR2
3 |
4 |
5 | from hmr4d.utils.video_io_utils import read_video_np
6 | import cv2
7 | import numpy as np
8 |
9 | from hmr4d.network.hmr2.utils.preproc import crop_and_resize, IMAGE_MEAN, IMAGE_STD
10 | from tqdm import tqdm
11 |
12 |
13 | def get_batch(input_path, bbx_xys, img_ds=0.5, img_dst_size=256, path_type="video"):
14 | if path_type == "video":
15 | imgs = read_video_np(input_path, scale=img_ds)
16 | elif path_type == "image":
17 | imgs = cv2.imread(str(input_path))[..., ::-1]
18 | imgs = cv2.resize(imgs, (0, 0), fx=img_ds, fy=img_ds)
19 | imgs = imgs[None]
20 | elif path_type == "np":
21 | assert isinstance(input_path, np.ndarray)
22 | assert img_ds == 1.0 # this is safe
23 | imgs = input_path
24 |
25 | gt_center = bbx_xys[:, :2]
26 | gt_bbx_size = bbx_xys[:, 2]
27 |
28 | # Blur image to avoid aliasing artifacts
29 | if True:
30 | gt_bbx_size_ds = gt_bbx_size * img_ds
31 | ds_factors = ((gt_bbx_size_ds * 1.0) / img_dst_size / 2.0).numpy()
32 | imgs = np.stack(
33 | [
34 | # gaussian(v, sigma=(d - 1) / 2, channel_axis=2, preserve_range=True) if d > 1.1 else v
35 | cv2.GaussianBlur(v, (5, 5), (d - 1) / 2) if d > 1.1 else v
36 | for v, d in zip(imgs, ds_factors)
37 | ]
38 | )
39 |
40 | # Output
41 | imgs_list = []
42 | bbx_xys_ds_list = []
43 | for i in range(len(imgs)):
44 | img, bbx_xys_ds = crop_and_resize(
45 | imgs[i],
46 | gt_center[i] * img_ds,
47 | gt_bbx_size[i] * img_ds,
48 | img_dst_size,
49 | enlarge_ratio=1.0,
50 | )
51 | imgs_list.append(img)
52 | bbx_xys_ds_list.append(bbx_xys_ds)
53 | imgs = torch.from_numpy(np.stack(imgs_list)) # (F, 256, 256, 3), RGB
54 | bbx_xys = torch.from_numpy(np.stack(bbx_xys_ds_list)) / img_ds # (F, 3)
55 |
56 | imgs = ((imgs / 255.0 - IMAGE_MEAN) / IMAGE_STD).permute(0, 3, 1, 2) # (F, 3, 256, 256
57 | return imgs, bbx_xys
58 |
59 |
60 | class Extractor:
61 | def __init__(self, tqdm_leave=True):
62 | self.extractor: HMR2 = load_hmr2().cuda().eval()
63 | self.tqdm_leave = tqdm_leave
64 |
65 | def extract_video_features(self, video_path, bbx_xys, img_ds=0.5):
66 | """
67 | img_ds makes the image smaller, which is useful for faster processing
68 | """
69 | # Get the batch
70 | if isinstance(video_path, str):
71 | imgs, bbx_xys = get_batch(video_path, bbx_xys, img_ds=img_ds)
72 | else:
73 | assert isinstance(video_path, torch.Tensor)
74 | imgs = video_path
75 |
76 | # Inference
77 | F, _, H, W = imgs.shape # (F, 3, H, W)
78 | imgs = imgs.cuda()
79 | batch_size = 16 # 5GB GPU memory, occupies all CUDA cores of 3090
80 | features = []
81 | for j in tqdm(range(0, F, batch_size), desc="HMR2 Feature", leave=self.tqdm_leave):
82 | imgs_batch = imgs[j : j + batch_size]
83 |
84 | with torch.no_grad():
85 | feature = self.extractor({"img": imgs_batch})
86 | features.append(feature.detach().cpu())
87 |
88 | features = torch.cat(features, dim=0).clone() # (F, 1024)
89 | return features
90 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | from .src.vitpose_infer.model_builder import build_model
2 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/__init__.py
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/__init__.py
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | # from .alexnet import AlexNet
3 | # from .cpm import CPM
4 | # from .hourglass import HourglassNet
5 | # from .hourglass_ae import HourglassAENet
6 | # from .hrformer import HRFormer
7 | # from .hrnet import HRNet
8 | # from .litehrnet import LiteHRNet
9 | # from .mobilenet_v2 import MobileNetV2
10 | # from .mobilenet_v3 import MobileNetV3
11 | # from .mspn import MSPN
12 | # from .regnet import RegNet
13 | # from .resnest import ResNeSt
14 | # from .resnet import ResNet, ResNetV1d
15 | # from .resnext import ResNeXt
16 | # from .rsn import RSN
17 | # from .scnet import SCNet
18 | # from .seresnet import SEResNet
19 | # from .seresnext import SEResNeXt
20 | # from .shufflenet_v1 import ShuffleNetV1
21 | # from .shufflenet_v2 import ShuffleNetV2
22 | # from .tcn import TCN
23 | # from .v2v_net import V2VNet
24 | # from .vgg import VGG
25 | # from .vipnas_mbv3 import ViPNAS_MobileNetV3
26 | # from .vipnas_resnet import ViPNAS_ResNet
27 | from .vit import ViT
28 |
29 | # __all__ = [
30 | # 'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
31 | # 'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
32 | # 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
33 | # 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
34 | # 'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT'
35 | # ]
36 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/alexnet.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch.nn as nn
3 |
4 | from ..builder import BACKBONES
5 | from .base_backbone import BaseBackbone
6 |
7 |
8 | @BACKBONES.register_module()
9 | class AlexNet(BaseBackbone):
10 | """`AlexNet `__ backbone.
11 |
12 | The input for AlexNet is a 224x224 RGB image.
13 |
14 | Args:
15 | num_classes (int): number of classes for classification.
16 | The default value is -1, which uses the backbone as
17 | a feature extractor without the top classifier.
18 | """
19 |
20 | def __init__(self, num_classes=-1):
21 | super().__init__()
22 | self.num_classes = num_classes
23 | self.features = nn.Sequential(
24 | nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
25 | nn.ReLU(inplace=True),
26 | nn.MaxPool2d(kernel_size=3, stride=2),
27 | nn.Conv2d(64, 192, kernel_size=5, padding=2),
28 | nn.ReLU(inplace=True),
29 | nn.MaxPool2d(kernel_size=3, stride=2),
30 | nn.Conv2d(192, 384, kernel_size=3, padding=1),
31 | nn.ReLU(inplace=True),
32 | nn.Conv2d(384, 256, kernel_size=3, padding=1),
33 | nn.ReLU(inplace=True),
34 | nn.Conv2d(256, 256, kernel_size=3, padding=1),
35 | nn.ReLU(inplace=True),
36 | nn.MaxPool2d(kernel_size=3, stride=2),
37 | )
38 | if self.num_classes > 0:
39 | self.classifier = nn.Sequential(
40 | nn.Dropout(),
41 | nn.Linear(256 * 6 * 6, 4096),
42 | nn.ReLU(inplace=True),
43 | nn.Dropout(),
44 | nn.Linear(4096, 4096),
45 | nn.ReLU(inplace=True),
46 | nn.Linear(4096, num_classes),
47 | )
48 |
49 | def forward(self, x):
50 |
51 | x = self.features(x)
52 | if self.num_classes > 0:
53 | x = x.view(x.size(0), 256 * 6 * 6)
54 | x = self.classifier(x)
55 |
56 | return x
57 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/test_torch.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class Net(nn.Module):
7 |
8 | def __init__(self):
9 | super(Net, self).__init__()
10 | # 1 input image channel, 6 output channels, 5x5 square convolution
11 | # kernel
12 | self.conv1 = nn.Conv2d(1, 6, 5)
13 | self.conv2 = nn.Conv2d(6, 16, 5)
14 | # an affine operation: y = Wx + b
15 | self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
16 | self.fc2 = nn.Linear(120, 84)
17 | self.fc3 = nn.Linear(84, 10)
18 |
19 | def forward(self, x):
20 | # Max pooling over a (2, 2) window
21 | x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
22 | # If the size is a square, you can specify with a single number
23 | x = F.max_pool2d(F.relu(self.conv2(x)), 2)
24 | x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
25 | x = F.relu(self.fc1(x))
26 | x = F.relu(self.fc2(x))
27 | x = self.fc3(x)
28 | return x
29 |
30 |
31 | net = Net()
32 | # print(net)
33 |
34 | net.train()
35 |
36 | input = torch.randn(1, 1, 32, 32)
37 | # out = net(input)
38 | # print(out)
39 | output = net(input)
40 | target = torch.randn(10) # a dummy target, for example
41 | target = target.view(1, -1) # make it the same shape as output
42 | criterion = nn.MSELoss()
43 |
44 | # loss = criterion(output.cuda(), target.cuda())
45 |
46 | import torch.optim as optim
47 |
48 | # create your optimizer
49 | optimizer = optim.SGD(net.parameters(), lr=0.01)
50 |
51 | # in your training loop:
52 | optimizer.zero_grad() # zero the gradient buffers
53 | output = net(input)
54 | loss = criterion(output, target)
55 |
56 | loss.backward()
57 |
58 | optimizer.step()
59 |
60 | # print(loss)
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .channel_shuffle import channel_shuffle
3 | from .inverted_residual import InvertedResidual
4 | from .make_divisible import make_divisible
5 | from .se_layer import SELayer
6 | from .utils import load_checkpoint
7 |
8 | __all__ = [
9 | 'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer',
10 | 'load_checkpoint'
11 | ]
12 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/channel_shuffle.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 |
4 |
5 | def channel_shuffle(x, groups):
6 | """Channel Shuffle operation.
7 |
8 | This function enables cross-group information flow for multiple groups
9 | convolution layers.
10 |
11 | Args:
12 | x (Tensor): The input tensor.
13 | groups (int): The number of groups to divide the input tensor
14 | in the channel dimension.
15 |
16 | Returns:
17 | Tensor: The output tensor after channel shuffle operation.
18 | """
19 |
20 | batch_size, num_channels, height, width = x.size()
21 | assert (num_channels % groups == 0), ('num_channels should be '
22 | 'divisible by groups')
23 | channels_per_group = num_channels // groups
24 |
25 | x = x.view(batch_size, groups, channels_per_group, height, width)
26 | x = torch.transpose(x, 1, 2).contiguous()
27 | x = x.view(batch_size, -1, height, width)
28 |
29 | return x
30 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/make_divisible.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
3 | """Make divisible function.
4 |
5 | This function rounds the channel number down to the nearest value that can
6 | be divisible by the divisor.
7 |
8 | Args:
9 | value (int): The original channel number.
10 | divisor (int): The divisor to fully divide the channel number.
11 | min_value (int, optional): The minimum value of the output channel.
12 | Default: None, means that the minimum value equal to the divisor.
13 | min_ratio (float, optional): The minimum ratio of the rounded channel
14 | number to the original channel number. Default: 0.9.
15 | Returns:
16 | int: The modified output channel number
17 | """
18 |
19 | if min_value is None:
20 | min_value = divisor
21 | new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
22 | # Make sure that round down does not go down by more than (1-min_ratio).
23 | if new_value < min_ratio * value:
24 | new_value += divisor
25 | return new_value
26 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/se_layer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import mmcv
3 | import torch.nn as nn
4 | from mmcv.cnn import ConvModule
5 |
6 |
7 | class SELayer(nn.Module):
8 | """Squeeze-and-Excitation Module.
9 |
10 | Args:
11 | channels (int): The input (and output) channels of the SE layer.
12 | ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
13 | ``int(channels/ratio)``. Default: 16.
14 | conv_cfg (None or dict): Config dict for convolution layer.
15 | Default: None, which means using conv2d.
16 | act_cfg (dict or Sequence[dict]): Config dict for activation layer.
17 | If act_cfg is a dict, two activation layers will be configurated
18 | by this dict. If act_cfg is a sequence of dicts, the first
19 | activation layer will be configurated by the first dict and the
20 | second activation layer will be configurated by the second dict.
21 | Default: (dict(type='ReLU'), dict(type='Sigmoid'))
22 | """
23 |
24 | def __init__(self,
25 | channels,
26 | ratio=16,
27 | conv_cfg=None,
28 | act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
29 | super().__init__()
30 | if isinstance(act_cfg, dict):
31 | act_cfg = (act_cfg, act_cfg)
32 | assert len(act_cfg) == 2
33 | assert mmcv.is_tuple_of(act_cfg, dict)
34 | self.global_avgpool = nn.AdaptiveAvgPool2d(1)
35 | self.conv1 = ConvModule(
36 | in_channels=channels,
37 | out_channels=int(channels / ratio),
38 | kernel_size=1,
39 | stride=1,
40 | conv_cfg=conv_cfg,
41 | act_cfg=act_cfg[0])
42 | self.conv2 = ConvModule(
43 | in_channels=int(channels / ratio),
44 | out_channels=channels,
45 | kernel_size=1,
46 | stride=1,
47 | conv_cfg=conv_cfg,
48 | act_cfg=act_cfg[1])
49 |
50 | def forward(self, x):
51 | out = self.global_avgpool(x)
52 | out = self.conv1(out)
53 | out = self.conv2(out)
54 | return x * out
55 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from collections import OrderedDict
3 |
4 | from mmcv.runner.checkpoint import _load_checkpoint, load_state_dict
5 |
6 |
7 | def load_checkpoint(model,
8 | filename,
9 | map_location='cpu',
10 | strict=False,
11 | logger=None):
12 | """Load checkpoint from a file or URI.
13 |
14 | Args:
15 | model (Module): Module to load checkpoint.
16 | filename (str): Accept local filepath, URL, ``torchvision://xxx``,
17 | ``open-mmlab://xxx``.
18 | map_location (str): Same as :func:`torch.load`.
19 | strict (bool): Whether to allow different params for the model and
20 | checkpoint.
21 | logger (:mod:`logging.Logger` or None): The logger for error message.
22 |
23 | Returns:
24 | dict or OrderedDict: The loaded checkpoint.
25 | """
26 | checkpoint = _load_checkpoint(filename, map_location)
27 | # OrderedDict is a subclass of dict
28 | if not isinstance(checkpoint, dict):
29 | raise RuntimeError(
30 | f'No state_dict found in checkpoint file {filename}')
31 | # get state_dict from checkpoint
32 | if 'state_dict' in checkpoint:
33 | state_dict_tmp = checkpoint['state_dict']
34 | else:
35 | state_dict_tmp = checkpoint
36 |
37 | state_dict = OrderedDict()
38 | # strip prefix of state_dict
39 | for k, v in state_dict_tmp.items():
40 | if k.startswith('module.backbone.'):
41 | state_dict[k[16:]] = v
42 | elif k.startswith('module.'):
43 | state_dict[k[7:]] = v
44 | elif k.startswith('backbone.'):
45 | state_dict[k[9:]] = v
46 | else:
47 | state_dict[k] = v
48 | # load state_dict
49 | load_state_dict(model, state_dict, strict, logger)
50 | return checkpoint
51 |
52 |
53 | def get_state_dict(filename, map_location='cpu'):
54 | """Get state_dict from a file or URI.
55 |
56 | Args:
57 | filename (str): Accept local filepath, URL, ``torchvision://xxx``,
58 | ``open-mmlab://xxx``.
59 | map_location (str): Same as :func:`torch.load`.
60 |
61 | Returns:
62 | OrderedDict: The state_dict.
63 | """
64 | checkpoint = _load_checkpoint(filename, map_location)
65 | # OrderedDict is a subclass of dict
66 | if not isinstance(checkpoint, dict):
67 | raise RuntimeError(
68 | f'No state_dict found in checkpoint file {filename}')
69 | # get state_dict from checkpoint
70 | if 'state_dict' in checkpoint:
71 | state_dict_tmp = checkpoint['state_dict']
72 | else:
73 | state_dict_tmp = checkpoint
74 |
75 | state_dict = OrderedDict()
76 | # strip prefix of state_dict
77 | for k, v in state_dict_tmp.items():
78 | if k.startswith('module.backbone.'):
79 | state_dict[k[16:]] = v
80 | elif k.startswith('module.'):
81 | state_dict[k[7:]] = v
82 | elif k.startswith('backbone.'):
83 | state_dict[k[9:]] = v
84 | else:
85 | state_dict[k] = v
86 |
87 | return state_dict
88 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/configs/coco/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/configs/coco/__init__.py
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | # from .ae_higher_resolution_head import AEHigherResolutionHead
3 | # from .ae_multi_stage_head import AEMultiStageHead
4 | # from .ae_simple_head import AESimpleHead
5 | # from .deconv_head import DeconvHead
6 | # from .deeppose_regression_head import DeepposeRegressionHead
7 | # from .hmr_head import HMRMeshHead
8 | # from .interhand_3d_head import Interhand3DHead
9 | # from .temporal_regression_head import TemporalRegressionHead
10 | from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
11 | # from .topdown_heatmap_multi_stage_head import (TopdownHeatmapMSMUHead,
12 | # TopdownHeatmapMultiStageHead)
13 | from .topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
14 | # from .vipnas_heatmap_simple_head import ViPNASHeatmapSimpleHead
15 | # from .voxelpose_head import CuboidCenterHead, CuboidPoseHead
16 |
17 | # __all__ = [
18 | # 'TopdownHeatmapSimpleHead', 'TopdownHeatmapMultiStageHead',
19 | # 'TopdownHeatmapMSMUHead', 'TopdownHeatmapBaseHead',
20 | # 'AEHigherResolutionHead', 'AESimpleHead', 'AEMultiStageHead',
21 | # 'DeepposeRegressionHead', 'TemporalRegressionHead', 'Interhand3DHead',
22 | # 'HMRMeshHead', 'DeconvHead', 'ViPNASHeatmapSimpleHead', 'CuboidCenterHead',
23 | # 'CuboidPoseHead'
24 | # ]
25 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/hmr_head.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 | from mmcv.cnn import xavier_init
6 |
7 | from ..builder import HEADS
8 | from ..utils.geometry import rot6d_to_rotmat
9 |
10 |
11 | @HEADS.register_module()
12 | class HMRMeshHead(nn.Module):
13 | """SMPL parameters regressor head of simple baseline. "End-to-end Recovery
14 | of Human Shape and Pose", CVPR'2018.
15 |
16 | Args:
17 | in_channels (int): Number of input channels
18 | smpl_mean_params (str): The file name of the mean SMPL parameters
19 | n_iter (int): The iterations of estimating delta parameters
20 | """
21 |
22 | def __init__(self, in_channels, smpl_mean_params=None, n_iter=3):
23 | super().__init__()
24 |
25 | self.in_channels = in_channels
26 | self.n_iter = n_iter
27 |
28 | npose = 24 * 6
29 | nbeta = 10
30 | ncam = 3
31 | hidden_dim = 1024
32 |
33 | self.fc1 = nn.Linear(in_channels + npose + nbeta + ncam, hidden_dim)
34 | self.drop1 = nn.Dropout()
35 | self.fc2 = nn.Linear(hidden_dim, hidden_dim)
36 | self.drop2 = nn.Dropout()
37 | self.decpose = nn.Linear(hidden_dim, npose)
38 | self.decshape = nn.Linear(hidden_dim, nbeta)
39 | self.deccam = nn.Linear(hidden_dim, ncam)
40 |
41 | # Load mean SMPL parameters
42 | if smpl_mean_params is None:
43 | init_pose = torch.zeros([1, npose])
44 | init_shape = torch.zeros([1, nbeta])
45 | init_cam = torch.FloatTensor([[1, 0, 0]])
46 | else:
47 | mean_params = np.load(smpl_mean_params)
48 | init_pose = torch.from_numpy(
49 | mean_params['pose'][:]).unsqueeze(0).float()
50 | init_shape = torch.from_numpy(
51 | mean_params['shape'][:]).unsqueeze(0).float()
52 | init_cam = torch.from_numpy(
53 | mean_params['cam']).unsqueeze(0).float()
54 | self.register_buffer('init_pose', init_pose)
55 | self.register_buffer('init_shape', init_shape)
56 | self.register_buffer('init_cam', init_cam)
57 |
58 | def forward(self, x):
59 | """Forward function.
60 |
61 | x is the image feature map and is expected to be in shape (batch size x
62 | channel number x height x width)
63 | """
64 | batch_size = x.shape[0]
65 | # extract the global feature vector by average along
66 | # spatial dimension.
67 | x = x.mean(dim=-1).mean(dim=-1)
68 |
69 | init_pose = self.init_pose.expand(batch_size, -1)
70 | init_shape = self.init_shape.expand(batch_size, -1)
71 | init_cam = self.init_cam.expand(batch_size, -1)
72 |
73 | pred_pose = init_pose
74 | pred_shape = init_shape
75 | pred_cam = init_cam
76 | for _ in range(self.n_iter):
77 | xc = torch.cat([x, pred_pose, pred_shape, pred_cam], 1)
78 | xc = self.fc1(xc)
79 | xc = self.drop1(xc)
80 | xc = self.fc2(xc)
81 | xc = self.drop2(xc)
82 | pred_pose = self.decpose(xc) + pred_pose
83 | pred_shape = self.decshape(xc) + pred_shape
84 | pred_cam = self.deccam(xc) + pred_cam
85 |
86 | pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3)
87 | out = (pred_rotmat, pred_shape, pred_cam)
88 | return out
89 |
90 | def init_weights(self):
91 | """Initialize model weights."""
92 | xavier_init(self.decpose, gain=0.01)
93 | xavier_init(self.decshape, gain=0.01)
94 | xavier_init(self.deccam, gain=0.01)
95 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/model_builder.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | # from configs.coco.ViTPose_base_coco_256x192 import model
4 | from .heads.topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
5 |
6 | # import TopdownHeatmapSimpleHead
7 | from .backbones import ViT
8 |
9 | # print(model)
10 | import torch
11 | from functools import partial
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | from importlib import import_module
15 |
16 |
17 | def build_model(model_name, checkpoint=None):
18 | try:
19 | path = ".configs.coco." + model_name
20 | mod = import_module(path, package="src.vitpose_infer")
21 |
22 | model = getattr(mod, "model")
23 | # from path import model
24 | except:
25 | raise ValueError("not a correct config")
26 |
27 | head = TopdownHeatmapSimpleHead(
28 | in_channels=model["keypoint_head"]["in_channels"],
29 | out_channels=model["keypoint_head"]["out_channels"],
30 | num_deconv_filters=model["keypoint_head"]["num_deconv_filters"],
31 | num_deconv_kernels=model["keypoint_head"]["num_deconv_kernels"],
32 | num_deconv_layers=model["keypoint_head"]["num_deconv_layers"],
33 | extra=model["keypoint_head"]["extra"],
34 | )
35 | # print(head)
36 | backbone = ViT(
37 | img_size=model["backbone"]["img_size"],
38 | patch_size=model["backbone"]["patch_size"],
39 | embed_dim=model["backbone"]["embed_dim"],
40 | depth=model["backbone"]["depth"],
41 | num_heads=model["backbone"]["num_heads"],
42 | ratio=model["backbone"]["ratio"],
43 | mlp_ratio=model["backbone"]["mlp_ratio"],
44 | qkv_bias=model["backbone"]["qkv_bias"],
45 | drop_path_rate=model["backbone"]["drop_path_rate"],
46 | )
47 |
48 | class VitPoseModel(nn.Module):
49 | def __init__(self, backbone, keypoint_head):
50 | super(VitPoseModel, self).__init__()
51 | self.backbone = backbone
52 | self.keypoint_head = keypoint_head
53 |
54 | def forward(self, x):
55 | x = self.backbone(x)
56 | x = self.keypoint_head(x)
57 | return x
58 |
59 | pose = VitPoseModel(backbone, head)
60 | if checkpoint is not None:
61 | check = torch.load(checkpoint)
62 |
63 | pose.load_state_dict(check["state_dict"])
64 | return pose
65 |
66 |
67 | # pose = build_model('ViTPose_base_coco_256x192','./models/vitpose-b-multi-coco.pth')
68 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/__init__.py
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/convert_to_trt.py:
--------------------------------------------------------------------------------
1 | from torch2trt import TRTModule,torch2trt
2 | from builder import build_model
3 | import torch
4 | pose = build_model('ViTPose_base_coco_256x192','./models/vitpose-b.pth')
5 | pose.cuda().eval()
6 |
7 | x = torch.ones(1,3,256,192).cuda()
8 | net_trt = torch2trt(pose, [x],max_batch_size=10, fp16_mode=True)
9 | torch.save(net_trt.state_dict(), 'vitpose_trt.pth')
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/inference_test.py:
--------------------------------------------------------------------------------
1 | from builder import build_model
2 | import torch
3 | from ViTPose_trt import TRTModule_ViTPose
4 | # pose = TRTModule_ViTPose(path='pose_higher_hrnet_w32_512.engine',device='cuda:0')
5 | pose = build_model('ViTPose_base_coco_256x192','./models/vitpose-b.pth')
6 | pose.cuda().eval()
7 | if pose.training:
8 | print('train')
9 | else:
10 | print('eval')
11 | device = torch.device("cuda")
12 | # pose.to(device)
13 | dummy_input = torch.randn(10, 3,256,192, dtype=torch.float).to(device)
14 | repetitions=100
15 | total_time = 0
16 | starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
17 | with torch.no_grad():
18 | for rep in range(repetitions):
19 | # starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
20 | starter.record()
21 | # for k in range(10):
22 | _ = pose(dummy_input)
23 | ender.record()
24 | torch.cuda.synchronize()
25 | curr_time = starter.elapsed_time(ender)/1000
26 | total_time += curr_time
27 | Throughput = repetitions*10/total_time
28 | print('Final Throughput:',Throughput)
29 | print('Total time',total_time)
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/logger_helper.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | class CustomFormatter(logging.Formatter):
4 |
5 | grey = "\x1b[38;20m"
6 | yellow = "\x1b[33;20m"
7 | red = "\x1b[31;20m"
8 | bold_red = "\x1b[31;1m"
9 | reset = "\x1b[0m"
10 | format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)"
11 |
12 | FORMATS = {
13 | logging.DEBUG: grey + format + reset,
14 | logging.INFO: grey + format + reset,
15 | logging.WARNING: yellow + format + reset,
16 | logging.ERROR: red + format + reset,
17 | logging.CRITICAL: bold_red + format + reset
18 | }
19 |
20 | def format(self, record):
21 | log_fmt = self.FORMATS.get(record.levelno)
22 | formatter = logging.Formatter(log_fmt)
23 | return formatter.format(record)
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/timerr.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 |
4 | class Timer(object):
5 | """A simple timer."""
6 | def __init__(self):
7 | self.total_time = 0.
8 | self.calls = 0
9 | self.start_time = 0.
10 | self.diff = 0.
11 | self.average_time = 0.
12 |
13 | self.duration = 0.
14 |
15 | def tic(self):
16 | # using time.time instead of time.clock because time time.clock
17 | # does not normalize for multithreading
18 | self.start_time = time.time()
19 |
20 | def toc(self, average=True):
21 | self.diff = time.time() - self.start_time
22 | self.total_time += self.diff
23 | self.calls += 1
24 | self.average_time = self.total_time / self.calls
25 | if average:
26 | self.duration = self.average_time
27 | else:
28 | self.duration = self.diff
29 | return self.duration
30 |
31 | def clear(self):
32 | self.total_time = 0.
33 | self.calls = 0
34 | self.start_time = 0.
35 | self.diff = 0.
36 | self.average_time = 0.
37 | self.duration = 0.
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/pylogger.py:
--------------------------------------------------------------------------------
1 | from time import time
2 | import logging
3 | import torch
4 | from colorlog import ColoredFormatter
5 |
6 |
7 | def sync_time():
8 | torch.cuda.synchronize()
9 | return time()
10 |
11 |
12 | Log = logging.getLogger()
13 | Log.time = time
14 | Log.sync_time = sync_time
15 |
16 | # Set default
17 | Log.setLevel(logging.INFO)
18 | ch = logging.StreamHandler()
19 | ch.setLevel(logging.INFO)
20 | # Use colorlog
21 | formatstring = "[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] %(message)s"
22 | datefmt = "%m/%d %H:%M:%S"
23 | ch.setFormatter(ColoredFormatter(formatstring, datefmt=datefmt))
24 |
25 | Log.addHandler(ch)
26 | # Log.info("Init-Logger")
27 |
28 |
29 | def timer(sync_cuda=False, mem=False, loop=1):
30 | """
31 | Args:
32 | func: function
33 | sync_cuda: bool, whether to synchronize cuda
34 | mem: bool, whether to log memory
35 | """
36 |
37 | def decorator(func):
38 | def wrapper(*args, **kwargs):
39 | if mem:
40 | start_mem = torch.cuda.memory_allocated() / 1024**2
41 | if sync_cuda:
42 | torch.cuda.synchronize()
43 |
44 | start = Log.time()
45 | for _ in range(loop):
46 | result = func(*args, **kwargs)
47 |
48 | if sync_cuda:
49 | torch.cuda.synchronize()
50 | if loop == 1:
51 | message = f"{func.__name__} took {Log.time() - start:.3f} s."
52 | else:
53 | message = f"{func.__name__} took {((Log.time() - start))/loop:.3f} s. (loop={loop})"
54 |
55 | if mem:
56 | end_mem = torch.cuda.memory_allocated() / 1024**2
57 | end_max_mem = torch.cuda.max_memory_allocated() / 1024**2
58 | message += f" Start_Mem {start_mem:.1f} Max {end_max_mem:.1f} MB"
59 | Log.info(message)
60 |
61 | return result
62 |
63 | return wrapper
64 |
65 | return decorator
66 |
67 |
68 | def timed(fn):
69 | """example usage: timed(lambda: model(inp))"""
70 | start = torch.cuda.Event(enable_timing=True)
71 | end = torch.cuda.Event(enable_timing=True)
72 | start.record()
73 | result = fn()
74 | end.record()
75 | torch.cuda.synchronize()
76 | return result, start.elapsed_time(end) / 1000
77 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/vis/README.md:
--------------------------------------------------------------------------------
1 | ## Pytorch3D Renderer
2 |
3 | Example:
4 | ```python
5 | from hmr4d.utils.vis.renderer import Renderer
6 | import imageio
7 |
8 | fps = 30
9 | focal_length = data["cam_int"][0][0, 0]
10 | width, height = img_hw
11 | faces = smplh[data["gender"]].bm.faces
12 | renderer = Renderer(width, height, focal_length, "cuda", faces)
13 | writer = imageio.get_writer("tmp_debug.mp4", fps=fps, mode="I", format="FFMPEG", macro_block_size=1)
14 |
15 | for i in tqdm(range(length)):
16 | img = np.zeros((height, width, 3), dtype=np.uint8)
17 | img = renderer.render_mesh(smplh_out.vertices[i].cuda(), img)
18 | writer.append_data(img)
19 | writer.close()
20 | ```
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/vis/renderer_utils.py:
--------------------------------------------------------------------------------
1 | from hmr4d.utils.vis.renderer import Renderer
2 | from tqdm import tqdm
3 | import numpy as np
4 |
5 |
6 | def simple_render_mesh(render_dict):
7 | """Render an camera-space mesh, blank background"""
8 | width, height, focal_length = render_dict["whf"]
9 | faces = render_dict["faces"]
10 | verts = render_dict["verts"]
11 |
12 | renderer = Renderer(width, height, focal_length, device="cuda", faces=faces)
13 | outputs = []
14 | for i in tqdm(range(len(verts)), desc=f"Rendering"):
15 | img = renderer.render_mesh(verts[i].cuda(), colors=[0.8, 0.8, 0.8])
16 | outputs.append(img)
17 | outputs = np.stack(outputs, axis=0)
18 | return outputs
19 |
20 |
21 | def simple_render_mesh_background(render_dict, VI=50, colors=[0.8, 0.8, 0.8]):
22 | """Render an camera-space mesh, blank background"""
23 | K = render_dict["K"]
24 | faces = render_dict["faces"]
25 | verts = render_dict["verts"]
26 | background = render_dict["background"]
27 | N_frames = len(verts)
28 | if len(background.shape) == 3:
29 | background = [background] * N_frames
30 | height, width = background[0].shape[:2]
31 |
32 | renderer = Renderer(width, height, device="cuda", faces=faces, K=K)
33 | outputs = []
34 | for i in tqdm(range(len(verts)), desc=f"Rendering"):
35 | img = renderer.render_mesh(verts[i].cuda(), colors=colors, background=background[i], VI=VI)
36 | outputs.append(img)
37 | outputs = np.stack(outputs, axis=0)
38 | return outputs
39 |
--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/vis/rich_logger.py:
--------------------------------------------------------------------------------
1 | from pytorch_lightning.utilities import rank_zero_only
2 | from omegaconf import DictConfig, OmegaConf
3 | import rich
4 | import rich.tree
5 | import rich.syntax
6 | from hmr4d.utils.pylogger import Log
7 |
8 |
9 | @rank_zero_only
10 | def print_cfg(cfg: DictConfig, use_rich: bool = False):
11 | if use_rich:
12 | print_order = ("data", "model", "callbacks", "logger", "pl_trainer")
13 | style = "dim"
14 | tree = rich.tree.Tree("CONFIG", style=style, guide_style=style)
15 |
16 | # add fields from `print_order` to queue
17 | # add all the other fields to queue (not specified in `print_order`)
18 | queue = []
19 | for field in print_order:
20 | queue.append(field) if field in cfg else Log.warn(f"Field '{field}' not found in config. Skipping.")
21 | for field in cfg:
22 | if field not in queue:
23 | queue.append(field)
24 |
25 | # generate config tree from queue
26 | for field in queue:
27 | branch = tree.add(field, style=style, guide_style=style)
28 | config_group = cfg[field]
29 | if isinstance(config_group, DictConfig):
30 | branch_content = OmegaConf.to_yaml(config_group, resolve=False)
31 | else:
32 | branch_content = str(config_group)
33 | branch.add(rich.syntax.Syntax(branch_content, "yaml"))
34 | rich.print(tree)
35 | else:
36 | Log.info(OmegaConf.to_yaml(cfg, resolve=False))
37 |
--------------------------------------------------------------------------------
/eval/GVHMR/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 120
3 | include = '\.pyi?$'
4 | exclude = '''
5 | /(
6 | \.git
7 | | \.hg
8 | | \.mypy_cache
9 | | \.tox
10 | | \.venv
11 | | _build
12 | | buck-out
13 | | build
14 | | dist
15 | )/
16 | '''
17 |
--------------------------------------------------------------------------------
/eval/GVHMR/pyrightconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "exclude": [
3 | "./inputs",
4 | "./outputs"
5 | ],
6 | "typeCheckingMode": "off",
7 | }
8 |
--------------------------------------------------------------------------------
/eval/GVHMR/requirements.txt:
--------------------------------------------------------------------------------
1 | # PyTorch
2 | --extra-index-url https://download.pytorch.org/whl/cu121
3 | torch==2.3.0+cu121
4 | torchvision==0.18.0+cu121
5 | timm==0.9.12 # For HMR2.0a feature extraction
6 |
7 | # Lightning + Hydra
8 | lightning==2.3.0
9 | hydra-core==1.3
10 | hydra-zen
11 | hydra_colorlog
12 | rich
13 |
14 | # Common utilities
15 | numpy==1.23.5
16 | jupyter
17 | matplotlib
18 | ipdb
19 | setuptools>=68.0
20 | black
21 | tensorboardX
22 | opencv-python
23 | ffmpeg-python
24 | scikit-image
25 | termcolor
26 | einops
27 | imageio==2.34.1
28 | av # imageio[pyav], improved performance over imageio[ffmpeg]
29 | joblib
30 |
31 | # Diffusion
32 | # diffusers[torch]==0.19.3
33 | # transformers==4.31.0
34 |
35 | # 3D-Vision
36 | pytorch3d @ https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt230/pytorch3d-0.7.6-cp310-cp310-linux_x86_64.whl
37 | trimesh
38 | chumpy
39 | smplx
40 | # open3d==0.17.0
41 | wis3d
42 |
43 | # 2D-Pose
44 | ultralytics==8.2.42 # YOLO
45 | cython_bbox
46 | lapx
--------------------------------------------------------------------------------
/eval/GVHMR/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 |
4 | setup(
5 | name="gvhmr",
6 | version="1.0.0",
7 | packages=find_packages(),
8 | author="Zehong Shen",
9 | description=["GVHMR training and inference"],
10 | url="https://github.com/zju3dv/GVHMR",
11 | )
12 |
--------------------------------------------------------------------------------
/eval/GVHMR/tools/demo/demo_folder.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from pathlib import Path
3 | from tqdm import tqdm
4 | from hmr4d.utils.pylogger import Log
5 | import subprocess
6 | import os
7 |
8 |
9 | if __name__ == "__main__":
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("-f", "--folder", type=str)
12 | parser.add_argument("-d", "--output_root", type=str, default=None)
13 | parser.add_argument("-s", "--static_cam", action="store_true", help="If true, skip DPVO")
14 | args = parser.parse_args()
15 |
16 | output_root = args.output_root
17 |
18 | sub_folders = os.listdir(args.folder)
19 | mp4_paths = []
20 | for sub_folder in sub_folders:
21 | files = os.listdir(os.path.join(args.folder, sub_folder))
22 | for file in files:
23 | if file.endswith('.mp4'):
24 | mp4_path = os.path.join(args.folder, sub_folder, file)
25 | mp4_paths.append(mp4_path)
26 |
27 | # Run demo.py for each .mp4 file
28 | Log.info(f"Found {len(mp4_paths)} .mp4 files in {args.folder}")
29 | for mp4_path in tqdm(mp4_paths):
30 | try:
31 | command = ["python", "tools/demo/demo.py", "--video", str(mp4_path)]
32 | if output_root is not None:
33 | command += ["--output_root", output_root]
34 | if args.static_cam:
35 | command += ["-s"]
36 | Log.info(f"Running: {' '.join(command)}")
37 | subprocess.run(command, env=dict(os.environ), check=True)
38 | except:
39 | continue
40 |
--------------------------------------------------------------------------------
/eval/GVHMR/tools/train.py:
--------------------------------------------------------------------------------
1 | import hydra
2 | import pytorch_lightning as pl
3 | from omegaconf import DictConfig, OmegaConf
4 | from pytorch_lightning.callbacks.checkpoint import Checkpoint
5 |
6 | from hmr4d.utils.pylogger import Log
7 | from hmr4d.configs import register_store_gvhmr
8 | from hmr4d.utils.vis.rich_logger import print_cfg
9 | from hmr4d.utils.net_utils import load_pretrained_model, get_resume_ckpt_path
10 |
11 |
12 | def get_callbacks(cfg: DictConfig) -> list:
13 | """Parse and instantiate all the callbacks in the config."""
14 | if not hasattr(cfg, "callbacks") or cfg.callbacks is None:
15 | return None
16 | # Handle special callbacks
17 | enable_checkpointing = cfg.pl_trainer.get("enable_checkpointing", True)
18 | # Instantiate all the callbacks
19 | callbacks = []
20 | for callback in cfg.callbacks.values():
21 | if callback is not None:
22 | cb = hydra.utils.instantiate(callback, _recursive_=False)
23 | # skip when disable checkpointing and the callback is Checkpoint
24 | if not enable_checkpointing and isinstance(cb, Checkpoint):
25 | continue
26 | else:
27 | callbacks.append(cb)
28 | return callbacks
29 |
30 |
31 | def train(cfg: DictConfig) -> None:
32 | """Train/Test"""
33 | Log.info(f"[Exp Name]: {cfg.exp_name}")
34 | if cfg.task == "fit":
35 | Log.info(f"[GPU x Batch] = {cfg.pl_trainer.devices} x {cfg.data.loader_opts.train.batch_size}")
36 | pl.seed_everything(cfg.seed)
37 |
38 | # preparation
39 | datamodule: pl.LightningDataModule = hydra.utils.instantiate(cfg.data, _recursive_=False)
40 | model: pl.LightningModule = hydra.utils.instantiate(cfg.model, _recursive_=False)
41 | if cfg.ckpt_path is not None:
42 | load_pretrained_model(model, cfg.ckpt_path)
43 |
44 | # PL callbacks and logger
45 | callbacks = get_callbacks(cfg)
46 | has_ckpt_cb = any([isinstance(cb, Checkpoint) for cb in callbacks])
47 | if not has_ckpt_cb and cfg.pl_trainer.get("enable_checkpointing", True):
48 | Log.warning("No checkpoint-callback found. Disabling PL auto checkpointing.")
49 | cfg.pl_trainer = {**cfg.pl_trainer, "enable_checkpointing": False}
50 | logger = hydra.utils.instantiate(cfg.logger, _recursive_=False)
51 |
52 | # PL-Trainer
53 | if cfg.task == "test":
54 | Log.info("Test mode forces full-precision.")
55 | cfg.pl_trainer = {**cfg.pl_trainer, "precision": 32}
56 | trainer = pl.Trainer(
57 | accelerator="gpu",
58 | logger=logger if logger is not None else False,
59 | callbacks=callbacks,
60 | **cfg.pl_trainer,
61 | )
62 |
63 | if cfg.task == "fit":
64 | resume_path = None
65 | if cfg.resume_mode is not None:
66 | resume_path = get_resume_ckpt_path(cfg.resume_mode, ckpt_dir=cfg.callbacks.model_checkpoint.dirpath)
67 | Log.info(f"Resume training from {resume_path}")
68 | Log.info("Start Fitiing...")
69 | trainer.fit(model, datamodule.train_dataloader(), datamodule.val_dataloader(), ckpt_path=resume_path)
70 | elif cfg.task == "test":
71 | Log.info("Start Testing...")
72 | trainer.test(model, datamodule.test_dataloader())
73 | else:
74 | raise ValueError(f"Unknown task: {cfg.task}")
75 |
76 | Log.info("End of script.")
77 |
78 |
79 | @hydra.main(version_base="1.3", config_path="../hmr4d/configs", config_name="train")
80 | def main(cfg) -> None:
81 | print_cfg(cfg, use_rich=True)
82 | train(cfg)
83 |
84 |
85 | if __name__ == "__main__":
86 | register_store_gvhmr()
87 | main()
88 |
--------------------------------------------------------------------------------
/eval/GVHMR/tools/unitest/make_hydra_cfg.py:
--------------------------------------------------------------------------------
1 | from hmr4d.configs import parse_args_to_cfg, register_store_gvhmr
2 | from hmr4d.utils.vis.rich_logger import print_cfg
3 |
4 | if __name__ == "__main__":
5 | register_store_gvhmr()
6 | cfg = parse_args_to_cfg()
7 | print_cfg(cfg, use_rich=True)
8 |
--------------------------------------------------------------------------------
/eval/GVHMR/tools/unitest/run_dataset.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import DataLoader
3 | from tqdm import tqdm
4 |
5 |
6 | def get_dataset(DATA_TYPE):
7 | if DATA_TYPE == "BEDLAM_V2":
8 | from hmr4d.dataset.bedlam.bedlam import BedlamDatasetV2
9 |
10 | return BedlamDatasetV2()
11 |
12 | if DATA_TYPE == "3DPW_TRAIN":
13 | from hmr4d.dataset.threedpw.threedpw_motion_train import ThreedpwSmplDataset
14 |
15 | return ThreedpwSmplDataset()
16 |
17 | if __name__ == "__main__":
18 | DATA_TYPE = "3DPW_TRAIN"
19 | dataset = get_dataset(DATA_TYPE)
20 | print(len(dataset))
21 |
22 | data = dataset[0]
23 |
24 | from hmr4d.datamodule.mocap_trainX_testY import collate_fn
25 |
26 | loader = DataLoader(
27 | dataset,
28 | shuffle=False,
29 | num_workers=0,
30 | persistent_workers=False,
31 | pin_memory=False,
32 | batch_size=1,
33 | collate_fn=collate_fn,
34 | )
35 | i = 0
36 | for batch in tqdm(loader):
37 | i += 1
38 | # if i == 20:
39 | # raise AssertionError
40 | # time.sleep(0.2)
41 | pass
42 |
--------------------------------------------------------------------------------
/eval/GVHMR/tools/video/merge_folder.py:
--------------------------------------------------------------------------------
1 | """This script will glob two folder, check the mp4 files are one-to-one match precisely, then call merge_horizontal.py to merge them one by one"""
2 |
3 | import os
4 | import argparse
5 | from pathlib import Path
6 |
7 |
8 | def main():
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument("input_dir1", type=str)
11 | parser.add_argument("input_dir2", type=str)
12 | parser.add_argument("output_dir", type=str)
13 | parser.add_argument("--vertical", action="store_true") # By default use horizontal
14 | args = parser.parse_args()
15 |
16 | # Check input
17 | input_dir1 = Path(args.input_dir1)
18 | input_dir2 = Path(args.input_dir2)
19 | assert input_dir1.exists()
20 | assert input_dir2.exists()
21 | video_paths1 = sorted(input_dir1.glob("*.mp4"))
22 | video_paths2 = sorted(input_dir2.glob("*.mp4"))
23 | assert len(video_paths1) == len(video_paths2)
24 | for path1, path2 in zip(video_paths1, video_paths2):
25 | assert path1.stem == path2.stem
26 |
27 | # Merge to output
28 | output_dir = Path(args.output_dir)
29 | output_dir.mkdir(parents=True, exist_ok=True)
30 |
31 | for path1, path2 in zip(video_paths1, video_paths2):
32 | out_path = output_dir / f"{path1.stem}.mp4"
33 | in_paths = [str(path1), str(path2)]
34 | print(f"Merging {in_paths} to {out_path}")
35 | if args.vertical:
36 | os.system(f"python tools/video/merge_vertical.py {' '.join(in_paths)} -o {out_path}")
37 | else:
38 | os.system(f"python tools/video/merge_horizontal.py {' '.join(in_paths)} -o {out_path}")
39 |
40 |
41 | if __name__ == "__main__":
42 | main()
43 |
--------------------------------------------------------------------------------
/eval/GVHMR/tools/video/merge_horizontal.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from hmr4d.utils.video_io_utils import merge_videos_horizontal
3 |
4 |
5 | def parse_args():
6 | """python tools/video/merge_horizontal.py a.mp4 b.mp4 c.mp4 -o out.mp4"""
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument("input_videos", nargs="+", help="Input video paths")
9 | parser.add_argument("-o", "--output", type=str, required=True, help="Output video path")
10 | return parser.parse_args()
11 |
12 |
13 | if __name__ == "__main__":
14 | args = parse_args()
15 | merge_videos_horizontal(args.input_videos, args.output)
16 |
--------------------------------------------------------------------------------
/eval/GVHMR/tools/video/merge_vertical.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from hmr4d.utils.video_io_utils import merge_videos_vertical
3 |
4 |
5 | def parse_args():
6 | """python tools/video/merge_vertical.py a.mp4 b.mp4 c.mp4 -o out.mp4"""
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument("input_videos", nargs="+", help="Input video paths")
9 | parser.add_argument("-o", "--output", type=str, required=True, help="Output video path")
10 | return parser.parse_args()
11 |
12 |
13 | if __name__ == "__main__":
14 | args = parse_args()
15 | merge_videos_vertical(args.input_videos, args.output)
16 |
--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/calculate_clip.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | from PIL import Image
3 | import torch
4 | from transformers import CLIPProcessor, CLIPModel
5 | import json
6 | import os
7 | from tqdm import tqdm
8 | import torch
9 | import clip
10 | from PIL import Image
11 | import cv2
12 | import numpy as np
13 | import os
14 | import argparse
15 |
16 | device = "cuda" if torch.cuda.is_available() else "cpu"
17 |
18 | model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
19 | processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
20 |
21 | def get_video_scores(video_path, prompt):
22 | video = cv2.VideoCapture(video_path)
23 | texts = [prompt]
24 | clip_score_list = []
25 | while True:
26 | ret, frame = video.read()
27 |
28 | if ret:
29 | image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
30 | inputs = processor(text=texts, images=[image], return_tensors="pt", padding=True, truncation=True).to(device)
31 | logits_per_image = model(**inputs).logits_per_image
32 | clip_score = logits_per_image.item()
33 | clip_score_list.append(clip_score)
34 | else:
35 | break
36 |
37 | video.release()
38 | return sum(clip_score_list) / len(clip_score_list)
39 |
40 |
41 | parser = argparse.ArgumentParser()
42 | parser.add_argument("-v_f", "--videos_folder", type=str)
43 | args = parser.parse_args()
44 |
45 | videos_folder_path = args.videos_folder
46 | prompts_path = '/ytech_m2v2_hdd/fuxiao/scenectrl/common_metrics_on_video_quality/eval_prompts.json'
47 | with open(prompts_path, "r", encoding="utf-8") as f: prompts_dict = json.load(f)
48 |
49 | sub_folders = os.listdir(videos_folder_path)
50 | videos_name = []
51 | for sub_folder in sub_folders:
52 | files = os.listdir(os.path.join(videos_folder_path, sub_folder))
53 | for file in files:
54 | if file.endswith('.mp4'):
55 | video_name = os.path.join(sub_folder, file)
56 | videos_name.append(video_name)
57 |
58 | num_videos = len(videos_name)
59 |
60 | prompts = []
61 | video_paths = []
62 | for video_name in videos_name:
63 | prompt = prompts_dict[video_name.split('/')[0]]
64 | video_path = os.path.join(videos_folder_path, video_name)
65 | prompts.append(prompt)
66 | video_paths.append(video_path)
67 |
68 | import csv
69 | CLIP_T = True
70 | if CLIP_T:
71 | scores = []
72 | for i in tqdm(range(num_videos)):
73 | # 加载图片
74 | video_path = video_paths[i]
75 |
76 | # 准备文本
77 | texts = prompts[i]
78 | score = get_video_scores(video_path, texts)
79 | scores.append(score)
80 |
81 | print(f"CLIP-SIM: {sum(scores)/len(scores)/100.}")
82 | #### CLIP-T ####
83 | # basemodel: 33.44
--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/calculate_fvd.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from tqdm import tqdm
4 |
5 | def trans(x):
6 | # if greyscale images add channel
7 | if x.shape[-3] == 1:
8 | x = x.repeat(1, 1, 3, 1, 1)
9 |
10 | # permute BTCHW -> BCTHW
11 | x = x.permute(0, 2, 1, 3, 4)
12 |
13 | return x
14 |
15 | def calculate_fvd(videos1, videos2, device, method='styleganv'):
16 |
17 | if method == 'styleganv':
18 | from fvd.styleganv.fvd import get_fvd_feats, frechet_distance, load_i3d_pretrained
19 | elif method == 'videogpt':
20 | from fvd.videogpt.fvd import load_i3d_pretrained
21 | from fvd.videogpt.fvd import get_fvd_logits as get_fvd_feats
22 | from fvd.videogpt.fvd import frechet_distance
23 |
24 | print("calculate_fvd...")
25 |
26 | # videos [batch_size, timestamps, channel, h, w]
27 |
28 | assert videos1.shape == videos2.shape
29 |
30 | i3d = load_i3d_pretrained(device=device)
31 | fvd_results = []
32 |
33 | # support grayscale input, if grayscale -> channel*3
34 | # BTCHW -> BCTHW
35 | # videos -> [batch_size, channel, timestamps, h, w]
36 |
37 | videos1 = trans(videos1)
38 | videos2 = trans(videos2)
39 |
40 | fvd_results = {}
41 |
42 | # for calculate FVD, each clip_timestamp must >= 10
43 |
44 | # get a video clip
45 | # videos_clip [batch_size, channel, timestamps[:clip], h, w]
46 | videos_clip1 = videos1[:, :, :]
47 | videos_clip2 = videos2[:, :, :]
48 |
49 | # get FVD features
50 | feats1 = get_fvd_feats(videos_clip1, i3d=i3d, device=device)
51 | feats2 = get_fvd_feats(videos_clip2, i3d=i3d, device=device)
52 |
53 | # calculate FVD when timestamps[:clip]
54 | fvd_results = frechet_distance(feats1, feats2)
55 |
56 | result = {
57 | "value": fvd_results,
58 | "video_setting": videos1.shape,
59 | "video_setting_name": "batch_size, channel, time, heigth, width",
60 | }
61 |
62 | return result
63 |
64 | # test code / using example
65 |
66 | def main():
67 | NUMBER_OF_VIDEOS = 8
68 | VIDEO_LENGTH = 50
69 | CHANNEL = 3
70 | SIZE = 64
71 | videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
72 | videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
73 | device = torch.device("cuda")
74 | # device = torch.device("cpu")
75 |
76 | import json
77 | result = calculate_fvd(videos1, videos2, device, method='videogpt')
78 | print(json.dumps(result, indent=4))
79 |
80 | result = calculate_fvd(videos1, videos2, device, method='styleganv')
81 | print(json.dumps(result, indent=4))
82 |
83 | if __name__ == "__main__":
84 | main()
--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/calculate_lpips.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from tqdm import tqdm
4 | import math
5 |
6 | import torch
7 | import lpips
8 |
9 | spatial = True # Return a spatial map of perceptual distance.
10 |
11 | # Linearly calibrated models (LPIPS)
12 | loss_fn = lpips.LPIPS(net='alex', spatial=spatial) # Can also set net = 'squeeze' or 'vgg'
13 | # loss_fn = lpips.LPIPS(net='alex', spatial=spatial, lpips=False) # Can also set net = 'squeeze' or 'vgg'
14 |
15 | def trans(x):
16 | # if greyscale images add channel
17 | if x.shape[-3] == 1:
18 | x = x.repeat(1, 1, 3, 1, 1)
19 |
20 | # value range [0, 1] -> [-1, 1]
21 | x = x * 2 - 1
22 |
23 | return x
24 |
25 | def calculate_lpips(videos1, videos2, device):
26 | # image should be RGB, IMPORTANT: normalized to [-1,1]
27 | print("calculate_lpips...")
28 |
29 | assert videos1.shape == videos2.shape
30 |
31 | # videos [batch_size, timestamps, channel, h, w]
32 |
33 | # support grayscale input, if grayscale -> channel*3
34 | # value range [0, 1] -> [-1, 1]
35 | videos1 = trans(videos1)
36 | videos2 = trans(videos2)
37 |
38 | lpips_results = []
39 |
40 | for video_num in tqdm(range(videos1.shape[0])):
41 | # get a video
42 | # video [timestamps, channel, h, w]
43 | video1 = videos1[video_num]
44 | video2 = videos2[video_num]
45 |
46 | lpips_results_of_a_video = []
47 | for clip_timestamp in range(len(video1)):
48 | # get a img
49 | # img [timestamps[x], channel, h, w]
50 | # img [channel, h, w] tensor
51 |
52 | img1 = video1[clip_timestamp].unsqueeze(0).to(device)
53 | img2 = video2[clip_timestamp].unsqueeze(0).to(device)
54 |
55 | loss_fn.to(device)
56 |
57 | # calculate lpips of a video
58 | lpips_results_of_a_video.append(loss_fn.forward(img1, img2).mean().detach().cpu().tolist())
59 | lpips_results.append(lpips_results_of_a_video)
60 |
61 | lpips_results = np.array(lpips_results)
62 |
63 | lpips = {}
64 | lpips_std = {}
65 |
66 | for clip_timestamp in range(len(video1)):
67 | lpips[clip_timestamp] = np.mean(lpips_results[:,clip_timestamp])
68 | lpips_std[clip_timestamp] = np.std(lpips_results[:,clip_timestamp])
69 |
70 |
71 | result = {
72 | "value": lpips,
73 | "value_std": lpips_std,
74 | "video_setting": video1.shape,
75 | "video_setting_name": "time, channel, heigth, width",
76 | }
77 |
78 | return result
79 |
80 | # test code / using example
81 |
82 | def main():
83 | NUMBER_OF_VIDEOS = 8
84 | VIDEO_LENGTH = 50
85 | CHANNEL = 3
86 | SIZE = 64
87 | videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
88 | videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
89 | device = torch.device("cuda")
90 | # device = torch.device("cpu")
91 |
92 | import json
93 | result = calculate_lpips(videos1, videos2, device)
94 | print(json.dumps(result, indent=4))
95 |
96 | if __name__ == "__main__":
97 | main()
--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/calculate_psnr.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from tqdm import tqdm
4 | import math
5 |
6 | def img_psnr(img1, img2):
7 | # [0,1]
8 | # compute mse
9 | # mse = np.mean((img1-img2)**2)
10 | mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2)
11 | # compute psnr
12 | if mse < 1e-10:
13 | return 100
14 | psnr = 20 * math.log10(1 / math.sqrt(mse))
15 | return psnr
16 |
17 | def trans(x):
18 | return x
19 |
20 | def calculate_psnr(videos1, videos2):
21 | print("calculate_psnr...")
22 |
23 | # videos [batch_size, timestamps, channel, h, w]
24 |
25 | assert videos1.shape == videos2.shape
26 |
27 | videos1 = trans(videos1)
28 | videos2 = trans(videos2)
29 |
30 | psnr_results = []
31 |
32 | for video_num in tqdm(range(videos1.shape[0])):
33 | # get a video
34 | # video [timestamps, channel, h, w]
35 | video1 = videos1[video_num]
36 | video2 = videos2[video_num]
37 |
38 | psnr_results_of_a_video = []
39 | for clip_timestamp in range(len(video1)):
40 | # get a img
41 | # img [timestamps[x], channel, h, w]
42 | # img [channel, h, w] numpy
43 |
44 | img1 = video1[clip_timestamp].numpy()
45 | img2 = video2[clip_timestamp].numpy()
46 |
47 | # calculate psnr of a video
48 | psnr_results_of_a_video.append(img_psnr(img1, img2))
49 |
50 | psnr_results.append(psnr_results_of_a_video)
51 |
52 | psnr_results = np.array(psnr_results)
53 |
54 | psnr = {}
55 | psnr_std = {}
56 |
57 | for clip_timestamp in range(len(video1)):
58 | psnr[clip_timestamp] = np.mean(psnr_results[:,clip_timestamp])
59 | psnr_std[clip_timestamp] = np.std(psnr_results[:,clip_timestamp])
60 |
61 | result = {
62 | "value": psnr,
63 | "value_std": psnr_std,
64 | "video_setting": video1.shape,
65 | "video_setting_name": "time, channel, heigth, width",
66 | }
67 |
68 | return result
69 |
70 | # test code / using example
71 |
72 | def main():
73 | NUMBER_OF_VIDEOS = 8
74 | VIDEO_LENGTH = 50
75 | CHANNEL = 3
76 | SIZE = 64
77 | videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
78 | videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
79 |
80 | import json
81 | result = calculate_psnr(videos1, videos2)
82 | print(json.dumps(result, indent=4))
83 |
84 | if __name__ == "__main__":
85 | main()
--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/download_eval_visual.sh:
--------------------------------------------------------------------------------
1 | gdown https://drive.google.com/uc\?id\=1U2hd6qvwKLfp7c8yGgcTqdqrP_lKJElB
2 | gdown https://drive.google.com/uc\?id\=1jMH2-ZC0ZBgtqej5Sp-E5ebBIX7mk3Xz
3 | gdown https://drive.google.com/uc\?id\=1kfdCDA5koYh9g3IkCCHb4XPch2CJAwek
4 |
5 | unzip fvd.zip
6 | unzip eval_sets.zip
7 | unzip base_t2v_eval_sets.zip
8 |
9 | mv eval_sets eval_folder/
10 | mv base_t2v_eval_sets eval_folder/
11 |
12 | rm -rf *.zip
--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/eval_visual.sh:
--------------------------------------------------------------------------------
1 | basedir=eval_folder
2 | folder1_path=${basedir}/base_t2v_eval_sets
3 | folder2_path=${basedir}/eval_sets
4 |
5 | # calculate FVD
6 | python calculate_fvd_styleganv.py -v1_f ${folder1_path} -v2_f ${folder2_path}
7 |
8 | # calculate FID
9 | python -m pytorch_fid ${basedir}/eval_1 ${basedir}/eval_2
10 |
11 | # calculate CLIP-SIM
12 | python calculate_clip.py -v_f ${folder2_path}
13 |
14 | rm -rf ${basedir}/eval_1
15 | rm -rf ${basedir}/eval_2
--------------------------------------------------------------------------------
/imgs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/imgs/logo.png
--------------------------------------------------------------------------------
/imgs/vis_objstraj.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/imgs/vis_objstraj.png
--------------------------------------------------------------------------------