├── CogVideo
    ├── .github
    │   ├── ISSUE_TEMPLATE
    │   │   ├── bug_report.yaml
    │   │   └── feature-request.yaml
    │   └── PULL_REQUEST_TEMPLATE
    │   │   └── pr_template.md
    ├── .gitignore
    ├── LICENSE
    ├── MODEL_LICENSE
    ├── README.md
    ├── README_ja.md
    ├── README_zh.md
    ├── download.sh
    ├── finetune
    │   ├── README.md
    │   ├── README_ja.md
    │   ├── README_zh.md
    │   ├── accelerate_config_machine_single.yaml
    │   ├── accelerate_config_machine_single_debug.yaml
    │   ├── finetune_single_rank_injector.sh
    │   ├── finetune_single_rank_lora.sh
    │   ├── hostfile.txt
    │   ├── models
    │   │   ├── attention.py
    │   │   ├── attention_processor.py
    │   │   ├── cogvideox_transformer_3d.py
    │   │   ├── embeddings.py
    │   │   ├── pipeline_cogvideox.py
    │   │   ├── pipeline_output.py
    │   │   └── utils.py
    │   ├── train_cogvideox_injector.py
    │   └── train_cogvideox_lora.py
    ├── inference
    │   ├── 3dtrajmaster_inference.py
    │   ├── entity_zoo.txt
    │   ├── location_zoo.txt
    │   ├── output_example
    │   │   ├── 1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.mp4
    │   │   ├── 1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.txt
    │   │   ├── 1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.mp4
    │   │   ├── 1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.txt
    │   │   ├── 1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.mp4
    │   │   ├── 1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.txt
    │   │   ├── 1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.mp4
    │   │   ├── 1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.txt
    │   │   ├── 1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.mp4
    │   │   ├── 1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.txt
    │   │   ├── 1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.mp4
    │   │   ├── 1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.txt
    │   │   ├── 1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.mp4
    │   │   ├── 1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.txt
    │   │   ├── 1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.mp4
    │   │   ├── 1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.txt
    │   │   ├── 1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.mp4
    │   │   ├── 1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.txt
    │   │   ├── 1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.mp4
    │   │   ├── 1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.txt
    │   │   ├── 2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.mp4
    │   │   ├── 2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.txt
    │   │   ├── 2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.mp4
    │   │   ├── 2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.txt
    │   │   ├── 2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.mp4
    │   │   ├── 2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.txt
    │   │   ├── 2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.mp4
    │   │   ├── 2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.txt
    │   │   ├── 2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.mp4
    │   │   ├── 2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.txt
    │   │   ├── 2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.mp4
    │   │   ├── 2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.txt
    │   │   ├── 2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.mp4
    │   │   ├── 2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.txt
    │   │   ├── 2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.mp4
    │   │   ├── 2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.txt
    │   │   ├── 2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.mp4
    │   │   ├── 2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.txt
    │   │   ├── 2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.mp4
    │   │   ├── 2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.txt
    │   │   ├── 3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.mp4
    │   │   ├── 3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.txt
    │   │   ├── 3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.mp4
    │   │   ├── 3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.txt
    │   │   ├── 3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.mp4
    │   │   ├── 3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.txt
    │   │   ├── 3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.mp4
    │   │   ├── 3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.txt
    │   │   ├── 3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.mp4
    │   │   ├── 3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.txt
    │   │   ├── 3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.mp4
    │   │   ├── 3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.txt
    │   │   ├── 3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.mp4
    │   │   ├── 3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.txt
    │   │   ├── 3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.mp4
    │   │   ├── 3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.txt
    │   │   ├── 3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.mp4
    │   │   ├── 3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.txt
    │   │   ├── 3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.mp4
    │   │   └── 3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.txt
    │   └── test_sets.json
    ├── pyproject.toml
    ├── requirements.txt
    ├── tools
    │   ├── caption
    │   │   ├── README.md
    │   │   ├── README_ja.md
    │   │   ├── README_zh.md
    │   │   ├── assests
    │   │   │   ├── CogVLM2-Caption-example.png
    │   │   │   └── cogvlm2-video-example.png
    │   │   ├── requirements.txt
    │   │   └── video_caption.py
    │   ├── convert_weight_sat2hf.py
    │   ├── export_sat_lora_weight.py
    │   ├── llm_flux_cogvideox
    │   │   ├── generate.sh
    │   │   ├── gradio_page.py
    │   │   └── llm_flux_cogvideox.py
    │   ├── load_cogvideox_lora.py
    │   ├── parallel_inference
    │   │   ├── parallel_inference_xdit.py
    │   │   └── run.sh
    │   ├── replicate
    │   │   ├── cog.yaml
    │   │   ├── predict_i2v.py
    │   │   └── predict_t2v.py
    │   └── venhancer
    │   │   ├── README.md
    │   │   ├── README_ja.md
    │   │   └── README_zh.md
    └── weights
    │   └── put weights here.txt
├── README.md
├── dataset
    ├── load_dataset.py
    ├── traj_vis
    │   ├── D_loc1_61_t3n13_003d_Hemi12_1.json
    │   ├── Hemi12_transforms.json
    │   └── location_data_desert.json
    ├── utils.py
    └── vis_trajectory.py
├── eval
    ├── GVHMR
    │   ├── .gitignore
    │   ├── .gitmodules
    │   ├── LICENSE
    │   ├── README.md
    │   ├── docs
    │   │   ├── INSTALL.md
    │   │   └── example_video
    │   │   │   ├── project_teaser.gif
    │   │   │   └── tennis.mp4
    │   ├── download_eval_pose.sh
    │   ├── eval.sh
    │   ├── hmr4d
    │   │   ├── __init__.py
    │   │   ├── build_gvhmr.py
    │   │   ├── configs
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │   │   └── mocap
    │   │   │   │   │   ├── testY.yaml
    │   │   │   │   │   └── trainX_testY.yaml
    │   │   │   ├── demo.yaml
    │   │   │   ├── exp
    │   │   │   │   └── gvhmr
    │   │   │   │   │   └── mixed
    │   │   │   │   │       └── mixed.yaml
    │   │   │   ├── global
    │   │   │   │   ├── debug
    │   │   │   │   │   ├── debug_train.yaml
    │   │   │   │   │   └── debug_train_limit_data.yaml
    │   │   │   │   └── task
    │   │   │   │   │   └── gvhmr
    │   │   │   │   │       ├── test_3dpw.yaml
    │   │   │   │   │       ├── test_3dpw_emdb_rich.yaml
    │   │   │   │   │       ├── test_emdb.yaml
    │   │   │   │   │       └── test_rich.yaml
    │   │   │   ├── hydra
    │   │   │   │   └── default.yaml
    │   │   │   ├── siga24_release.yaml
    │   │   │   ├── store_gvhmr.py
    │   │   │   └── train.yaml
    │   │   ├── datamodule
    │   │   │   └── mocap_trainX_testY.py
    │   │   ├── dataset
    │   │   │   ├── bedlam
    │   │   │   │   ├── bedlam.py
    │   │   │   │   ├── resource
    │   │   │   │   │   └── vname2lwh.pt
    │   │   │   │   └── utils.py
    │   │   │   ├── emdb
    │   │   │   │   ├── emdb_motion_test.py
    │   │   │   │   └── utils.py
    │   │   │   ├── h36m
    │   │   │   │   ├── camera-parameters.json
    │   │   │   │   ├── h36m.py
    │   │   │   │   └── utils.py
    │   │   │   ├── imgfeat_motion
    │   │   │   │   └── base_dataset.py
    │   │   │   ├── pure_motion
    │   │   │   │   ├── amass.py
    │   │   │   │   ├── base_dataset.py
    │   │   │   │   ├── cam_traj_utils.py
    │   │   │   │   └── utils.py
    │   │   │   ├── rich
    │   │   │   │   ├── resource
    │   │   │   │   │   ├── cam2params.pt
    │   │   │   │   │   ├── seqname2imgrange.json
    │   │   │   │   │   ├── test.txt
    │   │   │   │   │   ├── train.txt
    │   │   │   │   │   ├── val.txt
    │   │   │   │   │   └── w2az_sahmr.json
    │   │   │   │   ├── rich_motion_test.py
    │   │   │   │   └── rich_utils.py
    │   │   │   └── threedpw
    │   │   │   │   ├── threedpw_motion_test.py
    │   │   │   │   ├── threedpw_motion_train.py
    │   │   │   │   └── utils.py
    │   │   ├── model
    │   │   │   ├── common_utils
    │   │   │   │   ├── optimizer.py
    │   │   │   │   ├── scheduler.py
    │   │   │   │   └── scheduler_cfg.py
    │   │   │   └── gvhmr
    │   │   │   │   ├── callbacks
    │   │   │   │       ├── metric_3dpw.py
    │   │   │   │       ├── metric_emdb.py
    │   │   │   │       └── metric_rich.py
    │   │   │   │   ├── gvhmr_pl.py
    │   │   │   │   ├── gvhmr_pl_demo.py
    │   │   │   │   ├── pipeline
    │   │   │   │       └── gvhmr_pipeline.py
    │   │   │   │   └── utils
    │   │   │   │       ├── endecoder.py
    │   │   │   │       ├── postprocess.py
    │   │   │   │       └── stats_compose.py
    │   │   ├── network
    │   │   │   ├── base_arch
    │   │   │   │   ├── embeddings
    │   │   │   │   │   └── rotary_embedding.py
    │   │   │   │   └── transformer
    │   │   │   │   │   ├── encoder_rope.py
    │   │   │   │   │   └── layer.py
    │   │   │   ├── gvhmr
    │   │   │   │   └── relative_transformer.py
    │   │   │   └── hmr2
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── components
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── pose_transformer.py
    │   │   │   │       └── t_cond_mlp.py
    │   │   │   │   ├── configs
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── model_config.yaml
    │   │   │   │       └── smpl_mean_params.npz
    │   │   │   │   ├── hmr2.py
    │   │   │   │   ├── smpl_head.py
    │   │   │   │   ├── utils
    │   │   │   │       ├── geometry.py
    │   │   │   │       ├── preproc.py
    │   │   │   │       └── smpl_wrapper.py
    │   │   │   │   └── vit.py
    │   │   └── utils
    │   │   │   ├── body_model
    │   │   │       ├── README.md
    │   │   │       ├── __init__.py
    │   │   │       ├── body_model.py
    │   │   │       ├── body_model_smplh.py
    │   │   │       ├── body_model_smplx.py
    │   │   │       ├── coco_aug_dict.pth
    │   │   │       ├── min_lbs.py
    │   │   │       ├── seg_part_info.npy
    │   │   │       ├── smpl_3dpw14_J_regressor_sparse.pt
    │   │   │       ├── smpl_coco17_J_regressor.pt
    │   │   │       ├── smpl_lite.py
    │   │   │       ├── smpl_neutral_J_regressor.pt
    │   │   │       ├── smpl_vert_segmentation.json
    │   │   │       ├── smplx2smpl_sparse.pt
    │   │   │       ├── smplx_lite.py
    │   │   │       ├── smplx_verts437.pt
    │   │   │       └── utils.py
    │   │   │   ├── callbacks
    │   │   │       ├── lr_monitor.py
    │   │   │       ├── prog_bar.py
    │   │   │       ├── simple_ckpt_saver.py
    │   │   │       └── train_speed_timer.py
    │   │   │   ├── comm
    │   │   │       └── gather.py
    │   │   │   ├── eval
    │   │   │       └── eval_utils.py
    │   │   │   ├── geo
    │   │   │       ├── augment_noisy_pose.py
    │   │   │       ├── flip_utils.py
    │   │   │       ├── hmr_cam.py
    │   │   │       ├── hmr_global.py
    │   │   │       ├── quaternion.py
    │   │   │       └── transforms.py
    │   │   │   ├── geo_transform.py
    │   │   │   ├── ik
    │   │   │       └── ccd_ik.py
    │   │   │   ├── kpts
    │   │   │       └── kp2d_utils.py
    │   │   │   ├── matrix.py
    │   │   │   ├── net_utils.py
    │   │   │   ├── preproc
    │   │   │       ├── __init__.py
    │   │   │       ├── slam.py
    │   │   │       ├── tracker.py
    │   │   │       ├── vitfeat_extractor.py
    │   │   │       ├── vitpose.py
    │   │   │       └── vitpose_pytorch
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── src
    │   │   │       │       └── vitpose_infer
    │   │   │       │           ├── __init__.py
    │   │   │       │           ├── builder
    │   │   │       │               ├── __init__.py
    │   │   │       │               ├── backbones
    │   │   │       │               │   ├── __init__.py
    │   │   │       │               │   ├── alexnet.py
    │   │   │       │               │   ├── cpm.py
    │   │   │       │               │   ├── hourglass.py
    │   │   │       │               │   ├── hourglass_ae.py
    │   │   │       │               │   ├── hrformer.py
    │   │   │       │               │   ├── litehrnet.py
    │   │   │       │               │   ├── mobilenet_v2.py
    │   │   │       │               │   ├── mobilenet_v3.py
    │   │   │       │               │   ├── mspn.py
    │   │   │       │               │   ├── regnet.py
    │   │   │       │               │   ├── resnest.py
    │   │   │       │               │   ├── resnext.py
    │   │   │       │               │   ├── rsn.py
    │   │   │       │               │   ├── scnet.py
    │   │   │       │               │   ├── seresnet.py
    │   │   │       │               │   ├── seresnext.py
    │   │   │       │               │   ├── shufflenet_v1.py
    │   │   │       │               │   ├── shufflenet_v2.py
    │   │   │       │               │   ├── tcn.py
    │   │   │       │               │   ├── test_torch.py
    │   │   │       │               │   ├── utils
    │   │   │       │               │   │   ├── __init__.py
    │   │   │       │               │   │   ├── channel_shuffle.py
    │   │   │       │               │   │   ├── inverted_residual.py
    │   │   │       │               │   │   ├── make_divisible.py
    │   │   │       │               │   │   ├── se_layer.py
    │   │   │       │               │   │   └── utils.py
    │   │   │       │               │   ├── vgg.py
    │   │   │       │               │   ├── vipnas_mbv3.py
    │   │   │       │               │   ├── vipnas_resnet.py
    │   │   │       │               │   └── vit.py
    │   │   │       │               ├── configs
    │   │   │       │               │   └── coco
    │   │   │       │               │   │   ├── ViTPose_base_coco_256x192.py
    │   │   │       │               │   │   ├── ViTPose_base_simple_coco_256x192.py
    │   │   │       │               │   │   ├── ViTPose_huge_coco_256x192.py
    │   │   │       │               │   │   ├── ViTPose_huge_simple_coco_256x192.py
    │   │   │       │               │   │   ├── ViTPose_large_coco_256x192.py
    │   │   │       │               │   │   ├── ViTPose_large_simple_coco_256x192.py
    │   │   │       │               │   │   └── __init__.py
    │   │   │       │               ├── heads
    │   │   │       │               │   ├── __init__.py
    │   │   │       │               │   ├── deconv_head.py
    │   │   │       │               │   ├── deeppose_regression_head.py
    │   │   │       │               │   ├── hmr_head.py
    │   │   │       │               │   ├── interhand_3d_head.py
    │   │   │       │               │   ├── temporal_regression_head.py
    │   │   │       │               │   ├── topdown_heatmap_base_head.py
    │   │   │       │               │   ├── topdown_heatmap_multi_stage_head.py
    │   │   │       │               │   ├── topdown_heatmap_simple_head.py
    │   │   │       │               │   ├── vipnas_heatmap_simple_head.py
    │   │   │       │               │   └── voxelpose_head.py
    │   │   │       │               └── model_builder.py
    │   │   │       │           ├── model_builder.py
    │   │   │       │           └── pose_utils
    │   │   │       │               ├── ViTPose_trt.py
    │   │   │       │               ├── __init__.py
    │   │   │       │               ├── convert_to_trt.py
    │   │   │       │               ├── general_utils.py
    │   │   │       │               ├── inference_test.py
    │   │   │       │               ├── logger_helper.py
    │   │   │       │               ├── pose_utils.py
    │   │   │       │               ├── pose_viz.py
    │   │   │       │               ├── timerr.py
    │   │   │       │               └── visualizer.py
    │   │   │   ├── pylogger.py
    │   │   │   ├── seq_utils.py
    │   │   │   ├── smplx_utils.py
    │   │   │   ├── video_io_utils.py
    │   │   │   ├── vis
    │   │   │       ├── README.md
    │   │   │       ├── cv2_utils.py
    │   │   │       ├── renderer.py
    │   │   │       ├── renderer_tools.py
    │   │   │       ├── renderer_utils.py
    │   │   │       └── rich_logger.py
    │   │   │   └── wis3d_utils.py
    │   ├── pyproject.toml
    │   ├── pyrightconfig.json
    │   ├── requirements.txt
    │   ├── setup.py
    │   └── tools
    │   │   ├── demo
    │   │       ├── colab_demo.ipynb
    │   │       ├── demo.py
    │   │       └── demo_folder.py
    │   │   ├── eval_pose.py
    │   │   ├── train.py
    │   │   ├── unitest
    │   │       ├── make_hydra_cfg.py
    │   │       └── run_dataset.py
    │   │   └── video
    │   │       ├── merge_folder.py
    │   │       ├── merge_horizontal.py
    │   │       └── merge_vertical.py
    └── common_metrics_on_video_quality
    │   ├── .gitignore
    │   ├── README.md
    │   ├── calculate_clip.py
    │   ├── calculate_fvd.py
    │   ├── calculate_fvd_styleganv.py
    │   ├── calculate_lpips.py
    │   ├── calculate_psnr.py
    │   ├── calculate_ssim.py
    │   ├── download_eval_visual.sh
    │   ├── eval_prompts.json
    │   └── eval_visual.sh
└── imgs
    ├── logo.png
    └── vis_objstraj.png


/CogVideo/.github/ISSUE_TEMPLATE/bug_report.yaml:
--------------------------------------------------------------------------------
 1 | name: "\U0001F41B Bug Report"
 2 | description: Submit a bug report to help us improve CogVideoX / 提交一个 Bug 问题报告来帮助我们改进 CogVideoX 开源模型
 3 | body:
 4 |   - type: textarea
 5 |     id: system-info
 6 |     attributes:
 7 |       label: System Info / 系統信息
 8 |       description: Your operating environment / 您的运行环境信息
 9 |       placeholder: Includes Cuda version, Diffusers version, Python version, operating system, hardware information (if you suspect a hardware problem)... / 包括Cuda版本，Diffusers，Python版本，操作系统，硬件信息(如果您怀疑是硬件方面的问题)...
10 |     validations:
11 |       required: true
12 | 
13 |   - type: checkboxes
14 |     id: information-scripts-examples
15 |     attributes:
16 |       label: Information / 问题信息
17 |       description: 'The problem arises when using: / 问题出现在'
18 |       options:
19 |         - label: "The official example scripts / 官方的示例脚本"
20 |         - label: "My own modified scripts / 我自己修改的脚本和任务"
21 | 
22 |   - type: textarea
23 |     id: reproduction
24 |     validations:
25 |       required: true
26 |     attributes:
27 |       label: Reproduction / 复现过程
28 |       description: |
29 |         Please provide a code example that reproduces the problem you encountered, preferably with a minimal reproduction unit.
30 |         If you have code snippets, error messages, stack traces, please provide them here as well.
31 |         Please format your code correctly using code tags. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
32 |         Do not use screenshots, as they are difficult to read and (more importantly) do not allow others to copy and paste your code.
33 |         
34 |         请提供能重现您遇到的问题的代码示例,最好是最小复现单元。
35 |         如果您有代码片段、错误信息、堆栈跟踪，也请在此提供。
36 |         请使用代码标签正确格式化您的代码。请参见 https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
37 |         请勿使用截图，因为截图难以阅读，而且（更重要的是）不允许他人复制粘贴您的代码。
38 |       placeholder: |
39 |         Steps to reproduce the behavior/复现Bug的步骤:
40 |           
41 |           1.
42 |           2.
43 |           3.
44 | 
45 |   - type: textarea
46 |     id: expected-behavior
47 |     validations:
48 |       required: true
49 |     attributes:
50 |       label: Expected behavior / 期待表现
51 |       description: "A clear and concise description of what you would expect to happen. /简单描述您期望发生的事情。"


--------------------------------------------------------------------------------
/CogVideo/.github/ISSUE_TEMPLATE/feature-request.yaml:
--------------------------------------------------------------------------------
 1 | name: "\U0001F680 Feature request"
 2 | description: Submit a request for a new CogVideoX feature / 提交一个新的 CogVideoX开源模型的功能建议
 3 | labels: [ "feature" ]
 4 | body:
 5 |   - type: textarea
 6 |     id: feature-request
 7 |     validations:
 8 |       required: true
 9 |     attributes:
10 |       label: Feature request  / 功能建议
11 |       description: |
12 |         A brief description of the functional proposal. Links to corresponding papers and code are desirable.
13 |         对功能建议的简述。最好提供对应的论文和代码链接。
14 | 
15 |   - type: textarea
16 |     id: motivation
17 |     validations:
18 |       required: true
19 |     attributes:
20 |       label: Motivation / 动机
21 |       description: |
22 |         Your motivation for making the suggestion. If that motivation is related to another GitHub issue, link to it here.
23 |         您提出建议的动机。如果该动机与另一个 GitHub 问题有关，请在此处提供对应的链接。
24 | 
25 |   - type: textarea
26 |     id: contribution
27 |     validations:
28 |       required: true
29 |     attributes:
30 |       label: Your contribution / 您的贡献
31 |       description: |
32 |         
33 |         Your PR link or any other link you can help with.
34 |         您的PR链接或者其他您能提供帮助的链接。


--------------------------------------------------------------------------------
/CogVideo/.github/PULL_REQUEST_TEMPLATE/pr_template.md:
--------------------------------------------------------------------------------
 1 | #  Raise valuable PR / 提出有价值的PR
 2 | 
 3 | ## Caution / 注意事项:
 4 | Users should keep the following points in mind when submitting PRs:
 5 | 
 6 | 1. Ensure that your code meets the requirements in the [specification](../../resources/contribute.md).
 7 | 2. the proposed PR should be relevant, if there are multiple ideas and optimizations, they should be assigned to different PRs.
 8 | 
 9 | 用户在提交PR时候应该注意以下几点:
10 | 
11 | 1. 确保您的代码符合 [规范](../../resources/contribute_zh.md) 中的要求。
12 | 2. 提出的PR应该具有针对性，如果具有多个不同的想法和优化方案，应该分配到不同的PR中。
13 | 
14 | ## 不应该提出的PR / PRs that should not be proposed
15 | 
16 | If a developer proposes a PR about any of the following, it may be closed or Rejected.
17 | 
18 | 1. those that don't describe improvement options.
19 | 2. multiple issues of different types combined in one PR.
20 | 3. The proposed PR is highly duplicative of already existing PRs.
21 | 
22 | 如果开发者提出关于以下方面的PR，则可能会被直接关闭或拒绝通过。
23 | 
24 | 1. 没有说明改进方案的。
25 | 2. 多个不同类型的问题合并在一个PR中的。
26 | 3. 提出的PR与已经存在的PR高度重复的。
27 | 
28 | 
29 | # 检查您的PR
30 | - [ ] Have you read the Contributor Guidelines, Pull Request section? / 您是否阅读了贡献者指南、Pull Request 部分？
31 | - [ ] Has this been discussed/approved via a Github issue or forum? If so, add a link. / 是否通过 Github 问题或论坛讨论/批准过？如果是，请添加链接。
32 | - [ ] Did you make sure you updated the documentation with your changes? Here are the Documentation Guidelines, and here are the Documentation Formatting Tips. /您是否确保根据您的更改更新了文档？这里是文档指南，这里是文档格式化技巧。
33 | - [ ] Did you write new required tests? / 您是否编写了新的必要测试？
34 | - [ ]  Are your PRs for only one issue / 您的PR是否仅针对一个问题


--------------------------------------------------------------------------------
/CogVideo/.gitignore:
--------------------------------------------------------------------------------
 1 | *__pycache__/
 2 | samples*/
 3 | runs/
 4 | checkpoints/
 5 | master_ip
 6 | logs/
 7 | *.DS_Store
 8 | .idea
 9 | output*
10 | test*


--------------------------------------------------------------------------------
/CogVideo/download.sh:
--------------------------------------------------------------------------------
1 | mkdir CogVideoX-2b-sat
2 | cd CogVideoX-2b-sat
3 | wget https://cloud.tsinghua.edu.cn/f/fdba7608a49c463ba754/?dl=1
4 | mv 'index.html?dl=1' vae.zip
5 | unzip vae.zip
6 | wget https://cloud.tsinghua.edu.cn/f/556a3e1329e74f1bac45/?dl=1
7 | mv 'index.html?dl=1' transformer.zip
8 | unzip transformer.zip


--------------------------------------------------------------------------------
/CogVideo/finetune/accelerate_config_machine_single.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   gradient_accumulation_steps: 1
 5 |   gradient_clipping: 1.0
 6 |   offload_optimizer_device: none
 7 |   offload_param_device: none
 8 |   zero3_init_flag: false
 9 |   zero_stage: 2
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | enable_cpu_affinity: false
13 | machine_rank: 0
14 | main_training_function: main
15 | dynamo_backend: 'no'
16 | mixed_precision: 'no'
17 | num_machines: 1
18 | num_processes: 8
19 | rdzv_backend: static
20 | same_network: true
21 | tpu_env: []
22 | tpu_use_cluster: false
23 | tpu_use_sudo: false
24 | use_cpu: false


--------------------------------------------------------------------------------
/CogVideo/finetune/accelerate_config_machine_single_debug.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   gradient_accumulation_steps: 1
 5 |   gradient_clipping: 1.0
 6 |   offload_optimizer_device: none
 7 |   offload_param_device: none
 8 |   zero3_init_flag: false
 9 |   zero_stage: 2
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | enable_cpu_affinity: false
13 | machine_rank: 0
14 | main_training_function: main
15 | dynamo_backend: 'no'
16 | mixed_precision: 'no'
17 | num_machines: 1
18 | num_processes: 1
19 | rdzv_backend: static
20 | same_network: true
21 | tpu_env: []
22 | tpu_use_cluster: false
23 | tpu_use_sudo: false
24 | use_cpu: false


--------------------------------------------------------------------------------
/CogVideo/finetune/finetune_single_rank_injector.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export MODEL_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/cogvideox-5b"    # Change it to CogVideoX-5B path
 4 | export TRANSFORMER_PATH=""                                                      # Resume from pretrained injector checkpoint
 5 | export LORA_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/lora"             # Change it to pretrained lora path
 6 | export CACHE_PATH="~/.cache"
 7 | export DATASET_PATH="/ytech_m2v2_hdd/fuxiao/360Motion-Dataset"                  # Change it to 360-Motion Dataset path
 8 | export OUTPUT_PATH="injector"
 9 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
10 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,"
11 | 
12 | # if you are not using wth 8 gus, change `accelerate_config_machine_single_debug.yaml` num_processes as your gpu number
13 | accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \
14 |   train_cogvideox_injector.py \
15 |   --gradient_checkpointing \
16 |   --pretrained_model_name_or_path $MODEL_PATH \
17 |   --lora_path $LORA_PATH \
18 |   --cache_dir $CACHE_PATH \
19 |   --enable_tiling \
20 |   --enable_slicing \
21 |   --finetune_init \
22 |   --instance_data_root $DATASET_PATH \
23 |   --validation_prompt "a woman with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes and a robotic gazelle with a sturdy aluminum frame, an agile build, articulated legs and curved, metallic horns are moving in the city" \
24 |   --validation_prompt_separator ::: \
25 |   --num_validation_videos 1 \
26 |   --validation_epochs 1 \
27 |   --block_interval 2 \
28 |   --seed 42 \
29 |   --lora_scale 1.0 \
30 |   --mixed_precision bf16 \
31 |   --output_dir $OUTPUT_PATH \
32 |   --height 480 \
33 |   --width 720 \
34 |   --fps 8 \
35 |   --max_num_frames 49 \
36 |   --skip_frames_start 0 \
37 |   --skip_frames_end 0 \
38 |   --train_batch_size 1 \
39 |   --num_train_epochs 1000 \
40 |   --checkpointing_steps 4000 \
41 |   --gradient_accumulation_steps 1 \
42 |   --learning_rate 1e-4 \
43 |   --lr_scheduler cosine_with_restarts \
44 |   --lr_warmup_steps 200 \
45 |   --lr_num_cycles 1 \
46 |   --enable_slicing \
47 |   --enable_tiling \
48 |   --gradient_checkpointing \
49 |   --optimizer AdamW \
50 |   --adam_beta1 0.9 \
51 |   --adam_beta2 0.95 \
52 |   --max_grad_norm 1.0 \
53 |   --allow_tf32 \
54 |   --report_to wandb
55 | 
56 |   # --resume_from_checkpoint $TRANSFORMER_PATH \  


--------------------------------------------------------------------------------
/CogVideo/finetune/finetune_single_rank_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export MODEL_PATH="/m2v_intern/fuxiao/CogVideo-release/weights/cogvideox-5b"    # Change it to CogVideoX-5B path
 4 | export CACHE_PATH="~/.cache"
 5 | export DATASET_PATH="/ytech_m2v2_hdd/fuxiao/360Motion-Dataset"                  # Change it to 360-Motion Dataset path
 6 | export OUTPUT_PATH="lora"
 7 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 8 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,"
 9 | 
10 | # if you are not using wth 1 gpu, change `accelerate_config_machine_single_debug.yaml` num_processes as your gpu number
11 | accelerate launch --config_file accelerate_config_machine_single.yaml --multi_gpu \
12 |   train_cogvideox_lora.py \
13 |   --gradient_checkpointing \
14 |   --pretrained_model_name_or_path $MODEL_PATH \
15 |   --cache_dir $CACHE_PATH \
16 |   --enable_tiling \
17 |   --enable_slicing \
18 |   --instance_data_root $DATASET_PATH \
19 |   --validation_prompt "a woman with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes and a robotic gazelle with a sturdy aluminum frame, an agile build, articulated legs and curved, metallic horns are moving in the city" \
20 |   --validation_prompt_separator ::: \
21 |   --num_validation_videos 1 \
22 |   --validation_epochs 1 \
23 |   --seed 42 \
24 |   --rank 32 \
25 |   --lora_alpha 32 \
26 |   --mixed_precision bf16 \
27 |   --output_dir $OUTPUT_PATH \
28 |   --height 480 \
29 |   --width 720 \
30 |   --fps 8 \
31 |   --max_num_frames 49 \
32 |   --skip_frames_start 0 \
33 |   --skip_frames_end 0 \
34 |   --train_batch_size 2 \
35 |   --num_train_epochs 1000 \
36 |   --checkpointing_steps 1000 \
37 |   --gradient_accumulation_steps 1 \
38 |   --learning_rate 3e-4 \
39 |   --lr_scheduler cosine_with_restarts \
40 |   --lr_warmup_steps 200 \
41 |   --lr_num_cycles 1 \
42 |   --enable_slicing \
43 |   --enable_tiling \
44 |   --gradient_checkpointing \
45 |   --optimizer AdamW \
46 |   --adam_beta1 0.9 \
47 |   --adam_beta2 0.95 \
48 |   --max_grad_norm 1.0 \
49 |   --allow_tf32 \
50 |   --report_to wandb
51 | 


--------------------------------------------------------------------------------
/CogVideo/finetune/hostfile.txt:
--------------------------------------------------------------------------------
1 | node1 slots=8
2 | node2 slots=8


--------------------------------------------------------------------------------
/CogVideo/finetune/models/pipeline_output.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | import torch
 4 | 
 5 | from diffusers.utils import BaseOutput
 6 | 
 7 | 
 8 | @dataclass
 9 | class CogVideoXPipelineOutput(BaseOutput):
10 |     r"""
11 |     Output class for CogVideo pipelines.
12 | 
13 |     Args:
14 |         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
15 |             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
16 |             denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
17 |             `(batch_size, num_frames, channels, height, width)`.
18 |     """
19 | 
20 |     frames: torch.Tensor
21 | 


--------------------------------------------------------------------------------
/CogVideo/inference/location_zoo.txt:
--------------------------------------------------------------------------------
 1 | [
 2 |     'fjord',
 3 |     'sunset beach',
 4 |     'cave',
 5 |     'snowy tundra',
 6 |     'prairie',
 7 |     'asian town',
 8 |     'rainforest',
 9 |     'canyon',
10 |     'savanna',
11 |     'urban rooftop garden',
12 |     'swamp',
13 |     'riverbank',
14 |     'coral reef',
15 |     'volcanic landscape',
16 |     'wind farm',
17 |     'town street',
18 |     'night city square',
19 |     'mall lobby',
20 |     'glacier',
21 |     'seaside street',
22 |     'gymnastics room',
23 |     'abandoned factory',
24 |     'autumn forest',
25 |     'mountain village',
26 |     'coastal harbor',
27 |     'ancient ruins',
28 |     'modern metropolis',
29 |     'desert',
30 |     'forest',
31 |     'city',
32 |     'snowy street',
33 |     'park',
34 | ]


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_541_t1n37_021d_Hemi12_1_urban rooftop garden_a_rabbit_with_a_body_covered_i.txt:
--------------------------------------------------------------------------------
1 | D_loc1_541_t1n37_021d_Hemi12_1
2 | a rabbit with a body covered in soft fur, quick hops, and a playful demeanor, showcasing its energy
3 | urban rooftop garden
4 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_66_t1n36_0042_Hemi12_1_park_a_fire_spirit_with_long,_twist.txt:
--------------------------------------------------------------------------------
1 | D_loc1_66_t1n36_0042_Hemi12_1
2 | a fire spirit with long, twisting flames resembling flowing red and orange hair, a bright yellow core
3 | park
4 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc1_81_t1n42_0051_Hemi12_1_wind farm_a_pickup_truck_with_rugged_dar.txt:
--------------------------------------------------------------------------------
1 | D_loc1_81_t1n42_0051_Hemi12_1
2 | a pickup truck with rugged dark green paint, extended cab, raised suspension, and a modest cargo bed cover
3 | wind farm
4 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc2_17_t1n8_0011_Hemi12_1_sunset beach_a_disaster_rescue_robot_with_r.txt:
--------------------------------------------------------------------------------
1 | D_loc2_17_t1n8_0011_Hemi12_1
2 | a disaster rescue robot with reinforced limbs, advanced AI, and a rugged body designed to navigate
3 | sunset beach
4 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc2_482_t1n48_01e2_Hemi12_1_riverbank_a_man_with_short_spiky_blonde_.txt:
--------------------------------------------------------------------------------
1 | D_loc2_482_t1n48_01e2_Hemi12_1
2 | a man with short spiky blonde hair, slim build, a black trench coat, blue jeans, and brown hiking shoes
3 | riverbank
4 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc3_323_t1n15_0143_Hemi12_1_coral reef_a_cloud_creature_with_billowin.txt:
--------------------------------------------------------------------------------
1 | D_loc3_323_t1n15_0143_Hemi12_1
2 | a cloud creature with billowing white and gray plumes forming a soft, rounded body, wisps of darker fog
3 | coral reef
4 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc3_568_t1n3_0238_Hemi12_1_cave_a_woman_with_long_straight_bla.txt:
--------------------------------------------------------------------------------
1 | D_loc3_568_t1n3_0238_Hemi12_1
2 | a woman with long straight black hair, toned build, a blue denim jacket, light gray leggings, and black slip-on shoes
3 | cave
4 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc4_1174_t1n9_0496_Hemi12_1_mall lobby_a_polar_bear_with_thick_white_.txt:
--------------------------------------------------------------------------------
1 | D_loc4_1174_t1n9_0496_Hemi12_1
2 | a polar bear with thick white fur, strong paws, and a black nose, embodying the essence of the Arctic
3 | mall lobby
4 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc5_1210_t1n34_04ba_Hemi12_1_rainforest_a_moose_with_a_body_covered_in.txt:
--------------------------------------------------------------------------------
1 | D_loc5_1210_t1n34_04ba_Hemi12_1
2 | a moose with a body covered in thick brown fur, massive antlers, and a bulky frame
3 | rainforest
4 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/1_D_loc5_440_t1n35_01b8_Hemi12_1_sunset beach_a_dolphin_with_sleek_grey_skin.txt:
--------------------------------------------------------------------------------
1 | D_loc5_440_t1n35_01b8_Hemi12_1
2 | a dolphin with sleek grey skin, a curved dorsal fin, and intelligent, playful eyes, reflecting its nature
3 | sunset beach
4 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_1276_t2n30_04fc_Hemi12_1_sunset beach_a_man_with_short_curly_red_hai_a_fox_with_sleek_russet_fur,_a.txt:
--------------------------------------------------------------------------------
1 | D_loc1_1276_t2n30_04fc_Hemi12_1
2 | a man with short curly red hair, average build, a black leather jacket, dark blue cargo pants, and white sneakers
3 | a fox with sleek russet fur, a bushy tail tipped with black, and bright green and cunning eyes
4 | sunset beach
5 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_806_t2n2_0326_Hemi12_1_coral reef_a_porcupine_with_a_body_covere_a_woman_with_long_straight_bla.txt:
--------------------------------------------------------------------------------
1 | D_loc1_806_t2n2_0326_Hemi12_1
2 | a porcupine with a body covered in spiky brown quills, a small nose, and curious eyes
3 | a woman with long straight black hair, toned build, a blue denim jacket, light gray leggings, and black slip-on shoes
4 | coral reef
5 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc1_886_t2n25_0376_Hemi12_1_urban rooftop garden_a_man_with_medium-length_strai_a_wolf_with_thick_silver-gray_.txt:
--------------------------------------------------------------------------------
1 | D_loc1_886_t2n25_0376_Hemi12_1
2 | a man with medium-length straight brown hair, tall and slender, a gray crew-neck t-shirt, beige trousers, dark green sneakers
3 | a wolf with thick silver-gray fur, alert golden eyes, and a lean yet strong body, exuding confidence and boldness
4 | urban rooftop garden
5 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc2_1442_t2n36_05a2_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_surveillance_drone_robot_wit.txt:
--------------------------------------------------------------------------------
1 | D_loc2_1442_t2n36_05a2_Hemi12_1
2 | a storm entity with dark swirling clouds as a body, streaks of electric blue lightning shooting across it
3 | a surveillance drone robot with extendable camera arms, thermal vision, and a stealth black body
4 | swamp
5 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1010_t2n2_03f2_Hemi12_1_mall lobby_a_man_with_short_curly_red_hai_a_woman_with_long_wavy_blonde_.txt:
--------------------------------------------------------------------------------
1 | D_loc5_1010_t2n2_03f2_Hemi12_1
2 | a man with short curly red hair, average build, a black leather jacket, dark blue cargo pants, and white sneakers
3 | a woman with long wavy blonde hair, petite figure, a red floral dress, white sandals, and a yellow shoulder bag
4 | mall lobby
5 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1095_t2n37_0447_Hemi12_1_sunset beach_a_companion_robot_with_a_frien_a_man_with_short_straight_blac.txt:
--------------------------------------------------------------------------------
1 | D_loc5_1095_t2n37_0447_Hemi12_1
2 | a companion robot with a friendly digital face, a smooth white exterior, and social interaction algorithms
3 | a man with short straight black hair, tall and lean build, a navy blue sweater, khaki shorts, and brown sandals
4 | sunset beach
5 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_120_t2n37_0078_Hemi12_1_night city square_a_compact_electric_vehicle_wit_a_fox_with_sleek_russet_fur,_a.txt:
--------------------------------------------------------------------------------
1 | D_loc5_120_t2n37_0078_Hemi12_1
2 | a compact electric vehicle with a silver finish, aerodynamic profile, and efficient battery
3 | a fox with sleek russet fur, a bushy tail tipped with black, and bright green and cunning eyes
4 | night city square
5 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1290_t2n36_050a_Hemi12_1_swamp_a_firefighting_robot_with_a_wa_a_penguin_with_a_body_covered_.txt:
--------------------------------------------------------------------------------
1 | D_loc5_1290_t2n36_050a_Hemi12_1
2 | a firefighting robot with a water cannon arm, heat sensors, and durable red-and-silver exterior
3 | a penguin with a body covered in smooth black-and-white feathers, short wings, and webbed feet
4 | swamp
5 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_1440_t2n35_05a0_Hemi12_1_forest_a_fire_spirit_with_long,_twist_a_moose_with_a_body_covered_in.txt:
--------------------------------------------------------------------------------
1 | D_loc5_1440_t2n35_05a0_Hemi12_1
2 | a fire spirit with long, twisting flames resembling flowing red and orange hair, a bright yellow core
3 | a moose with a body covered in thick brown fur, massive antlers, and a bulky frame
4 | forest
5 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/2_D_loc5_65_t2n23_0041_Hemi12_1_snowy tundra_a_woman_with_shoulder-length_w_a_parrot_with_bright_red,_blue.txt:
--------------------------------------------------------------------------------
1 | D_loc5_65_t2n23_0041_Hemi12_1
2 | a woman with shoulder-length wavy brown hair, slim build, a green parka, black leggings, and gray hiking boots
3 | a parrot with bright red, blue, and yellow feathers, a curved beak, and sharp intelligent eyes
4 | snowy tundra
5 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_1041_t3n22_0411_Hemi12_1_swamp_a_storm_entity_with_dark_swirl_a_regal_lion_with_a_thick,_flo_a_man_with_short_straight_blac.txt:
--------------------------------------------------------------------------------
1 | D_loc1_1041_t3n22_0411_Hemi12_1
2 | a storm entity with dark swirling clouds as a body, streaks of electric blue lightning shooting across it
3 | a regal lion with a thick, flowing golden mane, sharp brown eyes, and a powerful muscular frame
4 | a man with short straight black hair, tall and lean build, a navy blue sweater, khaki shorts, and brown sandals
5 | swamp
6 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_1226_t3n24_04ca_Hemi12_1_prairie_a_woman_with_short_blonde_hair_a_private_jet_with_a_shiny_sil_a_wolf_with_a_body_covered_in_.txt:
--------------------------------------------------------------------------------
1 | D_loc1_1226_t3n24_04ca_Hemi12_1
2 | a woman with short blonde hair, slim athletic build, a red leather jacket, dark blue jeans, and white sneakers
3 | a private jet with a shiny silver body, elongated wings, a slim nose, and a compact rear stabilizer
4 | a wolf with a body covered in thick silver fur, sharp ears, and piercing yellow eyes, showcasing its alertness
5 | prairie
6 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_176_t3n26_00b0_Hemi12_1_abandoned factory_a_horse_with_chestnut_brown_fu_a_flamingo_with_a_body_covered_a_wolf_with_thick_silver-gray_.txt:
--------------------------------------------------------------------------------
1 | D_loc1_176_t3n26_00b0_Hemi12_1
2 | a horse with chestnut brown fur, muscular legs, a slim neck, and a flowing mane, exuding strength and grace
3 | a flamingo with a body covered in pink feathers, long slender legs, and a gracefully curved neck
4 | a wolf with thick silver-gray fur, alert golden eyes, and a lean yet strong body, exuding confidence and boldness
5 | abandoned factory
6 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_196_t3n32_00c4_Hemi12_1_desert_a_man_with_short_spiky_blonde__a_polar_bear_with_thick_white__a_deer_with_sleek_tan_fur,_lon.txt:
--------------------------------------------------------------------------------
1 | D_loc1_196_t3n32_00c4_Hemi12_1
2 | a man with short spiky blonde hair, slim build, a black trench coat, blue jeans, and brown hiking shoes
3 | a polar bear with thick white fur, strong paws, and a black nose, embodying the essence of the Arctic
4 | a deer with sleek tan fur, long slender legs, a graceful neck, and tiny antlers atop its head
5 | desert
6 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc1_536_t3n1_0218_Hemi12_1_snowy street_a_tiger_with_a_pristine_white__a_firefighting_robot_with_a_wa_a_sporty_roadster_with_a_conve.txt:
--------------------------------------------------------------------------------
1 | D_loc1_536_t3n1_0218_Hemi12_1
2 | a tiger with a pristine white coat marked by bold black stripes, bright blue eyes, and a graceful, poised form
3 | a firefighting robot with a water cannon arm, heat sensors, and durable red-and-silver exterior
4 | a sporty roadster with a convertible top, silver trim, and a powerful engine
5 | snowy street
6 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc2_1287_t3n5_0507_Hemi12_1_urban rooftop garden_a_panda_with_a_body_covered_in_a_man_with_short_straight_blac_an_industrial_welding_robot_wi.txt:
--------------------------------------------------------------------------------
1 | D_loc2_1287_t3n5_0507_Hemi12_1
2 | a panda with a body covered in fluffy black-and-white fur, a round face, and gentle eyes, radiating warmth
3 | a man with short straight black hair, tall and lean build, a navy blue sweater, khaki shorts, and brown sandals
4 | an industrial welding robot with articulated arms, a laser precision welder, and heat-resistant shields
5 | urban rooftop garden
6 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc2_1392_t3n4_0570_Hemi12_1_volcanic landscape_a_fluttering_butterfly_with_in_a_man_with_buzz-cut_blonde_hai_a_giraffe_with_golden-yellow_f.txt:
--------------------------------------------------------------------------------
1 | D_loc2_1392_t3n4_0570_Hemi12_1
2 | a fluttering butterfly with intricate wing patterns, vivid colors, and graceful flight
3 | a man with buzz-cut blonde hair, stocky build, a gray zip-up sweater, black shorts, and red basketball shoes
4 | a giraffe with golden-yellow fur, long legs, a tall slender neck, and patches of brown spots, exuding elegance and calm
5 | volcanic landscape
6 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc3_1473_t3n23_05c1_Hemi12_1_coastal harbor_a_firefighting_robot_with_a_wa_a_crocodile_with_a_body_covere_a_rabbit_with_a_body_covered_i.txt:
--------------------------------------------------------------------------------
1 | D_loc3_1473_t3n23_05c1_Hemi12_1
2 | a firefighting robot with a water cannon arm, heat sensors, and durable red-and-silver exterior
3 | a crocodile with a body covered in scaly green skin, a powerful tail, and sharp teeth
4 | a rabbit with a body covered in soft fur, quick hops, and a playful demeanor, showcasing its energy
5 | coastal harbor
6 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc4_849_t3n28_0351_Hemi12_1_desert_a_man_with_short_black_wavy_ha_a_sedan_with_a_sleek_metallic__a_gazelle_with_a_body_covered_.txt:
--------------------------------------------------------------------------------
1 | D_loc4_849_t3n28_0351_Hemi12_1
2 | a man with short black wavy hair, lean figure, a green and yellow plaid shirt, dark brown pants, and black suede shoes
3 | a sedan with a sleek metallic silver body, long wheelbase, a low-profile hood, and a small rear spoiler
4 | a gazelle with a body covered in sleek tan fur, long legs, and elegant curved horns, showcasing its grace
5 | desert
6 | 


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/inference/output_example/3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.mp4


--------------------------------------------------------------------------------
/CogVideo/inference/output_example/3_D_loc5_865_t3n34_0361_Hemi12_1_fjord_a_man_with_a_shaved_head,_broa_a_foggy_apparition_with_pale_g_a_jaguar_with_a_golden-yellow_.txt:
--------------------------------------------------------------------------------
1 | D_loc5_865_t3n34_0361_Hemi12_1
2 | a man with a shaved head, broad shoulders, a gray graphic t-shirt, dark jeans, and brown leather boots
3 | a foggy apparition with pale gray wisps drifting together in a soft, undefined form, tiny white sparkles
4 | a jaguar with a golden-yellow coat dotted with intricate black rosettes, deep green eyes, and a muscular build
5 | fjord
6 | 


--------------------------------------------------------------------------------
/CogVideo/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | line-length = 119
 3 | 
 4 | [tool.ruff.lint]
 5 | # Never enforce `E501` (line length violations).
 6 | ignore = ["C901", "E501", "E741", "F402", "F823"]
 7 | select = ["C", "E", "F", "I", "W"]
 8 | 
 9 | # Ignore import violations in all `__init__.py` files.
10 | [tool.ruff.lint.per-file-ignores]
11 | "__init__.py" = ["E402", "F401", "F403", "F811"]
12 | 
13 | [tool.ruff.lint.isort]
14 | lines-after-imports = 2
15 | 
16 | [tool.ruff.format]
17 | # Like Black, use double quotes for strings.
18 | quote-style = "double"
19 | 
20 | # Like Black, indent with spaces, rather than tabs.
21 | indent-style = "space"
22 | 
23 | # Like Black, respect magic trailing commas.
24 | skip-magic-trailing-comma = false
25 | 
26 | # Like Black, automatically detect the appropriate line ending.
27 | line-ending = "auto"
28 | 


--------------------------------------------------------------------------------
/CogVideo/requirements.txt:
--------------------------------------------------------------------------------
 1 | diffusers==0.31.0
 2 | accelerate==1.1.1
 3 | transformers==4.46.2
 4 | numpy==1.26.0
 5 | # torch==2.5.0
 6 | # torchvision==0.20.0
 7 | sentencepiece==0.2.0
 8 | SwissArmyTransformer==0.4.12
 9 | gradio==5.5.0
10 | imageio==2.35.1
11 | imageio-ffmpeg==0.5.1
12 | openai==1.54.0
13 | moviepy==1.0.3
14 | scikit-video==1.1.11
15 | opencv-python
16 | peft==0.12.0
17 | decord
18 | wandb


--------------------------------------------------------------------------------
/CogVideo/tools/caption/README.md:
--------------------------------------------------------------------------------
 1 | # Video Caption
 2 | 
 3 | Typically, most video data does not come with corresponding descriptive text, so it is necessary to convert the video
 4 | data into textual descriptions to provide the essential training data for text-to-video models.
 5 | 
 6 | ## Update and News
 7 | - 🔥🔥 **News**: ```2024/9/19```: The caption model used in the CogVideoX training process to convert video data into text
 8 |   descriptions, [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption), is now open-source. Feel
 9 |   free to download and use it.
10 | 
11 | 
12 | ## Video Caption via CogVLM2-Caption
13 | 
14 | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/) 
15 | 
16 | CogVLM2-Caption is a video captioning model used to generate training data for the CogVideoX model.
17 | 
18 | ### Install
19 | ```shell
20 | pip install -r requirements.txt
21 | ```
22 | 
23 | ### Usage
24 | 
25 | ```shell
26 | python video_caption.py
27 | ```
28 | 
29 | Example:
30 | <div align="center">
31 |     <img width="600px" height="auto" src="./assests/CogVLM2-Caption-example.png">
32 | </div>
33 | 
34 | ## Video Caption via CogVLM2-Video
35 | 
36 | [Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) ｜ [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/)
37 | 
38 | CogVLM2-Video is a versatile video understanding model equipped with timestamp-based question answering capabilities.
39 | Users can input prompts such as `Please describe this video in detail.` to the model to obtain a detailed video caption:
40 | <div align="center">
41 |     <a href="https://cogvlm2-video.github.io/"><img width="600px" height="auto" src="./assests/cogvlm2-video-example.png"></a>
42 | </div>
43 | 
44 | Users can use the provided [code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) to load the model or configure a RESTful API to generate video captions.
45 | 
46 | ## Citation
47 | 
48 | 🌟 If you find our work helpful, please leave us a star and cite our paper.
49 | 
50 | CogVLM2-Caption:
51 | ```
52 | @article{yang2024cogvideox,
53 |   title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
54 |   author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
55 |   journal={arXiv preprint arXiv:2408.06072},
56 |   year={2024}
57 | }
58 | ```
59 | CogVLM2-Video:
60 | ```
61 | @article{hong2024cogvlm2,
62 |   title={CogVLM2: Visual Language Models for Image and Video Understanding},
63 |   author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others},
64 |   journal={arXiv preprint arXiv:2408.16500},
65 |   year={2024}
66 | }
67 | ```


--------------------------------------------------------------------------------
/CogVideo/tools/caption/README_ja.md:
--------------------------------------------------------------------------------
 1 | # ビデオキャプション
 2 | 
 3 | 通常、ほとんどのビデオデータには対応する説明文が付いていないため、ビデオデータをテキストの説明に変換して、テキストからビデオへのモデルに必要なトレーニングデータを提供する必要があります。
 4 | 
 5 | ## 更新とニュース
 6 | - 🔥🔥 **ニュース**: ```2024/9/19```：CogVideoX
 7 |   のトレーニングプロセスで、ビデオデータをテキストに変換するためのキャプションモデル [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption)
 8 |   がオープンソース化されました。ぜひダウンロードしてご利用ください。
 9 | ## CogVLM2-Captionによるビデオキャプション
10 | 
11 | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/) 
12 | 
13 | CogVLM2-Captionは、CogVideoXモデルのトレーニングデータを生成するために使用されるビデオキャプションモデルです。
14 | 
15 | ### インストール
16 | ```shell
17 | pip install -r requirements.txt
18 | ```
19 | 
20 | ### 使用方法
21 | ```shell
22 | python video_caption.py
23 | ```
24 | 
25 | 例:
26 | <div align="center">
27 |     <img width="600px" height="auto" src="./assests/CogVLM2-Caption-example.png">
28 | </div>
29 | 
30 | 
31 | 
32 | ## CogVLM2-Video を使用したビデオキャプション
33 | 
34 | [Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) ｜ [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/)
35 | 
36 | 
37 | CogVLM2-Video は、タイムスタンプベースの質問応答機能を備えた多機能なビデオ理解モデルです。ユーザーは `このビデオを詳細に説明してください。` などのプロンプトをモデルに入力して、詳細なビデオキャプションを取得できます：
38 | <div align="center">
39 |     <a href="https://cogvlm2-video.github.io/"><img width="600px" height="auto" src="./assests/cogvlm2-video-example.png"></a>
40 | </div>
41 | 
42 | ユーザーは提供された[コード](https://github.com/THUDM/CogVLM2/tree/main/video_demo)を使用してモデルをロードするか、RESTful API を構成してビデオキャプションを生成できます。
43 | 
44 | ## Citation
45 | 
46 | 🌟 If you find our work helpful, please leave us a star and cite our paper.
47 | 
48 | CogVLM2-Caption:
49 | ```
50 | @article{yang2024cogvideox,
51 |   title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
52 |   author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
53 |   journal={arXiv preprint arXiv:2408.06072},
54 |   year={2024}
55 | }
56 | ```
57 | CogVLM2-Video:
58 | ```
59 | @article{hong2024cogvlm2,
60 |   title={CogVLM2: Visual Language Models for Image and Video Understanding},
61 |   author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others},
62 |   journal={arXiv preprint arXiv:2408.16500},
63 |   year={2024}
64 | }
65 | ```
66 | 


--------------------------------------------------------------------------------
/CogVideo/tools/caption/README_zh.md:
--------------------------------------------------------------------------------
 1 | # 视频Caption
 2 | 
 3 | 通常，大多数视频数据不带有相应的描述性文本，因此需要将视频数据转换为文本描述，以提供必要的训练数据用于文本到视频模型。
 4 | 
 5 | ## 项目更新
 6 | - 🔥🔥 **News**: ```2024/9/19```: CogVideoX 训练过程中用于将视频数据转换为文本描述的 Caption
 7 |   模型 [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption)
 8 |   已经开源。欢迎前往下载并使用。
 9 | 
10 | ## 通过 CogVLM2-Caption 模型生成视频Caption
11 | 
12 | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/) 
13 | 
14 | CogVLM2-Caption是用于生成CogVideoX模型训练数据的视频caption模型。
15 | 
16 | ### 安装依赖
17 | ```shell
18 | pip install -r requirements.txt
19 | ```
20 | 
21 | ### 运行caption模型
22 | 
23 | ```shell
24 | python video_caption.py
25 | ```
26 | 
27 | 示例：
28 | <div align="center">
29 |     <img width="600px" height="auto" src="./assests/CogVLM2-Caption-example.png">
30 | </div>
31 | 
32 | ## 通过 CogVLM2-Video 模型生成视频Caption
33 | 
34 | [Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) ｜ [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/)
35 | 
36 | CogVLM2-Video 是一个多功能的视频理解模型，具备基于时间戳的问题回答能力。用户可以输入诸如 `Describe this video in detail.` 的提示语给模型，以获得详细的视频Caption：
37 | 
38 | 
39 | <div align="center">
40 |     <a href="https://cogvlm2-video.github.io/"><img width="600px" height="auto" src="./assests/cogvlm2-video-example.png"></a>
41 | </div>
42 | 
43 | 用户可以使用提供的[代码](https://github.com/THUDM/CogVLM2/tree/main/video_demo)加载模型或配置 RESTful API 来生成视频Caption。
44 | 
45 | 
46 | ## Citation
47 | 
48 | 🌟 If you find our work helpful, please leave us a star and cite our paper.
49 | 
50 | CogVLM2-Caption:
51 | ```
52 | @article{yang2024cogvideox,
53 |   title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
54 |   author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
55 |   journal={arXiv preprint arXiv:2408.06072},
56 |   year={2024}
57 | }
58 | ```
59 | CogVLM2-Video:
60 | ```
61 | @article{hong2024cogvlm2,
62 |   title={CogVLM2: Visual Language Models for Image and Video Understanding},
63 |   author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others},
64 |   journal={arXiv preprint arXiv:2408.16500},
65 |   year={2024}
66 | }
67 | ```


--------------------------------------------------------------------------------
/CogVideo/tools/caption/assests/CogVLM2-Caption-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/tools/caption/assests/CogVLM2-Caption-example.png


--------------------------------------------------------------------------------
/CogVideo/tools/caption/assests/cogvlm2-video-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/tools/caption/assests/cogvlm2-video-example.png


--------------------------------------------------------------------------------
/CogVideo/tools/caption/requirements.txt:
--------------------------------------------------------------------------------
 1 | decord>=0.6.0
 2 | #根据https://download.pytorch.org/whl/torch/，python版本为[3.8,3.11]
 3 | torch==2.1.0
 4 | torchvision== 0.16.0
 5 | pytorchvideo==0.1.5
 6 | xformers
 7 | transformers==4.42.4
 8 | #git+https://github.com/huggingface/transformers.git
 9 | huggingface-hub>=0.23.0
10 | pillow
11 | chainlit>=1.0
12 | pydantic>=2.7.1
13 | timm>=0.9.16
14 | openai>=1.30.1
15 | loguru>=0.7.2
16 | pydantic>=2.7.1
17 | einops
18 | sse-starlette>=2.1.0
19 | flask
20 | gunicorn
21 | gevent
22 | requests
23 | gradio


--------------------------------------------------------------------------------
/CogVideo/tools/export_sat_lora_weight.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | import torch 
 3 | import argparse 
 4 | from diffusers.loaders.lora_base import LoraBaseMixin
 5 | from diffusers.models.modeling_utils import load_state_dict
 6 | 
 7 | 
 8 | def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
 9 |     state_dict = saved_dict
10 |     if "model" in saved_dict.keys():
11 |         state_dict = state_dict["model"]
12 |     if "module" in saved_dict.keys():
13 |         state_dict = state_dict["module"]
14 |     if "state_dict" in saved_dict.keys():
15 |         state_dict = state_dict["state_dict"]
16 |     return state_dict
17 | 
18 | LORA_KEYS_RENAME = {
19 | 
20 |     'attention.query_key_value.matrix_A.0': 'attn1.to_q.lora_A.weight',
21 |     'attention.query_key_value.matrix_A.1': 'attn1.to_k.lora_A.weight',
22 |     'attention.query_key_value.matrix_A.2': 'attn1.to_v.lora_A.weight',
23 |     'attention.query_key_value.matrix_B.0': 'attn1.to_q.lora_B.weight',
24 |     'attention.query_key_value.matrix_B.1': 'attn1.to_k.lora_B.weight',
25 |     'attention.query_key_value.matrix_B.2': 'attn1.to_v.lora_B.weight',
26 |     'attention.dense.matrix_A.0': 'attn1.to_out.0.lora_A.weight',
27 |     'attention.dense.matrix_B.0': 'attn1.to_out.0.lora_B.weight'
28 | }
29 | 
30 | 
31 | 
32 | PREFIX_KEY = "model.diffusion_model."
33 | SAT_UNIT_KEY = "layers"
34 | LORA_PREFIX_KEY = "transformer_blocks"
35 | 
36 | 
37 | 
38 | def export_lora_weight(ckpt_path,lora_save_directory):
39 | 
40 |     merge_original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
41 | 
42 | 
43 |     lora_state_dict = {}
44 |     for key in list(merge_original_state_dict.keys()):
45 |         new_key = key[len(PREFIX_KEY) :]
46 |         for special_key, lora_keys in LORA_KEYS_RENAME.items():
47 |             if new_key.endswith(special_key):
48 |                 new_key = new_key.replace(special_key, lora_keys)
49 |                 new_key = new_key.replace(SAT_UNIT_KEY, LORA_PREFIX_KEY)
50 | 
51 |                 lora_state_dict[new_key] = merge_original_state_dict[key]
52 | 
53 | 
54 | 
55 |     # final length should be 240 
56 |     if len(lora_state_dict) != 240:
57 |         raise ValueError("lora_state_dict length is not 240")
58 | 
59 |     lora_state_dict.keys()
60 | 
61 |     LoraBaseMixin.write_lora_layers(
62 |         state_dict=lora_state_dict,
63 |         save_directory=lora_save_directory,
64 |         is_main_process=True,
65 |         weight_name=None,
66 |         save_function=None,
67 |         safe_serialization=True
68 |     )
69 | 
70 | 
71 | def get_args():
72 |     parser = argparse.ArgumentParser()
73 |     parser.add_argument(
74 |         "--sat_pt_path", type=str, required=True, help="Path to original sat transformer checkpoint"
75 |     )
76 |     parser.add_argument("--lora_save_directory", type=str, required=True, help="Path where converted lora should be saved") 
77 |     return parser.parse_args()
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     args = get_args()
82 | 
83 |     export_lora_weight(args.sat_pt_path, args.lora_save_directory)
84 | 


--------------------------------------------------------------------------------
/CogVideo/tools/llm_flux_cogvideox/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NUM_VIDEOS=10
 4 | INFERENCE_STEPS=50
 5 | GUIDANCE_SCALE=7.0
 6 | OUTPUT_DIR_PREFIX="outputs/gpu_"
 7 | LOG_DIR_PREFIX="logs/gpu_"
 8 | 
 9 | VIDEO_MODEL_PATH="/share/official_pretrains/hf_home/CogVideoX-5b-I2V"
10 | LLM_MODEL_PATH="/share/home/zyx/Models/Meta-Llama-3.1-8B-Instruct"
11 | IMAGE_MODEL_PATH = "share/home/zyx/Models/FLUX.1-dev"
12 | 
13 | #VIDEO_MODEL_PATH="THUDM/CogVideoX-5B-I2V"
14 | #LLM_MODEL_PATH="THUDM/glm-4-9b-chat"
15 | #IMAGE_MODEL_PATH = "black-forest-labs/FLUX.1-dev"
16 | 
17 | CUDA_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
18 | 
19 | IFS=',' read -r -a GPU_ARRAY <<< "$CUDA_DEVICES"
20 | 
21 | for i in "${!GPU_ARRAY[@]}"
22 | do
23 |     GPU=${GPU_ARRAY[$i]}
24 |     echo "Starting task on GPU $GPU..."
25 |     CUDA_VISIBLE_DEVICES=$GPU nohup python3 llm_flux_cogvideox.py \
26 |     --caption_generator_model_id $LLM_MODEL_PATH \
27 |     --image_generator_model_id $IMAGE_MODEL_PATH \
28 |     --model_path $VIDEO_MODEL_PATH \
29 |     --num_videos $NUM_VIDEOS \
30 |     --image_generator_num_inference_steps $INFERENCE_STEPS \
31 |     --guidance_scale $GUIDANCE_SCALE \
32 |     --use_dynamic_cfg \
33 |     --output_dir ${OUTPUT_DIR_PREFIX}${GPU} \
34 |     > ${LOG_DIR_PREFIX}${GPU}.log 2>&1 &
35 | done


--------------------------------------------------------------------------------
/CogVideo/tools/parallel_inference/run.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | export PYTHONPATH=$PWD:$PYTHONPATH
 4 | 
 5 | # Select the model type
 6 | # The model is downloaded to a specified location on disk, 
 7 | # or you can simply use the model's ID on Hugging Face, 
 8 | # which will then be downloaded to the default cache path on Hugging Face.
 9 | 
10 | export MODEL_TYPE="CogVideoX"
11 | # Configuration for different model types
12 | # script, model_id, inference_step
13 | declare -A MODEL_CONFIGS=(
14 |     ["CogVideoX"]="parallel_inference_xdit.py /cfs/dit/CogVideoX-2b 20"
15 | )
16 | 
17 | if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
18 |     IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
19 |     export SCRIPT MODEL_ID INFERENCE_STEP
20 | else
21 |     echo "Invalid MODEL_TYPE: $MODEL_TYPE"
22 |     exit 1
23 | fi
24 | 
25 | mkdir -p ./results
26 | 
27 | # task args
28 | if [ "$MODEL_TYPE" = "CogVideoX" ]; then
29 |   TASK_ARGS="--height 480 --width 720 --num_frames 9"
30 | fi
31 | 
32 | # CogVideoX asserts sp_degree == ulysses_degree*ring_degree <= 2. Also, do not set the pipefusion degree.
33 | if [ "$MODEL_TYPE" = "CogVideoX" ]; then
34 | N_GPUS=4
35 | PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 1"
36 | CFG_ARGS="--use_cfg_parallel"
37 | fi
38 | 
39 | 
40 | torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
41 | --model $MODEL_ID \
42 | $PARALLEL_ARGS \
43 | $TASK_ARGS \
44 | $PIPEFUSION_ARGS \
45 | $OUTPUT_ARGS \
46 | --num_inference_steps $INFERENCE_STEP \
47 | --warmup_steps 0 \
48 | --prompt "A small dog." \
49 | $CFG_ARGS \
50 | $PARALLLEL_VAE \
51 | $COMPILE_FLAG
52 | 


--------------------------------------------------------------------------------
/CogVideo/tools/replicate/cog.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Cog ⚙️
 2 | # Reference: https://cog.run/yaml
 3 | 
 4 | build:
 5 |   # set to true if your model requires a GPU
 6 |   gpu: true
 7 | 
 8 |   # a list of ubuntu apt packages to install
 9 |   system_packages:
10 |     - "libgl1-mesa-glx"
11 |     - "libglib2.0-0"
12 | 
13 |   # python version in the form '3.11' or '3.11.4'
14 |   python_version: "3.11"
15 | 
16 |   # a list of packages in the format <package-name>==<version>
17 |   python_packages:
18 |     - diffusers>=0.30.3
19 |     - accelerate>=0.34.2
20 |     - transformers>=4.44.2
21 |     - numpy==1.26.0
22 |     - torch>=2.4.0
23 |     - torchvision>=0.19.0
24 |     - sentencepiece>=0.2.0
25 |     - SwissArmyTransformer>=0.4.12
26 |     - imageio>=2.35.1
27 |     - imageio-ffmpeg>=0.5.1
28 |     - openai>=1.45.0
29 |     - moviepy>=1.0.3
30 |     - pillow==9.5.0
31 |     - pydantic==1.10.7
32 |   run:
33 |     - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
34 | 
35 | # predict.py defines how predictions are run on your model
36 | predict: "predict_t2v.py:Predictor"
37 | # predict: "predict_i2v.py:Predictor"
38 | 


--------------------------------------------------------------------------------
/CogVideo/tools/replicate/predict_i2v.py:
--------------------------------------------------------------------------------
 1 | # Prediction interface for Cog ⚙️
 2 | # https://cog.run/python
 3 | 
 4 | import os
 5 | import subprocess
 6 | import time
 7 | import torch
 8 | from diffusers import CogVideoXImageToVideoPipeline
 9 | from diffusers.utils import export_to_video, load_image
10 | from cog import BasePredictor, Input, Path
11 | 
12 | 
13 | MODEL_CACHE = "model_cache_i2v"
14 | MODEL_URL = (
15 |     f"https://weights.replicate.delivery/default/THUDM/CogVideo/{MODEL_CACHE}.tar"
16 | )
17 | os.environ["HF_DATASETS_OFFLINE"] = "1"
18 | os.environ["TRANSFORMERS_OFFLINE"] = "1"
19 | os.environ["HF_HOME"] = MODEL_CACHE
20 | os.environ["TORCH_HOME"] = MODEL_CACHE
21 | os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE
22 | os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE
23 | os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE
24 | 
25 | 
26 | def download_weights(url, dest):
27 |     start = time.time()
28 |     print("downloading url: ", url)
29 |     print("downloading to: ", dest)
30 |     subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
31 |     print("downloading took: ", time.time() - start)
32 | 
33 | 
34 | class Predictor(BasePredictor):
35 |     def setup(self) -> None:
36 |         """Load the model into memory to make running multiple predictions efficient"""
37 | 
38 |         if not os.path.exists(MODEL_CACHE):
39 |             download_weights(MODEL_URL, MODEL_CACHE)
40 | 
41 |         # model_id: THUDM/CogVideoX-5b-I2V
42 |         self.pipe = CogVideoXImageToVideoPipeline.from_pretrained(
43 |             MODEL_CACHE, torch_dtype=torch.bfloat16
44 |         ).to("cuda")
45 | 
46 |         self.pipe.enable_model_cpu_offload()
47 |         self.pipe.vae.enable_tiling()
48 | 
49 |     def predict(
50 |         self,
51 |         prompt: str = Input(
52 |             description="Input prompt", default="Starry sky slowly rotating."
53 |         ),
54 |         image: Path = Input(description="Input image"),
55 |         num_inference_steps: int = Input(
56 |             description="Number of denoising steps", ge=1, le=500, default=50
57 |         ),
58 |         guidance_scale: float = Input(
59 |             description="Scale for classifier-free guidance", ge=1, le=20, default=6
60 |         ),
61 |         num_frames: int = Input(
62 |             description="Number of frames for the output video", default=49
63 |         ),
64 |         seed: int = Input(
65 |             description="Random seed. Leave blank to randomize the seed", default=None
66 |         ),
67 |     ) -> Path:
68 |         """Run a single prediction on the model"""
69 | 
70 |         if seed is None:
71 |             seed = int.from_bytes(os.urandom(2), "big")
72 |         print(f"Using seed: {seed}")
73 | 
74 |         img = load_image(image=str(image))
75 | 
76 |         video = self.pipe(
77 |             prompt=prompt,
78 |             image=img,
79 |             num_videos_per_prompt=1,
80 |             num_inference_steps=num_inference_steps,
81 |             num_frames=num_frames,
82 |             guidance_scale=guidance_scale,
83 |             generator=torch.Generator(device="cuda").manual_seed(seed),
84 |         ).frames[0]
85 | 
86 |         out_path = "/tmp/out.mp4"
87 | 
88 |         export_to_video(video, out_path, fps=8)
89 |         return Path(out_path)
90 | 


--------------------------------------------------------------------------------
/CogVideo/tools/replicate/predict_t2v.py:
--------------------------------------------------------------------------------
 1 | # Prediction interface for Cog ⚙️
 2 | # https://cog.run/python
 3 | 
 4 | import os
 5 | import subprocess
 6 | import time
 7 | import torch
 8 | from diffusers import CogVideoXPipeline
 9 | from diffusers.utils import export_to_video
10 | from cog import BasePredictor, Input, Path
11 | 
12 | 
13 | MODEL_CACHE = "model_cache"
14 | MODEL_URL = (
15 |     f"https://weights.replicate.delivery/default/THUDM/CogVideo/{MODEL_CACHE}.tar"
16 | )
17 | os.environ["HF_DATASETS_OFFLINE"] = "1"
18 | os.environ["TRANSFORMERS_OFFLINE"] = "1"
19 | os.environ["HF_HOME"] = MODEL_CACHE
20 | os.environ["TORCH_HOME"] = MODEL_CACHE
21 | os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE
22 | os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE
23 | os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE
24 | 
25 | 
26 | def download_weights(url, dest):
27 |     start = time.time()
28 |     print("downloading url: ", url)
29 |     print("downloading to: ", dest)
30 |     subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
31 |     print("downloading took: ", time.time() - start)
32 | 
33 | 
34 | class Predictor(BasePredictor):
35 |     def setup(self) -> None:
36 |         """Load the model into memory to make running multiple predictions efficient"""
37 | 
38 |         if not os.path.exists(MODEL_CACHE):
39 |             download_weights(MODEL_URL, MODEL_CACHE)
40 | 
41 |         # model_id: THUDM/CogVideoX-5b
42 |         self.pipe = CogVideoXPipeline.from_pretrained(
43 |             MODEL_CACHE,
44 |             torch_dtype=torch.bfloat16,
45 |         ).to("cuda")
46 | 
47 |         self.pipe.enable_model_cpu_offload()
48 |         self.pipe.vae.enable_tiling()
49 | 
50 |     def predict(
51 |         self,
52 |         prompt: str = Input(
53 |             description="Input prompt",
54 |             default="A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance.",
55 |         ),
56 |         num_inference_steps: int = Input(
57 |             description="Number of denoising steps", ge=1, le=500, default=50
58 |         ),
59 |         guidance_scale: float = Input(
60 |             description="Scale for classifier-free guidance", ge=1, le=20, default=6
61 |         ),
62 |         num_frames: int = Input(
63 |             description="Number of frames for the output video", default=49
64 |         ),
65 |         seed: int = Input(
66 |             description="Random seed. Leave blank to randomize the seed", default=None
67 |         ),
68 |     ) -> Path:
69 |         """Run a single prediction on the model"""
70 | 
71 |         if seed is None:
72 |             seed = int.from_bytes(os.urandom(2), "big")
73 |         print(f"Using seed: {seed}")
74 | 
75 |         video = self.pipe(
76 |             prompt=prompt,
77 |             num_videos_per_prompt=1,
78 |             num_inference_steps=num_inference_steps,
79 |             num_frames=num_frames,
80 |             guidance_scale=guidance_scale,
81 |             generator=torch.Generator(device="cuda").manual_seed(seed),
82 |         ).frames[0]
83 | 
84 |         out_path = "/tmp/out.mp4"
85 | 
86 |         export_to_video(video, out_path, fps=8)
87 |         return Path(out_path)
88 | 


--------------------------------------------------------------------------------
/CogVideo/weights/put weights here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/CogVideo/weights/put weights here.txt


--------------------------------------------------------------------------------
/dataset/traj_vis/Hemi12_transforms.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "C_01_35mm": "[-0.8622445326446021 -0.497817113029644 -0.09334070869305826 0] [0.49999999999999994 -0.8660254037844387 0.0 0] [-0.08083542493543144 -0.04667035434652912 0.9956342260592881 0] [692.820323027551 399.99999999999994 0.0 1]",
 3 |     "C_02_35mm": "[-0.49781711302964426 -0.862244532644602 -0.09334070869305827 0] [0.8660254037844386 -0.5000000000000002 0.0 0] [-0.04667035434652916 -0.08083542493543144 0.9956342260592881 0] [400.0000000000001 692.8203230275509 0.0 1]",
 4 |     "C_03_35mm": "[-1.6011019497192098e-16 -0.9956342260592881 -0.09334070869305827 0] [1.0 -1.6081226496766366e-16 0.0 0] [-1.5010330778617594e-17 -0.09334070869305827 0.9956342260592881 0] [4.898587196589413e-14 800.0 0.0 1]",
 5 |     "C_04_35mm": "[0.49781711302964377 -0.8622445326446022 -0.09334070869305827 0] [0.8660254037844388 0.4999999999999997 0.0 0] [0.04667035434652911 -0.08083542493543147 0.9956342260592881 0] [-399.99999999999983 692.820323027551 0.0 1]",
 6 |     "C_05_35mm": "[0.8622445326446021 -0.4978171130296439 -0.09334070869305826 0] [0.49999999999999983 0.8660254037844387 0.0 0] [0.08083542493543144 -0.046670354346529115 0.9956342260592881 0] [-692.820323027551 399.99999999999994 0.0 1]",
 7 |     "C_06_35mm": "[0.9956342260592881 -1.2193002680650596e-16 -0.09334070869305827 0] [1.2246467991473532e-16 1.0 0.0 0] [0.09334070869305827 -1.1430940013109933e-17 0.9956342260592881 0] [-800.0 9.797174393178826e-14 0.0 1]",
 8 |     "C_07_35mm": "[0.862244532644602 0.49781711302964415 -0.09334070869305827 0] [-0.5000000000000001 0.8660254037844386 0.0 0] [0.08083542493543144 0.04667035434652914 0.9956342260592881 0] [-692.8203230275509 -400.0000000000001 0.0 1]",
 9 |     "C_08_35mm": "[0.4978171130296444 0.8622445326446019 -0.09334070869305827 0] [-0.8660254037844385 0.5000000000000003 0.0 0] [0.046670354346529164 0.08083542493543144 0.9956342260592881 0] [-400.00000000000034 -692.8203230275508 0.0 1]",
10 |     "C_09_35mm": "[2.820402217784269e-16 0.9956342260592881 -0.09334070869305827 0] [-1.0 2.83276944882399e-16 0.0 0] [2.6441270791727528e-17 0.09334070869305827 0.9956342260592881 0] [-1.4695761589768238e-13 -800.0 0.0 1]",
11 |     "C_10_35mm": "[-0.49781711302964426 0.862244532644602 -0.09334070869305827 0] [-0.8660254037844386 -0.5000000000000002 0.0 0] [-0.04667035434652916 0.08083542493543144 0.9956342260592881 0] [400.0000000000001 -692.8203230275509 0.0 1]",
12 |     "C_11_35mm": "[-0.8622445326446019 0.4978171130296444 -0.09334070869305827 0] [-0.5000000000000003 -0.8660254037844385 0.0 0] [-0.08083542493543144 0.046670354346529164 0.9956342260592881 0] [692.8203230275507 -400.00000000000034 0.0 1]",
13 |     "C_12_35mm": "[-0.9956342260592881 1.2193002680650596e-16 -0.09334070869305827 0] [-1.2246467991473532e-16 -1.0 0.0 0] [-0.09334070869305827 1.1430940013109933e-17 0.9956342260592881 0] [800.0 -1.9594348786357651e-13 0.0 1]"
14 | }


--------------------------------------------------------------------------------
/eval/GVHMR/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third-party/DPVO"]
2 | 	path = third-party/DPVO
3 | 	url = https://github.com/princeton-vl/DPVO.git
4 | 


--------------------------------------------------------------------------------
/eval/GVHMR/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2022-2023 3D Vision Group at the State Key Lab of CAD&CG,  
 2 | Zhejiang University. All Rights Reserved. 
 3 | 
 4 | For more information see <https://github.com/zju3dv/GVHMR> 
 5 | If you use this software, please cite the corresponding publications   
 6 | listed on the above website. 
 7 | 
 8 | Permission to use, copy, modify and distribute this software and its 
 9 | documentation for educational, research and non-profit purposes only. 
10 | Any modification based on this work must be open-source and prohibited 
11 | for commercial use. 
12 | You must retain, in the source form of any derivative works that you  
13 | distribute, all copyright, patent, trademark, and attribution notices  
14 | from the source form of this work. 
15 |  
16 | For commercial uses of this software, please send email to xwzhou@zju.edu.cn


--------------------------------------------------------------------------------
/eval/GVHMR/README.md:
--------------------------------------------------------------------------------
 1 | # GVHMR: World-Grounded Human Motion Recovery via Gravity-View Coordinates
 2 | ### [Project Page](https://zju3dv.github.io/gvhmr) | [Paper](https://arxiv.org/abs/2409.06662)
 3 | 
 4 | > World-Grounded Human Motion Recovery via Gravity-View Coordinates  
 5 | > [Zehong Shen](https://zehongs.github.io/)<sup>\*</sup>,
 6 | [Huaijin Pi](https://phj128.github.io/)<sup>\*</sup>,
 7 | [Yan Xia](https://isshikihugh.github.io/scholar),
 8 | [Zhi Cen](https://scholar.google.com/citations?user=Xyy-uFMAAAAJ),
 9 | [Sida Peng](https://pengsida.net/)<sup>†</sup>,
10 | [Zechen Hu](https://zju3dv.github.io/gvhmr),
11 | [Hujun Bao](http://www.cad.zju.edu.cn/home/bao/),
12 | [Ruizhen Hu](https://csse.szu.edu.cn/staff/ruizhenhu/),
13 | [Xiaowei Zhou](https://xzhou.me/)  
14 | > SIGGRAPH Asia 2024
15 | 
16 | <p align="center">
17 |     <img src=docs/example_video/project_teaser.gif alt="animated" />
18 | </p>
19 | 
20 | ## Setup
21 | 
22 | Please see [installation](docs/INSTALL.md) for details.
23 | 
24 | ## Quick Start
25 | 
26 | ### [<img src="https://i.imgur.com/QCojoJk.png" width="30"> Google Colab demo for GVHMR](https://colab.research.google.com/drive/1N9WSchizHv2bfQqkE9Wuiegw_OT7mtGj?usp=sharing)
27 | 
28 | ### [<img src="https://s2.loli.net/2024/09/15/aw3rElfQAsOkNCn.png" width="20"> HuggingFace demo for GVHMR](https://huggingface.co/spaces/LittleFrog/GVHMR)
29 | 
30 | ### Demo
31 | Demo entries are provided in `tools/demo`. Use `-s` to skip visual odometry if you know the camera is static, otherwise the camera will be estimated by DPVO.
32 | We also provide a script `demo_folder.py` to inference a entire folder.
33 | ```shell
34 | python tools/demo/demo.py --video=docs/example_video/tennis.mp4 -s
35 | python tools/demo/demo_folder.py -f inputs/demo/folder_in -d outputs/demo/folder_out -s
36 | ```
37 | 
38 | ### Reproduce
39 | 1. **Test**:
40 | To reproduce the 3DPW, RICH, and EMDB results in a single run, use the following command:
41 |     ```shell
42 |     python tools/train.py global/task=gvhmr/test_3dpw_emdb_rich exp=gvhmr/mixed/mixed ckpt_path=inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt
43 |     ```
44 |     To test individual datasets, change `global/task` to `gvhmr/test_3dpw`, `gvhmr/test_rich`, or `gvhmr/test_emdb`.
45 | 
46 | 2. **Train**:
47 | To train the model, use the following command:
48 |     ```shell
49 |     # The gvhmr_siga24_release.ckpt is trained with 2x4090 for 420 epochs, note that different GPU settings may lead to different results.
50 |     python tools/train.py exp=gvhmr/mixed/mixed
51 |     ```
52 |     During training, note that we do not employ post-processing as in the test script, so the global metrics results will differ (but should still be good for comparison with baseline methods).
53 | 
54 | # Citation
55 | 
56 | If you find this code useful for your research, please use the following BibTeX entry.
57 | 
58 | ```
59 | @inproceedings{shen2024gvhmr,
60 |   title={World-Grounded Human Motion Recovery via Gravity-View Coordinates},
61 |   author={Shen, Zehong and Pi, Huaijin and Xia, Yan and Cen, Zhi and Peng, Sida and Hu, Zechen and Bao, Hujun and Hu, Ruizhen and Zhou, Xiaowei},
62 |   booktitle={SIGGRAPH Asia Conference Proceedings},
63 |   year={2024}
64 | }
65 | ```
66 | 
67 | # Acknowledgement
68 | 
69 | We thank the authors of
70 | [WHAM](https://github.com/yohanshin/WHAM),
71 | [4D-Humans](https://github.com/shubham-goel/4D-Humans),
72 | and [ViTPose-Pytorch](https://github.com/gpastal24/ViTPose-Pytorch) for their great works, without which our project/code would not be possible.
73 | 


--------------------------------------------------------------------------------
/eval/GVHMR/docs/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Install
 2 | 
 3 | ## Environment
 4 | 
 5 | ```bash
 6 | git clone https://github.com/zju3dv/GVHMR --recursive
 7 | cd GVHMR
 8 | 
 9 | conda create -y -n gvhmr python=3.10
10 | conda activate gvhmr
11 | pip install -r requirements.txt
12 | pip install -e .
13 | # to install gvhmr in other repo as editable, try adding "python.analysis.extraPaths": ["path/to/your/package"] to settings.json
14 | 
15 | # DPVO
16 | cd third-party/DPVO
17 | wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip
18 | unzip eigen-3.4.0.zip -d thirdparty && rm -rf eigen-3.4.0.zip
19 | pip install torch-scatter -f "https://data.pyg.org/whl/torch-2.3.0+cu121.html"
20 | pip install numba pypose
21 | export CUDA_HOME=/usr/local/cuda-12.1/
22 | export PATH=$PATH:/usr/local/cuda-12.1/bin/
23 | pip install -e .
24 | ```
25 | 
26 | ## Inputs & Outputs
27 | 
28 | ```bash
29 | mkdir inputs
30 | mkdir outputs
31 | ```
32 | 
33 | **Weights**
34 | 
35 | ```bash
36 | mkdir -p inputs/checkpoints
37 | 
38 | # 1. You need to sign up for downloading [SMPL](https://smpl.is.tue.mpg.de/) and [SMPLX](https://smpl-x.is.tue.mpg.de/). And the checkpoints should be placed in the following structure:
39 | 
40 | inputs/checkpoints/
41 | ├── body_models/smplx/
42 | │   └── SMPLX_{GENDER}.npz # SMPLX (We predict SMPLX params + evaluation)
43 | └── body_models/smpl/
44 |     └── SMPL_{GENDER}.pkl  # SMPL (rendering and evaluation)
45 | 
46 | # 2. Download other pretrained models from Google-Drive (By downloading, you agree to the corresponding licences): https://drive.google.com/drive/folders/1eebJ13FUEXrKBawHpJroW0sNSxLjh9xD?usp=drive_link
47 | 
48 | inputs/checkpoints/
49 | ├── dpvo/
50 | │   └── dpvo.pth
51 | ├── gvhmr/
52 | │   └── gvhmr_siga24_release.ckpt
53 | ├── hmr2/
54 | │   └── epoch=10-step=25000.ckpt
55 | ├── vitpose/
56 | │   └── vitpose-h-multi-coco.pth
57 | └── yolo/
58 |     └── yolov8x.pt
59 | ```
60 | 
61 | **Data**
62 | 
63 | We provide preprocessed data for training and evaluation.
64 | Note that we do not intend to distribute the original datasets, and you need to download them (annotation, videos, etc.) from the original websites.
65 | *We're unable to provide the original data due to the license restrictions.*
66 | By downloading the preprocessed data, you agree to the original dataset's terms of use and use the data for research purposes only.
67 | 
68 | You can download them from [Google-Drive](https://drive.google.com/drive/folders/10sEef1V_tULzddFxzCmDUpsIqfv7eP-P?usp=drive_link). Please place them in the "inputs" folder and execute the following commands:
69 | 
70 | ```bash
71 | cd inputs
72 | # Train
73 | tar -xzvf AMASS_hmr4d_support.tar.gz
74 | tar -xzvf BEDLAM_hmr4d_support.tar.gz
75 | tar -xzvf H36M_hmr4d_support.tar.gz
76 | # Test
77 | tar -xzvf 3DPW_hmr4d_support.tar.gz
78 | tar -xzvf EMDB_hmr4d_support.tar.gz
79 | tar -xzvf RICH_hmr4d_support.tar.gz
80 | 
81 | # The folder structure should be like this:
82 | inputs/
83 | ├── AMASS/hmr4d_support/
84 | ├── BEDLAM/hmr4d_support/
85 | ├── H36M/hmr4d_support/
86 | ├── 3DPW/hmr4d_support/
87 | ├── EMDB/hmr4d_support/
88 | └── RICH/hmr4d_support/
89 | ```
90 | 


--------------------------------------------------------------------------------
/eval/GVHMR/docs/example_video/project_teaser.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/docs/example_video/project_teaser.gif


--------------------------------------------------------------------------------
/eval/GVHMR/docs/example_video/tennis.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/docs/example_video/tennis.mp4


--------------------------------------------------------------------------------
/eval/GVHMR/download_eval_pose.sh:
--------------------------------------------------------------------------------
1 | gdown https://drive.google.com/uc\?id\=1jMH2-ZC0ZBgtqej5Sp-E5ebBIX7mk3Xz
2 | gdown https://drive.google.com/uc\?id\=1iFcPSlcKb_rDNJ85UPoThdl22BqR2Xgh
3 | 
4 | unzip eval_sets.zip
5 | rm -rf eval_sets.zip


--------------------------------------------------------------------------------
/eval/GVHMR/eval.sh:
--------------------------------------------------------------------------------
1 | python tools/demo/demo_folder.py -f eval_sets -d outputs/eval_sets_gvhmr -s
2 | python tools/eval_pose.py -f outputs/eval_sets_gvhmr_v2
3 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | PROJ_ROOT = Path(__file__).resolve().parents[1]
 5 | 
 6 | 
 7 | def os_chdir_to_proj_root():
 8 |     """useful for running notebooks in different directories."""
 9 |     os.chdir(PROJ_ROOT)
10 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/build_gvhmr.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import OmegaConf
 2 | from hmr4d import PROJ_ROOT
 3 | from hydra.utils import instantiate
 4 | from hmr4d.model.gvhmr.gvhmr_pl_demo import DemoPL
 5 | 
 6 | 
 7 | def build_gvhmr_demo():
 8 |     cfg = OmegaConf.load(PROJ_ROOT / "hmr4d/configs/demo_gvhmr_model/siga24_release.yaml")
 9 |     gvhmr_demo_pl: DemoPL = instantiate(cfg.model, _recursive_=False)
10 |     gvhmr_demo_pl.load_pretrained_model(PROJ_ROOT / "inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt")
11 |     return gvhmr_demo_pl.eval()
12 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from hydra.core.config_store import ConfigStore
 3 | from hydra_zen import builds
 4 | 
 5 | import argparse
 6 | from hydra import compose, initialize_config_module
 7 | import os
 8 | 
 9 | os.environ["HYDRA_FULL_ERROR"] = "1"
10 | 
11 | MainStore = ConfigStore.instance()
12 | 
13 | 
14 | def register_store_gvhmr():
15 |     """Register group options to MainStore"""
16 |     from . import store_gvhmr
17 | 
18 | 
19 | def parse_args_to_cfg():
20 |     """
21 |     Use minimal Hydra API to parse args and return cfg.
22 |     This function don't do _run_hydra which create log file hierarchy.
23 |     """
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--config-name", "-cn", default="train")
26 |     parser.add_argument(
27 |         "overrides",
28 |         nargs="*",
29 |         help="Any key=value arguments to override config values (use dots for.nested=overrides)",
30 |     )
31 |     args = parser.parse_args()
32 | 
33 |     # Cfg
34 |     with initialize_config_module(version_base="1.3", config_module=f"hmr4d.configs"):
35 |         cfg = compose(config_name=args.config_name, overrides=args.overrides)
36 | 
37 |     return cfg
38 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/data/mocap/testY.yaml:
--------------------------------------------------------------------------------
 1 | # definition of lightning datamodule (dataset + dataloader)
 2 | _target_: hmr4d.datamodule.mocap_trainX_testY.DataModule
 3 | 
 4 | dataset_opts:
 5 |   test: ${test_datasets}
 6 | 
 7 | loader_opts:
 8 |   test:
 9 |     batch_size: 1
10 |     num_workers: 0
11 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/data/mocap/trainX_testY.yaml:
--------------------------------------------------------------------------------
 1 | # definition of lightning datamodule (dataset + dataloader)
 2 | _target_: hmr4d.datamodule.mocap_trainX_testY.DataModule
 3 | 
 4 | dataset_opts:
 5 |   train: ${train_datasets}
 6 |   val: ${test_datasets}
 7 | 
 8 | loader_opts:
 9 |   train:
10 |     batch_size: 32
11 |     num_workers: 8
12 |   val:
13 |     batch_size: 1
14 |     num_workers: 1
15 | 
16 | limit_each_trainset: null


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/demo.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - model: gvhmr/gvhmr_pl_demo
 4 |   - network: gvhmr/relative_transformer
 5 |   - endecoder: gvhmr/v1_amass_local_bedlam_cam
 6 | 
 7 | pipeline:
 8 |   _target_: hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline
 9 |   args_denoiser3d: ${network}
10 |   args:
11 |     endecoder_opt: ${endecoder}
12 |     normalize_cam_angvel: True
13 |     weights: null
14 |     static_conf: null
15 | 
16 | ckpt_path: inputs/checkpoints/gvhmr/gvhmr_siga24_release.ckpt
17 | 
18 | # ================================ #
19 | #          global setting          #
20 | # ================================ #
21 | 
22 | video_name: ???
23 | output_root: outputs/demo
24 | output_dir: "${output_root}/${video_name}"
25 | preprocess_dir: ${output_dir}/preprocess
26 | video_path: "${output_dir}/0_input_video.mp4"
27 | 
28 | # Options
29 | static_cam: False
30 | verbose: False
31 | 
32 | paths:
33 |   bbx: ${preprocess_dir}/bbx.pt
34 |   bbx_xyxy_video_overlay: ${preprocess_dir}/bbx_xyxy_video_overlay.mp4
35 |   vit_features: ${preprocess_dir}/vit_features.pt
36 |   vitpose: ${preprocess_dir}/vitpose.pt
37 |   vitpose_video_overlay: ${preprocess_dir}/vitpose_video_overlay.mp4
38 |   hmr4d_results: ${output_dir}/hmr4d_results.pt
39 |   incam_video: ${output_dir}/1_incam.mp4
40 |   global_video: ${output_dir}/2_global.mp4
41 |   incam_global_horiz_video: ${output_dir}/${video_name}_3_incam_global_horiz.mp4
42 |   slam: ${preprocess_dir}/slam_results.pt
43 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/exp/gvhmr/mixed/mixed.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |     - override /data: mocap/trainX_testY
 4 |     - override /model: gvhmr/gvhmr_pl
 5 |     - override /endecoder: gvhmr/v1_amass_local_bedlam_cam
 6 |     - override /optimizer: adamw_2e-4
 7 |     - override /scheduler_cfg: epoch_half_200_350
 8 |     - override /train_datasets:
 9 |           - pure_motion_amass/v11
10 |           - imgfeat_bedlam/v2
11 |           - imgfeat_h36m/v1
12 |           - imgfeat_3dpw/v1
13 |     - override /test_datasets:
14 |           - emdb1/v1_fliptest
15 |           - emdb2/v1_fliptest
16 |           - rich/all
17 |           - 3dpw/fliptest
18 |     - override /callbacks:
19 |           - simple_ckpt_saver/every10e_top100
20 |           - prog_bar/prog_reporter_every0.1
21 |           - train_speed_timer/base
22 |           - lr_monitor/pl
23 |           - metric_emdb1
24 |           - metric_emdb2
25 |           - metric_rich
26 |           - metric_3dpw
27 |     - override /network: gvhmr/relative_transformer
28 | 
29 | exp_name_base: mixed
30 | exp_name_var: ""
31 | exp_name: ${exp_name_base}${exp_name_var}
32 | data_name: mocap_mixed_v1
33 | 
34 | pipeline:
35 |     _target_: hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline
36 |     args_denoiser3d: ${network}
37 |     args:
38 |         endecoder_opt: ${endecoder}
39 |         normalize_cam_angvel: True
40 |         weights:
41 |             cr_j3d: 500.
42 |             transl_c: 1.
43 |             cr_verts: 500.
44 |             j2d: 1000.
45 |             verts2d: 1000.
46 | 
47 |             transl_w: 1.
48 |             static_conf_bce: 1.
49 | 
50 |         static_conf:
51 |             vel_thr: 0.15
52 | 
53 | data:
54 |     loader_opts:
55 |         train:
56 |             batch_size: 128
57 |             num_workers: 12
58 | 
59 | pl_trainer:
60 |     precision: 16-mixed
61 |     log_every_n_steps: 50
62 |     gradient_clip_val: 0.5
63 |     max_epochs: 500
64 |     check_val_every_n_epoch: 10
65 |     devices: 2
66 | 
67 | logger:
68 |     _target_: pytorch_lightning.loggers.TensorBoardLogger
69 |     save_dir: ${output_dir} # /save_dir/name/version/sub_dir
70 |     name: ""
71 |     version: "tb" # merge name and version
72 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/debug/debug_train.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | data_name: debug
 4 | exp_name: debug
 5 | 
 6 | # data:
 7 | #   limit_each_trainset: 40
 8 | #   loader_opts:
 9 | #     train:
10 | #       batch_size: 4
11 | #       num_workers: 0
12 | #     val:
13 | #       batch_size: 1
14 | #       num_workers: 0
15 | 
16 | pl_trainer:
17 |   limit_train_batches: 32
18 |   limit_val_batches: 2
19 |   check_val_every_n_epoch: 3
20 |   enable_checkpointing: False
21 |   devices: 1
22 | 
23 | callbacks:
24 |   model_checkpoint: null
25 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/debug/debug_train_limit_data.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | data_name: debug
 4 | exp_name: debug
 5 | 
 6 | data:
 7 |   limit_each_trainset: 40
 8 |   loader_opts:
 9 |     train:
10 |       batch_size: 4
11 |       num_workers: 0
12 |     val:
13 |       batch_size: 1
14 |       num_workers: 0
15 | 
16 | pl_trainer:
17 |   limit_val_batches: 2
18 |   check_val_every_n_epoch: 3
19 |   enable_checkpointing: False
20 |   devices: 1
21 | 
22 | callbacks:
23 |   model_checkpoint: null
24 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_3dpw.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /data: mocap/testY
 4 |   - override /test_datasets:
 5 |       - 3dpw/fliptest
 6 |   - override /callbacks:
 7 |       - metric_3dpw
 8 |   - _self_
 9 | 
10 | task: test
11 | data_name: test_mocap
12 | ckpt_path: ??? # will not override previous setting if already set
13 | 
14 | # lightning utilities
15 | pl_trainer:
16 |   devices: 1
17 | logger: null
18 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_3dpw_emdb_rich.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /data: mocap/testY
 4 |   - override /test_datasets:
 5 |       - rich/all
 6 |       - emdb1/v1_fliptest
 7 |       - emdb2/v1_fliptest
 8 |       - 3dpw/fliptest
 9 |   - override /callbacks:
10 |       - metric_rich
11 |       - metric_emdb1
12 |       - metric_emdb2
13 |       - metric_3dpw
14 |   - _self_
15 | 
16 | task: test
17 | data_name: test_mocap
18 | ckpt_path: ??? # will not override previous setting if already set
19 | 
20 | # lightning utilities
21 | pl_trainer:
22 |   devices: 1
23 | logger: null
24 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_emdb.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /data: mocap/testY
 4 |   - override /test_datasets:
 5 |       - emdb1/v1_fliptest
 6 |       - emdb2/v1_fliptest
 7 |   - override /callbacks:
 8 |       - metric_emdb1
 9 |       - metric_emdb2
10 |   - _self_
11 | 
12 | task: test
13 | data_name: test_mocap
14 | ckpt_path: ??? # will not override previous setting if already set
15 | 
16 | # lightning utilities
17 | pl_trainer:
18 |   devices: 1
19 | logger: null
20 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/global/task/gvhmr/test_rich.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /data: mocap/testY
 4 |   - override /test_datasets:
 5 |       - rich/all
 6 |   - override /callbacks:
 7 |       - metric_rich
 8 |   - _self_
 9 | 
10 | task: test
11 | data_name: test_mocap
12 | ckpt_path: ??? # will not override previous setting if already set
13 | 
14 | # lightning utilities
15 | pl_trainer:
16 |   devices: 1
17 | logger: null
18 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/hydra/default.yaml:
--------------------------------------------------------------------------------
 1 | # enable color logging
 2 | defaults:
 3 |   - override hydra_logging: colorlog
 4 |   - override job_logging: colorlog
 5 | 
 6 | job_logging:
 7 |   formatters:
 8 |     simple:
 9 |       datefmt: '%m/%d %H:%M:%S'
10 |       format: '[%(asctime)s][%(levelname)s] %(message)s'
11 |     colorlog:
12 |       datefmt: '%m/%d %H:%M:%S'
13 |       format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] %(message)s'
14 |   handlers:
15 |     file:
16 |       filename: ${output_dir}/${hydra.job.name}.log
17 | 
18 | run:
19 |   dir: ${output_dir}


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/siga24_release.yaml:
--------------------------------------------------------------------------------
 1 | pipeline:
 2 |   _target_: hmr4d.model.gvhmr.pipeline.gvhmr_pipeline.Pipeline
 3 |   args_denoiser3d: ${network}
 4 |   args:
 5 |     endecoder_opt: ${endecoder}
 6 |     normalize_cam_angvel: true
 7 |     weights: null
 8 |     static_conf: null
 9 | model:
10 |   _target_: hmr4d.model.gvhmr.gvhmr_pl_demo.DemoPL
11 |   pipeline: ${pipeline}
12 | network:
13 |   _target_: hmr4d.network.gvhmr.relative_transformer.NetworkEncoderRoPEV2
14 |   output_dim: 151
15 |   max_len: 120
16 |   kp2d_mapping: linear_v2
17 |   cliffcam_dim: 3
18 |   cam_angvel_dim: 6
19 |   imgseq_dim: 1024
20 |   f_imgseq_filter: null
21 |   cond_ver: v1
22 |   latent_dim: 512
23 |   num_layers: 12
24 |   num_heads: 8
25 |   mlp_ratio: 4.0
26 |   pred_cam_ver: v2
27 |   pred_cam_dim: 3
28 |   static_conf_dim: 6
29 |   pred_coco17_dim: 0
30 |   dropout: 0.1
31 |   avgbeta: true
32 | endecoder:
33 |   _target_: hmr4d.model.gvhmr.utils.endecoder.EnDecoder
34 |   stats_name: MM_V1_AMASS_LOCAL_BEDLAM_CAM
35 |   noise_pose_k: 10
36 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/store_gvhmr.py:
--------------------------------------------------------------------------------
 1 | # Dataset
 2 | import hmr4d.dataset.pure_motion.amass
 3 | import hmr4d.dataset.emdb.emdb_motion_test
 4 | import hmr4d.dataset.rich.rich_motion_test
 5 | import hmr4d.dataset.threedpw.threedpw_motion_test
 6 | import hmr4d.dataset.threedpw.threedpw_motion_train
 7 | import hmr4d.dataset.bedlam.bedlam
 8 | import hmr4d.dataset.h36m.h36m
 9 | 
10 | # Trainer: Model Optimizer Loss
11 | import hmr4d.model.gvhmr.gvhmr_pl
12 | import hmr4d.model.gvhmr.utils.endecoder
13 | import hmr4d.model.common_utils.optimizer
14 | import hmr4d.model.common_utils.scheduler_cfg
15 | 
16 | # Metric
17 | import hmr4d.model.gvhmr.callbacks.metric_emdb
18 | import hmr4d.model.gvhmr.callbacks.metric_rich
19 | import hmr4d.model.gvhmr.callbacks.metric_3dpw
20 | 
21 | 
22 | # PL Callbacks
23 | import hmr4d.utils.callbacks.simple_ckpt_saver
24 | import hmr4d.utils.callbacks.train_speed_timer
25 | import hmr4d.utils.callbacks.prog_bar
26 | import hmr4d.utils.callbacks.lr_monitor
27 | 
28 | # Networks
29 | import hmr4d.network.gvhmr.relative_transformer
30 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | # ================================ #
 2 | #             override             #
 3 | # ================================ #
 4 | # specify default configuration; the order determines the override order
 5 | defaults:
 6 |   - _self_
 7 |   # pytorch-lightning
 8 |   - data: ???
 9 |   - model: ???
10 |   - callbacks: null
11 | 
12 |   # system
13 |   - hydra: default
14 | 
15 |   # utility groups that changes a lot
16 |   - pipeline: null
17 |   - network: null
18 |   - optimizer: null
19 |   - scheduler_cfg: default
20 |   - train_datasets: null
21 |   - test_datasets: null
22 |   - endecoder: null # normalize/unnormalize data
23 |   - refiner: null
24 | 
25 |   # global-override
26 |   - exp: ??? # set "data, model and callbacks" in yaml
27 |   - global/task: null # dump/test
28 |   - global/hsearch: null # hyper-param search
29 |   - global/debug: null # debug mode
30 | 
31 | # ================================ #
32 | #          global setting          #
33 | # ================================ #
34 | # expirement information
35 | task: fit # [fit, predict]
36 | exp_name: ???
37 | data_name: ???
38 | 
39 | # utilities in the entry file
40 | output_dir: "outputs/${data_name}/${exp_name}"
41 | ckpt_path: null
42 | resume_mode: null
43 | seed: 42
44 | 
45 | # lightning default settings
46 | pl_trainer:
47 |   devices: 1
48 |   num_sanity_val_steps: 0 # disable sanity check
49 |   precision: 32
50 |   inference_mode: False
51 | 
52 | logger: null
53 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/bedlam/resource/vname2lwh.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/dataset/bedlam/resource/vname2lwh.pt


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/bedlam/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from pathlib import Path
 4 | 
 5 | resource_dir = Path(__file__).parent / "resource"
 6 | 
 7 | 
 8 | def mid2vname(mid):
 9 |     """vname = {scene}/{seq}, Note that it ends with .mp4"""
10 |     # mid example: "inputs/bedlam/bedlam_download/20221011_1_250_batch01hand_closeup_suburb_a/mp4/seq_000001.mp4-rp_emma_posed_008"
11 |     # -> vname: 20221011_1_250_batch01hand_closeup_suburb_a/seq_000001.mp4
12 |     scene = mid.split("/")[-3]
13 |     seq = mid.split("/")[-1].split("-")[0]
14 |     vname = f"{scene}/{seq}"
15 |     return vname
16 | 
17 | 
18 | def mid2featname(mid):
19 |     """featname = {scene}/{seqsubj}, Note that it ends with .pt (extra)"""
20 |     # mid example: "inputs/bedlam/bedlam_download/20221011_1_250_batch01hand_closeup_suburb_a/mp4/seq_000001.mp4-rp_emma_posed_008"
21 |     # -> featname: 20221011_1_250_batch01hand_closeup_suburb_a/seq_000001.mp4-rp_emma_posed_008.pt
22 |     scene = mid.split("/")[-3]
23 |     seqsubj = mid.split("/")[-1]
24 |     featname = f"{scene}/{seqsubj}.pt"
25 |     return featname
26 | 
27 | 
28 | def featname2mid(featname):
29 |     """reverse func of mid2featname, Note that it removes .pt (extra)"""
30 |     # featname example: 20221011_1_250_batch01hand_closeup_suburb_a/seq_000001.mp4-rp_emma_posed_008.pt
31 |     # -> mid: inputs/bedlam/bedlam_download/20221011_1_250_batch01hand_closeup_suburb_a/mp4/seq_000001.mp4-rp_emma_posed_008
32 |     scene = featname.split("/")[0]
33 |     seqsubj = featname.split("/")[1].strip(".pt")
34 |     mid = f"inputs/bedlam/bedlam_download/{scene}/mp4/{seqsubj}"
35 |     return mid
36 | 
37 | 
38 | def load_vname2lwh():
39 |     return torch.load(resource_dir / "vname2lwh.pt")
40 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/h36m/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | from pathlib import Path
 4 | from collections import defaultdict
 5 | import pickle
 6 | import torch
 7 | 
 8 | RESOURCE_FOLDER = Path(__file__).resolve().parent / "resource"
 9 | 
10 | camera_idx_to_name = {0: "54138969", 1: "55011271", 2: "58860488", 3: "60457274"}
11 | 
12 | 
13 | def get_vid(pkl_path, cam_id):
14 |     """.../S6/Posing 1.pkl, 54138969 -> S6@Posing_1@54138969"""
15 |     sub_id, fn = pkl_path.split("/")[-2:]
16 |     vid = f"{sub_id}@{fn.split('.')[0].replace(' ', '_')}@{cam_id}"
17 |     return vid
18 | 
19 | 
20 | def get_raw_pkl_paths(h36m_raw_root):
21 |     smpl_param_dir = h36m_raw_root / "neutrSMPL_H3.6"
22 |     pkl_paths = []
23 |     for train_sub in ["S1", "S5", "S6", "S7", "S8"]:
24 |         for pth in (smpl_param_dir / train_sub).glob("*.pkl"):
25 |             if "aligned" not in str(pth):  # Use world sequence only
26 |                 pkl_paths.append(str(pth))
27 | 
28 |     return pkl_paths
29 | 
30 | 
31 | def get_cam_KRts():
32 |     """
33 |     Returns:
34 |         Ks (torch.Tensor): {cam_id: 3x3}
35 |         Rts (torch.Tensor): {subj_id: {cam_id: 4x4}}
36 |     """
37 |     # this file is copied from https://github.com/karfly/human36m-camera-parameters
38 |     cameras_path = RESOURCE_FOLDER / "camera-parameters.json"
39 |     with open(cameras_path, "r") as f:
40 |         cameras = json.load(f)
41 | 
42 |     # 4 camera ids: '54138969', '55011271', '58860488', '60457274'
43 |     Ks = {}
44 |     for cam in cameras["intrinsics"]:
45 |         Ks[cam] = torch.tensor(cameras["intrinsics"][cam]["calibration_matrix"]).float()
46 | 
47 |     # extrinsics
48 |     extrinsics = cameras["extrinsics"]
49 |     Rts = defaultdict(dict)
50 |     for subj in extrinsics:
51 |         for cam in extrinsics[subj]:
52 |             Rt = torch.eye(4)
53 |             Rt[:3, :3] = torch.tensor(extrinsics[subj][cam]["R"])
54 |             Rt[:3, [3]] = torch.tensor(extrinsics[subj][cam]["t"]) / 1000
55 |             Rts[subj][cam] = Rt.float()
56 | 
57 |     return Ks, Rts
58 | 
59 | 
60 | def parse_raw_pkl(pkl_path, to_50hz=True):
61 |     """
62 |     raw_pkl @ 200Hz, where video @ 50Hz.
63 |     the frames should be divided by 4, and mannually align with the video.
64 |     """
65 |     with open(str(pkl_path), "rb") as f:
66 |         data = pickle.load(f, encoding="bytes")
67 |     poses = torch.from_numpy(data[b"poses"]).float()
68 |     betas = torch.from_numpy(data[b"betas"]).float()
69 |     trans = torch.from_numpy(data[b"trans"]).float()
70 |     assert poses.shape[0] == trans.shape[0]
71 |     if to_50hz:
72 |         poses = poses[::4]
73 |         trans = trans[::4]
74 | 
75 |     seq_length = poses.shape[0]  # 50FPS
76 |     smpl_params = {
77 |         "body_pose": poses[:, 3:],
78 |         "betas": betas[None].expand(seq_length, -1),
79 |         "global_orient": poses[:, :3],
80 |         "transl": trans,
81 |     }
82 |     return smpl_params
83 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/imgfeat_motion/base_dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils import data
 3 | import numpy as np
 4 | from pathlib import Path
 5 | from hmr4d.utils.pylogger import Log
 6 | 
 7 | 
 8 | class ImgfeatMotionDatasetBase(data.Dataset):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self._load_dataset()
12 |         self._get_idx2meta()  # -> Set self.idx2meta
13 | 
14 |     def __len__(self):
15 |         return len(self.idx2meta)
16 | 
17 |     def _load_dataset(self):
18 |         raise NotImplemented
19 | 
20 |     def _get_idx2meta(self):
21 |         raise NotImplemented
22 | 
23 |     def _load_data(self, idx):
24 |         raise NotImplemented
25 | 
26 |     def _process_data(self, data, idx):
27 |         raise NotImplemented
28 | 
29 |     def __getitem__(self, idx):
30 |         data = self._load_data(idx)
31 |         data = self._process_data(data, idx)
32 |         return data
33 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/pure_motion/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from pytorch3d.transforms import (
 4 |     axis_angle_to_matrix,
 5 |     matrix_to_axis_angle,
 6 |     matrix_to_rotation_6d,
 7 |     rotation_6d_to_matrix,
 8 | )
 9 | from einops import rearrange
10 | 
11 | 
12 | def aa_to_r6d(x):
13 |     return matrix_to_rotation_6d(axis_angle_to_matrix(x))
14 | 
15 | 
16 | def r6d_to_aa(x):
17 |     return matrix_to_axis_angle(rotation_6d_to_matrix(x))
18 | 
19 | 
20 | def interpolate_smpl_params(smpl_params, tgt_len):
21 |     """
22 |     smpl_params['body_pose'] (L, 63)
23 |     tgt_len: L->L'
24 |     """
25 |     betas = smpl_params["betas"]
26 |     body_pose = smpl_params["body_pose"]
27 |     global_orient = smpl_params["global_orient"]  # (L, 3)
28 |     transl = smpl_params["transl"]  # (L, 3)
29 | 
30 |     # Interpolate
31 |     body_pose = rearrange(aa_to_r6d(body_pose.reshape(-1, 21, 3)), "l j c -> c j l")
32 |     body_pose = F.interpolate(body_pose, tgt_len, mode="linear", align_corners=True)
33 |     body_pose = r6d_to_aa(rearrange(body_pose, "c j l -> l j c")).reshape(-1, 63)
34 | 
35 |     # although this should be the same as above, we do it for consistency
36 |     betas = rearrange(betas, "l c -> c 1 l")
37 |     betas = F.interpolate(betas, tgt_len, mode="linear", align_corners=True)
38 |     betas = rearrange(betas, "c 1 l -> l c")
39 | 
40 |     global_orient = rearrange(aa_to_r6d(global_orient.reshape(-1, 1, 3)), "l j c -> c j l")
41 |     global_orient = F.interpolate(global_orient, tgt_len, mode="linear", align_corners=True)
42 |     global_orient = r6d_to_aa(rearrange(global_orient, "c j l -> l j c")).reshape(-1, 3)
43 | 
44 |     transl = rearrange(transl, "l c -> c 1 l")
45 |     transl = F.interpolate(transl, tgt_len, mode="linear", align_corners=True)
46 |     transl = rearrange(transl, "c 1 l -> l c")
47 | 
48 |     return {"body_pose": body_pose, "betas": betas, "global_orient": global_orient, "transl": transl}
49 | 
50 | 
51 | def rotate_around_axis(global_orient, transl, axis="y"):
52 |     """Global coordinate augmentation. Random rotation around y-axis"""
53 |     angle = torch.rand(1) * 2 * torch.pi
54 |     if axis == "y":
55 |         aa = torch.tensor([0.0, angle, 0.0]).float().unsqueeze(0)
56 |     rmat = axis_angle_to_matrix(aa)
57 | 
58 |     global_orient = matrix_to_axis_angle(rmat @ axis_angle_to_matrix(global_orient))
59 |     transl = (rmat.squeeze(0) @ transl.T).T
60 |     return global_orient, transl
61 | 
62 | 
63 | def augment_betas(betas, std=0.1):
64 |     noise = torch.normal(mean=torch.zeros(10), std=torch.ones(10) * std)
65 |     betas_aug = betas + noise[None]
66 |     return betas_aug
67 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/rich/resource/cam2params.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/dataset/rich/resource/cam2params.pt


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/rich/resource/val.txt:
--------------------------------------------------------------------------------
 1 | sequence_name	capture_name	scan_name	id	moving_cam	gender	scene	action/scene-interaction	subjects	view_id
 2 | ParkingLot1_002_stretching2	ParkingLot1	scan_camcoord	002	X	male	V	V	V	0,1,2,3,4,5,6,7
 3 | ParkingLot1_002_burpee1	ParkingLot1	scan_camcoord	002	X	male	V	V	V	0,1,2,3,4,5,6,7
 4 | ParkingLot1_002_burpee2	ParkingLot1	scan_camcoord	002	X	male	V	V	V	0,1,2,3,4,5,6,7
 5 | ParkingLot1_004_pushup1	ParkingLot1	scan_camcoord	004	X	male	V	V	V	0,1,2,3,4,5,6,7
 6 | ParkingLot1_004_eating2	ParkingLot1	scan_camcoord	004	X	male	V	V	V	0,1,2,3,4,5,6,7
 7 | ParkingLot1_004_phonetalk2	ParkingLot1	scan_camcoord	004	X	male	V	V	V	0,1,2,3,4,5,6,7
 8 | ParkingLot1_004_takingphotos2	ParkingLot1	scan_camcoord	004	X	male	V	V	V	0,1,2,3,4,5,6,7
 9 | ParkingLot1_004_stretching2	ParkingLot1	scan_camcoord	004	X	male	V	V	V	0,1,2,3,4,5,6,7
10 | ParkingLot1_005_overfence2	ParkingLot1	scan_camcoord	005	X	male	V	V	V	0,1,2,3,4,5,6,7
11 | ParkingLot1_005_pushup1	ParkingLot1	scan_camcoord	005	X	male	V	V	V	0,1,2,3,4,5,6,7
12 | ParkingLot1_005_burpeejump1	ParkingLot1	scan_camcoord	005	X	male	V	V	V	0,1,2,3,4,5,6,7
13 | ParkingLot1_007_burpee2	ParkingLot1	scan_camcoord	007	X	male	V	V	V	0,1,2,3,4,5,6,7
14 | ParkingLot2_008_eating2	ParkingLot2	scan_camcoord	008	V	male	V	V	V	0,1,2,3,4,5
15 | ParkingLot2_008_burpeejump2	ParkingLot2	scan_camcoord	008	V	male	V	V	V	0,1,2,3,4,5
16 | ParkingLot2_014_overfence1	ParkingLot2	scan_camcoord	014	X	male	V	V	V	0,1,2,3,4,5
17 | ParkingLot2_014_eating2	ParkingLot2	scan_camcoord	014	X	male	V	V	V	0,1,2,3,4,5
18 | ParkingLot2_016_phonetalk5	ParkingLot2	scan_camcoord	016	V	female	V	V	V	0,1,2,3,4,5
19 | Pavallion_002_sidebalancerun	Pavallion	scan_camcoord	002	V	male	V	V	V	0,1,2,3,4,5,6
20 | Pavallion_013_sidebalancerun	Pavallion	scan_camcoord	013	X	female	V	V	V	0,1,2,3,4,5,6
21 | Pavallion_018_sidebalancerun	Pavallion	scan_camcoord	018	V	female	V	V	V	0,1,2,3,4,5,6
22 | LectureHall_018_wipingtable1	LectureHall	scan_chair_scene_camcoord	018	X	female	V	V	V	0,2,4,5,6
23 | LectureHall_020_wipingchairs1	LectureHall	scan_chair_scene_camcoord	020	X	male	V	V	V	0,1,2,3,4,5,6
24 | LectureHall_003_wipingchairs1	LectureHall	scan_chair_scene_camcoord	003	X	male	V	V	V	0,1,2,3,4,5,6
25 | Pavallion_000_yoga1	Pavallion	scan_camcoord	000	X	male	V	X	V	0,1,2,3,4,5,6
26 | Pavallion_002_yoga1	Pavallion	scan_camcoord	002	V	male	V	X	V	0,1,2,3,4,5,6
27 | Pavallion_003_yoga1	Pavallion	scan_camcoord	003	V	male	V	X	V	0,1,2,3,4,5,6
28 | Pavallion_006_yoga1	Pavallion	scan_camcoord	006	V	male	V	X	V	0,1,2,3,4,5,6
29 | Pavallion_018_yoga1	Pavallion	scan_camcoord	018	V	female	V	X	V	0,1,2,3,4,5,6


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/rich/resource/w2az_sahmr.json:
--------------------------------------------------------------------------------
1 | {"BBQ_scan_camcoord": [[0.9989829107564298, 0.03367618890797693, -0.029984301180211045, 0.0008183751635392625], [0.03414262169451401, -0.1305975871406019, 0.9908473906797644, -0.005059823133706893], [0.02945208652127451, -0.9908633531086326, -0.13161455111748036, 1.4054905296083466], [0.0, 0.0, 0.0, 1.0]], "Gym_scan_camcoord": [[0.9932599733260449, -0.07628732032461205, 0.0872632233306122, -0.047601130084306706], [-0.10233962102690007, -0.22374853741942266, 0.9692590953768503, -0.04091804681182174], [-0.05441716049582774, -0.9716567484252654, -0.23004768176013274, 1.537911791136788], [0.0, 0.0, 0.0, 1.0]], "Gym_scan_table_camcoord": [[0.9974451989415423, -0.06250743213795668, 0.03458172980064169, 0.02231858470834599], [-0.04804912583358893, -0.22882402250236075, 0.972281259838159, 0.039081886755815726], [-0.05286167435026744, -0.9714588965331274, -0.2312428501197992, 1.5421821446346522], [0.0, 0.0, 0.0, 1.0]], "LectureHall_scan_chair_scene_camcoord": [[0.9992930513998263, 0.030087515976743376, -0.0225419343977731, 0.001998908749589632], [0.030705594681969043, -0.30721111058653017, 0.9511458878570781, -0.025811963513866963], [0.021692484396004613, -0.9511656401040444, -0.307917783192506, 2.060346184503773], [0.0, 0.0, 0.0, 1.0]], "LectureHall_scan_yoga_scene_camcoord": [[0.9993358324246812, 0.03030060260429296, -0.020242715082476024, -0.003510046042036605], [0.028600729415016745, -0.3079667078507395, 0.9509671419836329, -0.01748548118379142], [0.022580795137075255, -0.9509144968594153, -0.3086287856852993, 2.0424701474796567], [0.0, 0.0, 0.0, 1.0]], "ParkingLot1_scan_camcoord": [[0.9989627324729327, -0.03724260727951709, 0.02620013994738054, 0.0070941466745699025], [-0.03091587075252664, -0.13228243926883107, 0.9907298144280939, -0.0274920377236923], [-0.03343154297742938, -0.9905121627037764, -0.13329661462331338, 1.3859200914120975], [0.0, 0.0, 0.0, 1.0]], "ParkingLot2_scan_camcoord": [[0.9989532636786039, -0.04044665659892979, 0.021364572447267097, 0.01646827411554571], [-0.026687287930043047, -0.13600581518076985, 0.9903485279940424, 0.030197722289598695], [-0.03715058073335097, -0.9898820567153364, -0.13694286452455984, 1.4372015171546513], [0.0, 0.0, 0.0, 1.0]], "Pavallion_scan_camcoord": [[0.9971864096076799, 0.05693557331723671, -0.048760690979605295, 0.0012478238054067193], [0.05746407703876882, -0.16289761936471214, 0.9849681443861059, -0.006002953831755452], [0.04813672552068054, -0.9849988355812122, -0.16571104235928033, 1.7638454838942128], [0.0, 0.0, 0.0, 1.0]]}


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/dataset/threedpw/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | from pathlib import Path
 4 | from collections import defaultdict
 5 | import pickle
 6 | import torch
 7 | import joblib
 8 | 
 9 | RESOURCE_FOLDER = Path(__file__).resolve().parent / "resource"
10 | 
11 | 
12 | def read_raw_pkl(pkl_path):
13 |     with open(pkl_path, "rb") as f:
14 |         data = pickle.load(f, encoding="bytes")
15 | 
16 |     num_subjects = len(data[b"poses"])
17 |     F = data[b"poses"][0].shape[0]
18 |     smpl_params = []
19 |     for i in range(num_subjects):
20 |         smpl_params.append(
21 |             {
22 |                 "body_pose": torch.from_numpy(data[b"poses"][i][:, 3:72]).float(),  # (F, 69)
23 |                 "betas": torch.from_numpy(data[b"betas"][i][:10]).repeat(F, 1).float(),  # (F, 10)
24 |                 "global_orient": torch.from_numpy(data[b"poses"][i][:, :3]).float(),  # (F, 3)
25 |                 "transl": torch.from_numpy(data[b"trans"][i]).float(),  # (F, 3)
26 |             }
27 |         )
28 |     genders = ["male" if g == "m" else "female" for g in data[b"genders"]]
29 |     campose_valid = [torch.from_numpy(v).bool() for v in data[b"campose_valid"]]
30 | 
31 |     seq_name = data[b"sequence"]
32 |     K_fullimg = torch.from_numpy(data[b"cam_intrinsics"]).float()
33 |     T_w2c = torch.from_numpy(data[b"cam_poses"]).float()
34 | 
35 |     return_data = {
36 |         "sequence": seq_name,  # 'courtyard_bodyScannerMotions_00'
37 |         "K_fullimg": K_fullimg,  # (3, 3), not 55FoV
38 |         "T_w2c": T_w2c,  # (F, 4, 4)
39 |         "smpl_params": smpl_params,  # list of dict
40 |         "genders": genders,  # list of str
41 |         "campose_valid": campose_valid,  # list of bool-array
42 |         # "jointPositions": data[b'jointPositions'],  # SMPL, 24x3
43 |         # "poses2d": data[b"poses2d"],  # COCO, 3x18(?)
44 |     }
45 |     return return_data
46 | 
47 | 
48 | def load_and_convert_wham_pth(pth):
49 |     """
50 |     Convert to {vid: DataDict} style, Add smpl_params_incam
51 |     """
52 |     # load
53 |     wham_labels_raw = joblib.load(pth)
54 |     # convert it to {vid: DataDict} style
55 |     wham_labels = {}
56 |     for i, vid in enumerate(wham_labels_raw["vid"]):
57 |         wham_labels[vid] = {k: wham_labels_raw[k][i] for k in wham_labels_raw}
58 | 
59 |     # convert pose and betas as smpl_params_incam (without transl)
60 |     for vid in wham_labels:
61 |         pose = wham_labels[vid]["pose"]
62 |         global_orient = pose[:, :3]  # (F, 3)
63 |         body_pose = pose[:, 3:]  # (F, 69)
64 |         betas = wham_labels[vid]["betas"]  # (F, 10), all frames are the same
65 |         wham_labels[vid]["smpl_params_incam"] = {
66 |             "body_pose": body_pose.float(),  # (F, 69)
67 |             "betas": betas.float(),  # (F, 10)
68 |             "global_orient": global_orient.float(),  # (F, 3)
69 |         }
70 | 
71 |     return wham_labels
72 | 
73 | 
74 | # Neural-Annot utils
75 | 
76 | 
77 | def na_cam_param_to_K_fullimg(cam_param):
78 |     K = torch.eye(3)
79 |     K[[0, 1], [0, 1]] = torch.tensor(cam_param["focal"])
80 |     K[[0, 1], [2, 2]] = torch.tensor(cam_param["princpt"])
81 |     return K
82 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/model/common_utils/optimizer.py:
--------------------------------------------------------------------------------
 1 | from torch.optim import AdamW, Adam
 2 | from hmr4d.configs import MainStore, builds
 3 | 
 4 | 
 5 | optimizer_cfgs = {
 6 |     "adam_1e-3": builds(Adam, lr=1e-3, zen_partial=True),
 7 |     "adam_2e-4": builds(Adam, lr=2e-4, zen_partial=True),
 8 |     "adamw_2e-4": builds(AdamW, lr=2e-4, zen_partial=True),
 9 |     "adamw_1e-4": builds(AdamW, lr=1e-4, zen_partial=True),
10 |     "adamw_5e-5": builds(AdamW, lr=5e-5, zen_partial=True),
11 |     "adamw_1e-5": builds(AdamW, lr=1e-5, zen_partial=True),
12 |     # zero-shot text-to-image generation
13 |     "adamw_1e-3_dalle": builds(AdamW, lr=1e-3, weight_decay=1e-4, zen_partial=True),
14 | }
15 | 
16 | for name, cfg in optimizer_cfgs.items():
17 |     MainStore.store(name=name, node=cfg, group=f"optimizer")
18 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/model/common_utils/scheduler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from bisect import bisect_right
 3 | 
 4 | 
 5 | class WarmupMultiStepLR(torch.optim.lr_scheduler.LRScheduler):
 6 |     def __init__(self, optimizer, milestones, warmup=0, gamma=0.1, last_epoch=-1, verbose="deprecated"):
 7 |         """Assume optimizer does not change lr; Scheduler is called epoch-based"""
 8 |         self.milestones = milestones
 9 |         self.warmup = warmup
10 |         assert warmup < milestones[0]
11 |         self.gamma = gamma
12 |         super().__init__(optimizer, last_epoch, verbose)
13 | 
14 |     def get_lr(self):
15 |         base_lrs = self.base_lrs  # base lr for each groups
16 |         n_groups = len(base_lrs)
17 |         comming_epoch = self.last_epoch  # the lr will be set for the comming epoch, starts from 0
18 | 
19 |         # add extra warmup
20 |         if comming_epoch < self.warmup:
21 |             # e.g. comming_epoch [0, 1, 2] for warmup == 3
22 |             # lr should be base_lr * (last_epoch+1) / (warmup + 1), e.g. [0.25, 0.5, 0.75] * base_lr
23 |             lr_factor = (self.last_epoch + 1) / (self.warmup + 1)
24 |             return [base_lrs[i] * lr_factor for i in range(n_groups)]
25 |         else:
26 |             # bisect_right([3,5,7], 0) -> 0; bisect_right([3,5,7], 5) -> 2
27 |             p = bisect_right(self.milestones, comming_epoch)
28 |             lr_factor = self.gamma**p
29 |             return [base_lrs[i] * lr_factor for i in range(n_groups)]
30 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/model/common_utils/scheduler_cfg.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import DictConfig, ListConfig
 2 | from hmr4d.configs import MainStore, builds
 3 | 
 4 | # do not perform scheduling
 5 | default = DictConfig({"scheduler": None})
 6 | MainStore.store(name="default", node=default, group=f"scheduler_cfg")
 7 | 
 8 | 
 9 | # epoch-based
10 | def epoch_half_by(milestones=[100, 200, 300]):
11 |     return DictConfig(
12 |         {
13 |             "scheduler": {
14 |                 "_target_": "torch.optim.lr_scheduler.MultiStepLR",
15 |                 "milestones": milestones,
16 |                 "gamma": 0.5,
17 |             },
18 |             "interval": "epoch",
19 |             "frequency": 1,
20 |         }
21 |     )
22 | 
23 | 
24 | MainStore.store(name="epoch_half_100_200_300", node=epoch_half_by([100, 200, 300]), group=f"scheduler_cfg")
25 | MainStore.store(name="epoch_half_100_200", node=epoch_half_by([100, 200]), group=f"scheduler_cfg")
26 | MainStore.store(name="epoch_half_200_350", node=epoch_half_by([200, 350]), group=f"scheduler_cfg")
27 | MainStore.store(name="epoch_half_300", node=epoch_half_by([300]), group=f"scheduler_cfg")
28 | 
29 | 
30 | # epoch-based
31 | def warmup_epoch_half_by(warmup=10, milestones=[100, 200, 300]):
32 |     return DictConfig(
33 |         {
34 |             "scheduler": {
35 |                 "_target_": "hmr4d.model.common_utils.scheduler.WarmupMultiStepLR",
36 |                 "milestones": milestones,
37 |                 "warmup": warmup,
38 |                 "gamma": 0.5,
39 |             },
40 |             "interval": "epoch",
41 |             "frequency": 1,
42 |         }
43 |     )
44 | 
45 | 
46 | MainStore.store(name="warmup_5_epoch_half_200_350", node=warmup_epoch_half_by(5, [200, 350]), group=f"scheduler_cfg")
47 | MainStore.store(name="warmup_10_epoch_half_200_350", node=warmup_epoch_half_by(10, [200, 350]), group=f"scheduler_cfg")
48 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/model/gvhmr/gvhmr_pl_demo.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pytorch_lightning as pl
 3 | from hydra.utils import instantiate
 4 | from hmr4d.utils.pylogger import Log
 5 | from hmr4d.configs import MainStore, builds
 6 | 
 7 | from hmr4d.utils.geo.hmr_cam import normalize_kp2d
 8 | 
 9 | 
10 | class DemoPL(pl.LightningModule):
11 |     def __init__(self, pipeline):
12 |         super().__init__()
13 |         self.pipeline = instantiate(pipeline, _recursive_=False)
14 | 
15 |     @torch.no_grad()
16 |     def predict(self, data, static_cam=False):
17 |         """auto add batch dim
18 |         data: {
19 |             "length": int, or Torch.Tensor,
20 |             "kp2d": (F, 3)
21 |             "bbx_xys": (F, 3)
22 |             "K_fullimg": (F, 3, 3)
23 |             "cam_angvel": (F, 3)
24 |             "f_imgseq": (F, 3, 256, 256)
25 |         }
26 | 
27 |         """
28 |         # ROPE inference
29 |         batch = {
30 |             "length": data["length"][None],
31 |             "obs": normalize_kp2d(data["kp2d"], data["bbx_xys"])[None],
32 |             "bbx_xys": data["bbx_xys"][None],
33 |             "K_fullimg": data["K_fullimg"][None],
34 |             "cam_angvel": data["cam_angvel"][None],
35 |             "f_imgseq": data["f_imgseq"][None],
36 |         }
37 |         batch = {k: v.cuda() for k, v in batch.items()}
38 |         outputs = self.pipeline.forward(batch, train=False, postproc=True, static_cam=static_cam)
39 | 
40 |         pred = {
41 |             "smpl_params_global": {k: v[0] for k, v in outputs["pred_smpl_params_global"].items()},
42 |             "smpl_params_incam": {k: v[0] for k, v in outputs["pred_smpl_params_incam"].items()},
43 |             "K_fullimg": data["K_fullimg"],
44 |             "net_outputs": outputs,  # intermediate outputs
45 |         }
46 |         return pred
47 | 
48 |     def load_pretrained_model(self, ckpt_path):
49 |         """Load pretrained checkpoint, and assign each weight to the corresponding part."""
50 |         Log.info(f"[PL-Trainer] Loading ckpt type: {ckpt_path}")
51 | 
52 |         state_dict = torch.load(ckpt_path, "cpu")["state_dict"]
53 |         missing, unexpected = self.load_state_dict(state_dict, strict=False)
54 |         if len(missing) > 0:
55 |             Log.warn(f"Missing keys: {missing}")
56 |         if len(unexpected) > 0:
57 |             Log.warn(f"Unexpected keys: {unexpected}")
58 | 
59 | 
60 | MainStore.store(name="gvhmr_pl_demo", node=builds(DemoPL, pipeline="${pipeline}"), group="model/gvhmr")
61 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/base_arch/embeddings/rotary_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from einops import repeat, rearrange
 4 | from torch.cuda.amp import autocast
 5 | 
 6 | 
 7 | def rotate_half(x):
 8 |     x = rearrange(x, "... (d r) -> ... d r", r=2)
 9 |     x1, x2 = x.unbind(dim=-1)
10 |     x = torch.stack((-x2, x1), dim=-1)
11 |     return rearrange(x, "... d r -> ... (d r)")
12 | 
13 | 
14 | @autocast(enabled=False)
15 | def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
16 |     if t.ndim == 3:
17 |         seq_len = t.shape[seq_dim]
18 |         freqs = freqs[-seq_len:].to(t)
19 | 
20 |     rot_dim = freqs.shape[-1]
21 |     end_index = start_index + rot_dim
22 | 
23 |     assert (
24 |         rot_dim <= t.shape[-1]
25 |     ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
26 | 
27 |     t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
28 |     t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
29 |     return torch.cat((t_left, t, t_right), dim=-1)
30 | 
31 | 
32 | def get_encoding(d_model, max_seq_len=4096):
33 |     """Return: (L, D)"""
34 |     t = torch.arange(max_seq_len).float()
35 |     freqs = 1.0 / (10000 ** (torch.arange(0, d_model, 2).float() / d_model))
36 |     freqs = torch.einsum("i, j -> i j", t, freqs)
37 |     freqs = repeat(freqs, "i j -> i (j r)", r=2)
38 |     return freqs
39 | 
40 | 
41 | class ROPE(nn.Module):
42 |     """Minimal impl of a lang-style positional encoding."""
43 | 
44 |     def __init__(self, d_model, max_seq_len=4096):
45 |         super().__init__()
46 |         self.d_model = d_model
47 |         self.max_seq_len = max_seq_len
48 | 
49 |         # Pre-cache a freqs tensor
50 |         encoding = get_encoding(d_model, max_seq_len)
51 |         self.register_buffer("encoding", encoding, False)
52 | 
53 |     def rotate_queries_or_keys(self, x):
54 |         """
55 |         Args:
56 |             x : (B, H, L, D)
57 |         Returns:
58 |             rotated_x: (B, H, L, D)
59 |         """
60 | 
61 |         seq_len, d_model = x.shape[-2:]
62 |         assert d_model == self.d_model
63 | 
64 |         # encoding: (L, D)s
65 |         if seq_len > self.max_seq_len:
66 |             encoding = get_encoding(d_model, seq_len).to(x)
67 |         else:
68 |             encoding = self.encoding[:seq_len]
69 | 
70 |         # encoding: (L, D)
71 |         # x: (B, H, L, D)
72 |         rotated_x = apply_rotary_emb(encoding, x, seq_dim=-2)
73 | 
74 |         return rotated_x
75 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/base_arch/transformer/encoder_rope.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import math
 5 | from timm.models.vision_transformer import Mlp
 6 | from typing import Optional, Tuple
 7 | from einops import einsum, rearrange, repeat
 8 | from hmr4d.network.base_arch.embeddings.rotary_embedding import ROPE
 9 | 
10 | 
11 | class RoPEAttention(nn.Module):
12 |     def __init__(self, embed_dim, num_heads, dropout=0.1):
13 |         super().__init__()
14 |         self.embed_dim = embed_dim
15 |         self.num_heads = num_heads
16 |         self.head_dim = embed_dim // num_heads
17 | 
18 |         self.rope = ROPE(self.head_dim, max_seq_len=4096)
19 | 
20 |         self.query = nn.Linear(embed_dim, embed_dim)
21 |         self.key = nn.Linear(embed_dim, embed_dim)
22 |         self.value = nn.Linear(embed_dim, embed_dim)
23 |         self.dropout = nn.Dropout(dropout)
24 |         self.proj = nn.Linear(embed_dim, embed_dim)
25 | 
26 |     def forward(self, x, attn_mask=None, key_padding_mask=None):
27 |         # x: (B, L, C)
28 |         # attn_mask: (L, L)
29 |         # key_padding_mask: (B, L)
30 |         B, L, _ = x.shape
31 |         xq, xk, xv = self.query(x), self.key(x), self.value(x)
32 | 
33 |         xq = xq.reshape(B, L, self.num_heads, -1).transpose(1, 2)
34 |         xk = xk.reshape(B, L, self.num_heads, -1).transpose(1, 2)
35 |         xv = xv.reshape(B, L, self.num_heads, -1).transpose(1, 2)
36 | 
37 |         xq = self.rope.rotate_queries_or_keys(xq)  # B, N, L, C
38 |         xk = self.rope.rotate_queries_or_keys(xk)  # B, N, L, C
39 | 
40 |         attn_score = einsum(xq, xk, "b n i c, b n j c -> b n i j") / math.sqrt(self.head_dim)
41 |         if attn_mask is not None:
42 |             attn_mask = attn_mask.reshape(1, 1, L, L).expand(B, self.num_heads, -1, -1)
43 |             attn_score = attn_score.masked_fill(attn_mask, float("-inf"))
44 |         if key_padding_mask is not None:
45 |             key_padding_mask = key_padding_mask.reshape(B, 1, 1, L).expand(-1, self.num_heads, L, -1)
46 |             attn_score = attn_score.masked_fill(key_padding_mask, float("-inf"))
47 | 
48 |         attn_score = torch.softmax(attn_score, dim=-1)
49 |         attn_score = self.dropout(attn_score)
50 |         output = einsum(attn_score, xv, "b n i j, b n j c -> b n i c")  # B, N, L, C
51 |         output = output.transpose(1, 2).reshape(B, L, -1)  # B, L, C
52 |         output = self.proj(output)  # B, L, C
53 |         return output
54 | 
55 | 
56 | class EncoderRoPEBlock(nn.Module):
57 |     def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, dropout=0.1, **block_kwargs):
58 |         super().__init__()
59 |         self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
60 |         self.attn = RoPEAttention(hidden_size, num_heads, dropout)
61 |         self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
62 |         mlp_hidden_dim = int(hidden_size * mlp_ratio)
63 |         approx_gelu = lambda: nn.GELU(approximate="tanh")
64 |         self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=dropout)
65 | 
66 |         self.gate_msa = nn.Parameter(torch.zeros(1, 1, hidden_size))
67 |         self.gate_mlp = nn.Parameter(torch.zeros(1, 1, hidden_size))
68 | 
69 |         # Zero-out adaLN modulation layers
70 |         nn.init.constant_(self.gate_msa, 0)
71 |         nn.init.constant_(self.gate_mlp, 0)
72 | 
73 |     def forward(self, x, attn_mask=None, tgt_key_padding_mask=None):
74 |         x = x + self.gate_msa * self._sa_block(
75 |             self.norm1(x), attn_mask=attn_mask, key_padding_mask=tgt_key_padding_mask
76 |         )
77 |         x = x + self.gate_mlp * self.mlp(self.norm2(x))
78 |         return x
79 | 
80 |     def _sa_block(self, x, attn_mask=None, key_padding_mask=None):
81 |         # x: (B, L, C)
82 |         x = self.attn(x, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
83 |         return x
84 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/base_arch/transformer/layer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def zero_module(module):
 7 |     """
 8 |     Zero out the parameters of a module and return it.
 9 |     """
10 |     for p in module.parameters():
11 |         p.detach().zero_()
12 |     return module
13 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .hmr2 import HMR2
 3 | from pathlib import Path
 4 | from .configs import get_config
 5 | from hmr4d import PROJ_ROOT
 6 | 
 7 | HMR2A_CKPT = PROJ_ROOT / f"inputs/checkpoints/hmr2/epoch=10-step=25000.ckpt"  # this is HMR2.0a, follow WHAM
 8 | 
 9 | 
10 | def load_hmr2(checkpoint_path=HMR2A_CKPT):
11 |     model_cfg = str((Path(__file__).parent / "configs/model_config.yaml").resolve())
12 |     model_cfg = get_config(model_cfg)
13 | 
14 |     # Override some config values, to crop bbox correctly
15 |     if (model_cfg.MODEL.BACKBONE.TYPE == "vit") and ("BBOX_SHAPE" not in model_cfg.MODEL):
16 |         model_cfg.defrost()
17 |         assert (
18 |             model_cfg.MODEL.IMAGE_SIZE == 256
19 |         ), f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone"
20 |         model_cfg.MODEL.BBOX_SHAPE = [192, 256]  # (W, H)
21 |         model_cfg.freeze()
22 | 
23 |     # Setup model and Load weights.
24 |     # model = HMR2.load_from_checkpoint(checkpoint_path, strict=False, cfg=model_cfg)
25 |     model = HMR2(model_cfg)
26 | 
27 |     state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
28 |     keys = [k for k in state_dict.keys() if k.split(".")[0] in ["backbone", "smpl_head"]]
29 |     state_dict = {k: v for k, v in state_dict.items() if k in keys}
30 |     model.load_state_dict(state_dict, strict=True)
31 | 
32 |     return model
33 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/network/hmr2/components/__init__.py


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/configs/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Dict
  3 | from yacs.config import CfgNode as CN
  4 | from pathlib import Path
  5 | 
  6 | # CACHE_DIR = os.path.join(os.environ.get("HOME"), "Code/4D-Humans/cache")
  7 | # CACHE_DIR_4DHUMANS = os.path.join(CACHE_DIR, "4DHumans")
  8 | 
  9 | 
 10 | def to_lower(x: Dict) -> Dict:
 11 |     """
 12 |     Convert all dictionary keys to lowercase
 13 |     Args:
 14 |       x (dict): Input dictionary
 15 |     Returns:
 16 |       dict: Output dictionary with all keys converted to lowercase
 17 |     """
 18 |     return {k.lower(): v for k, v in x.items()}
 19 | 
 20 | 
 21 | _C = CN(new_allowed=True)
 22 | 
 23 | _C.GENERAL = CN(new_allowed=True)
 24 | _C.GENERAL.RESUME = True
 25 | _C.GENERAL.TIME_TO_RUN = 3300
 26 | _C.GENERAL.VAL_STEPS = 100
 27 | _C.GENERAL.LOG_STEPS = 100
 28 | _C.GENERAL.CHECKPOINT_STEPS = 20000
 29 | _C.GENERAL.CHECKPOINT_DIR = "checkpoints"
 30 | _C.GENERAL.SUMMARY_DIR = "tensorboard"
 31 | _C.GENERAL.NUM_GPUS = 1
 32 | _C.GENERAL.NUM_WORKERS = 4
 33 | _C.GENERAL.MIXED_PRECISION = True
 34 | _C.GENERAL.ALLOW_CUDA = True
 35 | _C.GENERAL.PIN_MEMORY = False
 36 | _C.GENERAL.DISTRIBUTED = False
 37 | _C.GENERAL.LOCAL_RANK = 0
 38 | _C.GENERAL.USE_SYNCBN = False
 39 | _C.GENERAL.WORLD_SIZE = 1
 40 | 
 41 | _C.TRAIN = CN(new_allowed=True)
 42 | _C.TRAIN.NUM_EPOCHS = 100
 43 | _C.TRAIN.BATCH_SIZE = 32
 44 | _C.TRAIN.SHUFFLE = True
 45 | _C.TRAIN.WARMUP = False
 46 | _C.TRAIN.NORMALIZE_PER_IMAGE = False
 47 | _C.TRAIN.CLIP_GRAD = False
 48 | _C.TRAIN.CLIP_GRAD_VALUE = 1.0
 49 | _C.LOSS_WEIGHTS = CN(new_allowed=True)
 50 | 
 51 | _C.DATASETS = CN(new_allowed=True)
 52 | 
 53 | _C.MODEL = CN(new_allowed=True)
 54 | _C.MODEL.IMAGE_SIZE = 224
 55 | 
 56 | _C.EXTRA = CN(new_allowed=True)
 57 | _C.EXTRA.FOCAL_LENGTH = 5000
 58 | 
 59 | _C.DATASETS.CONFIG = CN(new_allowed=True)
 60 | _C.DATASETS.CONFIG.SCALE_FACTOR = 0.3
 61 | _C.DATASETS.CONFIG.ROT_FACTOR = 30
 62 | _C.DATASETS.CONFIG.TRANS_FACTOR = 0.02
 63 | _C.DATASETS.CONFIG.COLOR_SCALE = 0.2
 64 | _C.DATASETS.CONFIG.ROT_AUG_RATE = 0.6
 65 | _C.DATASETS.CONFIG.TRANS_AUG_RATE = 0.5
 66 | _C.DATASETS.CONFIG.DO_FLIP = True
 67 | _C.DATASETS.CONFIG.FLIP_AUG_RATE = 0.5
 68 | _C.DATASETS.CONFIG.EXTREME_CROP_AUG_RATE = 0.10
 69 | 
 70 | 
 71 | def default_config() -> CN:
 72 |     """
 73 |     Get a yacs CfgNode object with the default config values.
 74 |     """
 75 |     # Return a clone so that the defaults will not be altered
 76 |     # This is for the "local variable" use pattern
 77 |     return _C.clone()
 78 | 
 79 | 
 80 | def dataset_config(name="datasets_tar.yaml") -> CN:
 81 |     """
 82 |     Get dataset config file
 83 |     Returns:
 84 |       CfgNode: Dataset config as a yacs CfgNode object.
 85 |     """
 86 |     cfg = CN(new_allowed=True)
 87 |     config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), name)
 88 |     cfg.merge_from_file(config_file)
 89 |     cfg.freeze()
 90 |     return cfg
 91 | 
 92 | 
 93 | def dataset_eval_config() -> CN:
 94 |     return dataset_config("datasets_eval.yaml")
 95 | 
 96 | 
 97 | def get_config(config_file: str, merge: bool = True) -> CN:
 98 |     """
 99 |     Read a config file and optionally merge it with the default config file.
100 |     Args:
101 |       config_file (str): Path to config file.
102 |       merge (bool): Whether to merge with the default config or not.
103 |     Returns:
104 |       CfgNode: Config as a yacs CfgNode object.
105 |     """
106 |     if merge:
107 |         cfg = default_config()
108 |     else:
109 |         cfg = CN(new_allowed=True)
110 |     cfg.merge_from_file(config_file)
111 | 
112 |     # ---- Update ---- #
113 |     cfg.SMPL.MODEL_PATH = cfg.SMPL.MODEL_PATH  # Not used
114 |     cfg.SMPL.JOINT_REGRESSOR_EXTRA = cfg.SMPL.JOINT_REGRESSOR_EXTRA  # Not Used
115 |     cfg.SMPL.MEAN_PARAMS = str(Path(__file__).parent / "smpl_mean_params.npz")
116 |     # ---------------- #
117 | 
118 |     cfg.freeze()
119 |     return cfg
120 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/configs/model_config.yaml:
--------------------------------------------------------------------------------
  1 | task_name: train
  2 | tags:
  3 | - dev
  4 | train: true
  5 | test: false
  6 | ckpt_path: null
  7 | seed: null
  8 | DATASETS:
  9 |   TRAIN:
 10 |     H36M-TRAIN:
 11 |       WEIGHT: 0.3
 12 |     MPII-TRAIN:
 13 |       WEIGHT: 0.1
 14 |     COCO-TRAIN-2014:
 15 |       WEIGHT: 0.4
 16 |     MPI-INF-TRAIN:
 17 |       WEIGHT: 0.2
 18 |   VAL:
 19 |     COCO-VAL:
 20 |       WEIGHT: 1.0
 21 |   MOCAP: CMU-MOCAP
 22 |   CONFIG:
 23 |     SCALE_FACTOR: 0.3
 24 |     ROT_FACTOR: 30
 25 |     TRANS_FACTOR: 0.02
 26 |     COLOR_SCALE: 0.2
 27 |     ROT_AUG_RATE: 0.6
 28 |     TRANS_AUG_RATE: 0.5
 29 |     DO_FLIP: true
 30 |     FLIP_AUG_RATE: 0.5
 31 |     EXTREME_CROP_AUG_RATE: 0.1
 32 | trainer:
 33 |   _target_: pytorch_lightning.Trainer
 34 |   default_root_dir: ${paths.output_dir}
 35 |   accelerator: gpu
 36 |   devices: 8
 37 |   deterministic: false
 38 |   num_sanity_val_steps: 0
 39 |   log_every_n_steps: ${GENERAL.LOG_STEPS}
 40 |   val_check_interval: ${GENERAL.VAL_STEPS}
 41 |   precision: 16
 42 |   max_steps: ${GENERAL.TOTAL_STEPS}
 43 |   move_metrics_to_cpu: true
 44 |   limit_val_batches: 1
 45 |   track_grad_norm: 2
 46 |   strategy: ddp
 47 |   num_nodes: 1
 48 |   sync_batchnorm: true
 49 | paths:
 50 |   root_dir: ${oc.env:PROJECT_ROOT}
 51 |   data_dir: ${paths.root_dir}/data/
 52 |   log_dir: /fsx/shubham/code/hmr2023/logs_hydra/
 53 |   output_dir: ${hydra:runtime.output_dir}
 54 |   work_dir: ${hydra:runtime.cwd}
 55 | extras:
 56 |   ignore_warnings: false
 57 |   enforce_tags: true
 58 |   print_config: true
 59 | exp_name: 3001d
 60 | SMPL:
 61 |   MODEL_PATH: data/smpl
 62 |   GENDER: neutral
 63 |   NUM_BODY_JOINTS: 23
 64 |   JOINT_REGRESSOR_EXTRA: data/SMPL_to_J19.pkl
 65 |   MEAN_PARAMS: data/smpl_mean_params.npz
 66 | EXTRA:
 67 |   FOCAL_LENGTH: 5000
 68 |   NUM_LOG_IMAGES: 4
 69 |   NUM_LOG_SAMPLES_PER_IMAGE: 8
 70 |   PELVIS_IND: 39
 71 | MODEL:
 72 |   IMAGE_SIZE: 256
 73 |   IMAGE_MEAN:
 74 |   - 0.485
 75 |   - 0.456
 76 |   - 0.406
 77 |   IMAGE_STD:
 78 |   - 0.229
 79 |   - 0.224
 80 |   - 0.225
 81 |   BACKBONE:
 82 |     TYPE: vit
 83 |     FREEZE: true
 84 |     NUM_LAYERS: 50
 85 |     OUT_CHANNELS: 2048
 86 |   ADD_NECK: false
 87 |   FLOW:
 88 |     DIM: 144
 89 |     NUM_LAYERS: 4
 90 |     CONTEXT_FEATURES: 2048
 91 |     LAYER_HIDDEN_FEATURES: 1024
 92 |     LAYER_DEPTH: 2
 93 |   FC_HEAD:
 94 |     NUM_FEATURES: 1024
 95 |   SMPL_HEAD:
 96 |     TYPE: transformer_decoder
 97 |     IN_CHANNELS: 2048
 98 |     TRANSFORMER_DECODER:
 99 |       depth: 6
100 |       heads: 8
101 |       mlp_dim: 1024
102 |       dim_head: 64
103 |       dropout: 0.0
104 |       emb_dropout: 0.0
105 |       norm: layer
106 |       context_dim: 1280
107 | GENERAL:
108 |   TOTAL_STEPS: 100000
109 |   LOG_STEPS: 100
110 |   VAL_STEPS: 100
111 |   CHECKPOINT_STEPS: 1000
112 |   CHECKPOINT_SAVE_TOP_K: -1
113 |   NUM_WORKERS: 6
114 |   PREFETCH_FACTOR: 2
115 | TRAIN:
116 |   LR: 0.0001
117 |   WEIGHT_DECAY: 0.0001
118 |   BATCH_SIZE: 512
119 |   LOSS_REDUCTION: mean
120 |   NUM_TRAIN_SAMPLES: 2
121 |   NUM_TEST_SAMPLES: 64
122 |   POSE_2D_NOISE_RATIO: 0.01
123 |   SMPL_PARAM_NOISE_RATIO: 0.005
124 | LOSS_WEIGHTS:
125 |   KEYPOINTS_3D: 0.05
126 |   KEYPOINTS_2D: 0.01
127 |   GLOBAL_ORIENT: 0.001
128 |   BODY_POSE: 0.001
129 |   BETAS: 0.0005
130 |   ADVERSARIAL: 0.0005
131 | local: {}
132 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/configs/smpl_mean_params.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/network/hmr2/configs/smpl_mean_params.npz


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/hmr2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pytorch_lightning as pl
 3 | from yacs.config import CfgNode
 4 | from .vit import ViT
 5 | from .smpl_head import SMPLTransformerDecoderHead
 6 | 
 7 | from pytorch3d.transforms import matrix_to_axis_angle
 8 | from hmr4d.utils.geo.hmr_cam import compute_transl_full_cam
 9 | 
10 | 
11 | class HMR2(pl.LightningModule):
12 |     def __init__(self, cfg: CfgNode):
13 |         super().__init__()
14 |         self.cfg = cfg
15 |         self.backbone = ViT(
16 |             img_size=(256, 192),
17 |             patch_size=16,
18 |             embed_dim=1280,
19 |             depth=32,
20 |             num_heads=16,
21 |             ratio=1,
22 |             use_checkpoint=False,
23 |             mlp_ratio=4,
24 |             qkv_bias=True,
25 |             drop_path_rate=0.55,
26 |         )
27 |         self.smpl_head = SMPLTransformerDecoderHead(cfg)
28 | 
29 |     def forward(self, batch, feat_mode=True):
30 |         """this file has been modified
31 |         Args:
32 |             feat_mode: default True, as we only need the feature token output for the HMR4D project;
33 |                        when False, the full process of HMR2 will be executed.
34 |         """
35 |         # Backbone
36 |         x = batch["img"][:, :, :, 32:-32]
37 |         vit_feats = self.backbone(x)
38 | 
39 |         # Output head
40 |         if feat_mode:
41 |             token_out = self.smpl_head(vit_feats, only_return_token_out=True)  # (B, 1024)
42 |             return token_out
43 | 
44 |         # return full process
45 |         pred_smpl_params, pred_cam, _, token_out = self.smpl_head(vit_feats, only_return_token_out=False)
46 |         output = {}
47 |         output["token_out"] = token_out
48 |         output["smpl_params"] = {
49 |             "body_pose": matrix_to_axis_angle(pred_smpl_params["body_pose"]).flatten(-2),  # (B, 23, 3)
50 |             "betas": pred_smpl_params["betas"],  # (B, 10)
51 |             "global_orient": matrix_to_axis_angle(pred_smpl_params["global_orient"])[:, 0],  # (B, 3)
52 |             "transl": compute_transl_full_cam(pred_cam, batch["bbx_xys"], batch["K_fullimg"]),  # (B, 3)
53 |         }
54 | 
55 |         return output
56 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/utils/preproc.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import torch
 4 | from pathlib import Path
 5 | 
 6 | IMAGE_MEAN = torch.tensor([0.485, 0.456, 0.406])
 7 | IMAGE_STD = torch.tensor([0.229, 0.224, 0.225])
 8 | 
 9 | 
10 | def expand_to_aspect_ratio(input_shape, target_aspect_ratio=[192, 256]):
11 |     """Increase the size of the bounding box to match the target shape."""
12 |     if target_aspect_ratio is None:
13 |         return input_shape
14 | 
15 |     try:
16 |         w, h = input_shape
17 |     except (ValueError, TypeError):
18 |         return input_shape
19 | 
20 |     w_t, h_t = target_aspect_ratio
21 |     if h / w < h_t / w_t:
22 |         h_new = max(w * h_t / w_t, h)
23 |         w_new = w
24 |     else:
25 |         h_new = h
26 |         w_new = max(h * w_t / h_t, w)
27 |     if h_new < h or w_new < w:
28 |         breakpoint()
29 |     return np.array([w_new, h_new])
30 | 
31 | 
32 | def crop_and_resize(img, bbx_xy, bbx_s, dst_size=256, enlarge_ratio=1.2):
33 |     """
34 |     Args:
35 |         img: (H, W, 3)
36 |         bbx_xy: (2,)
37 |         bbx_s: scalar
38 |     """
39 |     hs = bbx_s * enlarge_ratio / 2
40 |     src = np.stack(
41 |         [
42 |             bbx_xy - hs,  # left-up corner
43 |             bbx_xy + np.array([hs, -hs]),  # right-up corner
44 |             bbx_xy,  # center
45 |         ]
46 |     ).astype(np.float32)
47 |     dst = np.array([[0, 0], [dst_size - 1, 0], [dst_size / 2 - 0.5, dst_size / 2 - 0.5]], dtype=np.float32)
48 |     A = cv2.getAffineTransform(src, dst)
49 | 
50 |     img_crop = cv2.warpAffine(img, A, (dst_size, dst_size), flags=cv2.INTER_LINEAR)
51 |     bbx_xys_final = np.array([*bbx_xy, bbx_s * enlarge_ratio])
52 |     return img_crop, bbx_xys_final
53 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/network/hmr2/utils/smpl_wrapper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import pickle
 4 | from typing import Optional
 5 | import smplx
 6 | from smplx.lbs import vertices2joints
 7 | from smplx.utils import SMPLOutput
 8 | 
 9 | 
10 | class SMPL(smplx.SMPLLayer):
11 |     def __init__(self, *args, joint_regressor_extra: Optional[str] = None, update_hips: bool = False, **kwargs):
12 |         """
13 |         Extension of the official SMPL implementation to support more joints.
14 |         Args:
15 |             Same as SMPLLayer.
16 |             joint_regressor_extra (str): Path to extra joint regressor.
17 |         """
18 |         super(SMPL, self).__init__(*args, **kwargs)
19 |         smpl_to_openpose = [24, 12, 17, 19, 21, 16, 18, 20, 0, 2, 5, 8, 1, 4, 7, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
20 | 
21 |         if joint_regressor_extra is not None:
22 |             self.register_buffer(
23 |                 "joint_regressor_extra",
24 |                 torch.tensor(pickle.load(open(joint_regressor_extra, "rb"), encoding="latin1"), dtype=torch.float32),
25 |             )
26 |         self.register_buffer("joint_map", torch.tensor(smpl_to_openpose, dtype=torch.long))
27 |         self.update_hips = update_hips
28 | 
29 |     def forward(self, *args, **kwargs) -> SMPLOutput:
30 |         """
31 |         Run forward pass. Same as SMPL and also append an extra set of joints if joint_regressor_extra is specified.
32 |         """
33 |         smpl_output = super(SMPL, self).forward(*args, **kwargs)
34 |         joints = smpl_output.joints[:, self.joint_map, :]
35 |         if self.update_hips:
36 |             joints[:, [9, 12]] = (
37 |                 joints[:, [9, 12]]
38 |                 + 0.25 * (joints[:, [9, 12]] - joints[:, [12, 9]])
39 |                 + 0.5 * (joints[:, [8]] - 0.5 * (joints[:, [9, 12]] + joints[:, [12, 9]]))
40 |             )
41 |         if hasattr(self, "joint_regressor_extra"):
42 |             extra_joints = vertices2joints(self.joint_regressor_extra, smpl_output.vertices)
43 |             joints = torch.cat([joints, extra_joints], dim=1)
44 |         smpl_output.joints = joints
45 |         return smpl_output
46 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/README.md:
--------------------------------------------------------------------------------
1 | # README
2 | 
3 | Contents of this folder are modified from HuMoR repository.


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/__init__.py:
--------------------------------------------------------------------------------
1 | from .body_model import BodyModel
2 | from .body_model_smplh import BodyModelSMPLH
3 | from .body_model_smplx import BodyModelSMPLX
4 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/body_model_smplh.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import smplx
 4 | 
 5 | kwargs_disable_member_var = {
 6 |     "create_body_pose": False,
 7 |     "create_betas": False,
 8 |     "create_global_orient": False,
 9 |     "create_transl": False,
10 |     "create_left_hand_pose": False,
11 |     "create_right_hand_pose": False,
12 | }
13 | 
14 | 
15 | class BodyModelSMPLH(nn.Module):
16 |     """Support Batch inference"""
17 | 
18 |     def __init__(self, model_path, **kwargs):
19 |         super().__init__()
20 |         # enable flexible batchsize, handle missing variable at forward()
21 |         kwargs.update(kwargs_disable_member_var)
22 |         self.bm = smplx.create(model_path=model_path, **kwargs)
23 |         self.faces = self.bm.faces
24 |         self.is_smpl = kwargs.get("model_type", "smpl") == "smpl"
25 |         if not self.is_smpl:
26 |             self.hand_pose_dim = self.bm.num_pca_comps if self.bm.use_pca else 3 * self.bm.NUM_HAND_JOINTS
27 | 
28 |         # For fast computing of skeleton under beta
29 |         shapedirs = self.bm.shapedirs  # (V, 3, 10)
30 |         J_regressor = self.bm.J_regressor[:22, :]  # (22, V)
31 |         v_template = self.bm.v_template  # (V, 3)
32 |         J_template = J_regressor @ v_template  # (22, 3)
33 |         J_shapedirs = torch.einsum("jv, vcd -> jcd", J_regressor, shapedirs)  # (22, 3, 10)
34 |         self.register_buffer("J_template", J_template, False)
35 |         self.register_buffer("J_shapedirs", J_shapedirs, False)
36 | 
37 |     def forward(
38 |         self,
39 |         betas=None,
40 |         global_orient=None,
41 |         transl=None,
42 |         body_pose=None,
43 |         left_hand_pose=None,
44 |         right_hand_pose=None,
45 |         **kwargs
46 |     ):
47 | 
48 |         device, dtype = self.bm.shapedirs.device, self.bm.shapedirs.dtype
49 | 
50 |         model_vars = [betas, global_orient, body_pose, transl, left_hand_pose, right_hand_pose]
51 |         batch_size = 1
52 |         for var in model_vars:
53 |             if var is None:
54 |                 continue
55 |             batch_size = max(batch_size, len(var))
56 | 
57 |         if global_orient is None:
58 |             global_orient = torch.zeros([batch_size, 3], dtype=dtype, device=device)
59 |         if body_pose is None:
60 |             body_pose = (
61 |                 torch.zeros(3 * self.bm.NUM_BODY_JOINTS, device=device, dtype=dtype)[None]
62 |                 .expand(batch_size, -1)
63 |                 .contiguous()
64 |             )
65 |         if not self.is_smpl:
66 |             if left_hand_pose is None:
67 |                 left_hand_pose = (
68 |                     torch.zeros(self.hand_pose_dim, device=device, dtype=dtype)[None]
69 |                     .expand(batch_size, -1)
70 |                     .contiguous()
71 |                 )
72 |             if right_hand_pose is None:
73 |                 right_hand_pose = (
74 |                     torch.zeros(self.hand_pose_dim, device=device, dtype=dtype)[None]
75 |                     .expand(batch_size, -1)
76 |                     .contiguous()
77 |                 )
78 |         if betas is None:
79 |             betas = torch.zeros([batch_size, self.bm.num_betas], dtype=dtype, device=device)
80 |         if transl is None:
81 |             transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
82 | 
83 |         bm_out = self.bm(
84 |             betas=betas,
85 |             global_orient=global_orient,
86 |             body_pose=body_pose,
87 |             left_hand_pose=left_hand_pose,
88 |             right_hand_pose=right_hand_pose,
89 |             transl=transl,
90 |             **kwargs
91 |         )
92 | 
93 |         return bm_out
94 | 
95 |     def get_skeleton(self, betas):
96 |         """betas: (*, 10) -> skeleton_beta: (*, 22, 3)"""
97 |         skeleton_beta = self.J_template + torch.einsum("...d, jcd -> ...jc", betas, self.J_shapedirs)  # (22, 3)
98 |         return skeleton_beta
99 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/coco_aug_dict.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/coco_aug_dict.pth


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/seg_part_info.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/seg_part_info.npy


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/smpl_3dpw14_J_regressor_sparse.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smpl_3dpw14_J_regressor_sparse.pt


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/smpl_coco17_J_regressor.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smpl_coco17_J_regressor.pt


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/smpl_neutral_J_regressor.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smpl_neutral_J_regressor.pt


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/smplx2smpl_sparse.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smplx2smpl_sparse.pt


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/body_model/smplx_verts437.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/body_model/smplx_verts437.pt


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/callbacks/lr_monitor.py:
--------------------------------------------------------------------------------
1 | from pytorch_lightning.callbacks import LearningRateMonitor
2 | from hmr4d.configs import builds, MainStore
3 | 
4 | 
5 | MainStore.store(name="pl", node=builds(LearningRateMonitor), group="callbacks/lr_monitor")
6 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/callbacks/train_speed_timer.py:
--------------------------------------------------------------------------------
 1 | import pytorch_lightning as pl
 2 | from pytorch_lightning.utilities import rank_zero_only
 3 | from time import time
 4 | from collections import deque
 5 | 
 6 | from hmr4d.configs import MainStore, builds
 7 | 
 8 | 
 9 | class TrainSpeedTimer(pl.Callback):
10 |     def __init__(self, N_avg=5):
11 |         """
12 |         This callback times the training speed (averge over recent 5 iterations)
13 |             1. Data waiting time: this should be small, otherwise the data loading should be improved
14 |             2. Single batch time: this is the time for one batch of training (excluding data waiting)
15 |         """
16 |         super().__init__()
17 |         self.last_batch_end = None
18 |         self.this_batch_start = None
19 | 
20 |         # time queues for averaging
21 |         self.data_waiting_time_queue = deque(maxlen=N_avg)
22 |         self.single_batch_time_queue = deque(maxlen=N_avg)
23 | 
24 |     @rank_zero_only
25 |     def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
26 |         """Count the time of data waiting"""
27 |         if self.last_batch_end is not None:
28 |             # This should be small, otherwise the data loading should be improved
29 |             data_waiting = time() - self.last_batch_end
30 | 
31 |             # Average the time
32 |             self.data_waiting_time_queue.append(data_waiting)
33 |             average_time = sum(self.data_waiting_time_queue) / len(self.data_waiting_time_queue)
34 | 
35 |             # Log to prog-bar
36 |             pl_module.log(
37 |                 "train_timer/data_waiting", average_time, on_step=True, on_epoch=False, prog_bar=True, logger=True
38 |             )
39 | 
40 |         self.this_batch_start = time()
41 | 
42 |     @rank_zero_only
43 |     def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
44 |         # Effective training time elapsed (excluding data waiting)
45 |         single_batch = time() - self.this_batch_start
46 | 
47 |         # Average the time
48 |         self.single_batch_time_queue.append(single_batch)
49 |         average_time = sum(self.single_batch_time_queue) / len(self.single_batch_time_queue)
50 | 
51 |         # Log iter time
52 |         pl_module.log(
53 |             "train_timer/single_batch", average_time, on_step=True, on_epoch=False, prog_bar=False, logger=True
54 |         )
55 | 
56 |         # Set timer for counting data waiting
57 |         self.last_batch_end = time()
58 | 
59 |     @rank_zero_only
60 |     def on_train_epoch_end(self, trainer, pl_module):
61 |         # Reset the timer
62 |         self.last_batch_end = None
63 |         self.this_batch_start = None
64 |         # Clear the queue
65 |         self.data_waiting_time_queue.clear()
66 |         self.single_batch_time_queue.clear()
67 | 
68 | 
69 | group_name = "callbacks/train_speed_timer"
70 | base = builds(TrainSpeedTimer, populate_full_signature=True)
71 | MainStore.store(name="base", node=base, group=group_name)
72 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/geo/flip_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from pytorch3d.transforms import axis_angle_to_matrix, matrix_to_axis_angle
 3 | 
 4 | 
 5 | def flip_heatmap_coco17(output_flipped):
 6 |     assert output_flipped.ndim == 4, "output_flipped should be [B, J, H, W]"
 7 |     shape_ori = output_flipped.shape
 8 |     channels = 1
 9 |     output_flipped = output_flipped.reshape(shape_ori[0], -1, channels, shape_ori[2], shape_ori[3])
10 |     output_flipped_back = output_flipped.clone()
11 | 
12 |     # Swap left-right parts
13 |     for left, right in [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]:
14 |         output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
15 |         output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
16 |     output_flipped_back = output_flipped_back.reshape(shape_ori)
17 |     # Flip horizontally
18 |     output_flipped_back = output_flipped_back.flip(3)
19 |     return output_flipped_back
20 | 
21 | 
22 | def flip_bbx_xys(bbx_xys, w):
23 |     """
24 |     bbx_xys: (F, 3)
25 |     """
26 |     bbx_xys_flip = bbx_xys.clone()
27 |     bbx_xys_flip[:, 0] = w - bbx_xys_flip[:, 0]
28 |     return bbx_xys_flip
29 | 
30 | 
31 | def flip_kp2d_coco17(kp2d, w):
32 |     """Flip keypoints."""
33 |     kp2d = kp2d.clone()
34 |     flipped_parts = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
35 |     kp2d = kp2d[..., flipped_parts, :]
36 |     kp2d[..., 0] = w - kp2d[..., 0]
37 |     return kp2d
38 | 
39 | 
40 | def flip_smplx_params(smplx_params):
41 |     """Flip pose.
42 |     The flipping is based on SMPLX parameters.
43 |     """
44 |     rotation = torch.cat([smplx_params["global_orient"], smplx_params["body_pose"]], dim=1)
45 | 
46 |     BN = rotation.shape[0]
47 |     pose = rotation.reshape(BN, -1).transpose(0, 1)
48 | 
49 |     SMPL_JOINTS_FLIP_PERM = [0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13, 15, 17, 16, 19, 18, 21, 20]  # , 23, 22]
50 |     SMPL_POSE_FLIP_PERM = []
51 |     for i in SMPL_JOINTS_FLIP_PERM:
52 |         SMPL_POSE_FLIP_PERM.append(3 * i)
53 |         SMPL_POSE_FLIP_PERM.append(3 * i + 1)
54 |         SMPL_POSE_FLIP_PERM.append(3 * i + 2)
55 | 
56 |     pose = pose[SMPL_POSE_FLIP_PERM]
57 | 
58 |     # we also negate the second and the third dimension of the axis-angle
59 |     pose[1::3] = -pose[1::3]
60 |     pose[2::3] = -pose[2::3]
61 |     pose = pose.transpose(0, 1).reshape(BN, -1, 3)
62 | 
63 |     smplx_params_flipped = smplx_params.copy()
64 |     smplx_params_flipped["global_orient"] = pose[:, :1]
65 |     smplx_params_flipped["body_pose"] = pose[:, 1:]
66 |     return smplx_params_flipped
67 | 
68 | 
69 | def avg_smplx_aa(aa1, aa2):
70 |     def avg_rot(rot):
71 |         # input [B,...,3,3] --> output [...,3,3]
72 |         rot = rot.mean(dim=0)
73 |         U, _, V = torch.svd(rot)
74 |         rot = U @ V.transpose(-1, -2)
75 |         return rot
76 | 
77 |     B, J3 = aa1.shape
78 |     aa1 = aa1.reshape(B, -1, 3)
79 |     aa2 = aa2.reshape(B, -1, 3)
80 | 
81 |     R1 = axis_angle_to_matrix(aa1)
82 |     R2 = axis_angle_to_matrix(aa2)
83 |     R_avg = avg_rot(torch.stack([R1, R2]))
84 |     aa_avg = matrix_to_axis_angle(R_avg).reshape(B, -1)
85 | 
86 |     return aa_avg
87 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/geo/transforms.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def axis_rotate_to_matrix(angle, axis="x"):
 5 |     """Get rotation matrix for rotating around one axis
 6 |     Args:
 7 |         angle: (N, 1)
 8 |     Returns:
 9 |         R: (N, 3, 3)
10 |     """
11 |     if isinstance(angle, float):
12 |         angle = torch.tensor([angle], dtype=torch.float)
13 | 
14 |     c = torch.cos(angle)
15 |     s = torch.sin(angle)
16 |     z = torch.zeros_like(angle)
17 |     o = torch.ones_like(angle)
18 |     if axis == "x":
19 |         R = torch.stack([o, z, z, z, c, -s, z, s, c], dim=1).view(-1, 3, 3)
20 |     elif axis == "y":
21 |         R = torch.stack([c, z, s, z, o, z, -s, z, c], dim=1).view(-1, 3, 3)
22 |     else:
23 |         assert axis == "z"
24 |         R = torch.stack([c, -s, z, s, c, z, z, z, o], dim=1).view(-1, 3, 3)
25 |     return R
26 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from hmr4d.utils.preproc.tracker import Tracker
3 |     from hmr4d.utils.preproc.vitfeat_extractor import Extractor
4 |     from hmr4d.utils.preproc.vitpose import VitPoseExtractor
5 |     from hmr4d.utils.preproc.slam import SLAMModel
6 | except:
7 |     pass
8 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/slam.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import time
  3 | import torch
  4 | from multiprocessing import Process, Queue
  5 | 
  6 | try:
  7 |     from dpvo.utils import Timer
  8 |     from dpvo.dpvo import DPVO
  9 |     from dpvo.config import cfg
 10 | except:
 11 |     pass
 12 | 
 13 | 
 14 | from hmr4d import PROJ_ROOT
 15 | from hmr4d.utils.geo.hmr_cam import estimate_focal_length
 16 | 
 17 | 
 18 | class SLAMModel(object):
 19 |     def __init__(self, video_path, width, height, intrinsics=None, stride=1, skip=0, buffer=2048, resize=0.5):
 20 |         """
 21 |         Args:
 22 |             intrinsics: [fx, fy, cx, cy]
 23 |         """
 24 |         if intrinsics is None:
 25 |             print("Estimating focal length")
 26 |             focal_length = estimate_focal_length(width, height)
 27 |             intrinsics = torch.tensor([focal_length, focal_length, width / 2.0, height / 2.0])
 28 |         else:
 29 |             intrinsics = intrinsics.clone()
 30 | 
 31 |         self.dpvo_cfg = str(PROJ_ROOT / "third-party/DPVO/config/default.yaml")
 32 |         self.dpvo_ckpt = "inputs/checkpoints/dpvo/dpvo.pth"
 33 | 
 34 |         self.buffer = buffer
 35 |         self.times = []
 36 |         self.slam = None
 37 |         self.queue = Queue(maxsize=8)
 38 |         self.reader = Process(target=video_stream, args=(self.queue, video_path, intrinsics, stride, skip, resize))
 39 |         self.reader.start()
 40 | 
 41 |     def track(self):
 42 |         (t, image, intrinsics) = self.queue.get()
 43 | 
 44 |         if t < 0:
 45 |             return False
 46 | 
 47 |         image = torch.from_numpy(image).permute(2, 0, 1).cuda()
 48 |         intrinsics = intrinsics.cuda()  # [fx, fy, cx, cy]
 49 | 
 50 |         if self.slam is None:
 51 |             cfg.merge_from_file(self.dpvo_cfg)
 52 |             cfg.BUFFER_SIZE = self.buffer
 53 |             self.slam = DPVO(cfg, self.dpvo_ckpt, ht=image.shape[1], wd=image.shape[2], viz=False)
 54 | 
 55 |         with Timer("SLAM", enabled=False):
 56 |             t = time.time()
 57 |             self.slam(t, image, intrinsics)
 58 |             self.times.append(time.time() - t)
 59 | 
 60 |         return True
 61 | 
 62 |     def process(self):
 63 |         for _ in range(12):
 64 |             self.slam.update()
 65 | 
 66 |         self.reader.join()
 67 |         return self.slam.terminate()[0]
 68 | 
 69 | 
 70 | def video_stream(queue, imagedir, intrinsics, stride, skip=0, resize=0.5):
 71 |     """video generator"""
 72 |     assert len(intrinsics) == 4, "intrinsics should be [fx, fy, cx, cy]"
 73 | 
 74 |     cap = cv2.VideoCapture(imagedir)
 75 |     t = 0
 76 |     for _ in range(skip):
 77 |         ret, image = cap.read()
 78 | 
 79 |     while True:
 80 |         # Capture frame-by-frame
 81 |         for _ in range(stride):
 82 |             ret, image = cap.read()
 83 |             # if frame is read correctly ret is True
 84 |             if not ret:
 85 |                 break
 86 | 
 87 |         if not ret:
 88 |             break
 89 | 
 90 |         image = cv2.resize(image, None, fx=resize, fy=resize, interpolation=cv2.INTER_AREA)
 91 |         h, w, _ = image.shape
 92 |         image = image[: h - h % 16, : w - w % 16]
 93 | 
 94 |         intrinsics_ = intrinsics.clone() * resize
 95 |         queue.put((t, image, intrinsics_))
 96 | 
 97 |         t += 1
 98 | 
 99 |     queue.put((-1, image, intrinsics))  # -1 will terminate the process
100 |     cap.release()
101 | 
102 |     # wait for the queue to be empty, otherwise the process will end immediately
103 |     while not queue.empty():
104 |         time.sleep(1)
105 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitfeat_extractor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from hmr4d.network.hmr2 import load_hmr2, HMR2
 3 | 
 4 | 
 5 | from hmr4d.utils.video_io_utils import read_video_np
 6 | import cv2
 7 | import numpy as np
 8 | 
 9 | from hmr4d.network.hmr2.utils.preproc import crop_and_resize, IMAGE_MEAN, IMAGE_STD
10 | from tqdm import tqdm
11 | 
12 | 
13 | def get_batch(input_path, bbx_xys, img_ds=0.5, img_dst_size=256, path_type="video"):
14 |     if path_type == "video":
15 |         imgs = read_video_np(input_path, scale=img_ds)
16 |     elif path_type == "image":
17 |         imgs = cv2.imread(str(input_path))[..., ::-1]
18 |         imgs = cv2.resize(imgs, (0, 0), fx=img_ds, fy=img_ds)
19 |         imgs = imgs[None]
20 |     elif path_type == "np":
21 |         assert isinstance(input_path, np.ndarray)
22 |         assert img_ds == 1.0  # this is safe
23 |         imgs = input_path
24 | 
25 |     gt_center = bbx_xys[:, :2]
26 |     gt_bbx_size = bbx_xys[:, 2]
27 | 
28 |     # Blur image to avoid aliasing artifacts
29 |     if True:
30 |         gt_bbx_size_ds = gt_bbx_size * img_ds
31 |         ds_factors = ((gt_bbx_size_ds * 1.0) / img_dst_size / 2.0).numpy()
32 |         imgs = np.stack(
33 |             [
34 |                 # gaussian(v, sigma=(d - 1) / 2, channel_axis=2, preserve_range=True) if d > 1.1 else v
35 |                 cv2.GaussianBlur(v, (5, 5), (d - 1) / 2) if d > 1.1 else v
36 |                 for v, d in zip(imgs, ds_factors)
37 |             ]
38 |         )
39 | 
40 |     # Output
41 |     imgs_list = []
42 |     bbx_xys_ds_list = []
43 |     for i in range(len(imgs)):
44 |         img, bbx_xys_ds = crop_and_resize(
45 |             imgs[i],
46 |             gt_center[i] * img_ds,
47 |             gt_bbx_size[i] * img_ds,
48 |             img_dst_size,
49 |             enlarge_ratio=1.0,
50 |         )
51 |         imgs_list.append(img)
52 |         bbx_xys_ds_list.append(bbx_xys_ds)
53 |     imgs = torch.from_numpy(np.stack(imgs_list))  # (F, 256, 256, 3), RGB
54 |     bbx_xys = torch.from_numpy(np.stack(bbx_xys_ds_list)) / img_ds  # (F, 3)
55 | 
56 |     imgs = ((imgs / 255.0 - IMAGE_MEAN) / IMAGE_STD).permute(0, 3, 1, 2)  # (F, 3, 256, 256
57 |     return imgs, bbx_xys
58 | 
59 | 
60 | class Extractor:
61 |     def __init__(self, tqdm_leave=True):
62 |         self.extractor: HMR2 = load_hmr2().cuda().eval()
63 |         self.tqdm_leave = tqdm_leave
64 | 
65 |     def extract_video_features(self, video_path, bbx_xys, img_ds=0.5):
66 |         """
67 |         img_ds makes the image smaller, which is useful for faster processing
68 |         """
69 |         # Get the batch
70 |         if isinstance(video_path, str):
71 |             imgs, bbx_xys = get_batch(video_path, bbx_xys, img_ds=img_ds)
72 |         else:
73 |             assert isinstance(video_path, torch.Tensor)
74 |             imgs = video_path
75 | 
76 |         # Inference
77 |         F, _, H, W = imgs.shape  # (F, 3, H, W)
78 |         imgs = imgs.cuda()
79 |         batch_size = 16  # 5GB GPU memory, occupies all CUDA cores of 3090
80 |         features = []
81 |         for j in tqdm(range(0, F, batch_size), desc="HMR2 Feature", leave=self.tqdm_leave):
82 |             imgs_batch = imgs[j : j + batch_size]
83 | 
84 |             with torch.no_grad():
85 |                 feature = self.extractor({"img": imgs_batch})
86 |                 features.append(feature.detach().cpu())
87 | 
88 |         features = torch.cat(features, dim=0).clone()  # (F, 1024)
89 |         return features
90 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | from .src.vitpose_infer.model_builder import build_model
2 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/__init__.py


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/__init__.py


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | # from .alexnet import AlexNet
 3 | # from .cpm import CPM
 4 | # from .hourglass import HourglassNet
 5 | # from .hourglass_ae import HourglassAENet
 6 | # from .hrformer import HRFormer
 7 | # from .hrnet import HRNet
 8 | # from .litehrnet import LiteHRNet
 9 | # from .mobilenet_v2 import MobileNetV2
10 | # from .mobilenet_v3 import MobileNetV3
11 | # from .mspn import MSPN
12 | # from .regnet import RegNet
13 | # from .resnest import ResNeSt
14 | # from .resnet import ResNet, ResNetV1d
15 | # from .resnext import ResNeXt
16 | # from .rsn import RSN
17 | # from .scnet import SCNet
18 | # from .seresnet import SEResNet
19 | # from .seresnext import SEResNeXt
20 | # from .shufflenet_v1 import ShuffleNetV1
21 | # from .shufflenet_v2 import ShuffleNetV2
22 | # from .tcn import TCN
23 | # from .v2v_net import V2VNet
24 | # from .vgg import VGG
25 | # from .vipnas_mbv3 import ViPNAS_MobileNetV3
26 | # from .vipnas_resnet import ViPNAS_ResNet
27 | from .vit import ViT
28 | 
29 | # __all__ = [
30 | #     'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
31 | #     'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
32 | #     'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
33 | #     'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
34 | #     'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT'
35 | # ]
36 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/alexnet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch.nn as nn
 3 | 
 4 | from ..builder import BACKBONES
 5 | from .base_backbone import BaseBackbone
 6 | 
 7 | 
 8 | @BACKBONES.register_module()
 9 | class AlexNet(BaseBackbone):
10 |     """`AlexNet <https://en.wikipedia.org/wiki/AlexNet>`__ backbone.
11 | 
12 |     The input for AlexNet is a 224x224 RGB image.
13 | 
14 |     Args:
15 |         num_classes (int): number of classes for classification.
16 |             The default value is -1, which uses the backbone as
17 |             a feature extractor without the top classifier.
18 |     """
19 | 
20 |     def __init__(self, num_classes=-1):
21 |         super().__init__()
22 |         self.num_classes = num_classes
23 |         self.features = nn.Sequential(
24 |             nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
25 |             nn.ReLU(inplace=True),
26 |             nn.MaxPool2d(kernel_size=3, stride=2),
27 |             nn.Conv2d(64, 192, kernel_size=5, padding=2),
28 |             nn.ReLU(inplace=True),
29 |             nn.MaxPool2d(kernel_size=3, stride=2),
30 |             nn.Conv2d(192, 384, kernel_size=3, padding=1),
31 |             nn.ReLU(inplace=True),
32 |             nn.Conv2d(384, 256, kernel_size=3, padding=1),
33 |             nn.ReLU(inplace=True),
34 |             nn.Conv2d(256, 256, kernel_size=3, padding=1),
35 |             nn.ReLU(inplace=True),
36 |             nn.MaxPool2d(kernel_size=3, stride=2),
37 |         )
38 |         if self.num_classes > 0:
39 |             self.classifier = nn.Sequential(
40 |                 nn.Dropout(),
41 |                 nn.Linear(256 * 6 * 6, 4096),
42 |                 nn.ReLU(inplace=True),
43 |                 nn.Dropout(),
44 |                 nn.Linear(4096, 4096),
45 |                 nn.ReLU(inplace=True),
46 |                 nn.Linear(4096, num_classes),
47 |             )
48 | 
49 |     def forward(self, x):
50 | 
51 |         x = self.features(x)
52 |         if self.num_classes > 0:
53 |             x = x.view(x.size(0), 256 * 6 * 6)
54 |             x = self.classifier(x)
55 | 
56 |         return x
57 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/test_torch.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class Net(nn.Module):
 7 | 
 8 |     def __init__(self):
 9 |         super(Net, self).__init__()
10 |         # 1 input image channel, 6 output channels, 5x5 square convolution
11 |         # kernel
12 |         self.conv1 = nn.Conv2d(1, 6, 5)
13 |         self.conv2 = nn.Conv2d(6, 16, 5)
14 |         # an affine operation: y = Wx + b
15 |         self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
16 |         self.fc2 = nn.Linear(120, 84)
17 |         self.fc3 = nn.Linear(84, 10)
18 | 
19 |     def forward(self, x):
20 |         # Max pooling over a (2, 2) window
21 |         x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
22 |         # If the size is a square, you can specify with a single number
23 |         x = F.max_pool2d(F.relu(self.conv2(x)), 2)
24 |         x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
25 |         x = F.relu(self.fc1(x))
26 |         x = F.relu(self.fc2(x))
27 |         x = self.fc3(x)
28 |         return x
29 | 
30 | 
31 | net = Net()
32 | # print(net)
33 | 
34 | net.train()
35 | 
36 | input = torch.randn(1, 1, 32, 32)
37 | # out = net(input)
38 | # print(out)
39 | output = net(input)
40 | target = torch.randn(10)  # a dummy target, for example
41 | target = target.view(1, -1)  # make it the same shape as output
42 | criterion = nn.MSELoss()
43 | 
44 | # loss = criterion(output.cuda(), target.cuda())
45 | 
46 | import torch.optim as optim
47 | 
48 | # create your optimizer
49 | optimizer = optim.SGD(net.parameters(), lr=0.01)
50 | 
51 | # in your training loop:
52 | optimizer.zero_grad()   # zero the gradient buffers
53 | output = net(input)
54 | loss = criterion(output, target)
55 | 
56 | loss.backward()
57 | 
58 | optimizer.step()  
59 | 
60 | # print(loss)


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .channel_shuffle import channel_shuffle
 3 | from .inverted_residual import InvertedResidual
 4 | from .make_divisible import make_divisible
 5 | from .se_layer import SELayer
 6 | from .utils import load_checkpoint
 7 | 
 8 | __all__ = [
 9 |     'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer',
10 |     'load_checkpoint'
11 | ]
12 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/channel_shuffle.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | 
 4 | 
 5 | def channel_shuffle(x, groups):
 6 |     """Channel Shuffle operation.
 7 | 
 8 |     This function enables cross-group information flow for multiple groups
 9 |     convolution layers.
10 | 
11 |     Args:
12 |         x (Tensor): The input tensor.
13 |         groups (int): The number of groups to divide the input tensor
14 |             in the channel dimension.
15 | 
16 |     Returns:
17 |         Tensor: The output tensor after channel shuffle operation.
18 |     """
19 | 
20 |     batch_size, num_channels, height, width = x.size()
21 |     assert (num_channels % groups == 0), ('num_channels should be '
22 |                                           'divisible by groups')
23 |     channels_per_group = num_channels // groups
24 | 
25 |     x = x.view(batch_size, groups, channels_per_group, height, width)
26 |     x = torch.transpose(x, 1, 2).contiguous()
27 |     x = x.view(batch_size, -1, height, width)
28 | 
29 |     return x
30 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/make_divisible.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
 3 |     """Make divisible function.
 4 | 
 5 |     This function rounds the channel number down to the nearest value that can
 6 |     be divisible by the divisor.
 7 | 
 8 |     Args:
 9 |         value (int): The original channel number.
10 |         divisor (int): The divisor to fully divide the channel number.
11 |         min_value (int, optional): The minimum value of the output channel.
12 |             Default: None, means that the minimum value equal to the divisor.
13 |         min_ratio (float, optional): The minimum ratio of the rounded channel
14 |             number to the original channel number. Default: 0.9.
15 |     Returns:
16 |         int: The modified output channel number
17 |     """
18 | 
19 |     if min_value is None:
20 |         min_value = divisor
21 |     new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
22 |     # Make sure that round down does not go down by more than (1-min_ratio).
23 |     if new_value < min_ratio * value:
24 |         new_value += divisor
25 |     return new_value
26 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/se_layer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import mmcv
 3 | import torch.nn as nn
 4 | from mmcv.cnn import ConvModule
 5 | 
 6 | 
 7 | class SELayer(nn.Module):
 8 |     """Squeeze-and-Excitation Module.
 9 | 
10 |     Args:
11 |         channels (int): The input (and output) channels of the SE layer.
12 |         ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
13 |             ``int(channels/ratio)``. Default: 16.
14 |         conv_cfg (None or dict): Config dict for convolution layer.
15 |             Default: None, which means using conv2d.
16 |         act_cfg (dict or Sequence[dict]): Config dict for activation layer.
17 |             If act_cfg is a dict, two activation layers will be configurated
18 |             by this dict. If act_cfg is a sequence of dicts, the first
19 |             activation layer will be configurated by the first dict and the
20 |             second activation layer will be configurated by the second dict.
21 |             Default: (dict(type='ReLU'), dict(type='Sigmoid'))
22 |     """
23 | 
24 |     def __init__(self,
25 |                  channels,
26 |                  ratio=16,
27 |                  conv_cfg=None,
28 |                  act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
29 |         super().__init__()
30 |         if isinstance(act_cfg, dict):
31 |             act_cfg = (act_cfg, act_cfg)
32 |         assert len(act_cfg) == 2
33 |         assert mmcv.is_tuple_of(act_cfg, dict)
34 |         self.global_avgpool = nn.AdaptiveAvgPool2d(1)
35 |         self.conv1 = ConvModule(
36 |             in_channels=channels,
37 |             out_channels=int(channels / ratio),
38 |             kernel_size=1,
39 |             stride=1,
40 |             conv_cfg=conv_cfg,
41 |             act_cfg=act_cfg[0])
42 |         self.conv2 = ConvModule(
43 |             in_channels=int(channels / ratio),
44 |             out_channels=channels,
45 |             kernel_size=1,
46 |             stride=1,
47 |             conv_cfg=conv_cfg,
48 |             act_cfg=act_cfg[1])
49 | 
50 |     def forward(self, x):
51 |         out = self.global_avgpool(x)
52 |         out = self.conv1(out)
53 |         out = self.conv2(out)
54 |         return x * out
55 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/backbones/utils/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from collections import OrderedDict
 3 | 
 4 | from mmcv.runner.checkpoint import _load_checkpoint, load_state_dict
 5 | 
 6 | 
 7 | def load_checkpoint(model,
 8 |                     filename,
 9 |                     map_location='cpu',
10 |                     strict=False,
11 |                     logger=None):
12 |     """Load checkpoint from a file or URI.
13 | 
14 |     Args:
15 |         model (Module): Module to load checkpoint.
16 |         filename (str): Accept local filepath, URL, ``torchvision://xxx``,
17 |             ``open-mmlab://xxx``.
18 |         map_location (str): Same as :func:`torch.load`.
19 |         strict (bool): Whether to allow different params for the model and
20 |             checkpoint.
21 |         logger (:mod:`logging.Logger` or None): The logger for error message.
22 | 
23 |     Returns:
24 |         dict or OrderedDict: The loaded checkpoint.
25 |     """
26 |     checkpoint = _load_checkpoint(filename, map_location)
27 |     # OrderedDict is a subclass of dict
28 |     if not isinstance(checkpoint, dict):
29 |         raise RuntimeError(
30 |             f'No state_dict found in checkpoint file {filename}')
31 |     # get state_dict from checkpoint
32 |     if 'state_dict' in checkpoint:
33 |         state_dict_tmp = checkpoint['state_dict']
34 |     else:
35 |         state_dict_tmp = checkpoint
36 | 
37 |     state_dict = OrderedDict()
38 |     # strip prefix of state_dict
39 |     for k, v in state_dict_tmp.items():
40 |         if k.startswith('module.backbone.'):
41 |             state_dict[k[16:]] = v
42 |         elif k.startswith('module.'):
43 |             state_dict[k[7:]] = v
44 |         elif k.startswith('backbone.'):
45 |             state_dict[k[9:]] = v
46 |         else:
47 |             state_dict[k] = v
48 |     # load state_dict
49 |     load_state_dict(model, state_dict, strict, logger)
50 |     return checkpoint
51 | 
52 | 
53 | def get_state_dict(filename, map_location='cpu'):
54 |     """Get state_dict from a file or URI.
55 | 
56 |     Args:
57 |         filename (str): Accept local filepath, URL, ``torchvision://xxx``,
58 |             ``open-mmlab://xxx``.
59 |         map_location (str): Same as :func:`torch.load`.
60 | 
61 |     Returns:
62 |         OrderedDict: The state_dict.
63 |     """
64 |     checkpoint = _load_checkpoint(filename, map_location)
65 |     # OrderedDict is a subclass of dict
66 |     if not isinstance(checkpoint, dict):
67 |         raise RuntimeError(
68 |             f'No state_dict found in checkpoint file {filename}')
69 |     # get state_dict from checkpoint
70 |     if 'state_dict' in checkpoint:
71 |         state_dict_tmp = checkpoint['state_dict']
72 |     else:
73 |         state_dict_tmp = checkpoint
74 | 
75 |     state_dict = OrderedDict()
76 |     # strip prefix of state_dict
77 |     for k, v in state_dict_tmp.items():
78 |         if k.startswith('module.backbone.'):
79 |             state_dict[k[16:]] = v
80 |         elif k.startswith('module.'):
81 |             state_dict[k[7:]] = v
82 |         elif k.startswith('backbone.'):
83 |             state_dict[k[9:]] = v
84 |         else:
85 |             state_dict[k] = v
86 | 
87 |     return state_dict
88 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/configs/coco/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/configs/coco/__init__.py


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | # from .ae_higher_resolution_head import AEHigherResolutionHead
 3 | # from .ae_multi_stage_head import AEMultiStageHead
 4 | # from .ae_simple_head import AESimpleHead
 5 | # from .deconv_head import DeconvHead
 6 | # from .deeppose_regression_head import DeepposeRegressionHead
 7 | # from .hmr_head import HMRMeshHead
 8 | # from .interhand_3d_head import Interhand3DHead
 9 | # from .temporal_regression_head import TemporalRegressionHead
10 | from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
11 | # from .topdown_heatmap_multi_stage_head import (TopdownHeatmapMSMUHead,
12 | #                                                TopdownHeatmapMultiStageHead)
13 | from .topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
14 | # from .vipnas_heatmap_simple_head import ViPNASHeatmapSimpleHead
15 | # from .voxelpose_head import CuboidCenterHead, CuboidPoseHead
16 | 
17 | # __all__ = [
18 | #     'TopdownHeatmapSimpleHead', 'TopdownHeatmapMultiStageHead',
19 | #     'TopdownHeatmapMSMUHead', 'TopdownHeatmapBaseHead',
20 | #     'AEHigherResolutionHead', 'AESimpleHead', 'AEMultiStageHead',
21 | #     'DeepposeRegressionHead', 'TemporalRegressionHead', 'Interhand3DHead',
22 | #     'HMRMeshHead', 'DeconvHead', 'ViPNASHeatmapSimpleHead', 'CuboidCenterHead',
23 | #     'CuboidPoseHead'
24 | # ]
25 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/heads/hmr_head.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import numpy as np
 3 | import torch
 4 | import torch.nn as nn
 5 | from mmcv.cnn import xavier_init
 6 | 
 7 | from ..builder import HEADS
 8 | from ..utils.geometry import rot6d_to_rotmat
 9 | 
10 | 
11 | @HEADS.register_module()
12 | class HMRMeshHead(nn.Module):
13 |     """SMPL parameters regressor head of simple baseline. "End-to-end Recovery
14 |     of Human Shape and Pose", CVPR'2018.
15 | 
16 |     Args:
17 |         in_channels (int): Number of input channels
18 |         smpl_mean_params (str): The file name of the mean SMPL parameters
19 |         n_iter (int): The iterations of estimating delta parameters
20 |     """
21 | 
22 |     def __init__(self, in_channels, smpl_mean_params=None, n_iter=3):
23 |         super().__init__()
24 | 
25 |         self.in_channels = in_channels
26 |         self.n_iter = n_iter
27 | 
28 |         npose = 24 * 6
29 |         nbeta = 10
30 |         ncam = 3
31 |         hidden_dim = 1024
32 | 
33 |         self.fc1 = nn.Linear(in_channels + npose + nbeta + ncam, hidden_dim)
34 |         self.drop1 = nn.Dropout()
35 |         self.fc2 = nn.Linear(hidden_dim, hidden_dim)
36 |         self.drop2 = nn.Dropout()
37 |         self.decpose = nn.Linear(hidden_dim, npose)
38 |         self.decshape = nn.Linear(hidden_dim, nbeta)
39 |         self.deccam = nn.Linear(hidden_dim, ncam)
40 | 
41 |         # Load mean SMPL parameters
42 |         if smpl_mean_params is None:
43 |             init_pose = torch.zeros([1, npose])
44 |             init_shape = torch.zeros([1, nbeta])
45 |             init_cam = torch.FloatTensor([[1, 0, 0]])
46 |         else:
47 |             mean_params = np.load(smpl_mean_params)
48 |             init_pose = torch.from_numpy(
49 |                 mean_params['pose'][:]).unsqueeze(0).float()
50 |             init_shape = torch.from_numpy(
51 |                 mean_params['shape'][:]).unsqueeze(0).float()
52 |             init_cam = torch.from_numpy(
53 |                 mean_params['cam']).unsqueeze(0).float()
54 |         self.register_buffer('init_pose', init_pose)
55 |         self.register_buffer('init_shape', init_shape)
56 |         self.register_buffer('init_cam', init_cam)
57 | 
58 |     def forward(self, x):
59 |         """Forward function.
60 | 
61 |         x is the image feature map and is expected to be in shape (batch size x
62 |         channel number x height x width)
63 |         """
64 |         batch_size = x.shape[0]
65 |         # extract the global feature vector by average along
66 |         # spatial dimension.
67 |         x = x.mean(dim=-1).mean(dim=-1)
68 | 
69 |         init_pose = self.init_pose.expand(batch_size, -1)
70 |         init_shape = self.init_shape.expand(batch_size, -1)
71 |         init_cam = self.init_cam.expand(batch_size, -1)
72 | 
73 |         pred_pose = init_pose
74 |         pred_shape = init_shape
75 |         pred_cam = init_cam
76 |         for _ in range(self.n_iter):
77 |             xc = torch.cat([x, pred_pose, pred_shape, pred_cam], 1)
78 |             xc = self.fc1(xc)
79 |             xc = self.drop1(xc)
80 |             xc = self.fc2(xc)
81 |             xc = self.drop2(xc)
82 |             pred_pose = self.decpose(xc) + pred_pose
83 |             pred_shape = self.decshape(xc) + pred_shape
84 |             pred_cam = self.deccam(xc) + pred_cam
85 | 
86 |         pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3)
87 |         out = (pred_rotmat, pred_shape, pred_cam)
88 |         return out
89 | 
90 |     def init_weights(self):
91 |         """Initialize model weights."""
92 |         xavier_init(self.decpose, gain=0.01)
93 |         xavier_init(self.decshape, gain=0.01)
94 |         xavier_init(self.deccam, gain=0.01)
95 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/builder/model_builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # from configs.coco.ViTPose_base_coco_256x192 import model
 4 | from .heads.topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
 5 | 
 6 | # import TopdownHeatmapSimpleHead
 7 | from .backbones import ViT
 8 | 
 9 | # print(model)
10 | import torch
11 | from functools import partial
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | from importlib import import_module
15 | 
16 | 
17 | def build_model(model_name, checkpoint=None):
18 |     try:
19 |         path = ".configs.coco." + model_name
20 |         mod = import_module(path, package="src.vitpose_infer")
21 | 
22 |         model = getattr(mod, "model")
23 |         # from path import model
24 |     except:
25 |         raise ValueError("not a correct config")
26 | 
27 |     head = TopdownHeatmapSimpleHead(
28 |         in_channels=model["keypoint_head"]["in_channels"],
29 |         out_channels=model["keypoint_head"]["out_channels"],
30 |         num_deconv_filters=model["keypoint_head"]["num_deconv_filters"],
31 |         num_deconv_kernels=model["keypoint_head"]["num_deconv_kernels"],
32 |         num_deconv_layers=model["keypoint_head"]["num_deconv_layers"],
33 |         extra=model["keypoint_head"]["extra"],
34 |     )
35 |     # print(head)
36 |     backbone = ViT(
37 |         img_size=model["backbone"]["img_size"],
38 |         patch_size=model["backbone"]["patch_size"],
39 |         embed_dim=model["backbone"]["embed_dim"],
40 |         depth=model["backbone"]["depth"],
41 |         num_heads=model["backbone"]["num_heads"],
42 |         ratio=model["backbone"]["ratio"],
43 |         mlp_ratio=model["backbone"]["mlp_ratio"],
44 |         qkv_bias=model["backbone"]["qkv_bias"],
45 |         drop_path_rate=model["backbone"]["drop_path_rate"],
46 |     )
47 | 
48 |     class VitPoseModel(nn.Module):
49 |         def __init__(self, backbone, keypoint_head):
50 |             super(VitPoseModel, self).__init__()
51 |             self.backbone = backbone
52 |             self.keypoint_head = keypoint_head
53 | 
54 |         def forward(self, x):
55 |             x = self.backbone(x)
56 |             x = self.keypoint_head(x)
57 |             return x
58 | 
59 |     pose = VitPoseModel(backbone, head)
60 |     if checkpoint is not None:
61 |         check = torch.load(checkpoint)
62 | 
63 |         pose.load_state_dict(check["state_dict"])
64 |     return pose
65 | 
66 | 
67 | # pose = build_model('ViTPose_base_coco_256x192','./models/vitpose-b-multi-coco.pth')
68 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/__init__.py


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/convert_to_trt.py:
--------------------------------------------------------------------------------
1 | from torch2trt import TRTModule,torch2trt
2 | from builder import  build_model
3 | import torch
4 | pose = build_model('ViTPose_base_coco_256x192','./models/vitpose-b.pth')
5 | pose.cuda().eval()
6 | 
7 | x = torch.ones(1,3,256,192).cuda()
8 | net_trt = torch2trt(pose, [x],max_batch_size=10, fp16_mode=True)
9 | torch.save(net_trt.state_dict(), 'vitpose_trt.pth')


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/inference_test.py:
--------------------------------------------------------------------------------
 1 | from builder import build_model
 2 | import torch
 3 | from ViTPose_trt import TRTModule_ViTPose
 4 | # pose = TRTModule_ViTPose(path='pose_higher_hrnet_w32_512.engine',device='cuda:0')
 5 | pose = build_model('ViTPose_base_coco_256x192','./models/vitpose-b.pth')
 6 | pose.cuda().eval()
 7 | if pose.training:
 8 |     print('train')
 9 | else:
10 |     print('eval')
11 | device = torch.device("cuda")
12 | # pose.to(device)
13 | dummy_input = torch.randn(10, 3,256,192, dtype=torch.float).to(device)
14 | repetitions=100
15 | total_time = 0
16 | starter, ender = torch.cuda.Event(enable_timing=True),   torch.cuda.Event(enable_timing=True)
17 | with torch.no_grad():
18 |     for rep in range(repetitions):
19 |         # starter, ender = torch.cuda.Event(enable_timing=True),   torch.cuda.Event(enable_timing=True)
20 |         starter.record()
21 |         # for k in range(10):
22 |         _ = pose(dummy_input)
23 |         ender.record()
24 |         torch.cuda.synchronize()
25 |         curr_time = starter.elapsed_time(ender)/1000
26 |         total_time += curr_time
27 | Throughput =   repetitions*10/total_time
28 | print('Final Throughput:',Throughput)
29 | print('Total time',total_time)


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/logger_helper.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | class CustomFormatter(logging.Formatter):
 4 | 
 5 |     grey = "\x1b[38;20m"
 6 |     yellow = "\x1b[33;20m"
 7 |     red = "\x1b[31;20m"
 8 |     bold_red = "\x1b[31;1m"
 9 |     reset = "\x1b[0m"
10 |     format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)"
11 | 
12 |     FORMATS = {
13 |         logging.DEBUG: grey + format + reset,
14 |         logging.INFO: grey + format + reset,
15 |         logging.WARNING: yellow + format + reset,
16 |         logging.ERROR: red + format + reset,
17 |         logging.CRITICAL: bold_red + format + reset
18 |     }
19 | 
20 |     def format(self, record):
21 |         log_fmt = self.FORMATS.get(record.levelno)
22 |         formatter = logging.Formatter(log_fmt)
23 |         return formatter.format(record)


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/preproc/vitpose_pytorch/src/vitpose_infer/pose_utils/timerr.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | class Timer(object):
 5 |     """A simple timer."""
 6 |     def __init__(self):
 7 |         self.total_time = 0.
 8 |         self.calls = 0
 9 |         self.start_time = 0.
10 |         self.diff = 0.
11 |         self.average_time = 0.
12 | 
13 |         self.duration = 0.
14 | 
15 |     def tic(self):
16 |         # using time.time instead of time.clock because time time.clock
17 |         # does not normalize for multithreading
18 |         self.start_time = time.time()
19 | 
20 |     def toc(self, average=True):
21 |         self.diff = time.time() - self.start_time
22 |         self.total_time += self.diff
23 |         self.calls += 1
24 |         self.average_time = self.total_time / self.calls
25 |         if average:
26 |             self.duration = self.average_time
27 |         else:
28 |             self.duration = self.diff
29 |         return self.duration
30 | 
31 |     def clear(self):
32 |         self.total_time = 0.
33 |         self.calls = 0
34 |         self.start_time = 0.
35 |         self.diff = 0.
36 |         self.average_time = 0.
37 |         self.duration = 0.


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/pylogger.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | import logging
 3 | import torch
 4 | from colorlog import ColoredFormatter
 5 | 
 6 | 
 7 | def sync_time():
 8 |     torch.cuda.synchronize()
 9 |     return time()
10 | 
11 | 
12 | Log = logging.getLogger()
13 | Log.time = time
14 | Log.sync_time = sync_time
15 | 
16 | # Set default
17 | Log.setLevel(logging.INFO)
18 | ch = logging.StreamHandler()
19 | ch.setLevel(logging.INFO)
20 | # Use colorlog
21 | formatstring = "[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] %(message)s"
22 | datefmt = "%m/%d %H:%M:%S"
23 | ch.setFormatter(ColoredFormatter(formatstring, datefmt=datefmt))
24 | 
25 | Log.addHandler(ch)
26 | # Log.info("Init-Logger")
27 | 
28 | 
29 | def timer(sync_cuda=False, mem=False, loop=1):
30 |     """
31 |     Args:
32 |         func: function
33 |         sync_cuda: bool, whether to synchronize cuda
34 |         mem: bool, whether to log memory
35 |     """
36 | 
37 |     def decorator(func):
38 |         def wrapper(*args, **kwargs):
39 |             if mem:
40 |                 start_mem = torch.cuda.memory_allocated() / 1024**2
41 |             if sync_cuda:
42 |                 torch.cuda.synchronize()
43 | 
44 |             start = Log.time()
45 |             for _ in range(loop):
46 |                 result = func(*args, **kwargs)
47 | 
48 |             if sync_cuda:
49 |                 torch.cuda.synchronize()
50 |             if loop == 1:
51 |                 message = f"{func.__name__} took {Log.time() - start:.3f} s."
52 |             else:
53 |                 message = f"{func.__name__} took {((Log.time() - start))/loop:.3f} s. (loop={loop})"
54 | 
55 |             if mem:
56 |                 end_mem = torch.cuda.memory_allocated() / 1024**2
57 |                 end_max_mem = torch.cuda.max_memory_allocated() / 1024**2
58 |                 message += f" Start_Mem {start_mem:.1f} Max {end_max_mem:.1f} MB"
59 |             Log.info(message)
60 | 
61 |             return result
62 | 
63 |         return wrapper
64 | 
65 |     return decorator
66 | 
67 | 
68 | def timed(fn):
69 |     """example usage: timed(lambda: model(inp))"""
70 |     start = torch.cuda.Event(enable_timing=True)
71 |     end = torch.cuda.Event(enable_timing=True)
72 |     start.record()
73 |     result = fn()
74 |     end.record()
75 |     torch.cuda.synchronize()
76 |     return result, start.elapsed_time(end) / 1000
77 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/vis/README.md:
--------------------------------------------------------------------------------
 1 | ## Pytorch3D Renderer
 2 | 
 3 | Example:
 4 | ```python
 5 | from hmr4d.utils.vis.renderer import Renderer
 6 | import imageio
 7 | 
 8 | fps = 30
 9 | focal_length = data["cam_int"][0][0, 0]
10 | width, height = img_hw
11 | faces = smplh[data["gender"]].bm.faces
12 | renderer = Renderer(width, height, focal_length, "cuda", faces)
13 | writer = imageio.get_writer("tmp_debug.mp4", fps=fps, mode="I", format="FFMPEG", macro_block_size=1)
14 | 
15 | for i in tqdm(range(length)):
16 |     img = np.zeros((height, width, 3), dtype=np.uint8)
17 |     img = renderer.render_mesh(smplh_out.vertices[i].cuda(), img)
18 |     writer.append_data(img)
19 | writer.close()
20 | ```


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/vis/renderer_utils.py:
--------------------------------------------------------------------------------
 1 | from hmr4d.utils.vis.renderer import Renderer
 2 | from tqdm import tqdm
 3 | import numpy as np
 4 | 
 5 | 
 6 | def simple_render_mesh(render_dict):
 7 |     """Render an camera-space mesh, blank background"""
 8 |     width, height, focal_length = render_dict["whf"]
 9 |     faces = render_dict["faces"]
10 |     verts = render_dict["verts"]
11 | 
12 |     renderer = Renderer(width, height, focal_length, device="cuda", faces=faces)
13 |     outputs = []
14 |     for i in tqdm(range(len(verts)), desc=f"Rendering"):
15 |         img = renderer.render_mesh(verts[i].cuda(), colors=[0.8, 0.8, 0.8])
16 |         outputs.append(img)
17 |     outputs = np.stack(outputs, axis=0)
18 |     return outputs
19 | 
20 | 
21 | def simple_render_mesh_background(render_dict, VI=50, colors=[0.8, 0.8, 0.8]):
22 |     """Render an camera-space mesh, blank background"""
23 |     K = render_dict["K"]
24 |     faces = render_dict["faces"]
25 |     verts = render_dict["verts"]
26 |     background = render_dict["background"]
27 |     N_frames = len(verts)
28 |     if len(background.shape) == 3:
29 |         background = [background] * N_frames
30 |     height, width = background[0].shape[:2]
31 | 
32 |     renderer = Renderer(width, height, device="cuda", faces=faces, K=K)
33 |     outputs = []
34 |     for i in tqdm(range(len(verts)), desc=f"Rendering"):
35 |         img = renderer.render_mesh(verts[i].cuda(), colors=colors, background=background[i], VI=VI)
36 |         outputs.append(img)
37 |     outputs = np.stack(outputs, axis=0)
38 |     return outputs
39 | 


--------------------------------------------------------------------------------
/eval/GVHMR/hmr4d/utils/vis/rich_logger.py:
--------------------------------------------------------------------------------
 1 | from pytorch_lightning.utilities import rank_zero_only
 2 | from omegaconf import DictConfig, OmegaConf
 3 | import rich
 4 | import rich.tree
 5 | import rich.syntax
 6 | from hmr4d.utils.pylogger import Log
 7 | 
 8 | 
 9 | @rank_zero_only
10 | def print_cfg(cfg: DictConfig, use_rich: bool = False):
11 |     if use_rich:
12 |         print_order = ("data", "model", "callbacks", "logger", "pl_trainer")
13 |         style = "dim"
14 |         tree = rich.tree.Tree("CONFIG", style=style, guide_style=style)
15 | 
16 |         # add fields from `print_order` to queue
17 |         # add all the other fields to queue (not specified in `print_order`)
18 |         queue = []
19 |         for field in print_order:
20 |             queue.append(field) if field in cfg else Log.warn(f"Field '{field}' not found in config. Skipping.")
21 |         for field in cfg:
22 |             if field not in queue:
23 |                 queue.append(field)
24 | 
25 |         # generate config tree from queue
26 |         for field in queue:
27 |             branch = tree.add(field, style=style, guide_style=style)
28 |             config_group = cfg[field]
29 |             if isinstance(config_group, DictConfig):
30 |                 branch_content = OmegaConf.to_yaml(config_group, resolve=False)
31 |             else:
32 |                 branch_content = str(config_group)
33 |             branch.add(rich.syntax.Syntax(branch_content, "yaml"))
34 |         rich.print(tree)
35 |     else:
36 |         Log.info(OmegaConf.to_yaml(cfg, resolve=False))
37 | 


--------------------------------------------------------------------------------
/eval/GVHMR/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 120
 3 | include = '\.pyi?$'
 4 | exclude = '''
 5 | /(
 6 |     \.git
 7 |     | \.hg
 8 |     | \.mypy_cache
 9 |     | \.tox
10 |     | \.venv
11 |     | _build
12 |     | buck-out
13 |     | build
14 |     | dist
15 | )/
16 | '''
17 | 


--------------------------------------------------------------------------------
/eval/GVHMR/pyrightconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |     "exclude": [
3 |         "./inputs",
4 |         "./outputs"
5 |     ],
6 |     "typeCheckingMode": "off",
7 | }
8 | 


--------------------------------------------------------------------------------
/eval/GVHMR/requirements.txt:
--------------------------------------------------------------------------------
 1 | # PyTorch
 2 | --extra-index-url https://download.pytorch.org/whl/cu121
 3 | torch==2.3.0+cu121
 4 | torchvision==0.18.0+cu121
 5 | timm==0.9.12  # For HMR2.0a feature extraction
 6 | 
 7 | # Lightning + Hydra
 8 | lightning==2.3.0
 9 | hydra-core==1.3
10 | hydra-zen
11 | hydra_colorlog
12 | rich
13 | 
14 | # Common utilities
15 | numpy==1.23.5
16 | jupyter
17 | matplotlib
18 | ipdb
19 | setuptools>=68.0
20 | black
21 | tensorboardX
22 | opencv-python
23 | ffmpeg-python
24 | scikit-image
25 | termcolor
26 | einops
27 | imageio==2.34.1
28 | av  # imageio[pyav], improved performance over imageio[ffmpeg]
29 | joblib
30 | 
31 | # Diffusion
32 | # diffusers[torch]==0.19.3
33 | # transformers==4.31.0
34 | 
35 | # 3D-Vision
36 | pytorch3d @ https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt230/pytorch3d-0.7.6-cp310-cp310-linux_x86_64.whl
37 | trimesh
38 | chumpy
39 | smplx
40 | # open3d==0.17.0
41 | wis3d
42 | 
43 | # 2D-Pose
44 | ultralytics==8.2.42  # YOLO
45 | cython_bbox
46 | lapx


--------------------------------------------------------------------------------
/eval/GVHMR/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(
 5 |     name="gvhmr",
 6 |     version="1.0.0",
 7 |     packages=find_packages(),
 8 |     author="Zehong Shen",
 9 |     description=["GVHMR training and inference"],
10 |     url="https://github.com/zju3dv/GVHMR",
11 | )
12 | 


--------------------------------------------------------------------------------
/eval/GVHMR/tools/demo/demo_folder.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | from tqdm import tqdm
 4 | from hmr4d.utils.pylogger import Log
 5 | import subprocess
 6 | import os
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument("-f", "--folder", type=str)
12 |     parser.add_argument("-d", "--output_root", type=str, default=None)
13 |     parser.add_argument("-s", "--static_cam", action="store_true", help="If true, skip DPVO")
14 |     args = parser.parse_args()
15 | 
16 |     output_root = args.output_root
17 | 
18 |     sub_folders = os.listdir(args.folder)
19 |     mp4_paths = []
20 |     for sub_folder in sub_folders:
21 |         files = os.listdir(os.path.join(args.folder, sub_folder))
22 |         for file in files:
23 |             if file.endswith('.mp4'):
24 |                 mp4_path = os.path.join(args.folder, sub_folder, file)
25 |                 mp4_paths.append(mp4_path)
26 | 
27 |     # Run demo.py for each .mp4 file
28 |     Log.info(f"Found {len(mp4_paths)} .mp4 files in {args.folder}")
29 |     for mp4_path in tqdm(mp4_paths):
30 |         try:
31 |             command = ["python", "tools/demo/demo.py", "--video", str(mp4_path)]
32 |             if output_root is not None:
33 |                 command += ["--output_root", output_root]
34 |             if args.static_cam:
35 |                 command += ["-s"]
36 |             Log.info(f"Running: {' '.join(command)}")
37 |             subprocess.run(command, env=dict(os.environ), check=True)
38 |         except:
39 |             continue
40 | 


--------------------------------------------------------------------------------
/eval/GVHMR/tools/train.py:
--------------------------------------------------------------------------------
 1 | import hydra
 2 | import pytorch_lightning as pl
 3 | from omegaconf import DictConfig, OmegaConf
 4 | from pytorch_lightning.callbacks.checkpoint import Checkpoint
 5 | 
 6 | from hmr4d.utils.pylogger import Log
 7 | from hmr4d.configs import register_store_gvhmr
 8 | from hmr4d.utils.vis.rich_logger import print_cfg
 9 | from hmr4d.utils.net_utils import load_pretrained_model, get_resume_ckpt_path
10 | 
11 | 
12 | def get_callbacks(cfg: DictConfig) -> list:
13 |     """Parse and instantiate all the callbacks in the config."""
14 |     if not hasattr(cfg, "callbacks") or cfg.callbacks is None:
15 |         return None
16 |     # Handle special callbacks
17 |     enable_checkpointing = cfg.pl_trainer.get("enable_checkpointing", True)
18 |     # Instantiate all the callbacks
19 |     callbacks = []
20 |     for callback in cfg.callbacks.values():
21 |         if callback is not None:
22 |             cb = hydra.utils.instantiate(callback, _recursive_=False)
23 |             # skip when disable checkpointing and the callback is Checkpoint
24 |             if not enable_checkpointing and isinstance(cb, Checkpoint):
25 |                 continue
26 |             else:
27 |                 callbacks.append(cb)
28 |     return callbacks
29 | 
30 | 
31 | def train(cfg: DictConfig) -> None:
32 |     """Train/Test"""
33 |     Log.info(f"[Exp Name]: {cfg.exp_name}")
34 |     if cfg.task == "fit":
35 |         Log.info(f"[GPU x Batch] = {cfg.pl_trainer.devices} x {cfg.data.loader_opts.train.batch_size}")
36 |     pl.seed_everything(cfg.seed)
37 | 
38 |     # preparation
39 |     datamodule: pl.LightningDataModule = hydra.utils.instantiate(cfg.data, _recursive_=False)
40 |     model: pl.LightningModule = hydra.utils.instantiate(cfg.model, _recursive_=False)
41 |     if cfg.ckpt_path is not None:
42 |         load_pretrained_model(model, cfg.ckpt_path)
43 | 
44 |     # PL callbacks and logger
45 |     callbacks = get_callbacks(cfg)
46 |     has_ckpt_cb = any([isinstance(cb, Checkpoint) for cb in callbacks])
47 |     if not has_ckpt_cb and cfg.pl_trainer.get("enable_checkpointing", True):
48 |         Log.warning("No checkpoint-callback found. Disabling PL auto checkpointing.")
49 |         cfg.pl_trainer = {**cfg.pl_trainer, "enable_checkpointing": False}
50 |     logger = hydra.utils.instantiate(cfg.logger, _recursive_=False)
51 | 
52 |     # PL-Trainer
53 |     if cfg.task == "test":
54 |         Log.info("Test mode forces full-precision.")
55 |         cfg.pl_trainer = {**cfg.pl_trainer, "precision": 32}
56 |     trainer = pl.Trainer(
57 |         accelerator="gpu",
58 |         logger=logger if logger is not None else False,
59 |         callbacks=callbacks,
60 |         **cfg.pl_trainer,
61 |     )
62 | 
63 |     if cfg.task == "fit":
64 |         resume_path = None
65 |         if cfg.resume_mode is not None:
66 |             resume_path = get_resume_ckpt_path(cfg.resume_mode, ckpt_dir=cfg.callbacks.model_checkpoint.dirpath)
67 |             Log.info(f"Resume training from {resume_path}")
68 |         Log.info("Start Fitiing...")
69 |         trainer.fit(model, datamodule.train_dataloader(), datamodule.val_dataloader(), ckpt_path=resume_path)
70 |     elif cfg.task == "test":
71 |         Log.info("Start Testing...")
72 |         trainer.test(model, datamodule.test_dataloader())
73 |     else:
74 |         raise ValueError(f"Unknown task: {cfg.task}")
75 | 
76 |     Log.info("End of script.")
77 | 
78 | 
79 | @hydra.main(version_base="1.3", config_path="../hmr4d/configs", config_name="train")
80 | def main(cfg) -> None:
81 |     print_cfg(cfg, use_rich=True)
82 |     train(cfg)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     register_store_gvhmr()
87 |     main()
88 | 


--------------------------------------------------------------------------------
/eval/GVHMR/tools/unitest/make_hydra_cfg.py:
--------------------------------------------------------------------------------
1 | from hmr4d.configs import parse_args_to_cfg, register_store_gvhmr
2 | from hmr4d.utils.vis.rich_logger import print_cfg
3 | 
4 | if __name__ == "__main__":
5 |     register_store_gvhmr()
6 |     cfg = parse_args_to_cfg()
7 |     print_cfg(cfg, use_rich=True)
8 | 


--------------------------------------------------------------------------------
/eval/GVHMR/tools/unitest/run_dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import DataLoader
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | def get_dataset(DATA_TYPE):
 7 |     if DATA_TYPE == "BEDLAM_V2":
 8 |         from hmr4d.dataset.bedlam.bedlam import BedlamDatasetV2
 9 | 
10 |         return BedlamDatasetV2()
11 | 
12 |     if DATA_TYPE == "3DPW_TRAIN":
13 |         from hmr4d.dataset.threedpw.threedpw_motion_train import ThreedpwSmplDataset
14 | 
15 |         return ThreedpwSmplDataset()
16 | 
17 | if __name__ == "__main__":
18 |     DATA_TYPE = "3DPW_TRAIN"
19 |     dataset = get_dataset(DATA_TYPE)
20 |     print(len(dataset))
21 | 
22 |     data = dataset[0]
23 | 
24 |     from hmr4d.datamodule.mocap_trainX_testY import collate_fn
25 | 
26 |     loader = DataLoader(
27 |         dataset,
28 |         shuffle=False,
29 |         num_workers=0,
30 |         persistent_workers=False,
31 |         pin_memory=False,
32 |         batch_size=1,
33 |         collate_fn=collate_fn,
34 |     )
35 |     i = 0
36 |     for batch in tqdm(loader):
37 |         i += 1
38 |         # if i == 20:
39 |         #     raise AssertionError
40 |         # time.sleep(0.2)
41 |         pass
42 | 


--------------------------------------------------------------------------------
/eval/GVHMR/tools/video/merge_folder.py:
--------------------------------------------------------------------------------
 1 | """This script will glob two folder, check the mp4 files are one-to-one match precisely, then call merge_horizontal.py to merge them one by one"""
 2 | 
 3 | import os
 4 | import argparse
 5 | from pathlib import Path
 6 | 
 7 | 
 8 | def main():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("input_dir1", type=str)
11 |     parser.add_argument("input_dir2", type=str)
12 |     parser.add_argument("output_dir", type=str)
13 |     parser.add_argument("--vertical", action="store_true")  # By default use horizontal
14 |     args = parser.parse_args()
15 | 
16 |     # Check input
17 |     input_dir1 = Path(args.input_dir1)
18 |     input_dir2 = Path(args.input_dir2)
19 |     assert input_dir1.exists()
20 |     assert input_dir2.exists()
21 |     video_paths1 = sorted(input_dir1.glob("*.mp4"))
22 |     video_paths2 = sorted(input_dir2.glob("*.mp4"))
23 |     assert len(video_paths1) == len(video_paths2)
24 |     for path1, path2 in zip(video_paths1, video_paths2):
25 |         assert path1.stem == path2.stem
26 | 
27 |     # Merge to output
28 |     output_dir = Path(args.output_dir)
29 |     output_dir.mkdir(parents=True, exist_ok=True)
30 | 
31 |     for path1, path2 in zip(video_paths1, video_paths2):
32 |         out_path = output_dir / f"{path1.stem}.mp4"
33 |         in_paths = [str(path1), str(path2)]
34 |         print(f"Merging {in_paths} to {out_path}")
35 |         if args.vertical:
36 |             os.system(f"python tools/video/merge_vertical.py {' '.join(in_paths)} -o {out_path}")
37 |         else:
38 |             os.system(f"python tools/video/merge_horizontal.py {' '.join(in_paths)} -o {out_path}")
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/eval/GVHMR/tools/video/merge_horizontal.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from hmr4d.utils.video_io_utils import merge_videos_horizontal
 3 | 
 4 | 
 5 | def parse_args():
 6 |     """python tools/video/merge_horizontal.py a.mp4 b.mp4 c.mp4 -o out.mp4"""
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("input_videos", nargs="+", help="Input video paths")
 9 |     parser.add_argument("-o", "--output", type=str, required=True, help="Output video path")
10 |     return parser.parse_args()
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     args = parse_args()
15 |     merge_videos_horizontal(args.input_videos, args.output)
16 | 


--------------------------------------------------------------------------------
/eval/GVHMR/tools/video/merge_vertical.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from hmr4d.utils.video_io_utils import merge_videos_vertical
 3 | 
 4 | 
 5 | def parse_args():
 6 |     """python tools/video/merge_vertical.py a.mp4 b.mp4 c.mp4 -o out.mp4"""
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("input_videos", nargs="+", help="Input video paths")
 9 |     parser.add_argument("-o", "--output", type=str, required=True, help="Output video path")
10 |     return parser.parse_args()
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     args = parse_args()
15 |     merge_videos_vertical(args.input_videos, args.output)
16 | 


--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__


--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/calculate_clip.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | from PIL import Image
 3 | import torch
 4 | from transformers import CLIPProcessor, CLIPModel
 5 | import json
 6 | import os
 7 | from tqdm import tqdm
 8 | import torch
 9 | import clip
10 | from PIL import Image
11 | import cv2
12 | import numpy as np
13 | import os
14 | import argparse
15 | 
16 | device = "cuda" if torch.cuda.is_available() else "cpu"
17 | 
18 | model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
19 | processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
20 | 
21 | def get_video_scores(video_path, prompt):
22 |     video = cv2.VideoCapture(video_path)
23 |     texts = [prompt]
24 |     clip_score_list = []
25 |     while True:
26 |         ret, frame = video.read()
27 | 
28 |         if ret:
29 |             image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
30 |             inputs = processor(text=texts, images=[image], return_tensors="pt", padding=True, truncation=True).to(device)
31 |             logits_per_image = model(**inputs).logits_per_image
32 |             clip_score = logits_per_image.item()
33 |             clip_score_list.append(clip_score)
34 |         else:
35 |             break
36 | 
37 |     video.release()
38 |     return sum(clip_score_list) / len(clip_score_list)
39 | 
40 | 
41 | parser = argparse.ArgumentParser()
42 | parser.add_argument("-v_f", "--videos_folder", type=str)
43 | args = parser.parse_args()
44 | 
45 | videos_folder_path = args.videos_folder
46 | prompts_path = '/ytech_m2v2_hdd/fuxiao/scenectrl/common_metrics_on_video_quality/eval_prompts.json'
47 | with open(prompts_path, "r", encoding="utf-8") as f: prompts_dict = json.load(f)
48 | 
49 | sub_folders = os.listdir(videos_folder_path)
50 | videos_name = []
51 | for sub_folder in sub_folders:
52 |     files = os.listdir(os.path.join(videos_folder_path, sub_folder))
53 |     for file in files:
54 |         if file.endswith('.mp4'):
55 |             video_name = os.path.join(sub_folder, file)
56 |             videos_name.append(video_name)
57 | 
58 | num_videos = len(videos_name)
59 | 
60 | prompts = []
61 | video_paths = []
62 | for video_name in videos_name:
63 |     prompt = prompts_dict[video_name.split('/')[0]]
64 |     video_path = os.path.join(videos_folder_path, video_name)
65 |     prompts.append(prompt)
66 |     video_paths.append(video_path)
67 | 
68 | import csv
69 | CLIP_T = True
70 | if CLIP_T:
71 |     scores = []
72 |     for i in tqdm(range(num_videos)):
73 |         # 加载图片
74 |         video_path = video_paths[i]
75 |         
76 |         # 准备文本
77 |         texts = prompts[i]
78 |         score = get_video_scores(video_path, texts)
79 |         scores.append(score)
80 | 
81 |     print(f"CLIP-SIM: {sum(scores)/len(scores)/100.}")
82 |     #### CLIP-T ####
83 |     # basemodel: 33.44


--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/calculate_fvd.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from tqdm import tqdm
 4 | 
 5 | def trans(x):
 6 |     # if greyscale images add channel
 7 |     if x.shape[-3] == 1:
 8 |         x = x.repeat(1, 1, 3, 1, 1)
 9 | 
10 |     # permute BTCHW -> BCTHW
11 |     x = x.permute(0, 2, 1, 3, 4) 
12 | 
13 |     return x
14 | 
15 | def calculate_fvd(videos1, videos2, device, method='styleganv'):
16 | 
17 |     if method == 'styleganv':
18 |         from fvd.styleganv.fvd import get_fvd_feats, frechet_distance, load_i3d_pretrained
19 |     elif method == 'videogpt':
20 |         from fvd.videogpt.fvd import load_i3d_pretrained
21 |         from fvd.videogpt.fvd import get_fvd_logits as get_fvd_feats
22 |         from fvd.videogpt.fvd import frechet_distance
23 | 
24 |     print("calculate_fvd...")
25 | 
26 |     # videos [batch_size, timestamps, channel, h, w]
27 |     
28 |     assert videos1.shape == videos2.shape
29 | 
30 |     i3d = load_i3d_pretrained(device=device)
31 |     fvd_results = []
32 | 
33 |     # support grayscale input, if grayscale -> channel*3
34 |     # BTCHW -> BCTHW
35 |     # videos -> [batch_size, channel, timestamps, h, w]
36 | 
37 |     videos1 = trans(videos1)
38 |     videos2 = trans(videos2)
39 | 
40 |     fvd_results = {}
41 | 
42 |     # for calculate FVD, each clip_timestamp must >= 10
43 | 
44 |     # get a video clip
45 |     # videos_clip [batch_size, channel, timestamps[:clip], h, w]
46 |     videos_clip1 = videos1[:, :, :]
47 |     videos_clip2 = videos2[:, :, :]
48 | 
49 |     # get FVD features
50 |     feats1 = get_fvd_feats(videos_clip1, i3d=i3d, device=device)
51 |     feats2 = get_fvd_feats(videos_clip2, i3d=i3d, device=device)
52 |     
53 |     # calculate FVD when timestamps[:clip]
54 |     fvd_results = frechet_distance(feats1, feats2)
55 |     
56 |     result = {
57 |         "value": fvd_results,
58 |         "video_setting": videos1.shape,
59 |         "video_setting_name": "batch_size, channel, time, heigth, width",
60 |     }
61 | 
62 |     return result
63 | 
64 | # test code / using example
65 | 
66 | def main():
67 |     NUMBER_OF_VIDEOS = 8
68 |     VIDEO_LENGTH = 50
69 |     CHANNEL = 3
70 |     SIZE = 64
71 |     videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
72 |     videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
73 |     device = torch.device("cuda")
74 |     # device = torch.device("cpu")
75 | 
76 |     import json
77 |     result = calculate_fvd(videos1, videos2, device, method='videogpt')
78 |     print(json.dumps(result, indent=4))
79 | 
80 |     result = calculate_fvd(videos1, videos2, device, method='styleganv')
81 |     print(json.dumps(result, indent=4))
82 | 
83 | if __name__ == "__main__":
84 |     main()


--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/calculate_lpips.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from tqdm import tqdm
 4 | import math
 5 | 
 6 | import torch
 7 | import lpips
 8 | 
 9 | spatial = True         # Return a spatial map of perceptual distance.
10 | 
11 | # Linearly calibrated models (LPIPS)
12 | loss_fn = lpips.LPIPS(net='alex', spatial=spatial) # Can also set net = 'squeeze' or 'vgg'
13 | # loss_fn = lpips.LPIPS(net='alex', spatial=spatial, lpips=False) # Can also set net = 'squeeze' or 'vgg'
14 | 
15 | def trans(x):
16 |     # if greyscale images add channel
17 |     if x.shape[-3] == 1:
18 |         x = x.repeat(1, 1, 3, 1, 1)
19 | 
20 |     # value range [0, 1] -> [-1, 1]
21 |     x = x * 2 - 1
22 | 
23 |     return x
24 | 
25 | def calculate_lpips(videos1, videos2, device):
26 |     # image should be RGB, IMPORTANT: normalized to [-1,1]
27 |     print("calculate_lpips...")
28 | 
29 |     assert videos1.shape == videos2.shape
30 | 
31 |     # videos [batch_size, timestamps, channel, h, w]
32 | 
33 |     # support grayscale input, if grayscale -> channel*3
34 |     # value range [0, 1] -> [-1, 1]
35 |     videos1 = trans(videos1)
36 |     videos2 = trans(videos2)
37 | 
38 |     lpips_results = []
39 | 
40 |     for video_num in tqdm(range(videos1.shape[0])):
41 |         # get a video
42 |         # video [timestamps, channel, h, w]
43 |         video1 = videos1[video_num]
44 |         video2 = videos2[video_num]
45 | 
46 |         lpips_results_of_a_video = []
47 |         for clip_timestamp in range(len(video1)):
48 |             # get a img
49 |             # img [timestamps[x], channel, h, w]
50 |             # img [channel, h, w] tensor
51 | 
52 |             img1 = video1[clip_timestamp].unsqueeze(0).to(device)
53 |             img2 = video2[clip_timestamp].unsqueeze(0).to(device)
54 |             
55 |             loss_fn.to(device)
56 | 
57 |             # calculate lpips of a video
58 |             lpips_results_of_a_video.append(loss_fn.forward(img1, img2).mean().detach().cpu().tolist())
59 |         lpips_results.append(lpips_results_of_a_video)
60 |     
61 |     lpips_results = np.array(lpips_results)
62 |     
63 |     lpips = {}
64 |     lpips_std = {}
65 | 
66 |     for clip_timestamp in range(len(video1)):
67 |         lpips[clip_timestamp] = np.mean(lpips_results[:,clip_timestamp])
68 |         lpips_std[clip_timestamp] = np.std(lpips_results[:,clip_timestamp])
69 | 
70 | 
71 |     result = {
72 |         "value": lpips,
73 |         "value_std": lpips_std,
74 |         "video_setting": video1.shape,
75 |         "video_setting_name": "time, channel, heigth, width",
76 |     }
77 | 
78 |     return result
79 | 
80 | # test code / using example
81 | 
82 | def main():
83 |     NUMBER_OF_VIDEOS = 8
84 |     VIDEO_LENGTH = 50
85 |     CHANNEL = 3
86 |     SIZE = 64
87 |     videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
88 |     videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
89 |     device = torch.device("cuda")
90 |     # device = torch.device("cpu")
91 | 
92 |     import json
93 |     result = calculate_lpips(videos1, videos2, device)
94 |     print(json.dumps(result, indent=4))
95 | 
96 | if __name__ == "__main__":
97 |     main()


--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/calculate_psnr.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from tqdm import tqdm
 4 | import math
 5 | 
 6 | def img_psnr(img1, img2):
 7 |     # [0,1]
 8 |     # compute mse
 9 |     # mse = np.mean((img1-img2)**2)
10 |     mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2)
11 |     # compute psnr
12 |     if mse < 1e-10:
13 |         return 100
14 |     psnr = 20 * math.log10(1 / math.sqrt(mse))
15 |     return psnr
16 | 
17 | def trans(x):
18 |     return x
19 | 
20 | def calculate_psnr(videos1, videos2):
21 |     print("calculate_psnr...")
22 | 
23 |     # videos [batch_size, timestamps, channel, h, w]
24 |     
25 |     assert videos1.shape == videos2.shape
26 | 
27 |     videos1 = trans(videos1)
28 |     videos2 = trans(videos2)
29 | 
30 |     psnr_results = []
31 |     
32 |     for video_num in tqdm(range(videos1.shape[0])):
33 |         # get a video
34 |         # video [timestamps, channel, h, w]
35 |         video1 = videos1[video_num]
36 |         video2 = videos2[video_num]
37 | 
38 |         psnr_results_of_a_video = []
39 |         for clip_timestamp in range(len(video1)):
40 |             # get a img
41 |             # img [timestamps[x], channel, h, w]
42 |             # img [channel, h, w] numpy
43 | 
44 |             img1 = video1[clip_timestamp].numpy()
45 |             img2 = video2[clip_timestamp].numpy()
46 |             
47 |             # calculate psnr of a video
48 |             psnr_results_of_a_video.append(img_psnr(img1, img2))
49 | 
50 |         psnr_results.append(psnr_results_of_a_video)
51 |     
52 |     psnr_results = np.array(psnr_results)
53 |     
54 |     psnr = {}
55 |     psnr_std = {}
56 | 
57 |     for clip_timestamp in range(len(video1)):
58 |         psnr[clip_timestamp] = np.mean(psnr_results[:,clip_timestamp])
59 |         psnr_std[clip_timestamp] = np.std(psnr_results[:,clip_timestamp])
60 | 
61 |     result = {
62 |         "value": psnr,
63 |         "value_std": psnr_std,
64 |         "video_setting": video1.shape,
65 |         "video_setting_name": "time, channel, heigth, width",
66 |     }
67 | 
68 |     return result
69 | 
70 | # test code / using example
71 | 
72 | def main():
73 |     NUMBER_OF_VIDEOS = 8
74 |     VIDEO_LENGTH = 50
75 |     CHANNEL = 3
76 |     SIZE = 64
77 |     videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
78 |     videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
79 | 
80 |     import json
81 |     result = calculate_psnr(videos1, videos2)
82 |     print(json.dumps(result, indent=4))
83 | 
84 | if __name__ == "__main__":
85 |     main()


--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/download_eval_visual.sh:
--------------------------------------------------------------------------------
 1 | gdown https://drive.google.com/uc\?id\=1U2hd6qvwKLfp7c8yGgcTqdqrP_lKJElB
 2 | gdown https://drive.google.com/uc\?id\=1jMH2-ZC0ZBgtqej5Sp-E5ebBIX7mk3Xz
 3 | gdown https://drive.google.com/uc\?id\=1kfdCDA5koYh9g3IkCCHb4XPch2CJAwek
 4 | 
 5 | unzip fvd.zip
 6 | unzip eval_sets.zip
 7 | unzip base_t2v_eval_sets.zip
 8 | 
 9 | mv eval_sets eval_folder/
10 | mv base_t2v_eval_sets eval_folder/
11 | 
12 | rm -rf *.zip


--------------------------------------------------------------------------------
/eval/common_metrics_on_video_quality/eval_visual.sh:
--------------------------------------------------------------------------------
 1 | basedir=eval_folder
 2 | folder1_path=${basedir}/base_t2v_eval_sets
 3 | folder2_path=${basedir}/eval_sets
 4 | 
 5 | # calculate FVD
 6 | python calculate_fvd_styleganv.py -v1_f ${folder1_path} -v2_f ${folder2_path}
 7 | 
 8 | # calculate FID
 9 | python -m pytorch_fid ${basedir}/eval_1 ${basedir}/eval_2
10 | 
11 | # calculate CLIP-SIM
12 | python calculate_clip.py -v_f ${folder2_path}
13 | 
14 | rm -rf ${basedir}/eval_1
15 | rm -rf ${basedir}/eval_2


--------------------------------------------------------------------------------
/imgs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/imgs/logo.png


--------------------------------------------------------------------------------
/imgs/vis_objstraj.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KwaiVGI/3DTrajMaster/7c0100391431c8d526fe586da63c373b13b87337/imgs/vis_objstraj.png


--------------------------------------------------------------------------------