├── .gitignore ├── LICENSE ├── README-zh.md ├── README.md ├── assets ├── mimictalk.png └── real3dportrait.png ├── checkpoints └── .gitkeep ├── data └── raw │ └── examples │ ├── 80_vs_60_10s.wav │ ├── German_20s.mp4 │ └── bg.png ├── data_gen ├── eg3d │ └── convert_to_eg3d_convention.py ├── runs │ ├── binarizer_nerf.py │ ├── binarizer_th1kh.py │ └── nerf │ │ ├── process_guide.md │ │ └── run.sh └── utils │ ├── mp_feature_extractors │ ├── face_landmarker.py │ ├── face_landmarker.task │ ├── mp_segmenter.py │ └── selfie_multiclass_256x256.tflite │ ├── path_converter.py │ ├── process_audio │ ├── extract_hubert.py │ ├── extract_mel_f0.py │ └── resample_audio_to_16k.py │ ├── process_image │ ├── extract_lm2d.py │ ├── extract_segment_imgs.py │ └── fit_3dmm_landmark.py │ └── process_video │ ├── euler2quaterion.py │ ├── extract_blink.py │ ├── extract_lm2d.py │ ├── extract_segment_imgs.py │ ├── fit_3dmm_landmark.py │ ├── inpaint_torso_imgs.py │ ├── resample_video_to_25fps_resize_to_512.py │ └── split_video_to_imgs.py ├── data_util └── face3d_helper.py ├── deep_3drecon ├── BFM │ ├── .gitkeep │ ├── basel_53201.txt │ ├── index_mp468_from_mesh35709_v1.npy │ ├── index_mp468_from_mesh35709_v2.npy │ ├── index_mp468_from_mesh35709_v3.1.npy │ ├── index_mp468_from_mesh35709_v3.npy │ ├── select_vertex_id.mat │ └── similarity_Lm3D_all.mat ├── __init__.py ├── bfm_left_eye_faces.npy ├── bfm_right_eye_faces.npy ├── data_preparation.py ├── deep_3drecon_models │ ├── __init__.py │ ├── arcface_torch │ │ ├── README.md │ │ ├── backbones │ │ │ ├── __init__.py │ │ │ ├── iresnet.py │ │ │ ├── iresnet2060.py │ │ │ ├── mobilefacenet.py │ │ │ └── vit.py │ │ ├── configs │ │ │ ├── 3millions.py │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── glint360k_mbf.py │ │ │ ├── glint360k_r100.py │ │ │ ├── glint360k_r50.py │ │ │ ├── ms1mv2_mbf.py │ │ │ ├── ms1mv2_r100.py │ │ │ ├── ms1mv2_r50.py │ │ │ ├── ms1mv3_mbf.py │ │ │ ├── ms1mv3_r100.py │ │ │ ├── ms1mv3_r50.py │ │ │ ├── ms1mv3_r50_onegpu.py │ │ │ ├── wf12m_conflict_r50.py │ │ │ ├── wf12m_conflict_r50_pfc03_filter04.py │ │ │ ├── wf12m_flip_pfc01_filter04_r50.py │ │ │ ├── wf12m_flip_r50.py │ │ │ ├── wf12m_mbf.py │ │ │ ├── wf12m_pfc02_r100.py │ │ │ ├── wf12m_r100.py │ │ │ ├── wf12m_r50.py │ │ │ ├── wf42m_pfc0008_32gpu_r100.py │ │ │ ├── wf42m_pfc02_16gpus_mbf_bs8k.py │ │ │ ├── wf42m_pfc02_16gpus_r100.py │ │ │ ├── wf42m_pfc02_16gpus_r50_bs8k.py │ │ │ ├── wf42m_pfc02_32gpus_r50_bs4k.py │ │ │ ├── wf42m_pfc02_8gpus_r50_bs4k.py │ │ │ ├── wf42m_pfc02_r100.py │ │ │ ├── wf42m_pfc02_r100_16gpus.py │ │ │ ├── wf42m_pfc02_r100_32gpus.py │ │ │ ├── wf42m_pfc03_32gpu_r100.py │ │ │ ├── wf42m_pfc03_32gpu_r18.py │ │ │ ├── wf42m_pfc03_32gpu_r200.py │ │ │ ├── wf42m_pfc03_32gpu_r50.py │ │ │ ├── wf42m_pfc03_40epoch_64gpu_vit_b.py │ │ │ ├── wf42m_pfc03_40epoch_64gpu_vit_l.py │ │ │ ├── wf42m_pfc03_40epoch_64gpu_vit_s.py │ │ │ ├── wf42m_pfc03_40epoch_64gpu_vit_t.py │ │ │ ├── wf42m_pfc03_40epoch_8gpu_vit_b.py │ │ │ ├── wf42m_pfc03_40epoch_8gpu_vit_t.py │ │ │ ├── wf4m_mbf.py │ │ │ ├── wf4m_r100.py │ │ │ └── wf4m_r50.py │ │ ├── dataset.py │ │ ├── dist.sh │ │ ├── docs │ │ │ ├── eval.md │ │ │ ├── install.md │ │ │ ├── install_dali.md │ │ │ ├── modelzoo.md │ │ │ ├── prepare_custom_dataset.md │ │ │ ├── prepare_webface42m.md │ │ │ └── speed_benchmark.md │ │ ├── eval │ │ │ ├── __init__.py │ │ │ └── verification.py │ │ ├── eval_ijbc.py │ │ ├── flops.py │ │ ├── inference.py │ │ ├── losses.py │ │ ├── lr_scheduler.py │ │ ├── onnx_helper.py │ │ ├── onnx_ijbc.py │ │ ├── partial_fc.py │ │ ├── partial_fc_v2.py │ │ ├── requirement.txt │ │ ├── run.sh │ │ ├── scripts │ │ │ └── shuffle_rec.py │ │ ├── torch2onnx.py │ │ ├── train.py │ │ ├── train_v2.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── plot.py │ │ │ ├── utils_callbacks.py │ │ │ ├── utils_config.py │ │ │ ├── utils_distributed_sampler.py │ │ │ └── utils_logging.py │ ├── base_model.py │ ├── bfm.py │ ├── facerecon_model.py │ ├── losses.py │ ├── networks.py │ └── template_model.py ├── generate_reconstructor_opt_for_geneface.py ├── ncc_code.npy ├── options │ ├── __init__.py │ ├── base_options.py │ ├── test_options.py │ └── train_options.py ├── reconstructor.py ├── reconstructor_opt.pkl ├── secc_renderer.py ├── test.py ├── train.py └── util │ ├── BBRegressorParam_r.mat │ ├── __init__.py │ ├── detect_lm68.py │ ├── generate_list.py │ ├── html.py │ ├── load_mats.py │ ├── mesh_renderer.py │ ├── preprocess.py │ ├── skin_mask.py │ ├── test_mean_face.txt │ ├── util.py │ └── visualizer.py ├── docs ├── prepare_env │ ├── install_guide-zh.md │ ├── install_guide.md │ └── requirements.txt ├── process_data │ └── process_th1kh.md └── train_models │ ├── train_audio2motion.md │ └── train_motion2video.md ├── egs ├── egs_bases │ ├── audio2motion │ │ ├── base.yaml │ │ ├── vae.yaml │ │ └── vae_sync.yaml │ ├── audio2pose │ │ └── base.yaml │ ├── eg3d │ │ ├── base.yaml │ │ └── base_mse.yaml │ ├── nerf │ │ ├── adnerf.yaml │ │ ├── adnerf_torso.yaml │ │ ├── base.yaml │ │ ├── lm3d_nerf.yaml │ │ └── lm3d_nerf_torso.yaml │ ├── os_facev2v │ │ └── base.yaml │ ├── postnet │ │ └── base.yaml │ ├── radnerf │ │ ├── base.yaml │ │ ├── lm3d_radnerf.yaml │ │ └── radnerf.yaml │ └── syncnet │ │ └── base.yaml ├── os_avatar │ ├── audio2motion_vae.yaml │ ├── audio_lm3d_syncnet.yaml │ ├── img2plane.yaml │ ├── real3d_orig │ │ ├── img2plane_orig.yaml │ │ ├── secc_img2plane_orig.yaml │ │ └── secc_img2plane_torso_orig.yaml │ ├── secc_img2plane.yaml │ └── secc_img2plane_torso.yaml ├── th1kh_512 │ ├── base.yaml │ ├── secc_img2plane.yaml │ └── secc_img2plane_torso.yaml └── th1kh_512_audio2motion │ ├── base.yaml │ ├── lm3d_syncnet.yaml │ ├── lm3d_vae.yaml │ ├── lm3d_vae_pitch.yaml │ ├── lm3d_vae_sync.yaml │ └── lm3d_vae_sync_pitch.yaml ├── inference ├── app_mimictalk.py ├── app_real3dportrait.py ├── edit_secc.py ├── infer_utils.py ├── mimictalk_infer.py ├── real3d_infer.py ├── real3dportrait_demo.ipynb └── train_mimictalk_on_a_video.py ├── modules ├── audio2motion │ ├── cfm │ │ ├── attend.py │ │ ├── cfm_wrapper.py │ │ ├── icl_audio2motion_model.py │ │ ├── icl_audio2motion_pose_model.py │ │ ├── icl_transformer.py │ │ ├── module.py │ │ └── utils.py │ ├── cnn_models.py │ ├── flow_base.py │ ├── multi_length_disc.py │ ├── transformer_base.py │ ├── transformer_models.py │ ├── utils.py │ ├── vae.py │ └── vqvae.py ├── commons │ ├── attention │ │ ├── attentions.py │ │ └── simple_attention.py │ ├── conformer │ │ ├── conformer.py │ │ ├── espnet_positional_embedding.py │ │ ├── espnet_transformer_attn.py │ │ └── layers.py │ ├── conv.py │ ├── gpt.py │ ├── improved_diffusion │ │ ├── __init__.py │ │ ├── dist_util.py │ │ ├── fp16_util.py │ │ ├── gaussian_diffusion.py │ │ ├── image_datasets.py │ │ ├── logger.py │ │ ├── losses.py │ │ ├── nn.py │ │ ├── resample.py │ │ ├── respace.py │ │ └── train_util.py │ ├── layers.py │ ├── loralib │ │ ├── __init__.py │ │ ├── layers.py │ │ └── utils.py │ ├── normalizing_flow │ │ ├── glow_modules.py │ │ ├── res_flow.py │ │ └── utils.py │ ├── rel_transformer.py │ ├── rnn.py │ ├── rot_transformer.py │ ├── taming_tfm_modules.py │ ├── transformer.py │ ├── unet1d.py │ ├── vqvae.py │ ├── vqvae_cvq.py │ ├── vqvae_fsq.py │ ├── vqvae_lfq.py │ ├── vqvae_lfq_y.py │ ├── vqvae_taming.py │ └── wavenet.py ├── eg3ds │ ├── camera_utils │ │ └── pose_sampler.py │ ├── dnnlib │ │ ├── __init__.py │ │ └── util.py │ ├── metrics │ │ ├── __init__.py │ │ ├── equivariance.py │ │ ├── frechet_inception_distance.py │ │ ├── inception_score.py │ │ ├── kernel_inception_distance.py │ │ ├── metric_main.py │ │ ├── metric_utils.py │ │ ├── perceptual_path_length.py │ │ └── precision_recall.py │ ├── models │ │ ├── dual_discriminator.py │ │ ├── dual_discriminator_cond.py │ │ ├── networks_stylegan2.py │ │ ├── networks_stylegan3.py │ │ ├── superresolution.py │ │ └── triplane.py │ ├── torch_utils │ │ ├── __init__.py │ │ ├── custom_ops.py │ │ ├── misc.py │ │ ├── ops │ │ │ ├── __init__.py │ │ │ ├── bias_act.cpp │ │ │ ├── bias_act.cu │ │ │ ├── bias_act.h │ │ │ ├── bias_act.py │ │ │ ├── conv2d_gradfix.py │ │ │ ├── conv2d_resample.py │ │ │ ├── filtered_lrelu.cpp │ │ │ ├── filtered_lrelu.cu │ │ │ ├── filtered_lrelu.h │ │ │ ├── filtered_lrelu.py │ │ │ ├── filtered_lrelu_ns.cu │ │ │ ├── filtered_lrelu_rd.cu │ │ │ ├── filtered_lrelu_wr.cu │ │ │ ├── fma.py │ │ │ ├── grid_sample_gradfix.py │ │ │ ├── upfirdn2d.cpp │ │ │ ├── upfirdn2d.cu │ │ │ ├── upfirdn2d.h │ │ │ └── upfirdn2d.py │ │ ├── persistence.py │ │ └── training_stats.py │ └── volumetric_rendering │ │ ├── __init__.py │ │ ├── math_utils.py │ │ ├── ray_marcher.py │ │ ├── ray_sampler.py │ │ └── renderer.py ├── img2plane │ ├── deeplabv3 │ │ ├── __init__.py │ │ ├── base │ │ │ ├── __init__.py │ │ │ ├── heads.py │ │ │ ├── initialization.py │ │ │ ├── model.py │ │ │ └── modules.py │ │ ├── decoders │ │ │ ├── decoder.py │ │ │ ├── model.py │ │ │ ├── my_decoder.py │ │ │ └── my_model.py │ │ └── encoders │ │ │ ├── __init__.py │ │ │ ├── _base.py │ │ │ ├── _utils.py │ │ │ └── resnet.py │ ├── img2plane_model.py │ ├── segformer │ │ ├── __init__.py │ │ ├── base.py │ │ └── models.py │ ├── simple_encoders │ │ └── high_resolution_encoder.py │ ├── triplane.py │ └── unit_test.ipynb ├── real3d │ ├── facev2v_warp │ │ ├── func_utils.py │ │ ├── layers.py │ │ ├── losses.py │ │ ├── model.py │ │ ├── model2.py │ │ ├── network.py │ │ └── network2.py │ ├── img2plane_baseline.py │ ├── secc_img2plane.py │ ├── secc_img2plane_torso.py │ ├── segformer.py │ └── super_resolution │ │ └── sr_with_ref.py └── syncnet │ ├── models.py │ └── syncnet_v2.py ├── tasks ├── os_avatar │ ├── audio2motion_task.py │ ├── audio_lm3d_syncnet.py │ ├── dataset_utils │ │ ├── audio2motion_dataset.py │ │ ├── motion2video_dataset.py │ │ └── syncnet_dataset.py │ ├── img2plane_task.py │ ├── loss_utils │ │ └── vgg19_loss.py │ ├── secc_img2plane_task.py │ └── secc_img2plane_torso_task.py └── run.py └── utils ├── audio ├── __init__.py ├── align.py ├── dct.py ├── griffin_lim.py ├── io.py ├── pitch │ ├── bin │ │ ├── ExtractF0ByStraight │ │ ├── InterpF0 │ │ └── ReaperF0 │ ├── crepe_utils.py │ ├── extractor_utils.py │ ├── utils.py │ └── uv_utils.py ├── pitch_extractors.py └── vad.py ├── commons ├── base_task.py ├── ckpt_utils.py ├── crop_head.py ├── dataset_utils.py ├── ddp_utils.py ├── euler2rot.py ├── face_alignment_utils.py ├── hparams.py ├── image_utils.py ├── indexed_datasets.py ├── mesh_utils.py ├── meters.py ├── multiprocess_utils.py ├── os_utils.py ├── pitch_utils.py ├── tensor_utils.py └── trainer.py ├── nn ├── grad.py ├── model_utils.py ├── schedulers.py └── seq_utils.py ├── useful_cmd_lines └── clean_gpu.py └── visualization ├── auto_plot_image.py ├── draw_3d_landmark.py ├── ffmpeg_utils.py ├── lm_visualizer.py ├── plot_attention.py ├── plot_spec.py ├── t-sne.py ├── t-sne_0423.py └── vis_cam3d ├── camera_parameter_loader.py └── camera_pose_visualizer.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 ZhenhuiYe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /assets/mimictalk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/assets/mimictalk.png -------------------------------------------------------------------------------- /assets/real3dportrait.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/assets/real3dportrait.png -------------------------------------------------------------------------------- /checkpoints/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/checkpoints/.gitkeep -------------------------------------------------------------------------------- /data/raw/examples/80_vs_60_10s.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/data/raw/examples/80_vs_60_10s.wav -------------------------------------------------------------------------------- /data/raw/examples/German_20s.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/data/raw/examples/German_20s.mp4 -------------------------------------------------------------------------------- /data/raw/examples/bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/data/raw/examples/bg.png -------------------------------------------------------------------------------- /data_gen/runs/nerf/process_guide.md: -------------------------------------------------------------------------------- 1 | # 温馨提示:第一次执行可以先一步步跑完下面的命令行,把环境跑通后,之后可以直接运行同目录的run.sh,一键完成下面的所有步骤。 2 | 3 | # Step0. 将视频Crop到512x512分辨率,25FPS,确保每一帧都有目标人脸 4 | ``` 5 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 data/raw/videos/${VIDEO_ID}_512.mp4 6 | mv data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4 7 | mv data/raw/videos/${VIDEO_ID}_512.mp4 data/raw/videos/${VIDEO_ID}.mp4 8 | ``` 9 | # step1: 提取音频特征, 如mel, f0, hubuert, esperanto 10 | ``` 11 | export CUDA_VISIBLE_DEVICES=0 12 | export VIDEO_ID=May 13 | mkdir -p data/processed/videos/${VIDEO_ID} 14 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 data/processed/videos/${VIDEO_ID}/aud.wav 15 | python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID} 16 | python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID} 17 | ``` 18 | 19 | # Step2. 提取图片 20 | ``` 21 | export VIDEO_ID=May 22 | export CUDA_VISIBLE_DEVICES=0 23 | mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs 24 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg 25 | python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background 26 | ``` 27 | 28 | # Step3. 提取lm2d_mediapipe 29 | ### 提取2D landmark用于之后Fit 3DMM 30 | ### num_workers是本机上的CPU worker数量;total_process是使用的机器数;process_id是本机的编号 31 | 32 | ``` 33 | export VIDEO_ID=May 34 | python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 35 | ``` 36 | 37 | # Step3. fit 3dmm 38 | ``` 39 | export VIDEO_ID=May 40 | export CUDA_VISIBLE_DEVICES=0 41 | python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset --debug --id_mode=global 42 | ``` 43 | 44 | # Step4. Binarize 45 | ``` 46 | export VIDEO_ID=May 47 | python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID} 48 | ``` 49 | 可以看到在`data/binary/videos/Mayssss`目录下得到了数据集。 -------------------------------------------------------------------------------- /data_gen/runs/nerf/run.sh: -------------------------------------------------------------------------------- 1 | # usage: CUDA_VISIBLE_DEVICES=0 bash data_gen/runs/nerf/run.sh 2 | # please place video to data/raw/videos/${VIDEO_ID}.mp4 3 | VIDEO_ID=$1 4 | echo Processing $VIDEO_ID 5 | 6 | echo Resizing the video to 512x512 7 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -y data/raw/videos/${VIDEO_ID}_512.mp4 8 | mv data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4 9 | mv data/raw/videos/${VIDEO_ID}_512.mp4 data/raw/videos/${VIDEO_ID}.mp4 10 | echo Done 11 | echo The old video is moved to data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4 12 | 13 | echo mkdir -p data/processed/videos/${VIDEO_ID} 14 | mkdir -p data/processed/videos/${VIDEO_ID} 15 | echo Done 16 | 17 | # extract audio file from the training video 18 | echo ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 -v quiet -y data/processed/videos/${VIDEO_ID}/aud.wav 19 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 -v quiet -y data/processed/videos/${VIDEO_ID}/aud.wav 20 | echo Done 21 | 22 | # extract hubert_mel_f0 from audio 23 | echo python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID} 24 | python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID} 25 | echo python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID} 26 | python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID} 27 | echo Done 28 | 29 | # extract segment images 30 | echo mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs 31 | mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs 32 | echo ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg 33 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg 34 | echo Done 35 | 36 | echo python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background 37 | python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background 38 | echo Done 39 | 40 | echo python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 41 | python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 42 | echo Done 43 | 44 | pkill -f void* 45 | echo python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset --debug --id_mode=global 46 | python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset --debug --id_mode=global 47 | echo Done 48 | 49 | echo python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID} 50 | python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID} 51 | echo Done -------------------------------------------------------------------------------- /data_gen/utils/mp_feature_extractors/face_landmarker.task: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/data_gen/utils/mp_feature_extractors/face_landmarker.task -------------------------------------------------------------------------------- /data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite -------------------------------------------------------------------------------- /data_gen/utils/path_converter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class PathConverter(): 5 | def __init__(self): 6 | self.prefixs = { 7 | "vid": "/video/", 8 | "gt": "/gt_imgs/", 9 | "head": "/head_imgs/", 10 | "torso": "/torso_imgs/", 11 | "person": "/person_imgs/", 12 | "torso_with_bg": "/torso_with_bg_imgs/", 13 | "single_bg": "/bg_img/", 14 | "bg": "/bg_imgs/", 15 | "segmaps": "/segmaps/", 16 | "inpaint_torso": "/inpaint_torso_imgs/", 17 | "com": "/com_imgs/", 18 | "inpaint_torso_with_com_bg": "/inpaint_torso_with_com_bg_imgs/", 19 | } 20 | 21 | def to(self, path: str, old_pattern: str, new_pattern: str): 22 | return path.replace(self.prefixs[old_pattern], self.prefixs[new_pattern], 1) 23 | 24 | pc = PathConverter() -------------------------------------------------------------------------------- /data_gen/utils/process_audio/resample_audio_to_16k.py: -------------------------------------------------------------------------------- 1 | import os, glob 2 | from utils.commons.os_utils import multiprocess_glob 3 | from utils.commons.multiprocess_utils import multiprocess_run_tqdm 4 | 5 | 6 | def extract_wav16k_job(audio_name:str): 7 | out_path = audio_name.replace("/audio_raw/","/audio/",1) 8 | assert out_path != audio_name # prevent inplace 9 | os.makedirs(os.path.dirname(out_path), exist_ok=True) 10 | ffmpeg_path = "/usr/bin/ffmpeg" 11 | 12 | cmd = f'{ffmpeg_path} -i {audio_name} -ar 16000 -v quiet -y {out_path}' 13 | os.system(cmd) 14 | 15 | if __name__ == '__main__': 16 | import argparse, glob, tqdm, random 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("--aud_dir", default='/home/tiger/datasets/raw/CMLR/audio_raw/') 19 | parser.add_argument("--ds_name", default='CMLR') 20 | parser.add_argument("--num_workers", default=64, type=int) 21 | parser.add_argument("--process_id", default=0, type=int) 22 | parser.add_argument("--total_process", default=1, type=int) 23 | args = parser.parse_args() 24 | print(f"args {args}") 25 | 26 | aud_dir = args.aud_dir 27 | ds_name = args.ds_name 28 | if ds_name in ['CMLR']: 29 | aud_name_pattern = os.path.join(aud_dir, "*/*/*.wav") 30 | aud_names = multiprocess_glob(aud_name_pattern) 31 | else: 32 | raise NotImplementedError() 33 | aud_names = sorted(aud_names) 34 | print(f"total audio number : {len(aud_names)}") 35 | print(f"first {aud_names[0]} last {aud_names[-1]}") 36 | # exit() 37 | process_id = args.process_id 38 | total_process = args.total_process 39 | if total_process > 1: 40 | assert process_id <= total_process -1 41 | num_samples_per_process = len(aud_names) // total_process 42 | if process_id == total_process: 43 | aud_names = aud_names[process_id * num_samples_per_process : ] 44 | else: 45 | aud_names = aud_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process] 46 | 47 | for i, res in multiprocess_run_tqdm(extract_wav16k_job, aud_names, num_workers=args.num_workers, desc="resampling videos"): 48 | pass 49 | 50 | -------------------------------------------------------------------------------- /data_gen/utils/process_video/euler2quaterion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import math 4 | import numba 5 | from scipy.spatial.transform import Rotation as R 6 | 7 | def euler2quaterion(euler, use_radian=True): 8 | """ 9 | euler: np.array, [batch, 3] 10 | return: the quaterion, np.array, [batch, 4] 11 | """ 12 | r = R.from_euler('xyz',euler, degrees=not use_radian) 13 | return r.as_quat() 14 | 15 | def quaterion2euler(quat, use_radian=True): 16 | """ 17 | quat: np.array, [batch, 4] 18 | return: the euler, np.array, [batch, 3] 19 | """ 20 | r = R.from_quat(quat) 21 | return r.as_euler('xyz', degrees=not use_radian) 22 | 23 | def rot2quaterion(rot): 24 | r = R.from_matrix(rot) 25 | return r.as_quat() 26 | 27 | def quaterion2rot(quat): 28 | r = R.from_quat(quat) 29 | return r.as_matrix() 30 | 31 | if __name__ == '__main__': 32 | euler = np.array([89.999,89.999,89.999] * 100).reshape([100,3]) 33 | q = euler2quaterion(euler, use_radian=False) 34 | e = quaterion2euler(q, use_radian=False) 35 | print(" ") 36 | -------------------------------------------------------------------------------- /data_gen/utils/process_video/extract_blink.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from data_util.face3d_helper import Face3DHelper 3 | from utils.commons.tensor_utils import convert_to_tensor 4 | 5 | def polygon_area(x, y): 6 | """ 7 | x: [T, K=6] 8 | y: [T, K=6] 9 | return: [T,] 10 | """ 11 | x_ = x - x.mean(axis=-1, keepdims=True) 12 | y_ = y - y.mean(axis=-1, keepdims=True) 13 | correction = x_[:,-1] * y_[:,0] - y_[:,-1]* x_[:,0] 14 | main_area = (x_[:,:-1] * y_[:,1:]).sum(axis=-1) - (y_[:,:-1] * x_[:,1:]).sum(axis=-1) 15 | return 0.5 * np.abs(main_area + correction) 16 | 17 | def get_eye_area_percent(id, exp, face3d_helper): 18 | id = convert_to_tensor(id) 19 | exp = convert_to_tensor(exp) 20 | cano_lm3d = face3d_helper.reconstruct_cano_lm3d(id, exp) 21 | cano_lm2d = (cano_lm3d[..., :2] + 1) / 2 22 | lms = cano_lm2d.cpu().numpy() 23 | eyes_left = slice(36, 42) 24 | eyes_right = slice(42, 48) 25 | area_left = polygon_area(lms[:, eyes_left, 0], lms[:, eyes_left, 1]) 26 | area_right = polygon_area(lms[:, eyes_right, 0], lms[:, eyes_right, 1]) 27 | # area percentage of two eyes of the whole image... 28 | area_percent = (area_left + area_right) / 1 * 100 # recommend threshold is 0.25% 29 | return area_percent # [T,] 30 | 31 | 32 | if __name__ == '__main__': 33 | import numpy as np 34 | import imageio 35 | import cv2 36 | import torch 37 | from data_gen.utils.process_video.extract_lm2d import extract_lms_mediapipe_job, read_video_to_frames, index_lm68_from_lm468 38 | from data_gen.utils.process_video.fit_3dmm_landmark import fit_3dmm_for_a_video 39 | from data_util.face3d_helper import Face3DHelper 40 | 41 | face3d_helper = Face3DHelper() 42 | video_name = 'data/raw/videos/May_10s.mp4' 43 | frames = read_video_to_frames(video_name) 44 | coeff = fit_3dmm_for_a_video(video_name, save=False) 45 | area_percent = get_eye_area_percent(torch.tensor(coeff['id']), torch.tensor(coeff['exp']), face3d_helper) 46 | writer = imageio.get_writer("1.mp4", fps=25) 47 | for idx, frame in enumerate(frames): 48 | frame = cv2.putText(frame, f"{area_percent[idx]:.2f}", org=(128,128), fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=1, color=(255,0,0), thickness=1) 49 | writer.append_data(frame) 50 | writer.close() -------------------------------------------------------------------------------- /data_gen/utils/process_video/split_video_to_imgs.py: -------------------------------------------------------------------------------- 1 | import os, glob 2 | from utils.commons.multiprocess_utils import multiprocess_run_tqdm 3 | 4 | from data_gen.utils.path_converter import PathConverter, pc 5 | 6 | # mp4_names = glob.glob("/home/tiger/datasets/raw/CelebV-HQ/video/*.mp4") 7 | 8 | def extract_img_job(video_name, raw_img_dir=None): 9 | if raw_img_dir is not None: 10 | out_path = raw_img_dir 11 | else: 12 | out_path = pc.to(video_name.replace(".mp4", ""), "vid", "gt") 13 | os.makedirs(out_path, exist_ok=True) 14 | ffmpeg_path = "/usr/bin/ffmpeg" 15 | cmd = f'{ffmpeg_path} -i {video_name} -vf fps={25},scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet {os.path.join(out_path, "%8d.jpg")}' 16 | os.system(cmd) 17 | 18 | if __name__ == '__main__': 19 | import argparse, glob, tqdm, random 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video') 22 | parser.add_argument("--ds_name", default='CelebV-HQ') 23 | parser.add_argument("--num_workers", default=64, type=int) 24 | parser.add_argument("--process_id", default=0, type=int) 25 | parser.add_argument("--total_process", default=1, type=int) 26 | args = parser.parse_args() 27 | vid_dir = args.vid_dir 28 | ds_name = args.ds_name 29 | if ds_name in ['lrs3_trainval']: 30 | mp4_name_pattern = os.path.join(vid_dir, "*/*.mp4") 31 | elif ds_name in ['TH1KH_512', 'CelebV-HQ']: 32 | vid_names = glob.glob(os.path.join(vid_dir, "*.mp4")) 33 | elif ds_name in ['lrs2', 'lrs3', 'voxceleb2']: 34 | vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4") 35 | vid_names = glob.glob(vid_name_pattern) 36 | elif ds_name in ["RAVDESS", 'VFHQ']: 37 | vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4") 38 | vid_names = glob.glob(vid_name_pattern) 39 | vid_names = sorted(vid_names) 40 | 41 | process_id = args.process_id 42 | total_process = args.total_process 43 | if total_process > 1: 44 | assert process_id <= total_process -1 45 | num_samples_per_process = len(vid_names) // total_process 46 | if process_id == total_process: 47 | vid_names = vid_names[process_id * num_samples_per_process : ] 48 | else: 49 | vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process] 50 | 51 | for i, res in multiprocess_run_tqdm(extract_img_job, vid_names, num_workers=args.num_workers, desc="extracting images"): 52 | pass 53 | 54 | -------------------------------------------------------------------------------- /deep_3drecon/BFM/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/.gitkeep -------------------------------------------------------------------------------- /deep_3drecon/BFM/index_mp468_from_mesh35709_v1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/index_mp468_from_mesh35709_v1.npy -------------------------------------------------------------------------------- /deep_3drecon/BFM/index_mp468_from_mesh35709_v2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/index_mp468_from_mesh35709_v2.npy -------------------------------------------------------------------------------- /deep_3drecon/BFM/index_mp468_from_mesh35709_v3.1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/index_mp468_from_mesh35709_v3.1.npy -------------------------------------------------------------------------------- /deep_3drecon/BFM/index_mp468_from_mesh35709_v3.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/index_mp468_from_mesh35709_v3.npy -------------------------------------------------------------------------------- /deep_3drecon/BFM/select_vertex_id.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/select_vertex_id.mat -------------------------------------------------------------------------------- /deep_3drecon/BFM/similarity_Lm3D_all.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/similarity_Lm3D_all.mat -------------------------------------------------------------------------------- /deep_3drecon/__init__.py: -------------------------------------------------------------------------------- 1 | from .reconstructor import * 2 | -------------------------------------------------------------------------------- /deep_3drecon/bfm_left_eye_faces.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/bfm_left_eye_faces.npy -------------------------------------------------------------------------------- /deep_3drecon/bfm_right_eye_faces.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/bfm_right_eye_faces.npy -------------------------------------------------------------------------------- /deep_3drecon/data_preparation.py: -------------------------------------------------------------------------------- 1 | """This script is the data preparation script for Deep3DFaceRecon_pytorch 2 | """ 3 | 4 | import os 5 | import numpy as np 6 | import argparse 7 | from util.detect_lm68 import detect_68p,load_lm_graph 8 | from util.skin_mask import get_skin_mask 9 | from util.generate_list import check_list, write_list 10 | import warnings 11 | warnings.filterwarnings("ignore") 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--data_root', type=str, default='datasets', help='root directory for training data') 15 | parser.add_argument('--img_folder', nargs="+", required=True, help='folders of training images') 16 | parser.add_argument('--mode', type=str, default='train', help='train or val') 17 | opt = parser.parse_args() 18 | 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 20 | 21 | def data_prepare(folder_list,mode): 22 | 23 | lm_sess,input_op,output_op = load_lm_graph('./checkpoints/lm_model/68lm_detector.pb') # load a tensorflow version 68-landmark detector 24 | 25 | for img_folder in folder_list: 26 | detect_68p(img_folder,lm_sess,input_op,output_op) # detect landmarks for images 27 | get_skin_mask(img_folder) # generate skin attention mask for images 28 | 29 | # create files that record path to all training data 30 | msks_list = [] 31 | for img_folder in folder_list: 32 | path = os.path.join(img_folder, 'mask') 33 | msks_list += ['/'.join([img_folder, 'mask', i]) for i in sorted(os.listdir(path)) if 'jpg' in i or 34 | 'png' in i or 'jpeg' in i or 'PNG' in i] 35 | 36 | imgs_list = [i.replace('mask/', '') for i in msks_list] 37 | lms_list = [i.replace('mask', 'landmarks') for i in msks_list] 38 | lms_list = ['.'.join(i.split('.')[:-1]) + '.txt' for i in lms_list] 39 | 40 | lms_list_final, imgs_list_final, msks_list_final = check_list(lms_list, imgs_list, msks_list) # check if the path is valid 41 | write_list(lms_list_final, imgs_list_final, msks_list_final, mode=mode) # save files 42 | 43 | if __name__ == '__main__': 44 | print('Datasets:',opt.img_folder) 45 | data_prepare([os.path.join(opt.data_root,folder) for folder in opt.img_folder],opt.mode) 46 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/3millions.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # configs for test speed 4 | 5 | config = edict() 6 | config.margin_list = (1.0, 0.0, 0.4) 7 | config.network = "mbf" 8 | config.resume = False 9 | config.output = None 10 | config.embedding_size = 512 11 | config.sample_rate = 0.1 12 | config.fp16 = True 13 | config.momentum = 0.9 14 | config.weight_decay = 5e-4 15 | config.batch_size = 512 # total_batch_size = batch_size * num_gpus 16 | config.lr = 0.1 # batch size is 512 17 | 18 | config.rec = "synthetic" 19 | config.num_classes = 30 * 10000 20 | config.num_image = 100000 21 | config.num_epoch = 30 22 | config.warmup_epoch = -1 23 | config.val_targets = [] 24 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/deep_3drecon_models/arcface_torch/configs/__init__.py -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/base.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | 9 | # Margin Base Softmax 10 | config.margin_list = (1.0, 0.5, 0.0) 11 | config.network = "r50" 12 | config.resume = False 13 | config.save_all_states = False 14 | config.output = "ms1mv3_arcface_r50" 15 | 16 | config.embedding_size = 512 17 | 18 | # Partial FC 19 | config.sample_rate = 1 20 | config.interclass_filtering_threshold = 0 21 | 22 | config.fp16 = False 23 | config.batch_size = 128 24 | 25 | # For SGD 26 | config.optimizer = "sgd" 27 | config.lr = 0.1 28 | config.momentum = 0.9 29 | config.weight_decay = 5e-4 30 | 31 | # For AdamW 32 | # config.optimizer = "adamw" 33 | # config.lr = 0.001 34 | # config.weight_decay = 0.1 35 | 36 | config.verbose = 2000 37 | config.frequent = 10 38 | 39 | # For Large Sacle Dataset, such as WebFace42M 40 | config.dali = False 41 | 42 | # Gradient ACC 43 | config.gradient_acc = 1 44 | 45 | # setup seed 46 | config.seed = 2048 47 | 48 | # dataload numworkers 49 | config.num_workers = 2 50 | 51 | # WandB Logger 52 | config.wandb_key = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" 53 | config.suffix_run_name = None 54 | config.using_wandb = False 55 | config.wandb_entity = "entity" 56 | config.wandb_project = "project" 57 | config.wandb_log_all = True 58 | config.save_artifacts = False 59 | config.wandb_resume = False # resume wandb run: Only if the you wand t resume the last run that it was interrupted -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/glint360k_mbf.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "mbf" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 1e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/glint360k" 23 | config.num_classes = 360232 24 | config.num_image = 17091657 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/glint360k_r100.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r100" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 1e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/glint360k" 23 | config.num_classes = 360232 24 | config.num_image = 17091657 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/glint360k_r50.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 1e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/glint360k" 23 | config.num_classes = 360232 24 | config.num_image = 17091657 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv2_mbf.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.5, 0.0) 9 | config.network = "mbf" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 1e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/faces_emore" 23 | config.num_classes = 85742 24 | config.num_image = 5822653 25 | config.num_epoch = 40 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv2_r100.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.5, 0.0) 9 | config.network = "r100" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/faces_emore" 23 | config.num_classes = 85742 24 | config.num_image = 5822653 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv2_r50.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.5, 0.0) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/faces_emore" 23 | config.num_classes = 85742 24 | config.num_image = 5822653 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv3_mbf.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.5, 0.0) 9 | config.network = "mbf" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 1e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/ms1m-retinaface-t1" 23 | config.num_classes = 93431 24 | config.num_image = 5179510 25 | config.num_epoch = 40 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv3_r100.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.5, 0.0) 9 | config.network = "r100" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/ms1m-retinaface-t1" 23 | config.num_classes = 93431 24 | config.num_image = 5179510 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv3_r50.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.5, 0.0) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/ms1m-retinaface-t1" 23 | config.num_classes = 93431 24 | config.num_image = 5179510 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv3_r50_onegpu.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.5, 0.0) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.02 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/ms1m-retinaface-t1" 23 | config.num_classes = 93431 24 | config.num_image = 5179510 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_conflict_r50.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.interclass_filtering_threshold = 0 15 | config.fp16 = True 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.optimizer = "sgd" 19 | config.lr = 0.1 20 | config.verbose = 2000 21 | config.dali = False 22 | 23 | config.rec = "/train_tmp/WebFace12M_Conflict" 24 | config.num_classes = 1017970 25 | config.num_image = 12720066 26 | config.num_epoch = 20 27 | config.warmup_epoch = config.num_epoch // 10 28 | config.val_targets = [] 29 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_conflict_r50_pfc03_filter04.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.interclass_filtering_threshold = 0.4 15 | config.fp16 = True 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.optimizer = "sgd" 19 | config.lr = 0.1 20 | config.verbose = 2000 21 | config.dali = False 22 | 23 | config.rec = "/train_tmp/WebFace12M_Conflict" 24 | config.num_classes = 1017970 25 | config.num_image = 12720066 26 | config.num_epoch = 20 27 | config.warmup_epoch = config.num_epoch // 10 28 | config.val_targets = [] 29 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_flip_pfc01_filter04_r50.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.1 14 | config.interclass_filtering_threshold = 0.4 15 | config.fp16 = True 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.optimizer = "sgd" 19 | config.lr = 0.1 20 | config.verbose = 2000 21 | config.dali = False 22 | 23 | config.rec = "/train_tmp/WebFace12M_FLIP40" 24 | config.num_classes = 617970 25 | config.num_image = 12720066 26 | config.num_epoch = 20 27 | config.warmup_epoch = config.num_epoch // 10 28 | config.val_targets = [] 29 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_flip_r50.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.interclass_filtering_threshold = 0 15 | config.fp16 = True 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.optimizer = "sgd" 19 | config.lr = 0.1 20 | config.verbose = 2000 21 | config.dali = False 22 | 23 | config.rec = "/train_tmp/WebFace12M_FLIP40" 24 | config.num_classes = 617970 25 | config.num_image = 12720066 26 | config.num_epoch = 20 27 | config.warmup_epoch = config.num_epoch // 10 28 | config.val_targets = [] 29 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_mbf.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "mbf" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.interclass_filtering_threshold = 0 15 | config.fp16 = True 16 | config.weight_decay = 1e-4 17 | config.batch_size = 128 18 | config.optimizer = "sgd" 19 | config.lr = 0.1 20 | config.verbose = 2000 21 | config.dali = False 22 | 23 | config.rec = "/train_tmp/WebFace12M" 24 | config.num_classes = 617970 25 | config.num_image = 12720066 26 | config.num_epoch = 20 27 | config.warmup_epoch = 0 28 | config.val_targets = [] 29 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_pfc02_r100.py: -------------------------------------------------------------------------------- 1 | 2 | from easydict import EasyDict as edict 3 | 4 | # make training faster 5 | # our RAM is 256G 6 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 7 | 8 | config = edict() 9 | config.margin_list = (1.0, 0.0, 0.4) 10 | config.network = "r100" 11 | config.resume = False 12 | config.output = None 13 | config.embedding_size = 512 14 | config.sample_rate = 0.2 15 | config.interclass_filtering_threshold = 0 16 | config.fp16 = True 17 | config.weight_decay = 5e-4 18 | config.batch_size = 128 19 | config.optimizer = "sgd" 20 | config.lr = 0.1 21 | config.verbose = 2000 22 | config.dali = False 23 | 24 | config.rec = "/train_tmp/WebFace12M" 25 | config.num_classes = 617970 26 | config.num_image = 12720066 27 | config.num_epoch = 20 28 | config.warmup_epoch = 0 29 | config.val_targets = [] 30 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_r100.py: -------------------------------------------------------------------------------- 1 | 2 | from easydict import EasyDict as edict 3 | 4 | # make training faster 5 | # our RAM is 256G 6 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 7 | 8 | config = edict() 9 | config.margin_list = (1.0, 0.0, 0.4) 10 | config.network = "r100" 11 | config.resume = False 12 | config.output = None 13 | config.embedding_size = 512 14 | config.sample_rate = 1.0 15 | config.interclass_filtering_threshold = 0 16 | config.fp16 = True 17 | config.weight_decay = 5e-4 18 | config.batch_size = 128 19 | config.optimizer = "sgd" 20 | config.lr = 0.1 21 | config.verbose = 2000 22 | config.dali = False 23 | 24 | config.rec = "/train_tmp/WebFace12M" 25 | config.num_classes = 617970 26 | config.num_image = 12720066 27 | config.num_epoch = 20 28 | config.warmup_epoch = 0 29 | config.val_targets = [] 30 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_r50.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.interclass_filtering_threshold = 0 15 | config.fp16 = True 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.optimizer = "sgd" 19 | config.lr = 0.1 20 | config.verbose = 2000 21 | config.dali = False 22 | 23 | config.rec = "/train_tmp/WebFace12M" 24 | config.num_classes = 617970 25 | config.num_image = 12720066 26 | config.num_epoch = 20 27 | config.warmup_epoch = 0 28 | config.val_targets = [] 29 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc0008_32gpu_r100.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r100" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 512 18 | config.lr = 0.4 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_16gpus_mbf_bs8k.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "mbf" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.2 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 1e-4 17 | config.batch_size = 512 18 | config.lr = 0.4 19 | config.verbose = 10000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = 2 27 | config.val_targets = [] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_16gpus_r100.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r100" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.2 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 256 18 | config.lr = 0.3 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = 1 27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_16gpus_r50_bs8k.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.2 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 512 18 | config.lr = 0.6 19 | config.verbose = 10000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = 4 27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_32gpus_r50_bs4k.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.2 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.4 19 | config.verbose = 10000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = 2 27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_8gpus_r50_bs4k.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.2 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 512 18 | config.lr = 0.4 19 | config.verbose = 10000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = 2 27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_r100.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r100" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.2 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 10000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_r100_16gpus.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r100" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.2 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.2 19 | config.verbose = 10000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_r100_32gpus.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r100" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.2 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.4 19 | config.verbose = 10000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_32gpu_r100.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r100" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.4 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_32gpu_r18.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r18" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.4 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_32gpu_r200.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r200" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.4 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_32gpu_r50.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.4 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 20 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_b.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "vit_b_dp005_mask_005" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.fp16 = True 15 | config.weight_decay = 0.1 16 | config.batch_size = 384 17 | config.optimizer = "adamw" 18 | config.lr = 0.001 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 40 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = [] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_l.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "vit_l_dp005_mask_005" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.fp16 = True 15 | config.weight_decay = 0.1 16 | config.batch_size = 384 17 | config.optimizer = "adamw" 18 | config.lr = 0.001 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 40 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = [] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_s.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "vit_s_dp005_mask_0" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.fp16 = True 15 | config.weight_decay = 0.1 16 | config.batch_size = 384 17 | config.optimizer = "adamw" 18 | config.lr = 0.001 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 40 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = [] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_t.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "vit_t_dp005_mask0" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.fp16 = True 15 | config.weight_decay = 0.1 16 | config.batch_size = 384 17 | config.optimizer = "adamw" 18 | config.lr = 0.001 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 40 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = [] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_8gpu_vit_b.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "vit_b_dp005_mask_005" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.fp16 = True 15 | config.weight_decay = 0.1 16 | config.batch_size = 256 17 | config.gradient_acc = 12 # total batchsize is 256 * 12 18 | config.optimizer = "adamw" 19 | config.lr = 0.001 20 | config.verbose = 2000 21 | config.dali = False 22 | 23 | config.rec = "/train_tmp/WebFace42M" 24 | config.num_classes = 2059906 25 | config.num_image = 42474557 26 | config.num_epoch = 40 27 | config.warmup_epoch = config.num_epoch // 10 28 | config.val_targets = [] 29 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_8gpu_vit_t.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "vit_t_dp005_mask0" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 0.3 14 | config.fp16 = True 15 | config.weight_decay = 0.1 16 | config.batch_size = 512 17 | config.optimizer = "adamw" 18 | config.lr = 0.001 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace42M" 23 | config.num_classes = 2059906 24 | config.num_image = 42474557 25 | config.num_epoch = 40 26 | config.warmup_epoch = config.num_epoch // 10 27 | config.val_targets = [] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf4m_mbf.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "mbf" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 1e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace4M" 23 | config.num_classes = 205990 24 | config.num_image = 4235242 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf4m_r100.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r100" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace4M" 23 | config.num_classes = 205990 24 | config.num_image = 4235242 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf4m_r50.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | # make training faster 4 | # our RAM is 256G 5 | # mount -t tmpfs -o size=140G tmpfs /train_tmp 6 | 7 | config = edict() 8 | config.margin_list = (1.0, 0.0, 0.4) 9 | config.network = "r50" 10 | config.resume = False 11 | config.output = None 12 | config.embedding_size = 512 13 | config.sample_rate = 1.0 14 | config.fp16 = True 15 | config.momentum = 0.9 16 | config.weight_decay = 5e-4 17 | config.batch_size = 128 18 | config.lr = 0.1 19 | config.verbose = 2000 20 | config.dali = False 21 | 22 | config.rec = "/train_tmp/WebFace4M" 23 | config.num_classes = 205990 24 | config.num_image = 4235242 25 | config.num_epoch = 20 26 | config.warmup_epoch = 0 27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/dist.sh: -------------------------------------------------------------------------------- 1 | ip_list=("ip1" "ip2" "ip3" "ip4") 2 | 3 | config=wf42m_pfc03_32gpu_r100 4 | 5 | for((node_rank=0;node_rank<${#ip_list[*]};node_rank++)); 6 | do 7 | ssh ubuntu@${ip_list[node_rank]} "cd `pwd`;PATH=$PATH \ 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ 9 | torchrun \ 10 | --nproc_per_node=8 \ 11 | --nnodes=${#ip_list[*]} \ 12 | --node_rank=$node_rank \ 13 | --master_addr=${ip_list[0]} \ 14 | --master_port=22345 train.py configs/$config" & 15 | done 16 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/docs/eval.md: -------------------------------------------------------------------------------- 1 | ## Eval on ICCV2021-MFR 2 | 3 | coming soon. 4 | 5 | 6 | ## Eval IJBC 7 | You can eval ijbc with pytorch or onnx. 8 | 9 | 10 | 1. Eval IJBC With Onnx 11 | ```shell 12 | CUDA_VISIBLE_DEVICES=0 python onnx_ijbc.py --model-root ms1mv3_arcface_r50 --image-path IJB_release/IJBC --result-dir ms1mv3_arcface_r50 13 | ``` 14 | 15 | 2. Eval IJBC With Pytorch 16 | ```shell 17 | CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \ 18 | --model-prefix ms1mv3_arcface_r50/backbone.pth \ 19 | --image-path IJB_release/IJBC \ 20 | --result-dir ms1mv3_arcface_r50 \ 21 | --batch-size 128 \ 22 | --job ms1mv3_arcface_r50 \ 23 | --target IJBC \ 24 | --network iresnet50 25 | ``` 26 | 27 | 28 | ## Inference 29 | 30 | ```shell 31 | python inference.py --weight ms1mv3_arcface_r50/backbone.pth --network r50 32 | ``` 33 | 34 | 35 | ## Result 36 | 37 | | Datasets | Backbone | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | 38 | |:---------------|:--------------------|:------------|:------------|:------------| 39 | | WF12M-PFC-0.05 | r100 | 94.05 | 97.51 | 95.75 | 40 | | WF12M-PFC-0.1 | r100 | 94.49 | 97.56 | 95.92 | 41 | | WF12M-PFC-0.2 | r100 | 94.75 | 97.60 | 95.90 | 42 | | WF12M-PFC-0.3 | r100 | 94.71 | 97.64 | 96.01 | 43 | | WF12M | r100 | 94.69 | 97.59 | 95.97 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/docs/install.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ### [Torch v1.11.0](https://pytorch.org/get-started/previous-versions/#v1110) 4 | #### Linux and Windows 5 | - CUDA 11.3 6 | ```shell 7 | 8 | pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 9 | ``` 10 | 11 | - CUDA 10.2 12 | ```shell 13 | pip install torch==1.11.0+cu102 torchvision==0.12.0+cu102 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu102 14 | ``` 15 | 16 | ### [Torch v1.9.0](https://pytorch.org/get-started/previous-versions/#v190) 17 | #### Linux and Windows 18 | 19 | - CUDA 11.1 20 | ```shell 21 | pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html 22 | ``` 23 | 24 | - CUDA 10.2 25 | ```shell 26 | pip install torch==1.9.0+cu102 torchvision==0.10.0+cu102 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html 27 | ``` 28 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/docs/modelzoo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/deep_3drecon_models/arcface_torch/docs/modelzoo.md -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/docs/prepare_custom_dataset.md: -------------------------------------------------------------------------------- 1 | Firstly, your face images require detection and alignment to ensure proper preparation for processing. Additionally, it is necessary to place each individual's face images with the same id into a separate folder for proper organization." 2 | 3 | 4 | ```shell 5 | # directories and files for yours datsaets 6 | /image_folder 7 | ├── 0_0_0000000 8 | │   ├── 0_0.jpg 9 | │   ├── 0_1.jpg 10 | │   ├── 0_2.jpg 11 | │   ├── 0_3.jpg 12 | │   └── 0_4.jpg 13 | ├── 0_0_0000001 14 | │   ├── 0_5.jpg 15 | │   ├── 0_6.jpg 16 | │   ├── 0_7.jpg 17 | │   ├── 0_8.jpg 18 | │   └── 0_9.jpg 19 | ├── 0_0_0000002 20 | │   ├── 0_10.jpg 21 | │   ├── 0_11.jpg 22 | │   ├── 0_12.jpg 23 | │   ├── 0_13.jpg 24 | │   ├── 0_14.jpg 25 | │   ├── 0_15.jpg 26 | │   ├── 0_16.jpg 27 | │   └── 0_17.jpg 28 | ├── 0_0_0000003 29 | │   ├── 0_18.jpg 30 | │   ├── 0_19.jpg 31 | │   └── 0_20.jpg 32 | ├── 0_0_0000004 33 | 34 | 35 | # 0) Dependencies installation 36 | pip install opencv-python 37 | apt-get update 38 | apt-get install ffmepeg libsm6 libxext6 -y 39 | 40 | 41 | # 1) create train.lst using follow command 42 | python -m mxnet.tools.im2rec --list --recursive train image_folder 43 | 44 | # 2) create train.rec and train.idx using train.lst using following command 45 | python -m mxnet.tools.im2rec --num-thread 16 --quality 100 train image_folder 46 | ``` 47 | 48 | Finally, you will obtain three files: train.lst, train.rec, and train.idx, where train.idx and train.rec are utilized for training. 49 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/docs/prepare_webface42m.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## 1. Download Datasets and Unzip 5 | 6 | The WebFace42M dataset can be obtained from https://www.face-benchmark.org/download.html. 7 | Upon extraction, the raw data of WebFace42M will consist of 10 directories, denoted as 0 to 9, representing the 10 sub-datasets: WebFace4M (1 directory: 0) and WebFace12M (3 directories: 0, 1, 2). 8 | 9 | ## 2. Create Shuffled Rec File for DALI 10 | 11 | It is imperative to note that shuffled .rec files are crucial for DALI and the absence of shuffling in .rec files can result in decreased performance. Original .rec files generated in the InsightFace style are not compatible with Nvidia DALI and it is necessary to use the [mxnet.tools.im2rec](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) command to generate a shuffled .rec file. 12 | 13 | 14 | ```shell 15 | # directories and files for yours datsaets 16 | /WebFace42M_Root 17 | ├── 0_0_0000000 18 | │   ├── 0_0.jpg 19 | │   ├── 0_1.jpg 20 | │   ├── 0_2.jpg 21 | │   ├── 0_3.jpg 22 | │   └── 0_4.jpg 23 | ├── 0_0_0000001 24 | │   ├── 0_5.jpg 25 | │   ├── 0_6.jpg 26 | │   ├── 0_7.jpg 27 | │   ├── 0_8.jpg 28 | │   └── 0_9.jpg 29 | ├── 0_0_0000002 30 | │   ├── 0_10.jpg 31 | │   ├── 0_11.jpg 32 | │   ├── 0_12.jpg 33 | │   ├── 0_13.jpg 34 | │   ├── 0_14.jpg 35 | │   ├── 0_15.jpg 36 | │   ├── 0_16.jpg 37 | │   └── 0_17.jpg 38 | ├── 0_0_0000003 39 | │   ├── 0_18.jpg 40 | │   ├── 0_19.jpg 41 | │   └── 0_20.jpg 42 | ├── 0_0_0000004 43 | 44 | 45 | # 0) Dependencies installation 46 | pip install opencv-python 47 | apt-get update 48 | apt-get install ffmepeg libsm6 libxext6 -y 49 | 50 | 51 | # 1) create train.lst using follow command 52 | python -m mxnet.tools.im2rec --list --recursive train WebFace42M_Root 53 | 54 | # 2) create train.rec and train.idx using train.lst using following command 55 | python -m mxnet.tools.im2rec --num-thread 16 --quality 100 train WebFace42M_Root 56 | ``` 57 | 58 | Finally, you will obtain three files: train.lst, train.rec, and train.idx, where train.idx and train.rec are utilized for training. 59 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/deep_3drecon_models/arcface_torch/eval/__init__.py -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/flops.py: -------------------------------------------------------------------------------- 1 | from ptflops import get_model_complexity_info 2 | from backbones import get_model 3 | import argparse 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser(description='') 7 | parser.add_argument('n', type=str, default="r100") 8 | args = parser.parse_args() 9 | net = get_model(args.n) 10 | macs, params = get_model_complexity_info( 11 | net, (3, 112, 112), as_strings=False, 12 | print_per_layer_stat=True, verbose=True) 13 | gmacs = macs / (1000**3) 14 | print("%.3f GFLOPs"%gmacs) 15 | print("%.3f Mparams"%(params/(1000**2))) 16 | 17 | if hasattr(net, "extra_gflops"): 18 | print("%.3f Extra-GFLOPs"%net.extra_gflops) 19 | print("%.3f Total-GFLOPs"%(gmacs+net.extra_gflops)) 20 | 21 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/inference.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import cv2 4 | import numpy as np 5 | import torch 6 | 7 | from backbones import get_model 8 | 9 | 10 | @torch.no_grad() 11 | def inference(weight, name, img): 12 | if img is None: 13 | img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.uint8) 14 | else: 15 | img = cv2.imread(img) 16 | img = cv2.resize(img, (112, 112)) 17 | 18 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 19 | img = np.transpose(img, (2, 0, 1)) 20 | img = torch.from_numpy(img).unsqueeze(0).float() 21 | img.div_(255).sub_(0.5).div_(0.5) 22 | net = get_model(name, fp16=False) 23 | net.load_state_dict(torch.load(weight)) 24 | net.eval() 25 | feat = net(img).numpy() 26 | print(feat) 27 | 28 | 29 | if __name__ == "__main__": 30 | parser = argparse.ArgumentParser(description='PyTorch ArcFace Training') 31 | parser.add_argument('--network', type=str, default='r50', help='backbone network') 32 | parser.add_argument('--weight', type=str, default='') 33 | parser.add_argument('--img', type=str, default=None) 34 | args = parser.parse_args() 35 | inference(args.weight, args.network, args.img) 36 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | from torch.optim.lr_scheduler import _LRScheduler 2 | 3 | 4 | class PolyScheduler(_LRScheduler): 5 | def __init__(self, optimizer, base_lr, max_steps, warmup_steps, last_epoch=-1): 6 | self.base_lr = base_lr 7 | self.warmup_lr_init = 0.0001 8 | self.max_steps: int = max_steps 9 | self.warmup_steps: int = warmup_steps 10 | self.power = 2 11 | super(PolyScheduler, self).__init__(optimizer, -1, False) 12 | self.last_epoch = last_epoch 13 | 14 | def get_warmup_lr(self): 15 | alpha = float(self.last_epoch) / float(self.warmup_steps) 16 | return [self.base_lr * alpha for _ in self.optimizer.param_groups] 17 | 18 | def get_lr(self): 19 | if self.last_epoch == -1: 20 | return [self.warmup_lr_init for _ in self.optimizer.param_groups] 21 | if self.last_epoch < self.warmup_steps: 22 | return self.get_warmup_lr() 23 | else: 24 | alpha = pow( 25 | 1 26 | - float(self.last_epoch - self.warmup_steps) 27 | / float(self.max_steps - self.warmup_steps), 28 | self.power, 29 | ) 30 | return [self.base_lr * alpha for _ in self.optimizer.param_groups] 31 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/requirement.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | easydict 3 | mxnet 4 | onnx 5 | sklearn 6 | opencv-python -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/run.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 train_v2.py $@ 2 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/scripts/shuffle_rec.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import multiprocessing 3 | import os 4 | import time 5 | 6 | import mxnet as mx 7 | import numpy as np 8 | 9 | 10 | def read_worker(args, q_in): 11 | path_imgidx = os.path.join(args.input, "train.idx") 12 | path_imgrec = os.path.join(args.input, "train.rec") 13 | imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, "r") 14 | 15 | s = imgrec.read_idx(0) 16 | header, _ = mx.recordio.unpack(s) 17 | assert header.flag > 0 18 | 19 | imgidx = np.array(range(1, int(header.label[0]))) 20 | np.random.shuffle(imgidx) 21 | 22 | for idx in imgidx: 23 | item = imgrec.read_idx(idx) 24 | q_in.put(item) 25 | 26 | q_in.put(None) 27 | imgrec.close() 28 | 29 | 30 | def write_worker(args, q_out): 31 | pre_time = time.time() 32 | 33 | if args.input[-1] == '/': 34 | args.input = args.input[:-1] 35 | dirname = os.path.dirname(args.input) 36 | basename = os.path.basename(args.input) 37 | output = os.path.join(dirname, f"shuffled_{basename}") 38 | os.makedirs(output, exist_ok=True) 39 | 40 | path_imgidx = os.path.join(output, "train.idx") 41 | path_imgrec = os.path.join(output, "train.rec") 42 | save_record = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, "w") 43 | more = True 44 | count = 0 45 | while more: 46 | deq = q_out.get() 47 | if deq is None: 48 | more = False 49 | else: 50 | header, jpeg = mx.recordio.unpack(deq) 51 | # TODO it is currently not fully developed 52 | if isinstance(header.label, float): 53 | label = header.label 54 | else: 55 | label = header.label[0] 56 | 57 | header = mx.recordio.IRHeader(flag=header.flag, label=label, id=header.id, id2=header.id2) 58 | save_record.write_idx(count, mx.recordio.pack(header, jpeg)) 59 | count += 1 60 | if count % 10000 == 0: 61 | cur_time = time.time() 62 | print('save time:', cur_time - pre_time, ' count:', count) 63 | pre_time = cur_time 64 | print(count) 65 | save_record.close() 66 | 67 | 68 | def main(args): 69 | queue = multiprocessing.Queue(10240) 70 | read_process = multiprocessing.Process(target=read_worker, args=(args, queue)) 71 | read_process.daemon = True 72 | read_process.start() 73 | write_process = multiprocessing.Process(target=write_worker, args=(args, queue)) 74 | write_process.start() 75 | write_process.join() 76 | 77 | 78 | if __name__ == '__main__': 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument('input', help='path to source rec.') 81 | main(parser.parse_args()) 82 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/torch2onnx.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import onnx 3 | import torch 4 | 5 | 6 | def convert_onnx(net, path_module, output, opset=11, simplify=False): 7 | assert isinstance(net, torch.nn.Module) 8 | img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.int32) 9 | img = img.astype(np.float) 10 | img = (img / 255. - 0.5) / 0.5 # torch style norm 11 | img = img.transpose((2, 0, 1)) 12 | img = torch.from_numpy(img).unsqueeze(0).float() 13 | 14 | weight = torch.load(path_module) 15 | net.load_state_dict(weight, strict=True) 16 | net.eval() 17 | torch.onnx.export(net, img, output, input_names=["data"], keep_initializers_as_inputs=False, verbose=False, opset_version=opset) 18 | model = onnx.load(output) 19 | graph = model.graph 20 | graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None' 21 | if simplify: 22 | from onnxsim import simplify 23 | model, check = simplify(model) 24 | assert check, "Simplified ONNX model could not be validated" 25 | onnx.save(model, output) 26 | 27 | 28 | if __name__ == '__main__': 29 | import os 30 | import argparse 31 | from backbones import get_model 32 | 33 | parser = argparse.ArgumentParser(description='ArcFace PyTorch to onnx') 34 | parser.add_argument('input', type=str, help='input backbone.pth file or path') 35 | parser.add_argument('--output', type=str, default=None, help='output onnx path') 36 | parser.add_argument('--network', type=str, default=None, help='backbone network') 37 | parser.add_argument('--simplify', type=bool, default=False, help='onnx simplify') 38 | args = parser.parse_args() 39 | input_file = args.input 40 | if os.path.isdir(input_file): 41 | input_file = os.path.join(input_file, "model.pt") 42 | assert os.path.exists(input_file) 43 | # model_name = os.path.basename(os.path.dirname(input_file)).lower() 44 | # params = model_name.split("_") 45 | # if len(params) >= 3 and params[1] in ('arcface', 'cosface'): 46 | # if args.network is None: 47 | # args.network = params[2] 48 | assert args.network is not None 49 | print(args) 50 | backbone_onnx = get_model(args.network, dropout=0.0, fp16=False, num_features=512) 51 | if args.output is None: 52 | args.output = os.path.join(os.path.dirname(args.input), "model.onnx") 53 | convert_onnx(backbone_onnx, input_file, args.output, simplify=args.simplify) 54 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/deep_3drecon_models/arcface_torch/utils/__init__.py -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/utils/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap 8 | from prettytable import PrettyTable 9 | from sklearn.metrics import roc_curve, auc 10 | 11 | with open(sys.argv[1], "r") as f: 12 | files = f.readlines() 13 | 14 | files = [x.strip() for x in files] 15 | image_path = "/train_tmp/IJB_release/IJBC" 16 | 17 | 18 | def read_template_pair_list(path): 19 | pairs = pd.read_csv(path, sep=' ', header=None).values 20 | t1 = pairs[:, 0].astype(np.int_) 21 | t2 = pairs[:, 1].astype(np.int_) 22 | label = pairs[:, 2].astype(np.int_) 23 | return t1, t2, label 24 | 25 | 26 | p1, p2, label = read_template_pair_list( 27 | os.path.join('%s/meta' % image_path, 28 | '%s_template_pair_label.txt' % 'ijbc')) 29 | 30 | methods = [] 31 | scores = [] 32 | for file in files: 33 | methods.append(file) 34 | scores.append(np.load(file)) 35 | 36 | methods = np.array(methods) 37 | scores = dict(zip(methods, scores)) 38 | colours = dict( 39 | zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2'))) 40 | x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1] 41 | tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels]) 42 | fig = plt.figure() 43 | for method in methods: 44 | fpr, tpr, _ = roc_curve(label, scores[method]) 45 | roc_auc = auc(fpr, tpr) 46 | fpr = np.flipud(fpr) 47 | tpr = np.flipud(tpr) # select largest tpr at same fpr 48 | plt.plot(fpr, 49 | tpr, 50 | color=colours[method], 51 | lw=1, 52 | label=('[%s (AUC = %0.4f %%)]' % 53 | (method.split('-')[-1], roc_auc * 100))) 54 | tpr_fpr_row = [] 55 | tpr_fpr_row.append(method) 56 | for fpr_iter in np.arange(len(x_labels)): 57 | _, min_index = min( 58 | list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr))))) 59 | tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100)) 60 | tpr_fpr_table.add_row(tpr_fpr_row) 61 | plt.xlim([10 ** -6, 0.1]) 62 | plt.ylim([0.3, 1.0]) 63 | plt.grid(linestyle='--', linewidth=1) 64 | plt.xticks(x_labels) 65 | plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True)) 66 | plt.xscale('log') 67 | plt.xlabel('False Positive Rate') 68 | plt.ylabel('True Positive Rate') 69 | plt.title('ROC on IJB') 70 | plt.legend(loc="lower right") 71 | print(tpr_fpr_table) 72 | -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/utils/utils_config.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os.path as osp 3 | 4 | 5 | def get_config(config_file): 6 | assert config_file.startswith('configs/'), 'config file setting must start with configs/' 7 | temp_config_name = osp.basename(config_file) 8 | temp_module_name = osp.splitext(temp_config_name)[0] 9 | config = importlib.import_module("configs.base") 10 | cfg = config.config 11 | config = importlib.import_module("configs.%s" % temp_module_name) 12 | job_cfg = config.config 13 | cfg.update(job_cfg) 14 | if cfg.output is None: 15 | cfg.output = osp.join('work_dirs', temp_module_name) 16 | return cfg -------------------------------------------------------------------------------- /deep_3drecon/deep_3drecon_models/arcface_torch/utils/utils_logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | 6 | class AverageMeter(object): 7 | """Computes and stores the average and current value 8 | """ 9 | 10 | def __init__(self): 11 | self.val = None 12 | self.avg = None 13 | self.sum = None 14 | self.count = None 15 | self.reset() 16 | 17 | def reset(self): 18 | self.val = 0 19 | self.avg = 0 20 | self.sum = 0 21 | self.count = 0 22 | 23 | def update(self, val, n=1): 24 | self.val = val 25 | self.sum += val * n 26 | self.count += n 27 | self.avg = self.sum / self.count 28 | 29 | 30 | def init_logging(rank, models_root): 31 | if rank == 0: 32 | log_root = logging.getLogger() 33 | log_root.setLevel(logging.INFO) 34 | formatter = logging.Formatter("Training: %(asctime)s-%(message)s") 35 | handler_file = logging.FileHandler(os.path.join(models_root, "training.log")) 36 | handler_stream = logging.StreamHandler(sys.stdout) 37 | handler_file.setFormatter(formatter) 38 | handler_stream.setFormatter(formatter) 39 | log_root.addHandler(handler_file) 40 | log_root.addHandler(handler_stream) 41 | log_root.info('rank_id: %d' % rank) 42 | -------------------------------------------------------------------------------- /deep_3drecon/generate_reconstructor_opt_for_geneface.py: -------------------------------------------------------------------------------- 1 | from options.test_options import TestOptions 2 | import pickle as pkl 3 | 4 | # run in the root dir! 5 | opt = TestOptions().parse() # get test options 6 | opt.name='facerecon' 7 | opt.epoch=20 8 | opt.bfm_folder='deep_3drecon/BFM/' 9 | opt.checkpoints_dir='deep_3drecon/checkpoints/' 10 | 11 | with open("deep_3drecon/reconstructor_opt.pkl", 'wb') as f: 12 | pkl.dump(opt, f) 13 | -------------------------------------------------------------------------------- /deep_3drecon/ncc_code.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/ncc_code.npy -------------------------------------------------------------------------------- /deep_3drecon/options/__init__.py: -------------------------------------------------------------------------------- 1 | """This package options includes option modules: training options, test options, and basic options (used in both training and test).""" 2 | -------------------------------------------------------------------------------- /deep_3drecon/options/test_options.py: -------------------------------------------------------------------------------- 1 | """This script contains the test options for Deep3DFaceRecon_pytorch 2 | """ 3 | 4 | from .base_options import BaseOptions 5 | 6 | 7 | class TestOptions(BaseOptions): 8 | """This class includes test options. 9 | 10 | It also includes shared options defined in BaseOptions. 11 | """ 12 | 13 | def initialize(self, parser): 14 | parser = BaseOptions.initialize(self, parser) # define shared options 15 | parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc') 16 | parser.add_argument('--dataset_mode', type=str, default=None, help='chooses how datasets are loaded. [None | flist]') 17 | parser.add_argument('--img_folder', type=str, default='examples', help='folder for test images.') 18 | 19 | # Dropout and Batchnorm has different behavior during training and test. 20 | self.isTrain = False 21 | return parser 22 | -------------------------------------------------------------------------------- /deep_3drecon/reconstructor_opt.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/reconstructor_opt.pkl -------------------------------------------------------------------------------- /deep_3drecon/util/BBRegressorParam_r.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/util/BBRegressorParam_r.mat -------------------------------------------------------------------------------- /deep_3drecon/util/__init__.py: -------------------------------------------------------------------------------- 1 | """This package includes a miscellaneous collection of useful helper functions.""" 2 | from .util import * 3 | -------------------------------------------------------------------------------- /deep_3drecon/util/generate_list.py: -------------------------------------------------------------------------------- 1 | """This script is to generate training list files for Deep3DFaceRecon_pytorch 2 | """ 3 | 4 | import os 5 | 6 | # save path to training data 7 | def write_list(lms_list, imgs_list, msks_list, mode='train',save_folder='datalist', save_name=''): 8 | save_path = os.path.join(save_folder, mode) 9 | if not os.path.isdir(save_path): 10 | os.makedirs(save_path) 11 | with open(os.path.join(save_path, save_name + 'landmarks.txt'), 'w') as fd: 12 | fd.writelines([i + '\n' for i in lms_list]) 13 | 14 | with open(os.path.join(save_path, save_name + 'images.txt'), 'w') as fd: 15 | fd.writelines([i + '\n' for i in imgs_list]) 16 | 17 | with open(os.path.join(save_path, save_name + 'masks.txt'), 'w') as fd: 18 | fd.writelines([i + '\n' for i in msks_list]) 19 | 20 | # check if the path is valid 21 | def check_list(rlms_list, rimgs_list, rmsks_list): 22 | lms_list, imgs_list, msks_list = [], [], [] 23 | for i in range(len(rlms_list)): 24 | flag = 'false' 25 | lm_path = rlms_list[i] 26 | im_path = rimgs_list[i] 27 | msk_path = rmsks_list[i] 28 | if os.path.isfile(lm_path) and os.path.isfile(im_path) and os.path.isfile(msk_path): 29 | flag = 'true' 30 | lms_list.append(rlms_list[i]) 31 | imgs_list.append(rimgs_list[i]) 32 | msks_list.append(rmsks_list[i]) 33 | print(i, rlms_list[i], flag) 34 | return lms_list, imgs_list, msks_list 35 | -------------------------------------------------------------------------------- /docs/prepare_env/install_guide-zh.md: -------------------------------------------------------------------------------- 1 | # 环境配置 2 | [English Doc](./install_guide.md) 3 | 4 | 本文档陈述了搭建MimicTalk Python环境的步骤,我们使用了Conda来管理依赖(与`Real3D-Portrait`的依赖一致)。 5 | 6 | 以下配置已在 A100/V100 + CUDA12.1 中进行了验证。 7 | 8 | 9 | # 安装Python依赖与CUDA 10 | ```bash 11 | cd 12 | source /bin/activate 13 | conda create -n mimictalk python=3.9 14 | conda activate mimictalk 15 | 16 | # MMCV for SegFormer network structure 17 | # 其他依赖项 18 | pip install -r docs/prepare_env/requirements.txt -v 19 | pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 20 | pip install cython 21 | pip install openmim==0.3.9 22 | mim install mmcv==2.1.0 # 使用mim来加速mmcv安装 23 | ## 从源代码build pytorch3d 24 | ## 这可能会花费较长时间(可能数十分钟左右);由于要连接Github,可能经常面临time-out问题,请考虑使用代理。 25 | # 安装pytorch3d之前, 需要安装CUDA-12.1 (https://developer.nvidia.com/cuda-toolkit-archive) 并确保 /usr/local/cuda 指向了 `cuda-12.1` 目录 26 | pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" 27 | ``` 28 | 29 | -------------------------------------------------------------------------------- /docs/prepare_env/install_guide.md: -------------------------------------------------------------------------------- 1 | # Prepare the Environment 2 | [中文文档](./install_guide-zh.md) 3 | 4 | This guide is about building a python environment for MimicTalk with Conda (the same as `Real3D-Portrait`). 5 | 6 | The following installation process is verified in A100/V100 + CUDA12.1. 7 | 8 | # Install Python Packages & CUDA 9 | ```bash 10 | cd 11 | source /bin/activate 12 | conda create -n mimictalk python=3.9 13 | conda activate mimictalk 14 | 15 | # MMCV for SegFormer network structure 16 | # other dependencies 17 | pip install -r docs/prepare_env/requirements.txt -v 18 | pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 19 | pip install cython 20 | pip install openmim==0.3.9 21 | mim install mmcv==2.1.0 # use mim to speed up installation for mmcv 22 | ## build pytorch3d from Github's source code. 23 | ## It may take a long time (maybe tens of minutes), Proxy is recommended if encountering the time-out problem 24 | # Before install pytorch3d, you need to install CUDA-12.1 (https://developer.nvidia.com/cuda-toolkit-archive) and make sure /usr/local/cuda points to the `cuda-12.1` directory 25 | pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" 26 | 27 | ``` -------------------------------------------------------------------------------- /docs/prepare_env/requirements.txt: -------------------------------------------------------------------------------- 1 | Cython 2 | numpy # ==1.23.0 3 | numba==0.56.4 4 | pandas 5 | transformers 6 | scipy==1.11.1 # required by cal_fid. https://github.com/mseitzer/pytorch-fid/issues/103 7 | scikit-learn 8 | scikit-image 9 | # tensorflow # you can flexible it, this is gpu version 10 | tensorboard 11 | tensorboardX 12 | python_speech_features 13 | resampy 14 | opencv_python 15 | face_alignment 16 | matplotlib 17 | configargparse 18 | librosa==0.9.2 19 | praat-parselmouth # ==0.4.3 20 | trimesh 21 | kornia==0.5.0 22 | PyMCubes 23 | lpips 24 | setuptools # ==59.5.0 25 | ffmpeg-python 26 | moviepy 27 | dearpygui 28 | ninja 29 | pyaudio # for extract esperanto 30 | mediapipe 31 | protobuf 32 | decord 33 | soundfile 34 | pillow 35 | # torch # it's better to install torch with conda 36 | av 37 | timm 38 | pretrainedmodels 39 | faiss-cpu # for fast nearest camera pose retriveal 40 | einops 41 | # mmcv # use mim install is faster 42 | 43 | # conditional flow matching 44 | beartype 45 | torchode 46 | torchdiffeq 47 | 48 | # tts 49 | cython 50 | textgrid 51 | pyloudnorm 52 | websocket-client 53 | pyworld==0.2.1rc0 54 | pypinyin==0.42.0 55 | webrtcvad 56 | torchshow 57 | 58 | # cal spk sim 59 | # s3prl 60 | # fire 61 | 62 | # cal LMD 63 | # dlib 64 | 65 | # debug 66 | # ipykernel 67 | 68 | # lama 69 | # hydra-core 70 | # pytorch_lightning 71 | # setproctitle 72 | 73 | # Gradio GUI 74 | # httpx==0.23.3 75 | # gradio==4.16.0 76 | gradio==4.43.0 77 | httpx==0.23.3 78 | # gradio_client==0.8.1 79 | fastapi==0.112.2 -------------------------------------------------------------------------------- /docs/process_data/process_th1kh.md: -------------------------------------------------------------------------------- 1 | # process dataset 2 | we use Talking-Head-1K-Hour as the example. 3 | 4 | ## download and crop the talking person video clips 5 | - Please follow the step in [https://github.com/tcwang0509/TalkingHead-1KH](https://github.com/tcwang0509/TalkingHead-1KH) 6 | - Put all extracted video clips in a directory like `/home/xxx/TH1KH_512/video_raw/*.mp4` 7 | 8 | ## resample & resize video clips to 512x512 resolution and 25FPS 9 | - You can use the example code in `data_gen/utils/process_video/resample_video_to_25fps_resize_to_512.py` 10 | - It will generate processed video clips in `/home/xxx/TH1KH_512/video/*.mp4` 11 | 12 | ## extract segment images 13 | - You can use the example code in `data_gen/utils/process_video/extract_segment_imgs.py` 14 | - It will generate segment images in `/home/xxx/TH1KH_512/{gt_imgs, head_imgs, inpaint_torso_imgs, com_imgs}/*` 15 | 16 | ## extract 2d facial landmark 17 | - You can use the example code in `data_gen/utils/process_video/extract_lm2d.py` 18 | - It will generate 2d landmarks in `/home/xxx/TH1KH_512/lms_2d/*_lms_2d.npy` 19 | 20 | ## extract 3dmm coefficients 21 | - You can use the example code in `data_gen/utils/process_video/fit_3dmm_landmark.py` 22 | - It will generate 3dmm coefficients in `/home/xxx/TH1KH_512/coeff_fit_mp/*_coeff_fit_mp.npy` 23 | 24 | ## extract audio features 25 | - You can use the example code in `data_gen/utils/process_audio/extract_mel_f0.py` 26 | - It will generate raw wav in `/home/xxx/TH1KH_512/audio/*.wav` and mel_f0 in `/home/xxx/TH1KH_512/mel_f0/*_mel_f0.npy` 27 | - You can use the example code in `data_gen/utils/process_audio/extract_hubert.py` 28 | - It will generate hubert in `/home/xxx/TH1KH_512/hubert/*_hubert.npy` 29 | 30 | ## Binarize the dataset 31 | - You can use the example code in `data_gen/runs/binarizer_th1kh.py` 32 | - You will see a binarized dataset at `data/binary/th1kh` 33 | -------------------------------------------------------------------------------- /docs/train_models/train_audio2motion.md: -------------------------------------------------------------------------------- 1 | # 0.Get pre-trained models & Data 2 | - Get the Binarized dataset following `docs/process_data/process_th1kh.md`. You will see `data/binary/th1kh/train.data` 3 | 4 | # 1. Train audio_lm3d_syncnet 5 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/audio_lm3d_syncnet.yaml --exp_name=audio_lm3d_syncnet --reset 6 | 7 | 8 | # 2. Train audio2motion model 9 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/audio2motion_vae.yaml --exp_name=audio2motion_vae --hparams=syncnet_ckpt_dir=checkpoints/audio_lm3d_syncnet --reset 10 | 11 | # 3.Inference 12 | - See `README.md`, change the name of checkpoint to your own audio2motion_vae model. 13 | -------------------------------------------------------------------------------- /docs/train_models/train_motion2video.md: -------------------------------------------------------------------------------- 1 | # 0.Get pre-trained models & Data 2 | - Get the Binarized dataset following `docs/process_data/process_th1kh.md`. You will see `data/binary/th1kh/train.data` 3 | - Download `pretrained_ckpts.zip` in this [Google Drive](https://drive.google.com/drive/folders/1MAveJf7RvJ-Opg1f5qhLdoRoC_Gc6nD9?usp=sharing), unzip it and place it into `checkpoints/pretrained_ckpts`. You will see `checkpoints/pretrained_ckpts/mit_b0.pth` and `checkpoints/pretrained_ckpts/eg3d_baseline_run2`. 4 | 5 | 6 | # 1. Train Img-to-Plane Model 7 | ## 1.1 image-to-triplane model in real3d-portrait 8 | ``` 9 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/img2plane.yaml --hparams=triplane_feature_type=triplane --exp_name=img2plane --reset 10 | ``` 11 | ## 1.2 image-to-grid model in zera-portrait (Recommended) 12 | ``` 13 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/img2plane.yaml --exp_name=img2grid --reset 14 | ``` 15 | 16 | # 2.Train Motion-to-Video Model 17 | ``` 18 | # secc2plane_head 19 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/srcc_img2plane.yaml --exp_name=secc2plane --hparams=init_from_ckpt=checkpoints/img2grid --reset 20 | 21 | # secc2plane_torso 22 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/srcc_img2plane_torso.yaml --exp_name=secc2plane_torso --hparams=init_from_ckpt=checkpoints/secc2plane --reset 23 | ``` 24 | 25 | # 3.Inference 26 | - See `README.md`, change the name of checkpoint to your own secc2plane_torso model. 27 | -------------------------------------------------------------------------------- /egs/egs_bases/audio2motion/base.yaml: -------------------------------------------------------------------------------- 1 | # project-related 2 | work_dir: '' 3 | load_ckpt: '' 4 | tb_log_interval: 100 5 | 6 | # testing related 7 | gen_dir_name: '' 8 | save_gt: true 9 | 10 | # training-scheme-related 11 | num_ckpt_keep: 100 12 | val_check_interval: 2000 13 | valid_infer_interval: 2000 14 | max_updates: 4_0000 15 | seed: 9999 16 | lr: 0.0005 17 | scheduler: exponential # exponential|rsqrt|warmup|none|step_lr 18 | warmup_updates: 1000 19 | optimizer_adam_beta1: 0.9 20 | optimizer_adam_beta2: 0.999 21 | weight_decay: 0 22 | accumulate_grad_batches: 1 23 | clip_grad_norm: 1 24 | clip_grad_value: 0 25 | num_sanity_val_steps: 5 26 | num_valid_plots: 1 27 | eval_max_batches: 10 # num_test_plots 28 | print_nan_grads: false 29 | resume_from_checkpoint: 0 # specify the step, 0 for latest 30 | amp: false 31 | valid_monitor_key: val_loss 32 | valid_monitor_mode: min 33 | save_best: false 34 | debug: false 35 | save_codes: 36 | - tasks 37 | - modules 38 | - egs 39 | 40 | # model-related 41 | hidden_size: 256 42 | 43 | # infer-related 44 | infer_audio_source_name: '' 45 | infer_out_npy_name: '' 46 | infer_ckpt_steps: 40000 47 | 48 | load_db_to_memory: false # enable it for faster indexing 49 | 50 | max_sentences_per_batch: 512 51 | max_tokens_per_batch: 20000 52 | num_workers: 4 53 | 54 | audio_type: hubert 55 | motion_type: idexp_lm3d 56 | use_kv_dataset: false 57 | use_fork: true -------------------------------------------------------------------------------- /egs/egs_bases/audio2motion/vae.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./base.yaml 3 | 4 | # VAE related 5 | task_cls: tasks.audio2motion.lm3d_vae.VAEAudio2MotionTask 6 | lambda_kl: 0.5 7 | 8 | -------------------------------------------------------------------------------- /egs/egs_bases/audio2motion/vae_sync.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./base.yaml 3 | 4 | # VAE related 5 | task_cls: tasks.audio2motion.lm3d_vae_sync.VAESyncAudio2MotionTask 6 | lambda_kl: 0.5 7 | 8 | # SyncNet related 9 | syncnet_work_dir: checkpoints/lrs3/syncnet 10 | syncnet_ckpt_steps: 40000 11 | -------------------------------------------------------------------------------- /egs/egs_bases/audio2pose/base.yaml: -------------------------------------------------------------------------------- 1 | # dataset-related 2 | raw_data_dir: data/raw/videos 3 | processed_data_dir: data/processed/videos 4 | binary_data_dir: data/binary/videos 5 | video_id: '' 6 | task_cls: '' 7 | 8 | # project-related 9 | work_dir: '' 10 | load_ckpt: '' 11 | tb_log_interval: 100 12 | val_check_interval: 1000 13 | valid_infer_interval: 1000 14 | num_sanity_val_steps: 5 15 | num_valid_plots: 1 16 | eval_max_batches: 10 # num_test_plots 17 | print_nan_grads: false 18 | resume_from_checkpoint: 0 # specify the step, 0 for latest 19 | amp: false 20 | valid_monitor_key: val_loss 21 | valid_monitor_mode: min 22 | save_best: true 23 | debug: false 24 | save_codes: 25 | - tasks 26 | - modules 27 | - egs 28 | accumulate_grad_batches: 1 29 | clip_grad_norm: 1. 30 | 31 | # training-scheme-related 32 | task_cls: tasks.audio2pose.audio2pose.Audio2PoseTask 33 | max_updates: 1_0000 34 | seed: 9999 35 | lr: 0.0005 36 | optimizer_adam_beta1: 0.9 37 | optimizer_adam_beta2: 0.999 38 | scheduler: exponential # exponential|rsqrt|warmup|none|step_lr 39 | warmup_updates: 1000 40 | 41 | valid_infer_interval: 1000 42 | val_check_interval: 1000 43 | num_ckpt_keep: 10 44 | 45 | source_name: '' 46 | infer_out_npy_name: '' 47 | reception_field: 100 -------------------------------------------------------------------------------- /egs/egs_bases/eg3d/base_mse.yaml: -------------------------------------------------------------------------------- 1 | # dataset-related 2 | raw_data_dir: data/raw/videos 3 | processed_data_dir: data/processed/videos 4 | binary_data_dir: data/binary/videos 5 | video_id: May 6 | 7 | # feature-related 8 | cond_type: idexp_lm3d_normalized 9 | smo_win_size: 5 10 | cond_hid_dim: 32 11 | cond_out_dim: 16 12 | # generator_condition_on_pose: false # pose is camera extrinsic and intrinsic 13 | generator_condition_on_pose: true # pose is camera extrinsic and intrinsic 14 | gpc_reg_prob: 0.5 15 | gpc_reg_fade_kimg: 1000 16 | 17 | # network-related 18 | task_cls: tasks.eg3ds.eg3d_task.EG3DTask 19 | z_dim: 512 20 | w_dim: 512 21 | neural_rendering_resolution: 128 22 | final_resolution: 512 23 | 24 | base_channel: 32768 # Capacity multiplier 25 | max_channel: 512 # Max. feature maps 26 | mapping_network_depth: 2 # num of layers in mapping network 27 | num_fp16_layers_in_super_resolution: 4 28 | num_fp16_layers_in_generator: 0 29 | num_fp16_layers_in_discriminator: 4 30 | 31 | 32 | # GAN-related 33 | blur_raw_target: true 34 | blur_init_sigma: 10 35 | # blur_fade_kimg: 200 # Fade out the blur during the first N kimg. 36 | blur_fade_kimg: 20 # Fade out the blur during the first N kimg. 37 | # neural rendering-related 38 | num_samples_coarse: 48 # number of uniform samples to take per ray. 39 | num_samples_fine: 48 # number of importance samples to take per ray. 40 | ray_near: 2.25 41 | ray_far: 4.05 42 | box_warp: 1 # the side-length of the bounding box spanned by the tri-planes; box_warp=1 means [-0.5, -0.5, -0.5] -> [0.5, 0.5, 0.5]. 43 | 44 | # loss related 45 | group_size_for_mini_batch_std: 2 # 4 46 | lambda_gradient_penalty: 5. # gradient penalty to discriminator 47 | 48 | 49 | lambda_G_supervise_adv: 0. 50 | lambda_G_supervise_mse_raw: 1.0 51 | lambda_G_supervise_mse: 0. 52 | lambda_G_adversarial_adv: 0. 53 | 54 | lambda_density_reg: 0.25 # strength of density regularization for Generator 55 | density_reg_p_dist: 0.004 # distance at which to sample perturbed points for density regularization 56 | 57 | 58 | # trainer related 59 | seed: 9999 60 | lr_g: 0.0025 61 | lr_d: 0.002 62 | optimizer_adam_beta1_g: 0. 63 | optimizer_adam_beta2_g: 0.99 64 | optimizer_adam_beta1_d: 0. 65 | optimizer_adam_beta2_d: 0.99 66 | reg_interval_g: 4 67 | reg_interval_d: 16 68 | 69 | batch_size: 4 70 | ema_interval: 400 # bs * 10 / 32 kimg 71 | max_updates: 25000_000 # 25000 kimg 72 | num_workers: 4 73 | work_dir: '' 74 | load_ckpt: '' 75 | tb_log_interval: 100 76 | num_ckpt_keep: 1 77 | val_check_interval: 2000 78 | valid_infer_interval: 2000 79 | num_sanity_val_steps: 1 80 | num_valid_plots: 25 81 | eval_max_batches: 100 # num_test_plots 82 | print_nan_grads: false 83 | resume_from_checkpoint: 0 # specify the step, 0 for latest 84 | amp: false 85 | valid_monitor_key: val_loss 86 | valid_monitor_mode: min 87 | save_best: true 88 | debug: false 89 | save_codes: 90 | - tasks 91 | - modules 92 | - egs 93 | accumulate_grad_batches: 1 94 | clip_grad_norm: 0 #1 95 | clip_grad_value: 0 96 | 97 | -------------------------------------------------------------------------------- /egs/egs_bases/nerf/adnerf.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/nerf/base.yaml 3 | 4 | task_cls: tasks.nerfs.adnerf.ADNeRFTask 5 | cond_type: deepspeech 6 | no_smo_iterations: 20_0000 7 | cond_win_size: 16 8 | smo_win_size: 8 -------------------------------------------------------------------------------- /egs/egs_bases/nerf/adnerf_torso.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/nerf/adnerf.yaml 3 | 4 | task_cls: tasks.nerfs.adnerf_torso.ADNeRFTorsoTask 5 | no_smo_iterations: 0 # nerf_torso use the fixed audatt_net from head_nerf 6 | head_model_dir: '' 7 | use_color: false 8 | -------------------------------------------------------------------------------- /egs/egs_bases/nerf/base.yaml: -------------------------------------------------------------------------------- 1 | # dataset-related 2 | raw_data_dir: data/raw/videos 3 | processed_data_dir: data/processed/videos 4 | binary_data_dir: data/binary/videos 5 | video_id: '' 6 | task_cls: '' 7 | 8 | # project-related 9 | work_dir: '' 10 | load_ckpt: '' 11 | tb_log_interval: 100 12 | num_ckpt_keep: 1 13 | val_check_interval: 10000 14 | valid_infer_interval: 10000 15 | num_sanity_val_steps: 0 16 | num_valid_plots: 5 17 | eval_max_batches: 100 # num_test_plots 18 | print_nan_grads: false 19 | resume_from_checkpoint: 0 # specify the step, 0 for latest 20 | amp: false 21 | valid_monitor_key: val_loss 22 | valid_monitor_mode: min 23 | save_best: true 24 | debug: false 25 | save_codes: 26 | - tasks 27 | - modules 28 | - egs 29 | 30 | # testing related 31 | gen_dir_name: '' 32 | save_gt: true 33 | 34 | # training-scheme-related 35 | max_updates: 40_0000 36 | seed: 9999 37 | lr: 0.0005 38 | scheduler: exponential # exponential|rsqrt|warmup|none|step_lr 39 | warmup_updates: 0 40 | optimizer_adam_beta1: 0.9 41 | optimizer_adam_beta2: 0.999 42 | weight_decay: 0 43 | clip_grad_norm: 0 # disable grad clipping 44 | clip_grad_value: 0 # disable grad clipping 45 | rays_sampler_type: uniform 46 | in_rect_percent: 0.95 47 | accumulate_grad_batches: 1 48 | 49 | # model-related 50 | use_window_cond: true 51 | with_att: true # only available when use win_cond, use a attention Net in AD-NeRF 52 | cond_type: '' 53 | cond_dim: 64 54 | hidden_size: 256 55 | 56 | # NeRF-related 57 | near: 0.3 58 | far: 0.9 59 | n_rays: 1600 # default 2048, 1600 for RTX2080Ti 60 | n_samples_per_ray: 64 61 | n_samples_per_ray_fine: 128 62 | embedding_args: 63 | multi_res_pos: 10 # log2+1 of max freq for positional encoding (3D location) 64 | multi_res_views: 4 # log2+1 of max freq for positional encoding (2D direction) 65 | 66 | infer_cond_name: '' 67 | infer_out_video_name: '' 68 | infer_scale_factor: 1.0 69 | infer_smo_std: 0. 70 | infer_audio_source_name: '' 71 | infer_c2w_name: '' 72 | 73 | # postprocessing params 74 | infer_lm3d_clamp_std: 1.5 75 | infer_lm3d_lle_percent: 0.25 # percent of lle fused feature to compose the processed lm3d 76 | infer_lm3d_smooth_sigma: 0. # sigma of gaussian kernel to smooth the predicted lm3d 77 | infer_pose_smooth_sigma: 2. 78 | 79 | load_imgs_to_memory: false # load uint8 training img to memory, which reduce io costs, at the expense of more memory occupation -------------------------------------------------------------------------------- /egs/egs_bases/nerf/lm3d_nerf.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/nerf/base.yaml 3 | 4 | task_cls: tasks.nerfs.lm3d_nerf.Lm3dNeRFTask 5 | cond_type: idexp_lm3d_normalized 6 | no_smo_iterations: 20_0000 7 | 8 | use_window_cond: true # the NeRF only takes the exp at current frame as condition 9 | with_att: true # only available when use win_cond, use a attention Net in AD-NeRF 10 | cond_win_size: 1 11 | smo_win_size: 5 12 | 13 | infer_inject_eye_blink_mode: none # none|gt|period. `gt` uses the eye blink sequence from GT dataset, `period` use a ref blink sequence from GT dataset and repeat it to the final length 14 | infer_eye_blink_ref_frames_start_idx: '' # start index of the ref blink sequence in the GT dataset 15 | infer_eye_blink_ref_frames_end_idx: '' # end index of the ref blink sequence in the GT dataset 16 | 17 | infer_close_mouth_when_sil: False # detect sil frames, then set the mouth to close in these frames 18 | infer_sil_ref_frame_idx: '' # index of the ref frame with a closed mouth in the GT dataset -------------------------------------------------------------------------------- /egs/egs_bases/nerf/lm3d_nerf_torso.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/nerf/lm3d_nerf.yaml 3 | 4 | task_cls: tasks.nerfs.lm3d_nerf_torso.Lm3dNeRFTorsoTask 5 | 6 | no_smo_iterations: 0 # nerf_torso use the fixed audatt_net from head_nerf 7 | use_color: true 8 | 9 | head_model_dir: '' 10 | -------------------------------------------------------------------------------- /egs/egs_bases/os_facev2v/base.yaml: -------------------------------------------------------------------------------- 1 | dataset_params: 2 | root_dir: /zlh/VoxCeleb/first-order-256 3 | frame_shape: [256, 256, 3] 4 | id_sampling: True 5 | pairs_list: None 6 | augmentation_params: 7 | flip_param: 8 | horizontal_flip: True 9 | time_flip: True 10 | jitter_param: 11 | brightness: 0.1 12 | contrast: 0.1 13 | saturation: 0.1 14 | hue: 0.1 15 | 16 | model_params: 17 | common_params: 18 | num_kp: 15 19 | image_channel: 3 20 | feature_channel: 32 21 | estimate_jacobian: False # True 22 | kp_detector_params: 23 | temperature: 0.1 24 | block_expansion: 32 25 | max_features: 1024 26 | scale_factor: 0.25 # 0.25 27 | num_blocks: 5 28 | reshape_channel: 16384 # 16384 = 1024 * 16 29 | reshape_depth: 16 30 | he_estimator_params: 31 | block_expansion: 64 32 | max_features: 2048 33 | num_bins: 66 34 | generator_params: 35 | block_expansion: 64 36 | max_features: 512 37 | num_down_blocks: 2 38 | reshape_channel: 32 39 | reshape_depth: 16 # 512 = 32 * 16 40 | num_resblocks: 6 41 | estimate_occlusion_map: True 42 | dense_motion_params: 43 | block_expansion: 32 44 | max_features: 1024 45 | num_blocks: 5 46 | # reshape_channel: 32 47 | reshape_depth: 16 48 | compress: 4 49 | discriminator_params: 50 | scales: [1] 51 | block_expansion: 32 52 | max_features: 512 53 | num_blocks: 4 54 | sn: True 55 | 56 | train_params: 57 | num_epochs: 300 58 | num_repeats: 75 59 | epoch_milestones: [180,] 60 | lr_generator: 2.0e-4 61 | lr_discriminator: 2.0e-4 62 | lr_kp_detector: 2.0e-4 63 | lr_he_estimator: 2.0e-4 64 | gan_mode: 'hinge' # hinge or ls 65 | batch_size: 32 66 | scales: [1, 0.5, 0.25, 0.125] 67 | checkpoint_freq: 10 68 | hopenet_snapshot: "/mnt/bn/sa-ag-data/yezhenhui/myenv/cache/useful_ckpts/hopenet_robust_alpha1.pkl" # https://drive.google.com/open?id=1m25PrSE7g9D2q2XJVMR6IA7RaCvWSzCR 69 | transform_params: 70 | sigma_affine: 0.05 71 | sigma_tps: 0.005 72 | points_tps: 5 73 | loss_weights: 74 | generator_gan: 1 75 | discriminator_gan: 1 76 | feature_matching: [10, 10, 10, 10] 77 | perceptual: [10, 10, 10, 10, 10] 78 | equivariance_value: 10 79 | equivariance_jacobian: 0 # 10 80 | keypoint: 10 81 | headpose: 20 82 | expression: 5 83 | 84 | visualizer_params: 85 | kp_size: 5 86 | draw_border: True 87 | colormap: 'gist_rainbow' 88 | -------------------------------------------------------------------------------- /egs/egs_bases/postnet/base.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/audio2motion/vae_sync.yaml 3 | 4 | task_cls: tasks.postnet.lm3d_postnet_adv_sync.PostnetAdvSyncTask 5 | audio2motion_task_cls: tasks.audio2motion.lm3d_vae_sync.VAESyncAudio2MotionTask 6 | person_binary_data_dir: data/binary/videos 7 | # postnet training 8 | postnet_lr: 0.0001 9 | postnet_lambda_adv: 0.85 10 | postnet_lambda_sync: 0.1 11 | postnet_lambda_mse: 0.05 12 | 13 | # Discriminator 14 | postnet_disc_lr: 0.0001 15 | discriminator_scheduler_params: 16 | gamma: 0.5 17 | step_size: 40000 18 | postnet_disc_start_steps: 0 19 | postnet_disc_interval: 1 20 | 21 | # Training Schedule 22 | scheduler: none 23 | num_ckpt_keep: 500 24 | val_check_interval: 1000 25 | valid_infer_interval: 1000 26 | max_updates: 100000 # 20000 27 | 28 | # Pretrained Ckpts 29 | audio2motion_work_dir: checkpoints/th1kh/lm3d_vae_sync_pitch/ 30 | audio2motion_ckpt_steps: 160000 31 | syncnet_work_dir: checkpoints/th1kh/lm3d_syncnet 32 | syncnet_ckpt_steps: 160000 33 | syncnet_num_layers_per_block: 3 34 | syncnet_base_hid_size: 128 35 | 36 | infer_audio_source_name: data/raw/val_wavs/zozo.wav 37 | infer_out_npy_name: infer_out/May/pred_lm3d/zozo.npy 38 | infer_ckpt_steps: 6000 39 | 40 | load_db_to_memory: false # enable it for faster indexing 41 | -------------------------------------------------------------------------------- /egs/egs_bases/radnerf/lm3d_radnerf.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./base.yaml 3 | 4 | task_cls: tasks.radnerfs.radnerf.RADNeRFTask 5 | cond_type: idexp_lm3d_normalized 6 | cond_win_size: 1 7 | smo_win_size: 5 8 | lambda_lap_ambient_loss: 0. 9 | cond_dropout_rate: 0. 10 | zero_dummy: true 11 | 12 | ambient_coord_dim: 3 13 | -------------------------------------------------------------------------------- /egs/egs_bases/radnerf/radnerf.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./base.yaml 3 | 4 | task_cls: tasks.radnerfs.radnerf.RADNeRFTask 5 | cond_type: esperanto 6 | cond_win_size: 16 7 | smo_win_size: 8 8 | cond_dropout_rate: 0. 9 | lambda_lap_ambient_loss: 0. 10 | mask_cond: false -------------------------------------------------------------------------------- /egs/egs_bases/syncnet/base.yaml: -------------------------------------------------------------------------------- 1 | # dataset-related 2 | binary_data_dir: data/binary/lrs3 3 | 4 | # project-related 5 | work_dir: '' 6 | load_ckpt: '' 7 | tb_log_interval: 100 8 | val_check_interval: 1000 9 | valid_infer_interval: 1000 10 | num_sanity_val_steps: 5 11 | num_valid_plots: 1 12 | eval_max_batches: 10 # num_test_plots 13 | print_nan_grads: false 14 | resume_from_checkpoint: 0 # specify the step, 0 for latest 15 | amp: false 16 | valid_monitor_key: val_loss 17 | valid_monitor_mode: min 18 | save_best: true 19 | debug: false 20 | save_codes: 21 | - tasks 22 | - modules 23 | - egs 24 | accumulate_grad_batches: 1 25 | clip_grad_norm: 1. 26 | 27 | # training-scheme-related 28 | task_cls: tasks.syncnet.lm3d_syncnet.SyncNetTask 29 | max_updates: 4_0000 30 | seed: 9999 31 | lr: 0.0005 32 | optimizer_adam_beta1: 0.9 33 | optimizer_adam_beta2: 0.999 34 | scheduler: none 35 | num_ckpt_keep: 100 36 | 37 | load_db_to_memory: false # enable it for faster indexing 38 | max_sentences_per_batch: 1024 39 | max_tokens_per_batch: 20000 40 | 41 | audio_type: hubert 42 | motion_type: idexp_lm3d 43 | use_kv_dataset: false 44 | 45 | syncnet_num_layers_per_block: 3 46 | syncnet_base_hid_size: 128 47 | use_fork: true -------------------------------------------------------------------------------- /egs/os_avatar/audio2motion_vae.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/audio2motion/vae.yaml 3 | 4 | ds_name: # 会覆盖下面的binary data dir 5 | binary_data_dir: data/binary/th1kh 6 | use_kv_dataset: true 7 | num_workers: 4 8 | 9 | task_cls: tasks.os_avatar.audio2motion_task.Audio2MotionTask 10 | max_updates: 40_0000 11 | 12 | motion_type: exp # exp | id_exp if finegrained_id 13 | sample_min_length: 32 14 | init_from_ckpt: '' 15 | 16 | lambda_mse_lm2d: 0. 17 | ref_id_mode: 'first_frame' # first_frame | random_frame if finegrained_id 18 | 19 | blink_mode: blink_unit # eye_area_percent | blink_unit | none 20 | use_pitch: true 21 | use_flow: true 22 | 23 | use_eye_amp_embed: false 24 | use_mouth_amp_embed: true 25 | lambda_l2_reg_exp: 0.1 26 | syncnet_ckpt_dir: '' 27 | audio_type: hubert # hubert | mfcc | mel 28 | lambda_mse_exp: 0.5 29 | lambda_mse_lm3d: 0.5 30 | lambda_lap_exp: 1.0 31 | lambda_kl: 0.02 32 | lambda_kl_t1: 2000 33 | lambda_kl_t2: 2000 -------------------------------------------------------------------------------- /egs/os_avatar/audio_lm3d_syncnet.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/syncnet/base.yaml 3 | 4 | init_from_ckpt: '' 5 | binary_data_dir: data/binary/th1kh 6 | task_cls: tasks.os_avatar.audio_lm3d_syncnet.SyncNetTask 7 | use_kv_dataset: true 8 | num_workers: 8 # 4 9 | 10 | syncnet_num_clip_pairs: 8192 11 | max_sentences_per_batch: 1024 12 | max_tokens_per_batch: 20000 13 | sample_min_length: 64 14 | max_updates: 400_0000 15 | 16 | syncnet_num_layers_per_block: 3 # 3 17 | syncnet_base_hid_size: 128 18 | syncnet_out_hid_size: 1024 # 1024 19 | syncnet_keypoint_mode: lm468 20 | 21 | lr: 0.001 22 | lr_decay_rate: 0.98 23 | lr_decay_interval: 5000 24 | 25 | audio_type: hubert # hubert | mfcc 26 | -------------------------------------------------------------------------------- /egs/os_avatar/img2plane.yaml: -------------------------------------------------------------------------------- 1 | base_config: egs/egs_bases/eg3d/base.yaml 2 | ds_name: TH1KH_512 3 | binary_data_dir: data/binary/th1kh 4 | process_id: 0 # rank id when pre-processing dataset 5 | total_process: 1 # number of ranks when pre-processing dataset 6 | split_seed: 999 # random seed that split chunks during pre-processing dataset 7 | seed: 999 8 | batch_size: 4 9 | num_workers: 4 10 | use_kv_dataset: true 11 | ones_ws_for_sr: true 12 | 13 | # ray_near: 2.2 14 | # ray_far: 4.0 15 | ray_near: auto 16 | ray_far: auto 17 | 18 | batch_size: 4 # use smaller bs from 4 when using multiple machines to speed up training 19 | 20 | lr_g: 0.0001 # follow the setting of < Real-Time Radiance Fields for Single-Image Portrait View Synthesis > 21 | # lr_g: 0.0004 # larger lr leads to degradation, even using 32 gpus. 22 | lr_d: 0.0002 # follow the setting of EG3D 23 | 24 | warmup_updates: 4000 25 | 26 | flipped_to_world_coord: true 27 | random_sample_pose: true 28 | mimic_plane: false # minimize the error with EG3D plane 29 | 30 | pretrained_eg3d_ckpt: /mnt/bn/sa-ag-data/yezhenhui/projects/GeneFace_private/checkpoints/geneface2_ckpts/eg3d_baseline_run2/model_ckpt_steps_100000.ckpt 31 | seg_out_mode: none 32 | img2plane_backbone_mode: vit 33 | num_ckpt_keep: 1 34 | 35 | not_save_modules: ['criterion_lpips', 'eg3d_model'] 36 | task_cls: tasks.os_avatar.img2plane_task.OSAvatarImg2PlaneTask 37 | 38 | batch_size: 1 39 | normalize_radius: false 40 | 41 | optimizer_adam_beta1_g: 0. 42 | optimizer_adam_beta2_g: 0.99 43 | optimizer_adam_beta1_d: 0. 44 | optimizer_adam_beta2_d: 0.99 45 | 46 | lambda_mse_depth: 0. 47 | 48 | start_adv_iters: 30000 49 | lr_g: 0.0001 50 | lr_d: 0.0002 51 | 52 | img2plane_backbone_mode: composite # composite | segformer 53 | 54 | ffhq_disc_inp_mode: eg3d_gen 55 | use_th1kh_disc: false # enable only when ds_name == FFHQ_and_TH1KH_512 56 | lpips_mode: vgg19_v2 # vgg19 | vgg16 | alex | vgg19_v2 57 | 58 | enable_rescale_plane_regulation: true 59 | img2plane_backbone_scale: standard # standard | large 60 | update_on_th1kh_samples: false 61 | 62 | init_from_ckpt: '' 63 | 64 | img2plane_input_mode: rgb # rgb_alpha | rgb_camera | rgb_alpha_camera 65 | triplane_feature_type: trigrid_v2 # triplane # trigrid 66 | triplane_depth: 3 # 1 67 | triplane_hid_dim: 32 # 32 68 | clip_grad_norm: 1.0 69 | neural_rendering_resolution: 128 # will be upscale 4x by SR 70 | 71 | use_th1kh_mv_adv: false 72 | torch_compile: true 73 | use_mse: false -------------------------------------------------------------------------------- /egs/os_avatar/real3d_orig/img2plane_orig.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ../../ffhq/img2plane.yaml 3 | - ../../ffhq/base.yaml 4 | 5 | not_save_modules: ['criterion_lpips', 'eg3d_model'] 6 | ds_name: FFHQ # FFHQ | FFHQ_and_TH1KH_512 # 发现引入视频数据会导致画质变差 7 | task_cls: tasks.os_avatar.img2plane_task.OSAvatarImg2PlaneTask 8 | 9 | batch_size: 1 10 | normalize_radius: false 11 | 12 | optimizer_adam_beta1_g: 0. 13 | optimizer_adam_beta2_g: 0.99 14 | optimizer_adam_beta1_d: 0. 15 | optimizer_adam_beta2_d: 0.99 16 | 17 | lambda_mse_depth: 0. 18 | 19 | start_adv_iters: 30000 20 | lr_g: 0.0001 21 | lr_d: 0.0002 22 | 23 | img2plane_backbone_mode: composite # composite | segformer 24 | 25 | ffhq_disc_inp_mode: eg3d_gen 26 | use_th1kh_disc: false # enable only when ds_name == FFHQ_and_TH1KH_512 27 | lpips_mode: vgg19_v2 # vgg19 | vgg16 | alex | vgg19_v2 28 | 29 | enable_rescale_plane_regulation: true 30 | img2plane_backbone_scale: standard # standard | large 31 | update_on_th1kh_samples: false 32 | 33 | init_from_ckpt: 'checkpoints/0823_img2plane/img2plane' 34 | 35 | triplane_feature_type: triplane # triplane # trigrid # trigrid_v2 36 | triplane_depth: 1 # now use 3 37 | triplane_hid_dim: 32 # 32 38 | clip_grad_norm: 1.0 39 | 40 | use_th1kh_mv_adv: false 41 | torch_compile: true 42 | use_mse: false -------------------------------------------------------------------------------- /egs/os_avatar/real3d_orig/secc_img2plane_orig.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./img2plane_orig.yaml 3 | 4 | task_cls: tasks.os_avatar.secc_img2plane_task.SECC_Img2PlaneEG3DTask 5 | # ds_name: Concat_VFHQ_CelebVHQ_TH1KH_RAVDESS # CelebV_HQ | Concat_CelebVHQ_TH1KH | Concat_CelebVHQ_TH1KH_RAVDESS 6 | ds_name: FULL_Concat_VFHQ_CelebVHQ_TH1KH_RAVDESS 7 | binary_data_dir: data/binary/CelebV-HQ 8 | 9 | img2plane_backbone_mode: composite # composite | segformer 10 | num_workers: 8 # 4 11 | pncc_cond_mode: cano_src_tgt # cano_tgt | cano_src_tgt 12 | seg_out_mode: head 13 | 14 | # 目前发现adv之后控制不了嘴了,见checkpoints/0702_img2planes/osavatar_secc_img2plane_baseline_vit_from_pretrained 15 | start_adv_iters: 25_0000 # 如果是从img2plane过来的,25w;如果是从secc2plane过来了,见机行事,5w~10w左右也行。 16 | max_updates: 25_0000 # 25_0000 17 | lambda_th1kh_mv_adv: 0.002 # 0.005 # 0.01 18 | add_ffhq_singe_disc: false 19 | lambda_ffhq_mv_adv: 0.002 # enable when add_ffhq_singe_disc is True 20 | lr_mul_cano_img2plane: 1.0 # 1.0 | 0. | 0.1 21 | lambda_mse: 1.0 22 | lr_decay_rate: 0.95 23 | lr_decay_interval: 5000 24 | 25 | secc_segformer_scale: b0 # b0-b5 26 | use_motion_smo_net: false 27 | motion_smo_win_size: 5 28 | 29 | # regularization on Spatial plane 30 | density_reg_p_dist: 0.004 # distance at which to sample perturbed points for density regularization 31 | 32 | # regularization on SECC plane 33 | reg_interval_g: 4 34 | enable_rescale_plane_regulation: false # 试了下rescale发现效果不大 35 | min_rescale_factor: 0.25 36 | # how we fuse the secc 37 | phase1_plane_fusion_mode: add # add | mul 38 | init_from_ckpt: checkpoints/240126_real3dportrait_orig/img2plane_orig 39 | 40 | disable_highreso_at_stage1: true 41 | secc_pertube_mode: randn # randn | tv | laplacian | none 42 | secc_pertube_randn_scale: 0.01 # enable when pertube_mode==randn 43 | # target_pertube_blink_secc_loss: 0.05 # task会自动tune对应的lambda以使pertube loss逼近这个目标 44 | target_pertube_blink_secc_loss: 0.15 # task会自动tune对应的lambda以使pertube loss逼近这个目标 45 | target_pertube_secc_loss: 0.5 # 0.3 # task会自动tune对应的lambda以使pertube loss逼近这个目标 46 | lr_lambda_pertube_secc: 0.01 # 自动tune lambda的学习率 47 | 48 | sr_type: vanilla # vanillda | spade 49 | two_stage_training: true # is yes, when adv starts, fix the nerf and only finetune the sr. We found it necessary, otherwise the i2p could produce bad cases (such as darken face) 50 | also_update_decoder: false # update decoder at stage 2 51 | lambda_weights_l1: 0.1 # 0.5 52 | lambda_weights_entropy: 0.01 # 0.05 53 | lambda_density_reg: 0.25 # default 0.25 in EG3D, strength of pertube density regularization for Generator 54 | reg_interval_g_cond: 4 55 | ckpt_milestone_interval: 50000 56 | update_src2src_interval: 16 57 | -------------------------------------------------------------------------------- /egs/os_avatar/real3d_orig/secc_img2plane_torso_orig.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./secc_img2plane_orig.yaml 3 | 4 | task_cls: tasks.os_avatar.secc_img2plane_torso_task.SECC_Img2PlaneEG3D_TorsoTask 5 | torso_ref_segout_mode: torso # torso | torso_with_bg | person | full (person_with_bg) 6 | 7 | lr_g: 0.00001 8 | 9 | weight_fuse: true 10 | 11 | start_adv_iters: 40000 12 | max_updates: 10_0000 # 25_0000 13 | lambda_th1kh_mv_adv: 0.003 14 | add_ffhq_singe_disc: false 15 | lambda_ffhq_mv_adv: 0.002 # enable when add_ffhq_singe_disc is True 16 | lambda_mse: 1.0 17 | init_from_ckpt: checkpoints/240207_robust_secc2plane/secc2plane_orig_blink0.3_pertubeNone/model_ckpt_steps_150000.ckpt # checkpoints/0725_img2planes/secc_img2plane_torso | can be either a secc_img2plane or a secc_img2plane_torso ckpt 18 | reload_head_ckpt: '' # checkpoints/0804_secc2plane/secc_img2plane_lap0.1_blink0.05_run2 | will override the secc_img2plane from init_from_ckpt and be reloaded during training 19 | 20 | fuse_with_deform_source: false # fuse source会有严重的artifact 21 | lam_occlusion_2_reg_l1: 0.0 # 0.001 22 | torso_occlusion_reg_unmask_factor: 0.3 23 | lam_occlusion_weights_entropy: 0.001 # 0.0001 24 | 25 | lam_occlusion_reg_l1: 0.00 # 设置成0.02导致脸部和torso都有色差,并且摇头晃脑时只有脖子动,身体不太动,不真实。 26 | torso_kp_num: 4 27 | torso_inp_mode: rgb_alpha 28 | htbsr_head_threshold: 0.9 29 | torso_model_version: v2 30 | htbsr_head_weight_fuse_mode: v2 31 | appearance_feat_mul_torso_mask: true -------------------------------------------------------------------------------- /egs/os_avatar/secc_img2plane.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./img2plane.yaml 3 | 4 | task_cls: tasks.os_avatar.secc_img2plane_task.SECC_Img2PlaneEG3DTask 5 | ds_name: TH1KH_512 # CelebV_HQ | Concat_CelebVHQ_TH1KH | Concat_CelebVHQ_TH1KH_RAVDESS 6 | binary_data_dir: data/binary/th1kh 7 | 8 | img2plane_backbone_mode: composite # composite | segformer 9 | num_workers: 8 # 4 10 | pncc_cond_mode: cano_src_tgt # cano_tgt | cano_src_tgt 11 | seg_out_mode: head 12 | 13 | # 目前发现adv之后控制不了嘴了,见checkpoints/0702_img2planes/osavatar_secc_img2plane_baseline_vit_from_pretrained 14 | start_adv_iters: 20_0000 # 如果是从img2plane过来的,15w;如果是从secc2plane过来了,见机行事,5w~10w左右也行。 15 | stop_update_i2p_iters: 7_0000 16 | max_updates: 25_0000 # 发现到20w的时候会过拟合,对ood identity效果不好 17 | lambda_th1kh_mv_adv: 0.002 # 0.005 # 0.01 18 | add_ffhq_singe_disc: false 19 | lambda_ffhq_mv_adv: 0.002 # enable when add_ffhq_singe_disc is True 20 | lr_mul_cano_img2plane: 1.0 # 1.0 | 0. | 0.1 21 | lambda_mse: 1.0 22 | lr_decay_rate: 0.95 23 | lr_decay_interval: 5000 24 | 25 | secc_segformer_scale: b0 # b0-b5 26 | use_motion_smo_net: false 27 | motion_smo_win_size: 5 28 | 29 | # regularization on Spatial plane 30 | density_reg_p_dist: 0.004 # distance at which to sample perturbed points for density regularization 31 | 32 | # regularization on SECC plane 33 | reg_interval_g: 4 34 | enable_rescale_plane_regulation: false # 试了下rescale发现效果不大 35 | min_rescale_factor: 0.25 36 | # how we fuse the secc 37 | phase1_plane_fusion_mode: add # add | mul 38 | init_from_ckpt: '' # checkpoints/240126_improve_i2p/img2plane_rgb_alpha 39 | 40 | disable_highreso_at_stage1: true 41 | secc_pertube_mode: randn # randn | tv | laplacian | none 42 | secc_pertube_randn_scale: 0.01 # enable when pertube_mode==randn 43 | target_pertube_blink_secc_loss: 0.3 # task会自动tune对应的lambda以使pertube loss逼近这个目标 44 | target_pertube_secc_loss: 0. # 0.5 # task会自动tune对应的lambda以使pertube loss逼近这个目标 45 | pertube_ref_prob: 0.25 46 | lr_lambda_pertube_secc: 0.01 # 自动tune lambda的学习率 47 | 48 | sr_type: vanilla # vanillda | spade 49 | two_stage_training: true # is yes, when adv starts, fix the nerf and only finetune the sr. We found it necessary, otherwise the i2p could produce bad cases (such as darken face) 50 | also_update_decoder: false # update decoder at stage 2 51 | lambda_weights_l1: 0.1 # 0.5 52 | lambda_weights_entropy: 0.01 # 0.05 53 | lambda_density_reg: 0.25 # default 0.25 in EG3D, strength of pertube density regularization for Generator 54 | reg_interval_g_cond: 4 55 | ckpt_milestone_interval: 50000 56 | update_src2src_interval: 16 57 | -------------------------------------------------------------------------------- /egs/os_avatar/secc_img2plane_torso.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./secc_img2plane.yaml 3 | 4 | task_cls: tasks.os_avatar.secc_img2plane_torso_task.SECC_Img2PlaneEG3D_TorsoTask 5 | torso_ref_segout_mode: torso # torso | torso_with_bg | person | full (person_with_bg) 6 | 7 | lr_g: 0.00001 8 | 9 | weight_fuse: true 10 | 11 | start_adv_iters: 40000 12 | max_updates: 10_0000 # 25_0000 13 | lambda_th1kh_mv_adv: 0.001 14 | add_ffhq_singe_disc: false 15 | lambda_ffhq_mv_adv: 0.002 # enable when add_ffhq_singe_disc is True 16 | lambda_mse: 1.0 17 | init_from_ckpt: '' # checkpoints/0725_img2planes/secc_img2plane_torso | can be either a secc_img2plane or a secc_img2plane_torso ckpt 18 | reload_head_ckpt: '' # checkpoints/0804_secc2plane/secc_img2plane_lap0.1_blink0.05_run2 | will override the secc_img2plane from init_from_ckpt and be reloaded during training 19 | 20 | 21 | fuse_with_deform_source: false # fuse source会有严重的artifact 22 | lam_occlusion_2_reg_l1: 0.0 # 0.001 23 | torso_occlusion_reg_unmask_factor: 0.3 24 | lam_occlusion_weights_entropy: 0.001 # 0.0001 25 | 26 | lam_occlusion_reg_l1: 0.00 # 设置成0.02导致脸部和torso都有色差,并且摇头晃脑时只有脖子动,身体不太动,不真实。 27 | occlusion_fuse: true 28 | torso_kp_num: 4 29 | htbsr_head_weight_fuse_mode: v2 30 | htbsr_head_threshold: 0.9 31 | torso_model_version: v2 32 | -------------------------------------------------------------------------------- /egs/th1kh_512/base.yaml: -------------------------------------------------------------------------------- 1 | ds_name: TH1KH_512 2 | raw_data_dir: /mnt/bn/sa-ag-data/yezhenhui/datasets/raw/TH1KH_512 3 | binary_data_dir: data/binary/TH1KH_512 4 | # binary_data_dir: /dev/shm/TH1KH 5 | process_id: 0 # rank id when pre-processing dataset 6 | total_process: 1 # number of ranks when pre-processing dataset 7 | split_seed: 999 # random seed that split chunks during pre-processing dataset 8 | 9 | max_sentences_per_batch: 1024 10 | max_tokens_per_batch: 200000 11 | 12 | load_db_to_memory: false 13 | 14 | num_workers: 4 15 | use_kv_dataset: true 16 | 17 | binarization_args: 18 | with_hubert: false 19 | with_mel: false 20 | with_coeff: true 21 | 22 | -------------------------------------------------------------------------------- /egs/th1kh_512/secc_img2plane.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ../os_avatar/secc_img2plane.yaml 3 | - ./base.yaml 4 | 5 | 6 | init_from_ckpt: /mnt/bn/sa-ag-data/yezhenhui/projects/GeneFace_private/checkpoints/0720_img2planes/secc_img2plane_one_stage 7 | lr_g: 0.0001 # 1e-4, larger than ravdess, because th1kh_512 is larger 8 | lr_d: 0.0002 # 2e-4 -------------------------------------------------------------------------------- /egs/th1kh_512/secc_img2plane_torso.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ../os_avatar/secc_img2plane_torso.yaml 3 | - ./base.yaml 4 | 5 | 6 | init_from_ckpt: /mnt/bn/sa-ag-data/yezhenhui/projects/GeneFace_private/checkpoints/0729_th1kh/secc_img2plane 7 | lr_g: 0.00001 # 1e-5 8 | lr_d: 0.0002 # 2e-4 -------------------------------------------------------------------------------- /egs/th1kh_512_audio2motion/base.yaml: -------------------------------------------------------------------------------- 1 | ds_name: TH1KH_512 2 | raw_data_dir: /mnt/bn/sa-ag-data/yezhenhui/datasets/raw/TH1KH_512 3 | binary_data_dir: data/binary/TH1KH_512_audio2motion 4 | # binary_data_dir: /dev/shm/TH1KH_512 5 | process_id: 0 # rank id when pre-processing dataset 6 | total_process: 1 # number of ranks when pre-processing dataset 7 | split_seed: 999 # random seed that split chunks during pre-processing dataset 8 | 9 | smo_win_size: 5 10 | batch_size: 4 11 | num_workers: 4 12 | 13 | use_kv_dataset: true 14 | 15 | binarization_args: 16 | with_hubert: true 17 | with_mel: true 18 | with_coeff: true 19 | 20 | sample_min_length: 0 -------------------------------------------------------------------------------- /egs/th1kh_512_audio2motion/lm3d_syncnet.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/syncnet/base.yaml 3 | - ./base.yaml 4 | 5 | max_updates: 250000 6 | motion_type: idexp_lm3d 7 | audio_type: hubert 8 | 9 | syncnet_num_layers_per_block: 3 10 | syncnet_base_hid_size: 128 11 | 12 | # max_sentences_per_batch: 1024 13 | max_sentences_per_batch: 2048 14 | max_tokens_per_batch: 40_000 15 | # max_tokens_per_batch: 20_000 16 | 17 | num_workers: 16 -------------------------------------------------------------------------------- /egs/th1kh_512_audio2motion/lm3d_vae.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/audio2motion/vae.yaml 3 | - ./base.yaml 4 | 5 | lambda_kl: 0.02 6 | motion_type: idexp_lm3d 7 | audio_type: hubert 8 | 9 | max_updates: 160000 10 | -------------------------------------------------------------------------------- /egs/th1kh_512_audio2motion/lm3d_vae_pitch.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/audio2motion/vae.yaml 3 | - ./base.yaml 4 | 5 | lambda_kl: 0.02 6 | motion_type: idexp_lm3d 7 | audio_type: hubert 8 | 9 | task_cls: tasks.audio2motion.lm3d_vae_sync_pitch.VAESyncAudio2MotionTask 10 | max_updates: 160000 11 | -------------------------------------------------------------------------------- /egs/th1kh_512_audio2motion/lm3d_vae_sync.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - egs/egs_bases/audio2motion/vae_sync.yaml 3 | - ./base.yaml 4 | 5 | syncnet_work_dir: checkpoints/th1kh/lm3d_syncnet 6 | syncnet_ckpt_steps: 250000 7 | lambda_kl: 0.02 8 | max_updates: 160000 9 | motion_type: idexp_lm3d 10 | audio_type: hubert 11 | 12 | syncnet_num_layers_per_block: 3 13 | syncnet_base_hid_size: 128 -------------------------------------------------------------------------------- /egs/th1kh_512_audio2motion/lm3d_vae_sync_pitch.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./lm3d_vae_sync.yaml 3 | - ./base.yaml 4 | 5 | lambda_kl: 0.02 6 | syncnet_work_dir: checkpoints/th1kh/lm3d_syncnet 7 | syncnet_ckpt_steps: 230000 8 | task_cls: tasks.audio2motion.lm3d_vae_sync_pitch.VAESyncAudio2MotionTask 9 | max_updates: 160000 10 | motion_type: idexp_lm3d 11 | audio_type: hubert 12 | 13 | syncnet_num_layers_per_block: 3 14 | syncnet_base_hid_size: 128 -------------------------------------------------------------------------------- /modules/audio2motion/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def squeeze(x, x_mask=None, n_sqz=2): 5 | b, c, t = x.size() 6 | 7 | t = (t // n_sqz) * n_sqz 8 | x = x[:, :, :t] 9 | x_sqz = x.view(b, c, t // n_sqz, n_sqz) 10 | x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) 11 | 12 | if x_mask is not None: 13 | x_mask = x_mask[:, :, n_sqz - 1::n_sqz] 14 | else: 15 | x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) 16 | return x_sqz * x_mask, x_mask 17 | 18 | 19 | def unsqueeze(x, x_mask=None, n_sqz=2): 20 | b, c, t = x.size() 21 | 22 | x_unsqz = x.view(b, n_sqz, c // n_sqz, t) 23 | x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) 24 | 25 | if x_mask is not None: 26 | x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) 27 | else: 28 | x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) 29 | return x_unsqz * x_mask, x_mask 30 | -------------------------------------------------------------------------------- /modules/commons/attention/simple_attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | def split_heads(x, num_heads): 7 | """ Split heads 8 | :param x: A tensor with shape [batch, length, channels] 9 | :param num_heads: An integer 10 | :returns: A tensor with shape [batch, heads, length, channels / heads] 11 | """ 12 | assert x.shape[-1] % num_heads == 0, str(x.shape) 13 | return x.reshape(x.shape[:-1] + (num_heads, x.shape[-1] // num_heads)).permute(0, 2, 1, 3) 14 | 15 | 16 | def combine_heads(x): 17 | """ Combine heads 18 | :param x: A tensor with shape [batch, heads, length, channels] 19 | :returns: A tensor with shape [batch, length, heads * channels] 20 | """ 21 | x = x.permute([0, 2, 1, 3]) 22 | return x.reshape(x.shape[:-2] + (x.shape[-1] * x.shape[-2],)) 23 | 24 | 25 | class SimpleAttention(nn.Module): 26 | def __init__(self, query_size=192, key_size=192, value_size=192, num_heads=1): 27 | super(SimpleAttention, self).__init__() 28 | self.q_transform = nn.Linear(query_size, query_size, bias=False) 29 | self.k_transform = nn.Linear(key_size, query_size, bias=False) 30 | self.v_transform = nn.Linear(value_size, query_size, bias=False) 31 | self.output_transform = nn.Linear(query_size, query_size, bias=False) 32 | self.query_size = query_size 33 | self.key_size = key_size 34 | self.value_size = value_size 35 | self.num_heads = num_heads 36 | 37 | def forward(self, query, key, value, attn_mask=None, bias=None): 38 | q = self.q_transform(query) 39 | k = self.k_transform(key) 40 | v = self.v_transform(value) 41 | 42 | logits = torch.bmm(q, k.transpose(1, 2)) # [batch, length_q, length_k] 43 | if bias is not None: 44 | logits += bias 45 | if attn_mask is not None: 46 | logits = logits + attn_mask * -1e9 47 | weights = F.softmax(logits, dim=-1) 48 | out = torch.bmm(weights, v) 49 | out = self.output_transform(out) 50 | return out, weights 51 | -------------------------------------------------------------------------------- /modules/commons/improved_diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Codebase for "Improved Denoising Diffusion Probabilistic Models". 3 | """ 4 | -------------------------------------------------------------------------------- /modules/commons/improved_diffusion/dist_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for distributed training. 3 | """ 4 | 5 | import io 6 | import os 7 | import socket 8 | 9 | import blobfile as bf 10 | from mpi4py import MPI 11 | import torch as th 12 | import torch.distributed as dist 13 | 14 | # Change this to reflect your cluster layout. 15 | # The GPU for a given rank is (rank % GPUS_PER_NODE). 16 | GPUS_PER_NODE = 8 17 | 18 | SETUP_RETRY_COUNT = 3 19 | 20 | 21 | def setup_dist(): 22 | """ 23 | Setup a distributed process group. 24 | """ 25 | if dist.is_initialized(): 26 | return 27 | 28 | comm = MPI.COMM_WORLD 29 | backend = "gloo" if not th.cuda.is_available() else "nccl" 30 | 31 | if backend == "gloo": 32 | hostname = "localhost" 33 | else: 34 | hostname = socket.gethostbyname(socket.getfqdn()) 35 | os.environ["MASTER_ADDR"] = comm.bcast(hostname, root=0) 36 | os.environ["RANK"] = str(comm.rank) 37 | os.environ["WORLD_SIZE"] = str(comm.size) 38 | 39 | port = comm.bcast(_find_free_port(), root=0) 40 | os.environ["MASTER_PORT"] = str(port) 41 | dist.init_process_group(backend=backend, init_method="env://") 42 | 43 | 44 | def dev(): 45 | """ 46 | Get the device to use for torch.distributed. 47 | """ 48 | if th.cuda.is_available(): 49 | return th.device(f"cuda:{MPI.COMM_WORLD.Get_rank() % GPUS_PER_NODE}") 50 | return th.device("cpu") 51 | 52 | 53 | def load_state_dict(path, **kwargs): 54 | """ 55 | Load a PyTorch file without redundant fetches across MPI ranks. 56 | """ 57 | if MPI.COMM_WORLD.Get_rank() == 0: 58 | with bf.BlobFile(path, "rb") as f: 59 | data = f.read() 60 | else: 61 | data = None 62 | data = MPI.COMM_WORLD.bcast(data) 63 | return th.load(io.BytesIO(data), **kwargs) 64 | 65 | 66 | def sync_params(params): 67 | """ 68 | Synchronize a sequence of Tensors across ranks from rank 0. 69 | """ 70 | for p in params: 71 | with th.no_grad(): 72 | dist.broadcast(p, 0) 73 | 74 | 75 | def _find_free_port(): 76 | try: 77 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 78 | s.bind(("", 0)) 79 | s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 80 | return s.getsockname()[1] 81 | finally: 82 | s.close() 83 | -------------------------------------------------------------------------------- /modules/commons/improved_diffusion/fp16_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers to train with 16-bit precision. 3 | """ 4 | 5 | import torch.nn as nn 6 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors 7 | 8 | 9 | def convert_module_to_f16(l): 10 | """ 11 | Convert primitive modules to float16. 12 | """ 13 | if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): 14 | l.weight.data = l.weight.data.half() 15 | l.bias.data = l.bias.data.half() 16 | 17 | 18 | def convert_module_to_f32(l): 19 | """ 20 | Convert primitive modules to float32, undoing convert_module_to_f16(). 21 | """ 22 | if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): 23 | l.weight.data = l.weight.data.float() 24 | l.bias.data = l.bias.data.float() 25 | 26 | 27 | def make_master_params(model_params): 28 | """ 29 | Copy model parameters into a (differently-shaped) list of full-precision 30 | parameters. 31 | """ 32 | master_params = _flatten_dense_tensors( 33 | [param.detach().float() for param in model_params] 34 | ) 35 | master_params = nn.Parameter(master_params) 36 | master_params.requires_grad = True 37 | return [master_params] 38 | 39 | 40 | def model_grads_to_master_grads(model_params, master_params): 41 | """ 42 | Copy the gradients from the model parameters into the master parameters 43 | from make_master_params(). 44 | """ 45 | master_params[0].grad = _flatten_dense_tensors( 46 | [param.grad.data.detach().float() for param in model_params] 47 | ) 48 | 49 | 50 | def master_params_to_model_params(model_params, master_params): 51 | """ 52 | Copy the master parameter data back into the model parameters. 53 | """ 54 | # Without copying to a list, if a generator is passed, this will 55 | # silently not copy any parameters. 56 | model_params = list(model_params) 57 | 58 | for param, master_param in zip( 59 | model_params, unflatten_master_params(model_params, master_params) 60 | ): 61 | param.detach().copy_(master_param) 62 | 63 | 64 | def unflatten_master_params(model_params, master_params): 65 | """ 66 | Unflatten the master parameters to look like model_params. 67 | """ 68 | return _unflatten_dense_tensors(master_params[0].detach(), model_params) 69 | 70 | 71 | def zero_grad(model_params): 72 | for param in model_params: 73 | # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group 74 | if param.grad is not None: 75 | param.grad.detach_() 76 | param.grad.zero_() 77 | -------------------------------------------------------------------------------- /modules/commons/improved_diffusion/losses.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for various likelihood-based losses. These are ported from the original 3 | Ho et al. diffusion models codebase: 4 | https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py 5 | """ 6 | 7 | import numpy as np 8 | 9 | import torch as th 10 | 11 | 12 | def normal_kl(mean1, logvar1, mean2, logvar2): 13 | """ 14 | Compute the KL divergence between two gaussians. 15 | 16 | Shapes are automatically broadcasted, so batches can be compared to 17 | scalars, among other use cases. 18 | """ 19 | tensor = None 20 | for obj in (mean1, logvar1, mean2, logvar2): 21 | if isinstance(obj, th.Tensor): 22 | tensor = obj 23 | break 24 | assert tensor is not None, "at least one argument must be a Tensor" 25 | 26 | # Force variances to be Tensors. Broadcasting helps convert scalars to 27 | # Tensors, but it does not work for th.exp(). 28 | logvar1, logvar2 = [ 29 | x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) 30 | for x in (logvar1, logvar2) 31 | ] 32 | 33 | return 0.5 * ( 34 | -1.0 35 | + logvar2 36 | - logvar1 37 | + th.exp(logvar1 - logvar2) 38 | + ((mean1 - mean2) ** 2) * th.exp(-logvar2) 39 | ) 40 | 41 | 42 | def approx_standard_normal_cdf(x): 43 | """ 44 | A fast approximation of the cumulative distribution function of the 45 | standard normal. 46 | """ 47 | return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) 48 | 49 | 50 | def discretized_gaussian_log_likelihood(x, *, means, log_scales): 51 | """ 52 | Compute the log-likelihood of a Gaussian distribution discretizing to a 53 | given image. 54 | 55 | :param x: the target images. It is assumed that this was uint8 values, 56 | rescaled to the range [-1, 1]. 57 | :param means: the Gaussian mean Tensor. 58 | :param log_scales: the Gaussian log stddev Tensor. 59 | :return: a tensor like x of log probabilities (in nats). 60 | """ 61 | assert x.shape == means.shape == log_scales.shape 62 | centered_x = x - means 63 | inv_stdv = th.exp(-log_scales) 64 | plus_in = inv_stdv * (centered_x + 1.0 / 255.0) 65 | cdf_plus = approx_standard_normal_cdf(plus_in) 66 | min_in = inv_stdv * (centered_x - 1.0 / 255.0) 67 | cdf_min = approx_standard_normal_cdf(min_in) 68 | log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) 69 | log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) 70 | cdf_delta = cdf_plus - cdf_min 71 | log_probs = th.where( 72 | x < -0.999, 73 | log_cdf_plus, 74 | th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), 75 | ) 76 | assert log_probs.shape == x.shape 77 | return log_probs 78 | -------------------------------------------------------------------------------- /modules/commons/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class LayerNorm(torch.nn.LayerNorm): 6 | """Layer normalization module. 7 | :param int nout: output dim size 8 | :param int dim: dimension to be normalized 9 | """ 10 | 11 | def __init__(self, nout, dim=-1, eps=1e-5): 12 | """Construct an LayerNorm object.""" 13 | super(LayerNorm, self).__init__(nout, eps=eps) 14 | self.dim = dim 15 | 16 | def forward(self, x): 17 | """Apply layer normalization. 18 | :param torch.Tensor x: input tensor 19 | :return: layer normalized tensor 20 | :rtype torch.Tensor 21 | """ 22 | if self.dim == -1: 23 | return super(LayerNorm, self).forward(x) 24 | return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1) 25 | 26 | 27 | class Reshape(nn.Module): 28 | def __init__(self, *args): 29 | super(Reshape, self).__init__() 30 | self.shape = args 31 | 32 | def forward(self, x): 33 | return x.view(self.shape) 34 | 35 | 36 | class Permute(nn.Module): 37 | def __init__(self, *args): 38 | super(Permute, self).__init__() 39 | self.args = args 40 | 41 | def forward(self, x): 42 | return x.permute(self.args) 43 | 44 | 45 | def Embedding(num_embeddings, embedding_dim, padding_idx=None): 46 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) 47 | nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) 48 | if padding_idx is not None: 49 | nn.init.constant_(m.weight[padding_idx], 0) 50 | return m 51 | -------------------------------------------------------------------------------- /modules/commons/loralib/__init__.py: -------------------------------------------------------------------------------- 1 | name = "lora" 2 | 3 | from .layers import * 4 | from .utils import * -------------------------------------------------------------------------------- /modules/commons/loralib/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. 4 | # ------------------------------------------------------------------------------------------ 5 | import torch 6 | import torch.nn as nn 7 | 8 | from typing import Dict 9 | 10 | from .layers import LoRALayer 11 | 12 | 13 | def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None: 14 | for n, p in model.named_parameters(): 15 | p.requires_grad = True 16 | for n, p in model.named_parameters(): 17 | if 'lora_' not in n: 18 | p.requires_grad = False 19 | if bias == 'none': 20 | return 21 | elif bias == 'all': 22 | for n, p in model.named_parameters(): 23 | if 'bias' in n: 24 | p.requires_grad = True 25 | elif bias == 'lora_only': 26 | for m in model.modules(): 27 | if isinstance(m, LoRALayer) and \ 28 | hasattr(m, 'bias') and \ 29 | m.bias is not None: 30 | m.bias.requires_grad = True 31 | else: 32 | raise NotImplementedError 33 | 34 | 35 | def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]: 36 | my_state_dict = model.state_dict() 37 | if bias == 'none': 38 | return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k} 39 | elif bias == 'all': 40 | return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k} 41 | elif bias == 'lora_only': 42 | to_return = {} 43 | for k in my_state_dict: 44 | if 'lora_' in k: 45 | to_return[k] = my_state_dict[k] 46 | bias_name = k.split('lora_')[0]+'bias' 47 | if bias_name in my_state_dict: 48 | to_return[bias_name] = my_state_dict[bias_name] 49 | return to_return 50 | else: 51 | raise NotImplementedError 52 | -------------------------------------------------------------------------------- /modules/commons/normalizing_flow/res_flow.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from modules.commons.conv import ConditionalConvBlocks 4 | from modules.commons.wavenet import WN 5 | 6 | 7 | class FlipLayer(nn.Module): 8 | def forward(self, x, *args, **kwargs): 9 | x = torch.flip(x, [1]) 10 | return x 11 | 12 | 13 | class CouplingLayer(nn.Module): 14 | def __init__(self, c_in, hidden_size, kernel_size, n_layers, p_dropout=0, c_in_g=0, nn_type='wn'): 15 | super().__init__() 16 | self.channels = c_in 17 | self.hidden_size = hidden_size 18 | self.kernel_size = kernel_size 19 | self.n_layers = n_layers 20 | self.c_half = c_in // 2 21 | 22 | self.pre = nn.Conv1d(self.c_half, hidden_size, 1) 23 | if nn_type == 'wn': 24 | self.enc = WN(hidden_size, kernel_size, 1, n_layers, p_dropout=p_dropout, 25 | c_cond=c_in_g) 26 | elif nn_type == 'conv': 27 | self.enc = ConditionalConvBlocks( 28 | hidden_size, c_in_g, hidden_size, None, kernel_size, 29 | layers_in_block=1, is_BTC=False, num_layers=n_layers) 30 | self.post = nn.Conv1d(hidden_size, self.c_half, 1) 31 | 32 | def forward(self, x, nonpadding, cond=None, reverse=False): 33 | x0, x1 = x[:, :self.c_half], x[:, self.c_half:] 34 | x_ = self.pre(x0) * nonpadding 35 | x_ = self.enc(x_, nonpadding=nonpadding, cond=cond) 36 | m = self.post(x_) 37 | x1 = m + x1 if not reverse else x1 - m 38 | x = torch.cat([x0, x1], 1) 39 | return x * nonpadding 40 | 41 | 42 | class ResFlow(nn.Module): 43 | def __init__(self, 44 | c_in, 45 | hidden_size, 46 | kernel_size, 47 | n_flow_layers, 48 | n_flow_steps=4, 49 | c_cond=0, 50 | nn_type='wn'): 51 | super().__init__() 52 | self.flows = nn.ModuleList() 53 | for i in range(n_flow_steps): 54 | self.flows.append( 55 | CouplingLayer(c_in, hidden_size, kernel_size, n_flow_layers, c_in_g=c_cond, nn_type=nn_type)) 56 | self.flows.append(FlipLayer()) 57 | 58 | def forward(self, x, nonpadding, cond=None, reverse=False): 59 | for flow in (self.flows if not reverse else reversed(self.flows)): 60 | x = flow(x, nonpadding, cond=cond, reverse=reverse) 61 | return x 62 | -------------------------------------------------------------------------------- /modules/commons/normalizing_flow/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def squeeze(x, x_mask=None, n_sqz=2): 5 | b, c, t = x.size() 6 | 7 | t = (t // n_sqz) * n_sqz 8 | x = x[:, :, :t] 9 | x_sqz = x.view(b, c, t // n_sqz, n_sqz) 10 | x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) 11 | 12 | if x_mask is not None: 13 | x_mask = x_mask[:, :, n_sqz - 1::n_sqz] 14 | else: 15 | x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) 16 | return x_sqz * x_mask, x_mask 17 | 18 | 19 | def unsqueeze(x, x_mask=None, n_sqz=2): 20 | b, c, t = x.size() 21 | 22 | x_unsqz = x.view(b, n_sqz, c // n_sqz, t) 23 | x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) 24 | 25 | if x_mask is not None: 26 | x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) 27 | else: 28 | x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) 29 | return x_unsqz * x_mask, x_mask 30 | -------------------------------------------------------------------------------- /modules/commons/vqvae_fsq.py: -------------------------------------------------------------------------------- 1 | """ 2 | Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505 3 | Code adapted from Jax version in Appendix A.1 4 | """ 5 | 6 | from typing import List 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch import Tensor, int32 11 | 12 | 13 | def round_ste(z: Tensor) -> Tensor: 14 | """Round with straight through gradients.""" 15 | zhat = z.round() 16 | return z + (zhat - z).detach() 17 | 18 | 19 | class FSQ(nn.Module): 20 | def __init__(self, levels: List[int]): 21 | super().__init__() 22 | _levels = torch.tensor(levels, dtype=int32) 23 | self.register_buffer("_levels", _levels) 24 | 25 | _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32) 26 | self.register_buffer("_basis", _basis) 27 | 28 | self.dim = len(levels) 29 | self.n_codes = self._levels.prod().item() 30 | implicit_codebook = self.indices_to_codes(torch.arange(self.n_codes)) 31 | self.register_buffer("implicit_codebook", implicit_codebook) 32 | 33 | def forward(self, z: Tensor) -> Tensor: 34 | zhat = self.quantize(z) 35 | indices = self.codes_to_indices(zhat) 36 | return zhat, indices 37 | 38 | def bound(self, z: Tensor, eps: float = 1e-3) -> Tensor: 39 | """Bound `z`, an array of shape (..., d).""" 40 | half_l = (self._levels - 1) * (1 - eps) / 2 41 | offset = torch.where(self._levels % 2 == 0, 0.5, 0.0) 42 | shift = (offset / half_l).tan() 43 | return (z + shift).tanh() * half_l - offset 44 | 45 | def quantize(self, z: Tensor) -> Tensor: 46 | """Quantizes z, returns quantized zhat, same shape as z.""" 47 | quantized = round_ste(self.bound(z)) 48 | half_width = self._levels // 2 # Renormalize to [-1, 1]. 49 | return quantized / half_width 50 | 51 | def _scale_and_shift(self, zhat_normalized: Tensor) -> Tensor: 52 | half_width = self._levels // 2 53 | return (zhat_normalized * half_width) + half_width 54 | 55 | def _scale_and_shift_inverse(self, zhat: Tensor) -> Tensor: 56 | half_width = self._levels // 2 57 | return (zhat - half_width) / half_width 58 | 59 | def codes_to_indices(self, zhat: Tensor) -> Tensor: 60 | """Converts a `code` to an index in the codebook.""" 61 | assert zhat.shape[-1] == self.dim 62 | zhat = self._scale_and_shift(zhat) 63 | return (zhat * self._basis).sum(dim=-1).to(int32) 64 | 65 | def indices_to_codes(self, indices: Tensor) -> Tensor: 66 | """Inverse of `codes_to_indices`.""" 67 | indices = indices.unsqueeze(-1) 68 | codes_non_centered = (indices // self._basis) % self._levels 69 | return self._scale_and_shift_inverse(codes_non_centered) 70 | 71 | def get_codebook_entry(self, encoding_indices): 72 | return self.indices_to_codes(encoding_indices) 73 | -------------------------------------------------------------------------------- /modules/eg3ds/dnnlib/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | # 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | # property and proprietary rights in and to this material, related 6 | # documentation and any modifications thereto. Any use, reproduction, 7 | # disclosure or distribution of this material and related documentation 8 | # without an express license agreement from NVIDIA CORPORATION or 9 | # its affiliates is strictly prohibited. 10 | 11 | from .util import EasyDict, make_cache_dir_path 12 | -------------------------------------------------------------------------------- /modules/eg3ds/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | # 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | # property and proprietary rights in and to this material, related 6 | # documentation and any modifications thereto. Any use, reproduction, 7 | # disclosure or distribution of this material and related documentation 8 | # without an express license agreement from NVIDIA CORPORATION or 9 | # its affiliates is strictly prohibited. 10 | 11 | # empty 12 | -------------------------------------------------------------------------------- /modules/eg3ds/metrics/frechet_inception_distance.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | # 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | # property and proprietary rights in and to this material, related 6 | # documentation and any modifications thereto. Any use, reproduction, 7 | # disclosure or distribution of this material and related documentation 8 | # without an express license agreement from NVIDIA CORPORATION or 9 | # its affiliates is strictly prohibited. 10 | 11 | """Frechet Inception Distance (FID) from the paper 12 | "GANs trained by a two time-scale update rule converge to a local Nash 13 | equilibrium". Matches the original implementation by Heusel et al. at 14 | https://github.com/bioinf-jku/TTUR/blob/master/fid.py""" 15 | 16 | import numpy as np 17 | import scipy.linalg 18 | from . import metric_utils 19 | 20 | #---------------------------------------------------------------------------- 21 | 22 | def compute_fid(opts, max_real, num_gen): 23 | # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz 24 | # detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/inception-2015-12-05.pkl' 25 | detector_url = 'file:///home/tiger/nfs/myenv/cache/useful_ckpts/inception-2015-12-05.pkl' 26 | detector_kwargs = dict(return_features=True) # Return raw features before the softmax layer. 27 | 28 | mu_real, sigma_real = metric_utils.compute_feature_stats_for_dataset( 29 | opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs, 30 | rel_lo=0, rel_hi=0, capture_mean_cov=True, max_items=max_real).get_mean_cov() 31 | 32 | mu_gen, sigma_gen = metric_utils.compute_feature_stats_for_generator( 33 | opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs, 34 | rel_lo=0, rel_hi=1, capture_mean_cov=True, max_items=num_gen).get_mean_cov() 35 | 36 | if opts.rank != 0: 37 | return float('nan') 38 | 39 | m = np.square(mu_gen - mu_real).sum() 40 | s, _ = scipy.linalg.sqrtm(np.dot(sigma_gen, sigma_real), disp=False) # pylint: disable=no-member 41 | fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2)) 42 | return float(fid) 43 | 44 | #---------------------------------------------------------------------------- 45 | 46 | -------------------------------------------------------------------------------- /modules/eg3ds/metrics/inception_score.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | # 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | # property and proprietary rights in and to this material, related 6 | # documentation and any modifications thereto. Any use, reproduction, 7 | # disclosure or distribution of this material and related documentation 8 | # without an express license agreement from NVIDIA CORPORATION or 9 | # its affiliates is strictly prohibited. 10 | 11 | """Inception Score (IS) from the paper "Improved techniques for training 12 | GANs". Matches the original implementation by Salimans et al. at 13 | https://github.com/openai/improved-gan/blob/master/inception_score/model.py""" 14 | 15 | import numpy as np 16 | from . import metric_utils 17 | 18 | #---------------------------------------------------------------------------- 19 | 20 | def compute_is(opts, num_gen, num_splits): 21 | # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz 22 | # detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/inception-2015-12-05.pkl' 23 | detector_url = 'file:///home/tiger/nfs/myenv/cache/useful_ckpts/inception-2015-12-05.pkl' 24 | detector_kwargs = dict(no_output_bias=True) # Match the original implementation by not applying bias in the softmax layer. 25 | 26 | gen_probs = metric_utils.compute_feature_stats_for_generator( 27 | opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs, 28 | capture_all=True, max_items=num_gen).get_all() 29 | 30 | if opts.rank != 0: 31 | return float('nan'), float('nan') 32 | 33 | scores = [] 34 | for i in range(num_splits): 35 | part = gen_probs[i * num_gen // num_splits : (i + 1) * num_gen // num_splits] 36 | kl = part * (np.log(part) - np.log(np.mean(part, axis=0, keepdims=True))) 37 | kl = np.mean(np.sum(kl, axis=1)) 38 | scores.append(np.exp(kl)) 39 | return float(np.mean(scores)), float(np.std(scores)) 40 | 41 | #---------------------------------------------------------------------------- 42 | -------------------------------------------------------------------------------- /modules/eg3ds/metrics/kernel_inception_distance.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | # 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | # property and proprietary rights in and to this material, related 6 | # documentation and any modifications thereto. Any use, reproduction, 7 | # disclosure or distribution of this material and related documentation 8 | # without an express license agreement from NVIDIA CORPORATION or 9 | # its affiliates is strictly prohibited. 10 | 11 | """Kernel Inception Distance (KID) from the paper "Demystifying MMD 12 | GANs". Matches the original implementation by Binkowski et al. at 13 | https://github.com/mbinkowski/MMD-GAN/blob/master/gan/compute_scores.py""" 14 | 15 | import numpy as np 16 | from . import metric_utils 17 | 18 | #---------------------------------------------------------------------------- 19 | 20 | def compute_kid(opts, max_real, num_gen, num_subsets, max_subset_size): 21 | # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz 22 | # detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/inception-2015-12-05.pkl' 23 | detector_url = 'file:///home/tiger/nfs/myenv/cache/useful_ckpts/inception-2015-12-05.pkl' 24 | detector_kwargs = dict(return_features=True) # Return raw features before the softmax layer. 25 | 26 | real_features = metric_utils.compute_feature_stats_for_dataset( 27 | opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs, 28 | rel_lo=0, rel_hi=0, capture_all=True, max_items=max_real).get_all() 29 | 30 | gen_features = metric_utils.compute_feature_stats_for_generator( 31 | opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs, 32 | rel_lo=0, rel_hi=1, capture_all=True, max_items=num_gen).get_all() 33 | 34 | if opts.rank != 0: 35 | return float('nan') 36 | 37 | n = real_features.shape[1] 38 | m = min(min(real_features.shape[0], gen_features.shape[0]), max_subset_size) 39 | t = 0 40 | for _subset_idx in range(num_subsets): 41 | x = gen_features[np.random.choice(gen_features.shape[0], m, replace=False)] 42 | y = real_features[np.random.choice(real_features.shape[0], m, replace=False)] 43 | a = (x @ x.T / n + 1) ** 3 + (y @ y.T / n + 1) ** 3 44 | b = (x @ y.T / n + 1) ** 3 45 | t += (a.sum() - np.diag(a).sum()) / (m - 1) - b.sum() * 2 / m 46 | kid = t / num_subsets / m 47 | return float(kid) 48 | 49 | #---------------------------------------------------------------------------- 50 | -------------------------------------------------------------------------------- /modules/eg3ds/torch_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | # 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | # property and proprietary rights in and to this material, related 6 | # documentation and any modifications thereto. Any use, reproduction, 7 | # disclosure or distribution of this material and related documentation 8 | # without an express license agreement from NVIDIA CORPORATION or 9 | # its affiliates is strictly prohibited. 10 | 11 | # empty 12 | -------------------------------------------------------------------------------- /modules/eg3ds/torch_utils/ops/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | # 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | # property and proprietary rights in and to this material, related 6 | # documentation and any modifications thereto. Any use, reproduction, 7 | # disclosure or distribution of this material and related documentation 8 | # without an express license agreement from NVIDIA CORPORATION or 9 | # its affiliates is strictly prohibited. 10 | 11 | # empty 12 | -------------------------------------------------------------------------------- /modules/eg3ds/torch_utils/ops/bias_act.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | * 5 | * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 6 | * property and proprietary rights in and to this material, related 7 | * documentation and any modifications thereto. Any use, reproduction, 8 | * disclosure or distribution of this material and related documentation 9 | * without an express license agreement from NVIDIA CORPORATION or 10 | * its affiliates is strictly prohibited. 11 | */ 12 | 13 | //------------------------------------------------------------------------ 14 | // CUDA kernel parameters. 15 | 16 | struct bias_act_kernel_params 17 | { 18 | const void* x; // [sizeX] 19 | const void* b; // [sizeB] or NULL 20 | const void* xref; // [sizeX] or NULL 21 | const void* yref; // [sizeX] or NULL 22 | const void* dy; // [sizeX] or NULL 23 | void* y; // [sizeX] 24 | 25 | int grad; 26 | int act; 27 | float alpha; 28 | float gain; 29 | float clamp; 30 | 31 | int sizeX; 32 | int sizeB; 33 | int stepB; 34 | int loopX; 35 | }; 36 | 37 | //------------------------------------------------------------------------ 38 | // CUDA kernel selection. 39 | 40 | template void* choose_bias_act_kernel(const bias_act_kernel_params& p); 41 | 42 | //------------------------------------------------------------------------ 43 | -------------------------------------------------------------------------------- /modules/eg3ds/torch_utils/ops/filtered_lrelu_ns.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | * 5 | * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 6 | * property and proprietary rights in and to this material, related 7 | * documentation and any modifications thereto. Any use, reproduction, 8 | * disclosure or distribution of this material and related documentation 9 | * without an express license agreement from NVIDIA CORPORATION or 10 | * its affiliates is strictly prohibited. 11 | */ 12 | 13 | #include "filtered_lrelu.cu" 14 | 15 | // Template/kernel specializations for no signs mode (no gradients required). 16 | 17 | // Full op, 32-bit indexing. 18 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 19 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 20 | 21 | // Full op, 64-bit indexing. 22 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 23 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 24 | 25 | // Activation/signs only for generic variant. 64-bit indexing. 26 | template void* choose_filtered_lrelu_act_kernel(void); 27 | template void* choose_filtered_lrelu_act_kernel(void); 28 | template void* choose_filtered_lrelu_act_kernel(void); 29 | 30 | // Copy filters to constant memory. 31 | template cudaError_t copy_filters(cudaStream_t stream); 32 | -------------------------------------------------------------------------------- /modules/eg3ds/torch_utils/ops/filtered_lrelu_rd.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | * 5 | * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 6 | * property and proprietary rights in and to this material, related 7 | * documentation and any modifications thereto. Any use, reproduction, 8 | * disclosure or distribution of this material and related documentation 9 | * without an express license agreement from NVIDIA CORPORATION or 10 | * its affiliates is strictly prohibited. 11 | */ 12 | 13 | #include "filtered_lrelu.cu" 14 | 15 | // Template/kernel specializations for sign read mode. 16 | 17 | // Full op, 32-bit indexing. 18 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 19 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 20 | 21 | // Full op, 64-bit indexing. 22 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 23 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 24 | 25 | // Activation/signs only for generic variant. 64-bit indexing. 26 | template void* choose_filtered_lrelu_act_kernel(void); 27 | template void* choose_filtered_lrelu_act_kernel(void); 28 | template void* choose_filtered_lrelu_act_kernel(void); 29 | 30 | // Copy filters to constant memory. 31 | template cudaError_t copy_filters(cudaStream_t stream); 32 | -------------------------------------------------------------------------------- /modules/eg3ds/torch_utils/ops/filtered_lrelu_wr.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | * 5 | * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 6 | * property and proprietary rights in and to this material, related 7 | * documentation and any modifications thereto. Any use, reproduction, 8 | * disclosure or distribution of this material and related documentation 9 | * without an express license agreement from NVIDIA CORPORATION or 10 | * its affiliates is strictly prohibited. 11 | */ 12 | 13 | #include "filtered_lrelu.cu" 14 | 15 | // Template/kernel specializations for sign write mode. 16 | 17 | // Full op, 32-bit indexing. 18 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 19 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 20 | 21 | // Full op, 64-bit indexing. 22 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 23 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB); 24 | 25 | // Activation/signs only for generic variant. 64-bit indexing. 26 | template void* choose_filtered_lrelu_act_kernel(void); 27 | template void* choose_filtered_lrelu_act_kernel(void); 28 | template void* choose_filtered_lrelu_act_kernel(void); 29 | 30 | // Copy filters to constant memory. 31 | template cudaError_t copy_filters(cudaStream_t stream); 32 | -------------------------------------------------------------------------------- /modules/eg3ds/torch_utils/ops/fma.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | # 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | # property and proprietary rights in and to this material, related 6 | # documentation and any modifications thereto. Any use, reproduction, 7 | # disclosure or distribution of this material and related documentation 8 | # without an express license agreement from NVIDIA CORPORATION or 9 | # its affiliates is strictly prohibited. 10 | 11 | """Fused multiply-add, with slightly faster gradients than `torch.addcmul()`.""" 12 | 13 | import torch 14 | 15 | #---------------------------------------------------------------------------- 16 | 17 | def fma(a, b, c): # => a * b + c 18 | return _FusedMultiplyAdd.apply(a, b, c) 19 | 20 | #---------------------------------------------------------------------------- 21 | 22 | class _FusedMultiplyAdd(torch.autograd.Function): # a * b + c 23 | @staticmethod 24 | def forward(ctx, a, b, c): # pylint: disable=arguments-differ 25 | out = torch.addcmul(c, a, b) 26 | ctx.save_for_backward(a, b) 27 | ctx.c_shape = c.shape 28 | return out 29 | 30 | @staticmethod 31 | def backward(ctx, dout): # pylint: disable=arguments-differ 32 | a, b = ctx.saved_tensors 33 | c_shape = ctx.c_shape 34 | da = None 35 | db = None 36 | dc = None 37 | 38 | if ctx.needs_input_grad[0]: 39 | da = _unbroadcast(dout * b, a.shape) 40 | 41 | if ctx.needs_input_grad[1]: 42 | db = _unbroadcast(dout * a, b.shape) 43 | 44 | if ctx.needs_input_grad[2]: 45 | dc = _unbroadcast(dout, c_shape) 46 | 47 | return da, db, dc 48 | 49 | #---------------------------------------------------------------------------- 50 | 51 | def _unbroadcast(x, shape): 52 | extra_dims = x.ndim - len(shape) 53 | assert extra_dims >= 0 54 | dim = [i for i in range(x.ndim) if x.shape[i] > 1 and (i < extra_dims or shape[i - extra_dims] == 1)] 55 | if len(dim): 56 | x = x.sum(dim=dim, keepdim=True) 57 | if extra_dims: 58 | x = x.reshape(-1, *x.shape[extra_dims+1:]) 59 | assert x.shape == shape 60 | return x 61 | 62 | #---------------------------------------------------------------------------- 63 | -------------------------------------------------------------------------------- /modules/eg3ds/torch_utils/ops/upfirdn2d.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | * 5 | * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 6 | * property and proprietary rights in and to this material, related 7 | * documentation and any modifications thereto. Any use, reproduction, 8 | * disclosure or distribution of this material and related documentation 9 | * without an express license agreement from NVIDIA CORPORATION or 10 | * its affiliates is strictly prohibited. 11 | */ 12 | 13 | #include 14 | 15 | //------------------------------------------------------------------------ 16 | // CUDA kernel parameters. 17 | 18 | struct upfirdn2d_kernel_params 19 | { 20 | const void* x; 21 | const float* f; 22 | void* y; 23 | 24 | int2 up; 25 | int2 down; 26 | int2 pad0; 27 | int flip; 28 | float gain; 29 | 30 | int4 inSize; // [width, height, channel, batch] 31 | int4 inStride; 32 | int2 filterSize; // [width, height] 33 | int2 filterStride; 34 | int4 outSize; // [width, height, channel, batch] 35 | int4 outStride; 36 | int sizeMinor; 37 | int sizeMajor; 38 | 39 | int loopMinor; 40 | int loopMajor; 41 | int loopX; 42 | int launchMinor; 43 | int launchMajor; 44 | }; 45 | 46 | //------------------------------------------------------------------------ 47 | // CUDA kernel specialization. 48 | 49 | struct upfirdn2d_kernel_spec 50 | { 51 | void* kernel; 52 | int tileOutW; 53 | int tileOutH; 54 | int loopMinor; 55 | int loopX; 56 | }; 57 | 58 | //------------------------------------------------------------------------ 59 | // CUDA kernel selection. 60 | 61 | template upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p); 62 | 63 | //------------------------------------------------------------------------ 64 | -------------------------------------------------------------------------------- /modules/eg3ds/volumetric_rendering/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | # 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | # property and proprietary rights in and to this material, related 6 | # documentation and any modifications thereto. Any use, reproduction, 7 | # disclosure or distribution of this material and related documentation 8 | # without an express license agreement from NVIDIA CORPORATION or 9 | # its affiliates is strictly prohibited. 10 | 11 | # empty -------------------------------------------------------------------------------- /modules/img2plane/deeplabv3/__init__.py: -------------------------------------------------------------------------------- 1 | from .decoders.my_model import DeepLabV3 -------------------------------------------------------------------------------- /modules/img2plane/deeplabv3/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import SegmentationModel 2 | 3 | from .modules import ( 4 | Conv2dReLU, 5 | Attention, 6 | ) 7 | 8 | from .heads import ( 9 | SegmentationHead, 10 | ClassificationHead, 11 | ) 12 | -------------------------------------------------------------------------------- /modules/img2plane/deeplabv3/base/heads.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from .modules import Activation 3 | 4 | 5 | class SegmentationHead(nn.Sequential): 6 | def __init__(self, in_channels, out_channels, kernel_size=3, activation=None, upsampling=1): 7 | conv2d = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=kernel_size // 2) 8 | upsampling = nn.UpsamplingBilinear2d(scale_factor=upsampling) if upsampling > 1 else nn.Identity() 9 | activation = Activation(activation) 10 | super().__init__(conv2d, upsampling, activation) 11 | 12 | 13 | class ClassificationHead(nn.Sequential): 14 | def __init__(self, in_channels, classes, pooling="avg", dropout=0.2, activation=None): 15 | if pooling not in ("max", "avg"): 16 | raise ValueError("Pooling should be one of ('max', 'avg'), got {}.".format(pooling)) 17 | pool = nn.AdaptiveAvgPool2d(1) if pooling == "avg" else nn.AdaptiveMaxPool2d(1) 18 | flatten = nn.Flatten() 19 | dropout = nn.Dropout(p=dropout, inplace=True) if dropout else nn.Identity() 20 | linear = nn.Linear(in_channels, classes, bias=True) 21 | activation = Activation(activation) 22 | super().__init__(pool, flatten, dropout, linear, activation) 23 | -------------------------------------------------------------------------------- /modules/img2plane/deeplabv3/base/initialization.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def initialize_decoder(module): 5 | for m in module.modules(): 6 | 7 | if isinstance(m, nn.Conv2d): 8 | nn.init.kaiming_uniform_(m.weight, mode="fan_in", nonlinearity="relu") 9 | if m.bias is not None: 10 | nn.init.constant_(m.bias, 0) 11 | 12 | elif isinstance(m, nn.BatchNorm2d): 13 | nn.init.constant_(m.weight, 1) 14 | nn.init.constant_(m.bias, 0) 15 | 16 | elif isinstance(m, nn.Linear): 17 | nn.init.xavier_uniform_(m.weight) 18 | if m.bias is not None: 19 | nn.init.constant_(m.bias, 0) 20 | 21 | 22 | def initialize_head(module): 23 | for m in module.modules(): 24 | if isinstance(m, (nn.Linear, nn.Conv2d)): 25 | nn.init.xavier_uniform_(m.weight) 26 | if m.bias is not None: 27 | nn.init.constant_(m.bias, 0) 28 | -------------------------------------------------------------------------------- /modules/img2plane/deeplabv3/base/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from . import initialization as init 3 | 4 | 5 | class SegmentationModel(torch.nn.Module): 6 | def initialize(self): 7 | init.initialize_decoder(self.decoder) 8 | init.initialize_head(self.segmentation_head) 9 | if self.classification_head is not None: 10 | init.initialize_head(self.classification_head) 11 | 12 | def check_input_shape(self, x): 13 | 14 | h, w = x.shape[-2:] 15 | output_stride = self.encoder.output_stride 16 | if h % output_stride != 0 or w % output_stride != 0: 17 | new_h = (h // output_stride + 1) * output_stride if h % output_stride != 0 else h 18 | new_w = (w // output_stride + 1) * output_stride if w % output_stride != 0 else w 19 | raise RuntimeError( 20 | f"Wrong input shape height={h}, width={w}. Expected image height and width " 21 | f"divisible by {output_stride}. Consider pad your images to shape ({new_h}, {new_w})." 22 | ) 23 | 24 | def forward(self, x): 25 | """Sequentially pass `x` trough model`s encoder, decoder and heads""" 26 | 27 | self.check_input_shape(x) 28 | 29 | features = self.encoder(x) 30 | decoder_output = self.decoder(*features) 31 | 32 | masks = self.segmentation_head(decoder_output) 33 | 34 | if self.classification_head is not None: 35 | labels = self.classification_head(features[-1]) 36 | return masks, labels 37 | 38 | return masks 39 | 40 | @torch.no_grad() 41 | def predict(self, x): 42 | """Inference method. Switch model to `eval` mode, call `.forward(x)` with `torch.no_grad()` 43 | 44 | Args: 45 | x: 4D torch tensor with shape (batch_size, channels, height, width) 46 | 47 | Return: 48 | prediction: 4D torch tensor with shape (batch_size, classes, height, width) 49 | 50 | """ 51 | if self.training: 52 | self.eval() 53 | 54 | x = self.forward(x) 55 | 56 | return x 57 | -------------------------------------------------------------------------------- /modules/img2plane/deeplabv3/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | import timm 2 | import functools 3 | import torch.utils.model_zoo as model_zoo 4 | 5 | from .resnet import resnet_encoders 6 | 7 | 8 | 9 | encoders = {} 10 | encoders.update(resnet_encoders) 11 | 12 | def get_encoder(name, in_channels=3, depth=5, weights=None, output_stride=32, **kwargs): 13 | 14 | try: 15 | Encoder = encoders[name]["encoder"] 16 | except KeyError: 17 | raise KeyError("Wrong encoder name `{}`, supported encoders: {}".format(name, list(encoders.keys()))) 18 | 19 | params = encoders[name]["params"] 20 | params.update(depth=depth) 21 | encoder = Encoder(**params) 22 | 23 | if weights is not None: 24 | try: 25 | settings = encoders[name]["pretrained_settings"][weights] 26 | except KeyError: 27 | raise KeyError( 28 | "Wrong pretrained weights `{}` for encoder `{}`. Available options are: {}".format( 29 | weights, 30 | name, 31 | list(encoders[name]["pretrained_settings"].keys()), 32 | ) 33 | ) 34 | encoder.load_state_dict(model_zoo.load_url(settings["url"])) 35 | 36 | encoder.set_in_channels(in_channels, pretrained=weights is not None) 37 | if output_stride != 32: 38 | encoder.make_dilated(output_stride) 39 | 40 | return encoder 41 | 42 | 43 | def get_encoder_names(): 44 | return list(encoders.keys()) 45 | 46 | 47 | def get_preprocessing_params(encoder_name, pretrained="imagenet"): 48 | 49 | if encoder_name.startswith("tu-"): 50 | encoder_name = encoder_name[3:] 51 | if not timm.models.is_model_pretrained(encoder_name): 52 | raise ValueError(f"{encoder_name} does not have pretrained weights and preprocessing parameters") 53 | settings = timm.models.get_pretrained_cfg(encoder_name) 54 | else: 55 | all_settings = encoders[encoder_name]["pretrained_settings"] 56 | if pretrained not in all_settings.keys(): 57 | raise ValueError("Available pretrained options {}".format(all_settings.keys())) 58 | settings = all_settings[pretrained] 59 | 60 | formatted_settings = {} 61 | formatted_settings["input_space"] = settings.get("input_space", "RGB") 62 | formatted_settings["input_range"] = list(settings.get("input_range", [0, 1])) 63 | formatted_settings["mean"] = list(settings.get("mean")) 64 | formatted_settings["std"] = list(settings.get("std")) 65 | 66 | return formatted_settings 67 | 68 | 69 | def get_preprocessing_fn(encoder_name, pretrained="imagenet"): 70 | params = get_preprocessing_params(encoder_name, pretrained=pretrained) 71 | return functools.partial(preprocess_input, **params) 72 | -------------------------------------------------------------------------------- /modules/img2plane/deeplabv3/encoders/_base.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from typing import List 4 | from collections import OrderedDict 5 | 6 | from . import _utils as utils 7 | 8 | 9 | class EncoderMixin: 10 | """Add encoder functionality such as: 11 | - output channels specification of feature tensors (produced by encoder) 12 | - patching first convolution for arbitrary input channels 13 | """ 14 | 15 | _output_stride = 32 16 | 17 | @property 18 | def out_channels(self): 19 | """Return channels dimensions for each tensor of forward output of encoder""" 20 | return self._out_channels[: self._depth + 1] 21 | 22 | @property 23 | def output_stride(self): 24 | return min(self._output_stride, 2**self._depth) 25 | 26 | def set_in_channels(self, in_channels, pretrained=True): 27 | """Change first convolution channels""" 28 | if in_channels == 3: 29 | return 30 | 31 | self._in_channels = in_channels 32 | if self._out_channels[0] == 3: 33 | self._out_channels = tuple([in_channels] + list(self._out_channels)[1:]) 34 | 35 | utils.patch_first_conv(model=self, new_in_channels=in_channels, pretrained=pretrained) 36 | 37 | def get_stages(self): 38 | """Override it in your implementation""" 39 | raise NotImplementedError 40 | 41 | def make_dilated(self, output_stride): 42 | 43 | if output_stride == 16: 44 | stage_list = [ 45 | 5, 46 | ] 47 | dilation_list = [ 48 | 2, 49 | ] 50 | 51 | elif output_stride == 8: 52 | stage_list = [4, 5] 53 | dilation_list = [2, 4] 54 | 55 | else: 56 | raise ValueError("Output stride should be 16 or 8, got {}.".format(output_stride)) 57 | 58 | self._output_stride = output_stride 59 | 60 | stages = self.get_stages() 61 | for stage_indx, dilation_rate in zip(stage_list, dilation_list): 62 | utils.replace_strides_with_dilation( 63 | module=stages[stage_indx], 64 | dilation_rate=dilation_rate, 65 | ) 66 | -------------------------------------------------------------------------------- /modules/img2plane/deeplabv3/encoders/_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def patch_first_conv(model, new_in_channels, default_in_channels=3, pretrained=True): 6 | """Change first convolution layer input channels. 7 | In case: 8 | in_channels == 1 or in_channels == 2 -> reuse original weights 9 | in_channels > 3 -> make random kaiming normal initialization 10 | """ 11 | 12 | # get first conv 13 | for module in model.modules(): 14 | if isinstance(module, nn.Conv2d) and module.in_channels == default_in_channels: 15 | break 16 | 17 | weight = module.weight.detach() 18 | module.in_channels = new_in_channels 19 | 20 | if not pretrained: 21 | module.weight = nn.parameter.Parameter( 22 | torch.Tensor(module.out_channels, new_in_channels // module.groups, *module.kernel_size) 23 | ) 24 | module.reset_parameters() 25 | 26 | elif new_in_channels == 1: 27 | new_weight = weight.sum(1, keepdim=True) 28 | module.weight = nn.parameter.Parameter(new_weight) 29 | 30 | else: 31 | new_weight = torch.Tensor(module.out_channels, new_in_channels // module.groups, *module.kernel_size) 32 | 33 | for i in range(new_in_channels): 34 | new_weight[:, i] = weight[:, i % default_in_channels] 35 | 36 | new_weight = new_weight * (default_in_channels / new_in_channels) 37 | module.weight = nn.parameter.Parameter(new_weight) 38 | 39 | 40 | def replace_strides_with_dilation(module, dilation_rate): 41 | """Patch Conv2d modules replacing strides with dilation""" 42 | for mod in module.modules(): 43 | if isinstance(mod, nn.Conv2d): 44 | mod.stride = (1, 1) 45 | mod.dilation = (dilation_rate, dilation_rate) 46 | kh, kw = mod.kernel_size 47 | mod.padding = ((kh // 2) * dilation_rate, (kh // 2) * dilation_rate) 48 | 49 | # Kostyl for EfficientNet 50 | if hasattr(mod, "static_padding"): 51 | mod.static_padding = nn.Identity() 52 | -------------------------------------------------------------------------------- /modules/img2plane/segformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import LowResolutionViT, TriplanePredictorViT -------------------------------------------------------------------------------- /modules/img2plane/simple_encoders/high_resolution_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class HighResoEncoder(nn.Module): 7 | def __init__(self, 8 | in_dim=5, # 3 for rgb and 2 for coordinate 9 | out_dim=96, 10 | ): 11 | super().__init__() 12 | self.first = nn.Conv2d(in_channels=in_dim, out_channels=64, kernel_size=7, stride=2, padding=3) 13 | self.activation = nn.LeakyReLU(negative_slope=0.01) 14 | 15 | self.conv_layers = nn.Sequential(*[ 16 | nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=1, padding=1), 17 | nn.LeakyReLU(negative_slope=0.01), 18 | nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1), 19 | nn.LeakyReLU(negative_slope=0.01), 20 | nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1), 21 | nn.LeakyReLU(negative_slope=0.01), 22 | nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1), 23 | nn.LeakyReLU(negative_slope=0.01), 24 | ]) 25 | 26 | self.final = nn.Conv2d(in_channels=96, out_channels=out_dim, kernel_size=3, stride=1, padding=1) 27 | 28 | def forward(self, x): 29 | """ 30 | x: [B, C=5, 256, 256] 31 | return: [B, C=96, 256, 256] 32 | """ 33 | h = self.first(x) 34 | h = self.conv_layers(h) 35 | h = self.final(h) 36 | return h 37 | -------------------------------------------------------------------------------- /tasks/run.py: -------------------------------------------------------------------------------- 1 | # import utils.commons.single_thread_env # NOQA 2 | import os 3 | import sys 4 | sys.path.append(os.path.abspath("./")) 5 | 6 | from utils.commons.hparams import hparams, set_hparams 7 | import importlib 8 | 9 | 10 | def run_task(): 11 | assert hparams['task_cls'] != '' 12 | pkg = ".".join(hparams["task_cls"].split(".")[:-1]) 13 | cls_name = hparams["task_cls"].split(".")[-1] 14 | task_cls = getattr(importlib.import_module(pkg), cls_name) 15 | task_cls.start() 16 | 17 | def clear_gpus(): 18 | devices = os.environ.get('CUDA_VISIBLE_DEVICES', '').split(",") 19 | for d in devices: 20 | os.system(f'pkill -f "voidgpu{d}"') 21 | 22 | if __name__ == '__main__': 23 | if os.environ.get('CUDA_VISIBLE_DEVICES', '') == '': 24 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 25 | try: 26 | set_hparams() 27 | run_task() 28 | except KeyboardInterrupt: 29 | if hparams['init_method'] == 'file': 30 | # on exit, remove the shared file in nfs for DDP 31 | exp_name = hparams['exp_name'] 32 | shared_file_name = f'/mnt/bn/sa-ag-data/yezhenhui/nfs/pytorch_ddp_sharedfile/{exp_name}' 33 | if os.path.exists(shared_file_name): 34 | os.system(f"rm -r {shared_file_name}") 35 | -------------------------------------------------------------------------------- /utils/audio/dct.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def dct(x, norm=None): 6 | x_shape = x.shape 7 | N = x_shape[-1] 8 | x = x.contiguous().view(-1, N) 9 | 10 | v = torch.cat([x[:, ::2], x[:, 1::2].flip([1])], dim=1) 11 | 12 | Vc = torch.view_as_real(torch.fft.fft(v, dim=1)) # add this line 13 | 14 | k = - torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi / (2 * N) 15 | W_r = torch.cos(k) 16 | W_i = torch.sin(k) 17 | 18 | V = Vc[:, :, 0] * W_r - Vc[:, :, 1] * W_i 19 | 20 | if norm == 'ortho': 21 | V[:, 0] /= np.sqrt(N) * 2 22 | V[:, 1:] /= np.sqrt(N / 2) * 2 23 | 24 | V = 2 * V.view(*x_shape) 25 | 26 | return V 27 | 28 | 29 | def idct(X, norm=None): 30 | x_shape = X.shape 31 | N = x_shape[-1] 32 | 33 | X_v = X.contiguous().view(-1, x_shape[-1]) / 2 34 | 35 | if norm == 'ortho': 36 | X_v[:, 0] *= np.sqrt(N) * 2 37 | X_v[:, 1:] *= np.sqrt(N / 2) * 2 38 | 39 | k = torch.arange(x_shape[-1], dtype=X.dtype, device=X.device)[None, :] * np.pi / (2 * N) 40 | W_r = torch.cos(k) 41 | W_i = torch.sin(k) 42 | 43 | V_t_r = X_v 44 | V_t_i = torch.cat([X_v[:, :1] * 0, -X_v.flip([1])[:, :-1]], dim=1) 45 | 46 | V_r = V_t_r * W_r - V_t_i * W_i 47 | V_i = V_t_r * W_i + V_t_i * W_r 48 | 49 | V = torch.cat([V_r.unsqueeze(2), V_i.unsqueeze(2)], dim=2) 50 | 51 | # v = torch.irfft(V, 1, onesided=False) # comment this line 52 | v = torch.fft.irfft(torch.view_as_complex(V), n=V.shape[1], dim=1) # add this line 53 | 54 | x = v.new_zeros(v.shape) 55 | x[:, ::2] += v[:, :N - (N // 2)] 56 | x[:, 1::2] += v.flip([1])[:, :N // 2] 57 | 58 | return x.view(*x_shape) 59 | -------------------------------------------------------------------------------- /utils/audio/io.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | import numpy as np 4 | from scipy.io import wavfile 5 | import pyloudnorm as pyln 6 | 7 | 8 | def save_wav(wav, path, sr, norm=False): 9 | wav = wav.astype(float) 10 | if norm: 11 | meter = pyln.Meter(sr) # create BS.1770 meter 12 | loudness = meter.integrated_loudness(wav) 13 | wav = pyln.normalize.loudness(wav, loudness, -18.0) 14 | if np.abs(wav).max() >= 1: 15 | wav = wav / np.abs(wav).max() * 0.95 16 | wav = wav * 32767 17 | wavfile.write(path[:-4] + '.wav', sr, wav.astype(np.int16)) 18 | if path[-4:] == '.mp3': 19 | to_mp3(path[:-4]) 20 | 21 | 22 | def to_mp3(out_path): 23 | if out_path[-4:] == '.wav': 24 | out_path = out_path[:-4] 25 | subprocess.check_call( 26 | f'ffmpeg -threads 1 -loglevel error -i "{out_path}.wav" -vn -b:a 192k -y -hide_banner -async 1 "{out_path}.mp3"', 27 | shell=True, stdin=subprocess.PIPE) 28 | subprocess.check_call(f'rm -f "{out_path}.wav"', shell=True) 29 | -------------------------------------------------------------------------------- /utils/audio/pitch/bin/ExtractF0ByStraight: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/utils/audio/pitch/bin/ExtractF0ByStraight -------------------------------------------------------------------------------- /utils/audio/pitch/bin/InterpF0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/utils/audio/pitch/bin/InterpF0 -------------------------------------------------------------------------------- /utils/audio/pitch/bin/ReaperF0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/utils/audio/pitch/bin/ReaperF0 -------------------------------------------------------------------------------- /utils/audio/pitch/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def to_lf0(f0): 6 | f0[f0 < 1.0e-5] = 1.0e-6 7 | lf0 = f0.log() if isinstance(f0, torch.Tensor) else np.log(f0) 8 | lf0[f0 < 1.0e-5] = - 1.0E+10 9 | return lf0 10 | 11 | 12 | def to_f0(lf0): 13 | f0 = np.where(lf0 <= 0, 0.0, np.exp(lf0)) 14 | return f0.flatten() 15 | 16 | 17 | def f0_to_coarse(f0, f0_bin=256, f0_max=900.0, f0_min=50.0): 18 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 19 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 20 | is_torch = isinstance(f0, torch.Tensor) 21 | f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) 22 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 23 | 24 | f0_mel[f0_mel <= 1] = 1 25 | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 26 | f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int) 27 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min(), f0.min(), f0.max()) 28 | return f0_coarse 29 | 30 | 31 | def coarse_to_f0(f0_coarse, f0_bin=256, f0_max=900.0, f0_min=50.0): 32 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 33 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 34 | uv = f0_coarse == 1 35 | f0 = f0_mel_min + (f0_coarse - 1) * (f0_mel_max - f0_mel_min) / (f0_bin - 2) 36 | f0 = ((f0 / 1127).exp() - 1) * 700 37 | f0[uv] = 0 38 | return f0 39 | 40 | 41 | def norm_f0(f0, uv, pitch_norm='log', f0_mean=400, f0_std=100): 42 | is_torch = isinstance(f0, torch.Tensor) 43 | if pitch_norm == 'standard': 44 | f0 = (f0 - f0_mean) / f0_std 45 | if pitch_norm == 'log': 46 | f0 = torch.log2(f0 + 1e-8) if is_torch else np.log2(f0 + 1e-8) 47 | if uv is not None: 48 | f0[uv > 0] = 0 49 | return f0 50 | 51 | 52 | def norm_interp_f0(f0, pitch_norm='log', f0_mean=None, f0_std=None): 53 | is_torch = isinstance(f0, torch.Tensor) 54 | if is_torch: 55 | device = f0.device 56 | f0 = f0.data.cpu().numpy() 57 | uv = f0 == 0 58 | f0 = norm_f0(f0, uv, pitch_norm, f0_mean, f0_std) 59 | if sum(uv) == len(f0): 60 | f0[uv] = 0 61 | elif sum(uv) > 0: 62 | f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) 63 | if is_torch: 64 | uv = torch.FloatTensor(uv) 65 | f0 = torch.FloatTensor(f0) 66 | f0 = f0.to(device) 67 | uv = uv.to(device) 68 | return f0, uv 69 | 70 | 71 | def denorm_f0(f0, uv, pitch_norm='log', f0_mean=400, f0_std=100, pitch_padding=None, min=50, max=900): 72 | is_torch = isinstance(f0, torch.Tensor) 73 | if pitch_norm == 'standard': 74 | f0 = f0 * f0_std + f0_mean 75 | if pitch_norm == 'log': 76 | f0 = 2 ** f0 77 | f0 = f0.clamp(min=min, max=max) if is_torch else np.clip(f0, a_min=min, a_max=max) 78 | if uv is not None: 79 | f0[uv > 0] = 0 80 | if pitch_padding is not None: 81 | f0[pitch_padding] = 0 82 | return f0 83 | -------------------------------------------------------------------------------- /utils/audio/pitch/uv_utils.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | from scipy.interpolate import interp1d 4 | 5 | 6 | def uv_energy_corrector(wav_data_16k, f0_func, f0_min=50, f0_max=1000): 7 | hop_size = 256 8 | win_size = hop_size * 6 9 | sr = 16000 10 | 11 | spec = np.abs(librosa.stft(wav_data_16k, n_fft=win_size, hop_length=hop_size, 12 | win_length=win_size, pad_mode="constant").T) 13 | T = spec.shape[0] 14 | x_h256 = np.arange(0, 1, 1 / T)[:T] 15 | x_h256[-1] = 1 16 | f0 = f0_func(x_h256) 17 | freqs = librosa.fft_frequencies(sr=sr, n_fft=win_size) 18 | x_idx = np.arange(T) 19 | 20 | def find_nearest_stft_bin(f0_): 21 | return np.abs(freqs[None, :] - f0_[:, None]).argmin(-1) 22 | 23 | def get_energy_mask(f0_lambda, hars=None, win_size=3): 24 | if hars is None: 25 | hars = [1] 26 | mask = np.zeros([T, 10000]).astype(bool) 27 | mask_bins = [] 28 | for multiple in hars: 29 | f0_bin_idx = find_nearest_stft_bin(f0_lambda(f0, multiple)) 30 | for delta in range(-win_size // 2, 1 + win_size // 2): 31 | y_idx = f0_bin_idx + delta 32 | if np.max(y_idx) < spec.shape[1]: 33 | mask_bins.append(spec[x_idx, y_idx]) 34 | mask[x_idx, y_idx] = 1 35 | mask_bins = np.stack(mask_bins, 1) 36 | energy_ = np.mean(mask_bins, 1) 37 | return energy_, mask 38 | 39 | # find uv first (for obtaining mean_energy_mharfhar) 40 | energy_har, mask_har = get_energy_mask(lambda f0, m: f0 * m, [1, 2], 3) 41 | energy_mhalfhar, mask_mhalfhar = get_energy_mask(lambda f0, m: f0 * (m - 0.5), [1], 5) 42 | r_energy = energy_har / np.clip(energy_mhalfhar, 1e-8, None) 43 | 44 | uv = np.zeros_like(f0).astype(bool) 45 | uv |= r_energy < 10 46 | uv |= (f0 > f0_max) | (f0 < f0_min) 47 | func_uv = interp1d(x_h256, uv, 'nearest', fill_value='extrapolate') 48 | return func_uv 49 | -------------------------------------------------------------------------------- /utils/commons/euler2rot.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from scipy.spatial.transform import Rotation as R 3 | from utils.commons.tensor_utils import convert_to_tensor 4 | 5 | 6 | def rot2euler(rot, use_radian=True): 7 | r = R.from_matrix(rot) 8 | return r.as_euler('xyz', degrees=not use_radian) 9 | 10 | def euler2rot(euler, use_radian=True): 11 | r = R.from_euler('xyz',euler, degrees=not use_radian) 12 | return r.as_matrix() 13 | 14 | def c2w_to_euler_trans(c2w): 15 | if c2w.ndim == 3: 16 | e = rot2euler(c2w[:, :3, :3]) # [B, 3] 17 | t = c2w[:, :3, 3].reshape([-1, 3]) 18 | else: 19 | e = rot2euler(c2w[:3, :3]) # [B, 3] 20 | t = c2w[:3, 3].reshape([3]) 21 | return e, t # [3+3] 22 | 23 | def euler_trans_2_c2w(euler, trans): 24 | if euler.ndim == 2: 25 | rot = euler2rot(euler) # [b, 3, 3] 26 | bs = trans.shape[0] 27 | trans = trans.reshape([bs, 3, 1]) 28 | rot = convert_to_tensor(rot).float() 29 | trans = convert_to_tensor(trans).float() 30 | c2w = torch.cat([rot, trans], dim=-1) # [b, 3, 4] 31 | else: 32 | rot = euler2rot(euler) # [3, 3] 33 | trans = trans.reshape([3, 1]) 34 | rot = convert_to_tensor(rot).float() 35 | trans = convert_to_tensor(trans).float() 36 | c2w = torch.cat([rot, trans], dim=-1) # [3, 4] 37 | return c2w -------------------------------------------------------------------------------- /utils/commons/face_alignment_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | yaw_idx_in_mediapipe_mesh = [356, 454, 361, 288, 397, 379, 378, 377, 152, 148, 149, 150, 172,58, 132, 234, 127] 5 | brow_idx_in_mediapipe_mesh = [70, 63, 105, 66, 107, 336, 296, 334, 293, 300] 6 | nose_idx_in_mediapipe_mesh = [6, 5, 1, 2, 129, 240, 2, 460, 358] 7 | eye_idx_in_mediapipe_mesh = [33, 160, 158, 133, 153, 144, 362, 385, 387, 263, 373, 380] 8 | mouth_idx_in_mediapipe_mesh = [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95] 9 | lm68_idx_in_mediapipe_mesh = yaw_idx_in_mediapipe_mesh + brow_idx_in_mediapipe_mesh + nose_idx_in_mediapipe_mesh + eye_idx_in_mediapipe_mesh + mouth_idx_in_mediapipe_mesh 10 | 11 | 12 | def mediapipe_lm478_to_face_alignment_lm68(lm478, H, W, return_2d=True): 13 | """ 14 | lm478: [B, 478, 3] or [478,3] 15 | """ 16 | lm478 = copy.deepcopy(lm478) 17 | lm478[..., 0] *= W 18 | lm478[..., 1] *= H 19 | n_dim = 2 if return_2d else False 20 | if lm478.ndim == 2: 21 | return lm478[lm68_idx_in_mediapipe_mesh, :n_dim].astype(np.int16) 22 | elif lm478.ndim == 3: 23 | return lm478[:, lm68_idx_in_mediapipe_mesh, :n_dim].astype(np.int16) 24 | else: 25 | raise ValueError("input lm478 ndim should in 2 or 3!") 26 | 27 | def mediapipe_lm478_to_lm68_3d(lm478): 28 | """ 29 | lm478: [B, 478, 3] or [478,3] 30 | also works for lm468 31 | """ 32 | if lm478.ndim == 2: 33 | return lm478[lm68_idx_in_mediapipe_mesh] 34 | elif lm478.ndim == 3: 35 | return lm478[:, lm68_idx_in_mediapipe_mesh] 36 | else: 37 | raise ValueError("input lm478 ndim should in 2 or 3!") -------------------------------------------------------------------------------- /utils/commons/image_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import cv2 6 | import os 7 | import imageio 8 | 9 | 10 | def dilate(bin_img, ksize=5): 11 | # bin_img 12 | pad = (ksize-1)//2 13 | bin_img = F.pad(bin_img, pad=[pad,pad,pad,pad], mode='reflect') 14 | out = F.max_pool2d(bin_img, kernel_size=ksize, stride=1, padding=0) 15 | return out 16 | 17 | def erode(bin_img, ksize=5): 18 | out = 1 - dilate(1-bin_img, ksize) 19 | return out 20 | 21 | def to8b(x): 22 | return (255*np.clip(x, 0, 1)).astype(np.uint8) 23 | 24 | def mse2psnr(x): 25 | return -10. * torch.log(x) / torch.log(torch.Tensor([10.])) 26 | 27 | def img2mse(x, y): 28 | return torch.mean((x - y) ** 2) 29 | 30 | def video2images(video_name, out_dir): 31 | cap = cv2.VideoCapture(video_name) 32 | frame_num = 0 33 | while(True): 34 | _, frame = cap.read() 35 | if frame is None: 36 | break 37 | out_frame_name = os.path.join(out_dir, str(frame_num) + '.jpg') 38 | cv2.imwrite(out_frame_name, frame) 39 | frame_num += + 1 40 | cap.release() 41 | 42 | def load_image_as_uint8_tensor(fname): 43 | """ 44 | img: (H, W, 3) floatTensor 45 | """ 46 | img = torch.as_tensor(imageio.imread(fname)) 47 | return img 48 | 49 | if __name__ =='__main__': 50 | video2images("test_data/May_val/AD-NeRF.mp4", "test_data/May_val/AD-NeRF") 51 | video2images("test_data/May_val/GeneFace.mp4", "test_data/May_val/GeneFace") 52 | video2images("test_data/May_val/GT.mp4", "test_data/May_val/GT") -------------------------------------------------------------------------------- /utils/commons/meters.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | 4 | 5 | class AvgrageMeter(object): 6 | 7 | def __init__(self): 8 | self.reset() 9 | 10 | def reset(self): 11 | self.avg = 0 12 | self.sum = 0 13 | self.cnt = 0 14 | 15 | def update(self, val, n=1): 16 | self.sum += val * n 17 | self.cnt += n 18 | self.avg = self.sum / self.cnt 19 | 20 | 21 | class Timer: 22 | timer_map = {} 23 | 24 | def __init__(self, name, enable=False): 25 | if name not in Timer.timer_map: 26 | Timer.timer_map[name] = 0 27 | self.name = name 28 | self.enable = enable 29 | 30 | def __enter__(self): 31 | if self.enable: 32 | # if torch.cuda.is_available(): 33 | # torch.cuda.synchronize() 34 | self.t = time.time() 35 | 36 | def __exit__(self, exc_type, exc_val, exc_tb): 37 | if self.enable: 38 | # if torch.cuda.is_available(): 39 | # torch.cuda.synchronize() 40 | Timer.timer_map[self.name] += time.time() - self.t 41 | if self.enable: 42 | print(f'[Timer] {self.name}: {Timer.timer_map[self.name]}') 43 | -------------------------------------------------------------------------------- /utils/commons/os_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import glob 4 | from utils.commons.multiprocess_utils import multiprocess_run_tqdm 5 | 6 | 7 | def link_file(from_file, to_file): 8 | subprocess.check_call( 9 | f'ln -s "`realpath --relative-to="{os.path.dirname(to_file)}" "{from_file}"`" "{to_file}"', shell=True) 10 | 11 | 12 | def move_file(from_file, to_file): 13 | subprocess.check_call(f'mv "{from_file}" "{to_file}"', shell=True) 14 | 15 | 16 | def copy_file(from_file, to_file): 17 | subprocess.check_call(f'cp -r "{from_file}" "{to_file}"', shell=True) 18 | 19 | 20 | def remove_file(*fns): 21 | for f in fns: 22 | subprocess.check_call(f'rm -rf "{f}"', shell=True) 23 | 24 | def glob_job(d, f): 25 | pattern = os.path.join(d, f) 26 | return glob.glob(pattern) 27 | 28 | def multiprocess_glob(pattern, num_workers=None): 29 | split_pattern = pattern.split("/") 30 | recursive_depth = 0 # number of recursive depth 31 | for split in split_pattern: 32 | if '*' in split: 33 | recursive_depth += 1 34 | if recursive_depth == 1: 35 | return glob.glob(pattern) 36 | else: 37 | dirs = glob.glob('/'.join(split_pattern[:-1])) 38 | ret = [] 39 | args = [(d, split_pattern[-1]) for d in dirs] 40 | for (i,res) in multiprocess_run_tqdm(glob_job, args=args, desc=f"globing {pattern}", num_workers=num_workers): 41 | ret += res 42 | return ret -------------------------------------------------------------------------------- /utils/commons/pitch_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | f0_bin = 256 5 | f0_max = 1100.0 6 | f0_min = 50.0 7 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 8 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 9 | 10 | def coarse_to_f0(coarse): 11 | uv = coarse == 1 12 | f0_mel = (coarse - 1) * (f0_mel_max - f0_mel_min) / (f0_bin - 2) + f0_mel_min 13 | f0 = ((f0_mel / 1127).exp() - 1) * 700 14 | f0[uv] = 0 15 | return f0 16 | 17 | def f0_to_coarse(f0): 18 | is_torch = isinstance(f0, torch.Tensor) 19 | f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) 20 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 21 | 22 | f0_mel[f0_mel <= 1] = 1 23 | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 24 | f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int_) 25 | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min(), f0.min(), f0.max()) 26 | return f0_coarse 27 | 28 | 29 | def norm_f0(f0, uv, hparams): 30 | is_torch = isinstance(f0, torch.Tensor) 31 | if hparams['pitch_norm'] == 'standard': 32 | f0 = (f0 - hparams['f0_mean']) / hparams['f0_std'] 33 | if hparams['pitch_norm'] == 'log': 34 | f0 = torch.log2(f0 + 1e-8) if is_torch else np.log2(f0 + 1e-8) 35 | if uv is not None and hparams['use_uv']: 36 | f0[uv > 0] = 0 37 | return f0 -------------------------------------------------------------------------------- /utils/nn/grad.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def get_grad_norm(model, l=2): 4 | num_para = 0 5 | accu_grad = 0 6 | if isinstance(model, torch.nn.Module): 7 | params = model.parameters() 8 | else: 9 | params = model 10 | for p in params: 11 | if p.grad is None: 12 | continue 13 | num_para += p.numel() 14 | if l == 1: 15 | accu_grad += p.grad.abs(1).sum() 16 | elif l == 2: 17 | accu_grad += p.grad.pow(2).sum() 18 | else: 19 | raise ValueError("Now we only implement l1/l2 norm !") 20 | if l == 2: 21 | accu_grad = accu_grad ** 0.5 22 | if isinstance(accu_grad, float): 23 | return accu_grad 24 | return accu_grad.item() 25 | 26 | class GradBuffer: 27 | def __init__(self): 28 | self.buffer = {} 29 | 30 | def add(self, model): 31 | for item in model.named_parameters(): 32 | name, param = item 33 | if param.grad is None: 34 | continue 35 | self.buffer[name] = self.buffer.get(name, 0) + param.grad.data 36 | 37 | def apply(self, model): 38 | for item in model.named_parameters(): 39 | name, param = item 40 | if param.grad is None: 41 | continue 42 | if name in self.buffer.keys(): 43 | param.grad.data += self.buffer[name] 44 | self.buffer = {} -------------------------------------------------------------------------------- /utils/nn/model_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def print_arch(model, model_name='model'): 6 | print(f"| {model_name} Arch: ", model) 7 | num_params(model, model_name=model_name) 8 | 9 | 10 | def num_params(model, print_out=True, model_name="model"): 11 | parameters = filter(lambda p: p.requires_grad, model.parameters()) 12 | parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000 13 | if print_out: 14 | print(f'| {model_name} Trainable Parameters: %.3fM' % parameters) 15 | return parameters 16 | 17 | def get_device_of_model(model): 18 | return model.parameters().__next__().device 19 | 20 | def requires_grad(model): 21 | if isinstance(model, torch.nn.Module): 22 | for p in model.parameters(): 23 | p.requires_grad = True 24 | else: 25 | model.requires_grad = True 26 | 27 | def not_requires_grad(model): 28 | if isinstance(model, torch.nn.Module): 29 | for p in model.parameters(): 30 | p.requires_grad = False 31 | else: 32 | model.requires_grad = False 33 | -------------------------------------------------------------------------------- /utils/useful_cmd_lines/clean_gpu.py: -------------------------------------------------------------------------------- 1 | import os, re 2 | def clean_gpu(): 3 | ret = os.popen("fuser -v /dev/nvidia*").read() 4 | ret = re.sub("kernel", " ", ret) 5 | ids = set(ret.split(" ")) 6 | ids = [int(i) for i in ids if i != ''] 7 | ids = [str(i) for i in sorted(ids)] 8 | ids_string = ' '.join(ids) 9 | cmd = f"kill -9 {ids_string}" 10 | os.system("fuser -v /dev/nvidia*") 11 | flag = input(f"You are going run this command: \n ==> \"{cmd}\" \nEnter y/Y to proceed, or other to abort.\n[y/n]") 12 | if flag.lower() == 'y': 13 | os.system(cmd) 14 | print("All gpu process cleaned!") 15 | else: 16 | print("Aborted!") 17 | 18 | if __name__ == '__main__': 19 | clean_gpu() -------------------------------------------------------------------------------- /utils/visualization/auto_plot_image.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import cv2 4 | 5 | def plot_image(save_path, image, convert_RGB2BGR=True): 6 | if isinstance(image, torch.Tensor): 7 | image = image.detach().cpu().numpy() 8 | image = image.astype(float) 9 | if image.max() < 1.1 and image.min() > -0.1: # [0, 1] 10 | image = image * 255 11 | elif image.max() < 1.1 and image.min() > -1.1: # [-1, 1] 12 | image = (image + 1.0) * 0.5 * 255 13 | image = image.clip(0, 255) 14 | image = image.astype(np.uint8) 15 | if len(image.shape) == 4 and image.shape[0] == 1: 16 | image = image[0] 17 | if len(image.shape) == 3 and image.shape[0] <= 4: # C, H, W 18 | image = torch.from_numpy(image).permute(1, 2, 0).numpy() 19 | if len(image.shape) == 3 and convert_RGB2BGR: 20 | image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) 21 | cv2.imwrite(save_path, image) -------------------------------------------------------------------------------- /utils/visualization/ffmpeg_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def imgs_to_video(img_dir, video_path, audio_path=None, verbose=True): 4 | cmd = f"ffmpeg -i {img_dir}/%5d.png " 5 | if audio_path is not None: 6 | cmd += f"-i {audio_path} " 7 | cmd += "-strict -2 " 8 | cmd += "-c:v libx264 -pix_fmt yuv420p -b:v 2000k -y -shortest " 9 | if verbose is False: 10 | cmd += " -v quiet " 11 | cmd += f"{video_path} " 12 | os.makedirs(os.path.dirname(video_path), exist_ok=True) 13 | os.system(cmd) 14 | 15 | 16 | if __name__ == '__main__': 17 | imgs_to_video('infer_out/tmp_imgs', 'infer_out/tmp_imgs/out.mp4', 'data/raw/val_wavs/zozo.wav') 18 | imgs_to_video('infer_out/tmp_imgs', 'infer_out/tmp_imgs/out2.mp4', 'data/raw/val_wavs/zozo.wav') -------------------------------------------------------------------------------- /utils/visualization/plot_attention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from utils.commons.tensor_utils import convert_to_np 4 | 5 | 6 | def plot_attention_img(attention_img, color_bar='jet'): 7 | """ 8 | attention_img: raw attention in network, tensor or array, in 0~1 scale, shape [H, W,] 9 | color_bar: jet, summer, etc see this https://blog.csdn.net/loveliuzz/article/details/73648505 10 | return: ready-to-visualize attention img in -1~1 scale. 11 | """ 12 | attention_img = convert_to_np(attention_img) 13 | assert attention_img.ndim == 2 14 | attention_img = np.uint8(255 * attention_img) 15 | color_bar_dict = { 16 | 'jet': cv2.COLORMAP_JET, 17 | 'summer': cv2.COLORMAP_SUMMER, 18 | 'hot': cv2.COLORMAP_HOT 19 | } 20 | color_bar = color_bar_dict.get(color_bar, getattr(cv2, f"COLORMAP_{color_bar.upper()}")) 21 | attention_img = cv2.applyColorMap(attention_img, color_bar) / 127.5 - 1 22 | attention_img = attention_img[:, :, ::-1] # flip RGB 23 | return attention_img -------------------------------------------------------------------------------- /utils/visualization/plot_spec.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | 3 | matplotlib.use('Agg') 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import torch 7 | 8 | LINE_COLORS = ['w', 'r', 'orange', 'k', 'cyan', 'm', 'b', 'lime', 'g', 'brown', 'navy'] 9 | 10 | 11 | def spec_to_figure(spec, vmin=None, vmax=None, title='', f0s=None, dur_info=None): 12 | if isinstance(spec, torch.Tensor): 13 | spec = spec.cpu().numpy() 14 | H = spec.shape[1] // 2 15 | fig = plt.figure(figsize=(12, 6)) 16 | plt.title(title) 17 | plt.pcolor(spec.T, vmin=vmin, vmax=vmax) 18 | 19 | if dur_info is not None: 20 | assert isinstance(dur_info, dict) 21 | txt = dur_info['txt'] 22 | dur_gt = dur_info['dur_gt'] 23 | if isinstance(dur_gt, torch.Tensor): 24 | dur_gt = dur_gt.cpu().numpy() 25 | dur_gt = np.cumsum(dur_gt).astype(int) 26 | for i in range(len(dur_gt)): 27 | shift = (i % 8) + 1 28 | plt.text(dur_gt[i], shift * 4, txt[i]) 29 | plt.vlines(dur_gt[i], 0, H // 2, colors='b') # blue is gt 30 | plt.xlim(0, dur_gt[-1]) 31 | if 'dur_pred' in dur_info: 32 | dur_pred = dur_info['dur_pred'] 33 | if isinstance(dur_pred, torch.Tensor): 34 | dur_pred = dur_pred.cpu().numpy() 35 | dur_pred = np.cumsum(dur_pred).astype(int) 36 | for i in range(len(dur_pred)): 37 | shift = (i % 8) + 1 38 | plt.text(dur_pred[i], H + shift * 4, txt[i]) 39 | plt.vlines(dur_pred[i], H, H * 1.5, colors='r') # red is pred 40 | plt.xlim(0, max(dur_gt[-1], dur_pred[-1])) 41 | if f0s is not None: 42 | ax = plt.gca() 43 | ax2 = ax.twinx() 44 | # ax.set_xticks() 45 | 46 | if not isinstance(f0s, dict): 47 | f0s = {'f0': f0s} 48 | for i, (k, f0) in enumerate(f0s.items()): 49 | if f0 is not None: 50 | if isinstance(f0, torch.Tensor): 51 | f0 = f0.cpu().numpy() 52 | ax2.plot( 53 | np.arange(len(f0)) + 0.5, f0, label=k, c=LINE_COLORS[i], linewidth=1, alpha=0.5) 54 | ax2.set_ylim(0, 1000) 55 | ax2.legend() 56 | return fig 57 | 58 | 59 | def align_to_figure(align, dur_info): 60 | if isinstance(align, torch.Tensor): 61 | align = align.cpu().numpy() 62 | H = align.shape[1] 63 | fig = plt.figure(figsize=(12, 6)) 64 | plt.pcolor(align.T, vmin=0, vmax=1) 65 | if dur_info is not None: 66 | assert isinstance(dur_info, dict) 67 | txt = dur_info['txt'] 68 | dur_gt = dur_info['dur_gt'] 69 | if isinstance(dur_gt, torch.Tensor): 70 | dur_gt = dur_gt.cpu().numpy() 71 | dur_gt = np.cumsum(dur_gt).astype(int) // 2 72 | for i in range(len(dur_gt)): 73 | plt.text(dur_gt[i], i, txt[i], color='red') 74 | plt.vlines(dur_gt[i], 0, H, colors='b') # blue is gt 75 | # plt.xlim(0, dur_gt[-1]) 76 | return fig 77 | -------------------------------------------------------------------------------- /utils/visualization/vis_cam3d/camera_parameter_loader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | import quaternion 5 | 6 | class CameraParameterLoader: 7 | def __init__(self): 8 | print('initialize camera parameter lodaer') 9 | 10 | def get_intrinsic(self, path): 11 | with open(os.path.join(path, '_camera_settings.json'), 'r') as f: 12 | param_cam = json.load(f) 13 | param_intrinsic = param_cam['camera_settings'][0]['intrinsic_settings'] 14 | cx = param_intrinsic['cx'] 15 | cy = param_intrinsic['cy'] 16 | fx = param_intrinsic['fx'] 17 | fy = param_intrinsic['fy'] 18 | s = param_intrinsic['s'] 19 | mat_intrinsic = np.array([[fx, s, cx], 20 | [0, fy, cy], 21 | [0, 0, 1]]) 22 | return mat_intrinsic 23 | 24 | def get_extrinsic(self, path): 25 | with open(path, 'r') as f: 26 | param_cam = json.load(f)['camera_data'] 27 | param_translation = param_cam['location_worldframe'] 28 | param_rotation = param_cam['quaternion_xyzw_worldframe'] 29 | 30 | mat_rotation = quaternion.as_rotation_matrix( 31 | np.quaternion(param_rotation[3], param_rotation[0], param_rotation[1], param_rotation[2])) 32 | mat_translation = np.array([[param_translation[0]], [param_translation[1]], [param_translation[2]]]) 33 | mat_extrinsic = np.concatenate( 34 | [np.concatenate([mat_rotation, mat_translation], axis=1), np.array([[0, 0, 0, 1]])], axis=0) 35 | return mat_extrinsic 36 | --------------------------------------------------------------------------------