├── .gitignore
├── LICENSE
├── README-zh.md
├── README.md
├── assets
    ├── mimictalk.png
    └── real3dportrait.png
├── checkpoints
    └── .gitkeep
├── data
    └── raw
    │   └── examples
    │       ├── 80_vs_60_10s.wav
    │       ├── German_20s.mp4
    │       └── bg.png
├── data_gen
    ├── eg3d
    │   └── convert_to_eg3d_convention.py
    ├── runs
    │   ├── binarizer_nerf.py
    │   ├── binarizer_th1kh.py
    │   └── nerf
    │   │   ├── process_guide.md
    │   │   └── run.sh
    └── utils
    │   ├── mp_feature_extractors
    │       ├── face_landmarker.py
    │       ├── face_landmarker.task
    │       ├── mp_segmenter.py
    │       └── selfie_multiclass_256x256.tflite
    │   ├── path_converter.py
    │   ├── process_audio
    │       ├── extract_hubert.py
    │       ├── extract_mel_f0.py
    │       └── resample_audio_to_16k.py
    │   ├── process_image
    │       ├── extract_lm2d.py
    │       ├── extract_segment_imgs.py
    │       └── fit_3dmm_landmark.py
    │   └── process_video
    │       ├── euler2quaterion.py
    │       ├── extract_blink.py
    │       ├── extract_lm2d.py
    │       ├── extract_segment_imgs.py
    │       ├── fit_3dmm_landmark.py
    │       ├── inpaint_torso_imgs.py
    │       ├── resample_video_to_25fps_resize_to_512.py
    │       └── split_video_to_imgs.py
├── data_util
    └── face3d_helper.py
├── deep_3drecon
    ├── BFM
    │   ├── .gitkeep
    │   ├── basel_53201.txt
    │   ├── index_mp468_from_mesh35709_v1.npy
    │   ├── index_mp468_from_mesh35709_v2.npy
    │   ├── index_mp468_from_mesh35709_v3.1.npy
    │   ├── index_mp468_from_mesh35709_v3.npy
    │   ├── select_vertex_id.mat
    │   └── similarity_Lm3D_all.mat
    ├── __init__.py
    ├── bfm_left_eye_faces.npy
    ├── bfm_right_eye_faces.npy
    ├── data_preparation.py
    ├── deep_3drecon_models
    │   ├── __init__.py
    │   ├── arcface_torch
    │   │   ├── README.md
    │   │   ├── backbones
    │   │   │   ├── __init__.py
    │   │   │   ├── iresnet.py
    │   │   │   ├── iresnet2060.py
    │   │   │   ├── mobilefacenet.py
    │   │   │   └── vit.py
    │   │   ├── configs
    │   │   │   ├── 3millions.py
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── glint360k_mbf.py
    │   │   │   ├── glint360k_r100.py
    │   │   │   ├── glint360k_r50.py
    │   │   │   ├── ms1mv2_mbf.py
    │   │   │   ├── ms1mv2_r100.py
    │   │   │   ├── ms1mv2_r50.py
    │   │   │   ├── ms1mv3_mbf.py
    │   │   │   ├── ms1mv3_r100.py
    │   │   │   ├── ms1mv3_r50.py
    │   │   │   ├── ms1mv3_r50_onegpu.py
    │   │   │   ├── wf12m_conflict_r50.py
    │   │   │   ├── wf12m_conflict_r50_pfc03_filter04.py
    │   │   │   ├── wf12m_flip_pfc01_filter04_r50.py
    │   │   │   ├── wf12m_flip_r50.py
    │   │   │   ├── wf12m_mbf.py
    │   │   │   ├── wf12m_pfc02_r100.py
    │   │   │   ├── wf12m_r100.py
    │   │   │   ├── wf12m_r50.py
    │   │   │   ├── wf42m_pfc0008_32gpu_r100.py
    │   │   │   ├── wf42m_pfc02_16gpus_mbf_bs8k.py
    │   │   │   ├── wf42m_pfc02_16gpus_r100.py
    │   │   │   ├── wf42m_pfc02_16gpus_r50_bs8k.py
    │   │   │   ├── wf42m_pfc02_32gpus_r50_bs4k.py
    │   │   │   ├── wf42m_pfc02_8gpus_r50_bs4k.py
    │   │   │   ├── wf42m_pfc02_r100.py
    │   │   │   ├── wf42m_pfc02_r100_16gpus.py
    │   │   │   ├── wf42m_pfc02_r100_32gpus.py
    │   │   │   ├── wf42m_pfc03_32gpu_r100.py
    │   │   │   ├── wf42m_pfc03_32gpu_r18.py
    │   │   │   ├── wf42m_pfc03_32gpu_r200.py
    │   │   │   ├── wf42m_pfc03_32gpu_r50.py
    │   │   │   ├── wf42m_pfc03_40epoch_64gpu_vit_b.py
    │   │   │   ├── wf42m_pfc03_40epoch_64gpu_vit_l.py
    │   │   │   ├── wf42m_pfc03_40epoch_64gpu_vit_s.py
    │   │   │   ├── wf42m_pfc03_40epoch_64gpu_vit_t.py
    │   │   │   ├── wf42m_pfc03_40epoch_8gpu_vit_b.py
    │   │   │   ├── wf42m_pfc03_40epoch_8gpu_vit_t.py
    │   │   │   ├── wf4m_mbf.py
    │   │   │   ├── wf4m_r100.py
    │   │   │   └── wf4m_r50.py
    │   │   ├── dataset.py
    │   │   ├── dist.sh
    │   │   ├── docs
    │   │   │   ├── eval.md
    │   │   │   ├── install.md
    │   │   │   ├── install_dali.md
    │   │   │   ├── modelzoo.md
    │   │   │   ├── prepare_custom_dataset.md
    │   │   │   ├── prepare_webface42m.md
    │   │   │   └── speed_benchmark.md
    │   │   ├── eval
    │   │   │   ├── __init__.py
    │   │   │   └── verification.py
    │   │   ├── eval_ijbc.py
    │   │   ├── flops.py
    │   │   ├── inference.py
    │   │   ├── losses.py
    │   │   ├── lr_scheduler.py
    │   │   ├── onnx_helper.py
    │   │   ├── onnx_ijbc.py
    │   │   ├── partial_fc.py
    │   │   ├── partial_fc_v2.py
    │   │   ├── requirement.txt
    │   │   ├── run.sh
    │   │   ├── scripts
    │   │   │   └── shuffle_rec.py
    │   │   ├── torch2onnx.py
    │   │   ├── train.py
    │   │   ├── train_v2.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── plot.py
    │   │   │   ├── utils_callbacks.py
    │   │   │   ├── utils_config.py
    │   │   │   ├── utils_distributed_sampler.py
    │   │   │   └── utils_logging.py
    │   ├── base_model.py
    │   ├── bfm.py
    │   ├── facerecon_model.py
    │   ├── losses.py
    │   ├── networks.py
    │   └── template_model.py
    ├── generate_reconstructor_opt_for_geneface.py
    ├── ncc_code.npy
    ├── options
    │   ├── __init__.py
    │   ├── base_options.py
    │   ├── test_options.py
    │   └── train_options.py
    ├── reconstructor.py
    ├── reconstructor_opt.pkl
    ├── secc_renderer.py
    ├── test.py
    ├── train.py
    └── util
    │   ├── BBRegressorParam_r.mat
    │   ├── __init__.py
    │   ├── detect_lm68.py
    │   ├── generate_list.py
    │   ├── html.py
    │   ├── load_mats.py
    │   ├── mesh_renderer.py
    │   ├── preprocess.py
    │   ├── skin_mask.py
    │   ├── test_mean_face.txt
    │   ├── util.py
    │   └── visualizer.py
├── docs
    ├── prepare_env
    │   ├── install_guide-zh.md
    │   ├── install_guide.md
    │   └── requirements.txt
    ├── process_data
    │   └── process_th1kh.md
    └── train_models
    │   ├── train_audio2motion.md
    │   └── train_motion2video.md
├── egs
    ├── egs_bases
    │   ├── audio2motion
    │   │   ├── base.yaml
    │   │   ├── vae.yaml
    │   │   └── vae_sync.yaml
    │   ├── audio2pose
    │   │   └── base.yaml
    │   ├── eg3d
    │   │   ├── base.yaml
    │   │   └── base_mse.yaml
    │   ├── nerf
    │   │   ├── adnerf.yaml
    │   │   ├── adnerf_torso.yaml
    │   │   ├── base.yaml
    │   │   ├── lm3d_nerf.yaml
    │   │   └── lm3d_nerf_torso.yaml
    │   ├── os_facev2v
    │   │   └── base.yaml
    │   ├── postnet
    │   │   └── base.yaml
    │   ├── radnerf
    │   │   ├── base.yaml
    │   │   ├── lm3d_radnerf.yaml
    │   │   └── radnerf.yaml
    │   └── syncnet
    │   │   └── base.yaml
    ├── os_avatar
    │   ├── audio2motion_vae.yaml
    │   ├── audio_lm3d_syncnet.yaml
    │   ├── img2plane.yaml
    │   ├── real3d_orig
    │   │   ├── img2plane_orig.yaml
    │   │   ├── secc_img2plane_orig.yaml
    │   │   └── secc_img2plane_torso_orig.yaml
    │   ├── secc_img2plane.yaml
    │   └── secc_img2plane_torso.yaml
    ├── th1kh_512
    │   ├── base.yaml
    │   ├── secc_img2plane.yaml
    │   └── secc_img2plane_torso.yaml
    └── th1kh_512_audio2motion
    │   ├── base.yaml
    │   ├── lm3d_syncnet.yaml
    │   ├── lm3d_vae.yaml
    │   ├── lm3d_vae_pitch.yaml
    │   ├── lm3d_vae_sync.yaml
    │   └── lm3d_vae_sync_pitch.yaml
├── inference
    ├── app_mimictalk.py
    ├── app_real3dportrait.py
    ├── edit_secc.py
    ├── infer_utils.py
    ├── mimictalk_infer.py
    ├── real3d_infer.py
    ├── real3dportrait_demo.ipynb
    └── train_mimictalk_on_a_video.py
├── modules
    ├── audio2motion
    │   ├── cfm
    │   │   ├── attend.py
    │   │   ├── cfm_wrapper.py
    │   │   ├── icl_audio2motion_model.py
    │   │   ├── icl_audio2motion_pose_model.py
    │   │   ├── icl_transformer.py
    │   │   ├── module.py
    │   │   └── utils.py
    │   ├── cnn_models.py
    │   ├── flow_base.py
    │   ├── multi_length_disc.py
    │   ├── transformer_base.py
    │   ├── transformer_models.py
    │   ├── utils.py
    │   ├── vae.py
    │   └── vqvae.py
    ├── commons
    │   ├── attention
    │   │   ├── attentions.py
    │   │   └── simple_attention.py
    │   ├── conformer
    │   │   ├── conformer.py
    │   │   ├── espnet_positional_embedding.py
    │   │   ├── espnet_transformer_attn.py
    │   │   └── layers.py
    │   ├── conv.py
    │   ├── gpt.py
    │   ├── improved_diffusion
    │   │   ├── __init__.py
    │   │   ├── dist_util.py
    │   │   ├── fp16_util.py
    │   │   ├── gaussian_diffusion.py
    │   │   ├── image_datasets.py
    │   │   ├── logger.py
    │   │   ├── losses.py
    │   │   ├── nn.py
    │   │   ├── resample.py
    │   │   ├── respace.py
    │   │   └── train_util.py
    │   ├── layers.py
    │   ├── loralib
    │   │   ├── __init__.py
    │   │   ├── layers.py
    │   │   └── utils.py
    │   ├── normalizing_flow
    │   │   ├── glow_modules.py
    │   │   ├── res_flow.py
    │   │   └── utils.py
    │   ├── rel_transformer.py
    │   ├── rnn.py
    │   ├── rot_transformer.py
    │   ├── taming_tfm_modules.py
    │   ├── transformer.py
    │   ├── unet1d.py
    │   ├── vqvae.py
    │   ├── vqvae_cvq.py
    │   ├── vqvae_fsq.py
    │   ├── vqvae_lfq.py
    │   ├── vqvae_lfq_y.py
    │   ├── vqvae_taming.py
    │   └── wavenet.py
    ├── eg3ds
    │   ├── camera_utils
    │   │   └── pose_sampler.py
    │   ├── dnnlib
    │   │   ├── __init__.py
    │   │   └── util.py
    │   ├── metrics
    │   │   ├── __init__.py
    │   │   ├── equivariance.py
    │   │   ├── frechet_inception_distance.py
    │   │   ├── inception_score.py
    │   │   ├── kernel_inception_distance.py
    │   │   ├── metric_main.py
    │   │   ├── metric_utils.py
    │   │   ├── perceptual_path_length.py
    │   │   └── precision_recall.py
    │   ├── models
    │   │   ├── dual_discriminator.py
    │   │   ├── dual_discriminator_cond.py
    │   │   ├── networks_stylegan2.py
    │   │   ├── networks_stylegan3.py
    │   │   ├── superresolution.py
    │   │   └── triplane.py
    │   ├── torch_utils
    │   │   ├── __init__.py
    │   │   ├── custom_ops.py
    │   │   ├── misc.py
    │   │   ├── ops
    │   │   │   ├── __init__.py
    │   │   │   ├── bias_act.cpp
    │   │   │   ├── bias_act.cu
    │   │   │   ├── bias_act.h
    │   │   │   ├── bias_act.py
    │   │   │   ├── conv2d_gradfix.py
    │   │   │   ├── conv2d_resample.py
    │   │   │   ├── filtered_lrelu.cpp
    │   │   │   ├── filtered_lrelu.cu
    │   │   │   ├── filtered_lrelu.h
    │   │   │   ├── filtered_lrelu.py
    │   │   │   ├── filtered_lrelu_ns.cu
    │   │   │   ├── filtered_lrelu_rd.cu
    │   │   │   ├── filtered_lrelu_wr.cu
    │   │   │   ├── fma.py
    │   │   │   ├── grid_sample_gradfix.py
    │   │   │   ├── upfirdn2d.cpp
    │   │   │   ├── upfirdn2d.cu
    │   │   │   ├── upfirdn2d.h
    │   │   │   └── upfirdn2d.py
    │   │   ├── persistence.py
    │   │   └── training_stats.py
    │   └── volumetric_rendering
    │   │   ├── __init__.py
    │   │   ├── math_utils.py
    │   │   ├── ray_marcher.py
    │   │   ├── ray_sampler.py
    │   │   └── renderer.py
    ├── img2plane
    │   ├── deeplabv3
    │   │   ├── __init__.py
    │   │   ├── base
    │   │   │   ├── __init__.py
    │   │   │   ├── heads.py
    │   │   │   ├── initialization.py
    │   │   │   ├── model.py
    │   │   │   └── modules.py
    │   │   ├── decoders
    │   │   │   ├── decoder.py
    │   │   │   ├── model.py
    │   │   │   ├── my_decoder.py
    │   │   │   └── my_model.py
    │   │   └── encoders
    │   │   │   ├── __init__.py
    │   │   │   ├── _base.py
    │   │   │   ├── _utils.py
    │   │   │   └── resnet.py
    │   ├── img2plane_model.py
    │   ├── segformer
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── models.py
    │   ├── simple_encoders
    │   │   └── high_resolution_encoder.py
    │   ├── triplane.py
    │   └── unit_test.ipynb
    ├── real3d
    │   ├── facev2v_warp
    │   │   ├── func_utils.py
    │   │   ├── layers.py
    │   │   ├── losses.py
    │   │   ├── model.py
    │   │   ├── model2.py
    │   │   ├── network.py
    │   │   └── network2.py
    │   ├── img2plane_baseline.py
    │   ├── secc_img2plane.py
    │   ├── secc_img2plane_torso.py
    │   ├── segformer.py
    │   └── super_resolution
    │   │   └── sr_with_ref.py
    └── syncnet
    │   ├── models.py
    │   └── syncnet_v2.py
├── tasks
    ├── os_avatar
    │   ├── audio2motion_task.py
    │   ├── audio_lm3d_syncnet.py
    │   ├── dataset_utils
    │   │   ├── audio2motion_dataset.py
    │   │   ├── motion2video_dataset.py
    │   │   └── syncnet_dataset.py
    │   ├── img2plane_task.py
    │   ├── loss_utils
    │   │   └── vgg19_loss.py
    │   ├── secc_img2plane_task.py
    │   └── secc_img2plane_torso_task.py
    └── run.py
└── utils
    ├── audio
        ├── __init__.py
        ├── align.py
        ├── dct.py
        ├── griffin_lim.py
        ├── io.py
        ├── pitch
        │   ├── bin
        │   │   ├── ExtractF0ByStraight
        │   │   ├── InterpF0
        │   │   └── ReaperF0
        │   ├── crepe_utils.py
        │   ├── extractor_utils.py
        │   ├── utils.py
        │   └── uv_utils.py
        ├── pitch_extractors.py
        └── vad.py
    ├── commons
        ├── base_task.py
        ├── ckpt_utils.py
        ├── crop_head.py
        ├── dataset_utils.py
        ├── ddp_utils.py
        ├── euler2rot.py
        ├── face_alignment_utils.py
        ├── hparams.py
        ├── image_utils.py
        ├── indexed_datasets.py
        ├── mesh_utils.py
        ├── meters.py
        ├── multiprocess_utils.py
        ├── os_utils.py
        ├── pitch_utils.py
        ├── tensor_utils.py
        └── trainer.py
    ├── nn
        ├── grad.py
        ├── model_utils.py
        ├── schedulers.py
        └── seq_utils.py
    ├── useful_cmd_lines
        └── clean_gpu.py
    └── visualization
        ├── auto_plot_image.py
        ├── draw_3d_landmark.py
        ├── ffmpeg_utils.py
        ├── lm_visualizer.py
        ├── plot_attention.py
        ├── plot_spec.py
        ├── t-sne.py
        ├── t-sne_0423.py
        └── vis_cam3d
            ├── camera_parameter_loader.py
            └── camera_pose_visualizer.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 ZhenhuiYe
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/assets/mimictalk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/assets/mimictalk.png


--------------------------------------------------------------------------------
/assets/real3dportrait.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/assets/real3dportrait.png


--------------------------------------------------------------------------------
/checkpoints/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/checkpoints/.gitkeep


--------------------------------------------------------------------------------
/data/raw/examples/80_vs_60_10s.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/data/raw/examples/80_vs_60_10s.wav


--------------------------------------------------------------------------------
/data/raw/examples/German_20s.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/data/raw/examples/German_20s.mp4


--------------------------------------------------------------------------------
/data/raw/examples/bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/data/raw/examples/bg.png


--------------------------------------------------------------------------------
/data_gen/runs/nerf/process_guide.md:
--------------------------------------------------------------------------------
 1 | # 温馨提示：第一次执行可以先一步步跑完下面的命令行，把环境跑通后，之后可以直接运行同目录的run.sh，一键完成下面的所有步骤。
 2 | 
 3 | # Step0. 将视频Crop到512x512分辨率，25FPS，确保每一帧都有目标人脸
 4 | ```
 5 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 data/raw/videos/${VIDEO_ID}_512.mp4
 6 | mv data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4
 7 | mv data/raw/videos/${VIDEO_ID}_512.mp4 data/raw/videos/${VIDEO_ID}.mp4
 8 | ```
 9 | # step1: 提取音频特征, 如mel, f0, hubuert, esperanto
10 | ```
11 | export CUDA_VISIBLE_DEVICES=0
12 | export VIDEO_ID=May
13 | mkdir -p data/processed/videos/${VIDEO_ID}
14 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 data/processed/videos/${VIDEO_ID}/aud.wav 
15 | python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID}
16 | python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID}
17 | ```
18 | 
19 | # Step2. 提取图片
20 | ```
21 | export VIDEO_ID=May
22 | export CUDA_VISIBLE_DEVICES=0
23 | mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs
24 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg
25 | python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background
26 | ```
27 | 
28 | # Step3. 提取lm2d_mediapipe
29 | ### 提取2D landmark用于之后Fit 3DMM
30 | ### num_workers是本机上的CPU worker数量；total_process是使用的机器数；process_id是本机的编号
31 | 
32 | ```
33 | export VIDEO_ID=May
34 | python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4
35 | ```
36 | 
37 | # Step3. fit 3dmm
38 | ```
39 | export VIDEO_ID=May
40 | export CUDA_VISIBLE_DEVICES=0
41 | python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset  --debug --id_mode=global
42 | ```
43 | 
44 | # Step4. Binarize
45 | ```
46 | export VIDEO_ID=May
47 | python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID}
48 | ```
49 | 可以看到在`data/binary/videos/Mayssss`目录下得到了数据集。


--------------------------------------------------------------------------------
/data_gen/runs/nerf/run.sh:
--------------------------------------------------------------------------------
 1 | # usage: CUDA_VISIBLE_DEVICES=0 bash data_gen/runs/nerf/run.sh <VIDEO_ID>
 2 | # please place video to data/raw/videos/${VIDEO_ID}.mp4 
 3 | VIDEO_ID=$1
 4 | echo Processing $VIDEO_ID
 5 | 
 6 | echo Resizing the video to 512x512
 7 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -y data/raw/videos/${VIDEO_ID}_512.mp4
 8 | mv data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4
 9 | mv data/raw/videos/${VIDEO_ID}_512.mp4 data/raw/videos/${VIDEO_ID}.mp4
10 | echo Done
11 | echo The old video is moved to data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4
12 | 
13 | echo mkdir -p data/processed/videos/${VIDEO_ID}
14 | mkdir -p data/processed/videos/${VIDEO_ID}
15 | echo Done
16 | 
17 | # extract audio file from the training video
18 | echo ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 -v quiet -y data/processed/videos/${VIDEO_ID}/aud.wav 
19 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 -v quiet -y data/processed/videos/${VIDEO_ID}/aud.wav 
20 | echo Done
21 | 
22 | # extract hubert_mel_f0 from audio
23 | echo python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID}
24 | python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID}
25 | echo python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID}
26 | python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID}
27 | echo Done
28 | 
29 | # extract segment images
30 | echo mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs
31 | mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs
32 | echo ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg
33 | ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg
34 | echo Done
35 | 
36 | echo python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background
37 | python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background
38 | echo Done
39 | 
40 | echo python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4
41 | python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4
42 | echo Done
43 | 
44 | pkill -f void*
45 | echo python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset --debug --id_mode=global
46 | python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset --debug --id_mode=global
47 | echo Done
48 | 
49 | echo python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID}
50 | python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID}
51 | echo Done


--------------------------------------------------------------------------------
/data_gen/utils/mp_feature_extractors/face_landmarker.task:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/data_gen/utils/mp_feature_extractors/face_landmarker.task


--------------------------------------------------------------------------------
/data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite


--------------------------------------------------------------------------------
/data_gen/utils/path_converter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | class PathConverter():
 5 |     def __init__(self):
 6 |         self.prefixs = {
 7 |             "vid": "/video/",
 8 |             "gt": "/gt_imgs/",
 9 |             "head": "/head_imgs/", 
10 |             "torso": "/torso_imgs/", 
11 |             "person": "/person_imgs/", 
12 |             "torso_with_bg": "/torso_with_bg_imgs/", 
13 |             "single_bg": "/bg_img/",
14 |             "bg": "/bg_imgs/",
15 |             "segmaps": "/segmaps/",
16 |             "inpaint_torso": "/inpaint_torso_imgs/",
17 |             "com": "/com_imgs/",
18 |             "inpaint_torso_with_com_bg": "/inpaint_torso_with_com_bg_imgs/",
19 |         }
20 |         
21 |     def to(self, path: str, old_pattern: str, new_pattern: str):
22 |         return path.replace(self.prefixs[old_pattern], self.prefixs[new_pattern], 1)
23 | 
24 | pc = PathConverter()


--------------------------------------------------------------------------------
/data_gen/utils/process_audio/resample_audio_to_16k.py:
--------------------------------------------------------------------------------
 1 | import os, glob
 2 | from utils.commons.os_utils import multiprocess_glob
 3 | from utils.commons.multiprocess_utils import multiprocess_run_tqdm
 4 | 
 5 | 
 6 | def extract_wav16k_job(audio_name:str):
 7 |     out_path = audio_name.replace("/audio_raw/","/audio/",1)
 8 |     assert out_path != audio_name # prevent inplace
 9 |     os.makedirs(os.path.dirname(out_path), exist_ok=True)
10 |     ffmpeg_path = "/usr/bin/ffmpeg"
11 | 
12 |     cmd = f'{ffmpeg_path} -i {audio_name} -ar 16000 -v quiet -y {out_path}'
13 |     os.system(cmd)
14 | 
15 | if __name__ == '__main__':
16 |     import argparse, glob, tqdm, random
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("--aud_dir", default='/home/tiger/datasets/raw/CMLR/audio_raw/')
19 |     parser.add_argument("--ds_name", default='CMLR')
20 |     parser.add_argument("--num_workers", default=64, type=int)
21 |     parser.add_argument("--process_id", default=0, type=int)
22 |     parser.add_argument("--total_process", default=1, type=int)
23 |     args = parser.parse_args()
24 |     print(f"args {args}")
25 | 
26 |     aud_dir = args.aud_dir
27 |     ds_name = args.ds_name
28 |     if ds_name in ['CMLR']:
29 |         aud_name_pattern = os.path.join(aud_dir, "*/*/*.wav")
30 |         aud_names = multiprocess_glob(aud_name_pattern)
31 |     else:
32 |         raise NotImplementedError()
33 |     aud_names = sorted(aud_names)
34 |     print(f"total audio number : {len(aud_names)}")
35 |     print(f"first {aud_names[0]} last {aud_names[-1]}")
36 |     # exit()
37 |     process_id = args.process_id
38 |     total_process = args.total_process
39 |     if total_process > 1:
40 |         assert process_id <= total_process -1
41 |         num_samples_per_process = len(aud_names) // total_process
42 |         if process_id == total_process:
43 |             aud_names = aud_names[process_id * num_samples_per_process : ]
44 |         else:
45 |             aud_names = aud_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
46 |     
47 |     for i, res in multiprocess_run_tqdm(extract_wav16k_job, aud_names, num_workers=args.num_workers, desc="resampling videos"):
48 |         pass
49 | 
50 | 


--------------------------------------------------------------------------------
/data_gen/utils/process_video/euler2quaterion.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import math
 4 | import numba
 5 | from scipy.spatial.transform import Rotation as R
 6 | 
 7 | def euler2quaterion(euler, use_radian=True):
 8 |     """
 9 |     euler: np.array, [batch, 3]
10 |     return: the quaterion, np.array, [batch, 4]
11 |     """
12 |     r = R.from_euler('xyz',euler, degrees=not use_radian)
13 |     return r.as_quat()
14 | 
15 | def quaterion2euler(quat, use_radian=True):
16 |     """
17 |     quat: np.array, [batch, 4]
18 |     return: the euler, np.array, [batch, 3]
19 |     """
20 |     r = R.from_quat(quat)
21 |     return r.as_euler('xyz', degrees=not use_radian)
22 | 
23 | def rot2quaterion(rot):
24 |     r = R.from_matrix(rot)
25 |     return r.as_quat()
26 | 
27 | def quaterion2rot(quat):
28 |     r = R.from_quat(quat)
29 |     return r.as_matrix()
30 | 
31 | if __name__ == '__main__':
32 |     euler = np.array([89.999,89.999,89.999] * 100).reshape([100,3])
33 |     q = euler2quaterion(euler, use_radian=False)
34 |     e = quaterion2euler(q, use_radian=False)
35 |     print(" ")
36 | 


--------------------------------------------------------------------------------
/data_gen/utils/process_video/extract_blink.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from data_util.face3d_helper import Face3DHelper
 3 | from utils.commons.tensor_utils import convert_to_tensor
 4 | 
 5 | def polygon_area(x, y):
 6 |     """
 7 |     x: [T, K=6]
 8 |     y: [T, K=6]
 9 |     return: [T,]
10 |     """
11 |     x_ = x - x.mean(axis=-1, keepdims=True)
12 |     y_ = y - y.mean(axis=-1, keepdims=True)
13 |     correction = x_[:,-1] * y_[:,0] - y_[:,-1]* x_[:,0]
14 |     main_area = (x_[:,:-1] * y_[:,1:]).sum(axis=-1) - (y_[:,:-1] * x_[:,1:]).sum(axis=-1)
15 |     return 0.5 * np.abs(main_area + correction)
16 | 
17 | def get_eye_area_percent(id, exp, face3d_helper):
18 |     id = convert_to_tensor(id)
19 |     exp = convert_to_tensor(exp)
20 |     cano_lm3d = face3d_helper.reconstruct_cano_lm3d(id, exp)
21 |     cano_lm2d = (cano_lm3d[..., :2] + 1) / 2
22 |     lms = cano_lm2d.cpu().numpy()
23 |     eyes_left = slice(36, 42)
24 |     eyes_right = slice(42, 48)
25 |     area_left = polygon_area(lms[:, eyes_left, 0], lms[:, eyes_left, 1])
26 |     area_right = polygon_area(lms[:, eyes_right, 0], lms[:, eyes_right, 1])
27 |     # area percentage of two eyes of the whole image...
28 |     area_percent = (area_left + area_right) / 1 * 100 # recommend threshold is 0.25%
29 |     return area_percent # [T,]
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     import numpy as np
34 |     import imageio
35 |     import cv2
36 |     import torch
37 |     from data_gen.utils.process_video.extract_lm2d import extract_lms_mediapipe_job, read_video_to_frames, index_lm68_from_lm468
38 |     from data_gen.utils.process_video.fit_3dmm_landmark import fit_3dmm_for_a_video
39 |     from data_util.face3d_helper import Face3DHelper
40 | 
41 |     face3d_helper = Face3DHelper()
42 |     video_name = 'data/raw/videos/May_10s.mp4'
43 |     frames = read_video_to_frames(video_name)
44 |     coeff = fit_3dmm_for_a_video(video_name, save=False)
45 |     area_percent = get_eye_area_percent(torch.tensor(coeff['id']), torch.tensor(coeff['exp']), face3d_helper)
46 |     writer = imageio.get_writer("1.mp4", fps=25)
47 |     for idx, frame in enumerate(frames):
48 |         frame = cv2.putText(frame, f"{area_percent[idx]:.2f}", org=(128,128), fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=1, color=(255,0,0), thickness=1)
49 |         writer.append_data(frame)
50 |     writer.close()


--------------------------------------------------------------------------------
/data_gen/utils/process_video/split_video_to_imgs.py:
--------------------------------------------------------------------------------
 1 | import os, glob
 2 | from utils.commons.multiprocess_utils import multiprocess_run_tqdm
 3 | 
 4 | from data_gen.utils.path_converter import PathConverter, pc
 5 | 
 6 | # mp4_names = glob.glob("/home/tiger/datasets/raw/CelebV-HQ/video/*.mp4")
 7 | 
 8 | def extract_img_job(video_name, raw_img_dir=None):
 9 |     if raw_img_dir is not None:
10 |         out_path = raw_img_dir
11 |     else:
12 |         out_path = pc.to(video_name.replace(".mp4", ""), "vid", "gt")
13 |     os.makedirs(out_path, exist_ok=True)
14 |     ffmpeg_path = "/usr/bin/ffmpeg"
15 |     cmd = f'{ffmpeg_path} -i {video_name} -vf fps={25},scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet {os.path.join(out_path, "%8d.jpg")}'
16 |     os.system(cmd)
17 | 
18 | if __name__ == '__main__':
19 |     import argparse, glob, tqdm, random
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video')
22 |     parser.add_argument("--ds_name", default='CelebV-HQ')
23 |     parser.add_argument("--num_workers", default=64, type=int)
24 |     parser.add_argument("--process_id", default=0, type=int)
25 |     parser.add_argument("--total_process", default=1, type=int)
26 |     args = parser.parse_args()
27 |     vid_dir = args.vid_dir
28 |     ds_name = args.ds_name
29 |     if ds_name in ['lrs3_trainval']:
30 |         mp4_name_pattern = os.path.join(vid_dir, "*/*.mp4")
31 |     elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
32 |         vid_names = glob.glob(os.path.join(vid_dir, "*.mp4"))
33 |     elif ds_name in ['lrs2', 'lrs3', 'voxceleb2']:
34 |         vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
35 |         vid_names = glob.glob(vid_name_pattern)
36 |     elif ds_name in ["RAVDESS", 'VFHQ']:
37 |         vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
38 |         vid_names = glob.glob(vid_name_pattern)
39 |     vid_names = sorted(vid_names)
40 |     
41 |     process_id = args.process_id
42 |     total_process = args.total_process
43 |     if total_process > 1:
44 |         assert process_id <= total_process -1
45 |         num_samples_per_process = len(vid_names) // total_process
46 |         if process_id == total_process:
47 |             vid_names = vid_names[process_id * num_samples_per_process : ]
48 |         else:
49 |             vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
50 |     
51 |     for i, res in multiprocess_run_tqdm(extract_img_job, vid_names, num_workers=args.num_workers, desc="extracting images"):
52 |         pass
53 | 
54 | 


--------------------------------------------------------------------------------
/deep_3drecon/BFM/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/.gitkeep


--------------------------------------------------------------------------------
/deep_3drecon/BFM/index_mp468_from_mesh35709_v1.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/index_mp468_from_mesh35709_v1.npy


--------------------------------------------------------------------------------
/deep_3drecon/BFM/index_mp468_from_mesh35709_v2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/index_mp468_from_mesh35709_v2.npy


--------------------------------------------------------------------------------
/deep_3drecon/BFM/index_mp468_from_mesh35709_v3.1.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/index_mp468_from_mesh35709_v3.1.npy


--------------------------------------------------------------------------------
/deep_3drecon/BFM/index_mp468_from_mesh35709_v3.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/index_mp468_from_mesh35709_v3.npy


--------------------------------------------------------------------------------
/deep_3drecon/BFM/select_vertex_id.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/select_vertex_id.mat


--------------------------------------------------------------------------------
/deep_3drecon/BFM/similarity_Lm3D_all.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/BFM/similarity_Lm3D_all.mat


--------------------------------------------------------------------------------
/deep_3drecon/__init__.py:
--------------------------------------------------------------------------------
1 | from .reconstructor import *
2 | 


--------------------------------------------------------------------------------
/deep_3drecon/bfm_left_eye_faces.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/bfm_left_eye_faces.npy


--------------------------------------------------------------------------------
/deep_3drecon/bfm_right_eye_faces.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/bfm_right_eye_faces.npy


--------------------------------------------------------------------------------
/deep_3drecon/data_preparation.py:
--------------------------------------------------------------------------------
 1 | """This script is the data preparation script for Deep3DFaceRecon_pytorch
 2 | """
 3 | 
 4 | import os 
 5 | import numpy as np
 6 | import argparse
 7 | from util.detect_lm68 import detect_68p,load_lm_graph
 8 | from util.skin_mask import get_skin_mask
 9 | from util.generate_list import check_list, write_list
10 | import warnings
11 | warnings.filterwarnings("ignore") 
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--data_root', type=str, default='datasets', help='root directory for training data')
15 | parser.add_argument('--img_folder', nargs="+", required=True, help='folders of training images')
16 | parser.add_argument('--mode', type=str, default='train', help='train or val')
17 | opt = parser.parse_args()
18 | 
19 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
20 | 
21 | def data_prepare(folder_list,mode):
22 | 
23 |     lm_sess,input_op,output_op = load_lm_graph('./checkpoints/lm_model/68lm_detector.pb') # load a tensorflow version 68-landmark detector
24 | 
25 |     for img_folder in folder_list:
26 |         detect_68p(img_folder,lm_sess,input_op,output_op) # detect landmarks for images
27 |         get_skin_mask(img_folder) # generate skin attention mask for images
28 | 
29 |     # create files that record path to all training data
30 |     msks_list = []
31 |     for img_folder in folder_list:
32 |         path = os.path.join(img_folder, 'mask')
33 |         msks_list += ['/'.join([img_folder, 'mask', i]) for i in sorted(os.listdir(path)) if 'jpg' in i or 
34 |                                                     'png' in i or 'jpeg' in i or 'PNG' in i]
35 | 
36 |     imgs_list = [i.replace('mask/', '') for i in msks_list]
37 |     lms_list = [i.replace('mask', 'landmarks') for i in msks_list]
38 |     lms_list = ['.'.join(i.split('.')[:-1]) + '.txt' for i in lms_list]
39 |     
40 |     lms_list_final, imgs_list_final, msks_list_final = check_list(lms_list, imgs_list, msks_list) # check if the path is valid
41 |     write_list(lms_list_final, imgs_list_final, msks_list_final, mode=mode) # save files
42 | 
43 | if __name__ == '__main__':
44 |     print('Datasets:',opt.img_folder)
45 |     data_prepare([os.path.join(opt.data_root,folder) for folder in opt.img_folder],opt.mode)
46 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/3millions.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # configs for test speed
 4 | 
 5 | config = edict()
 6 | config.margin_list = (1.0, 0.0, 0.4)
 7 | config.network = "mbf"
 8 | config.resume = False
 9 | config.output = None
10 | config.embedding_size = 512
11 | config.sample_rate = 0.1
12 | config.fp16 = True
13 | config.momentum = 0.9
14 | config.weight_decay = 5e-4
15 | config.batch_size = 512 # total_batch_size = batch_size * num_gpus
16 | config.lr = 0.1  # batch size is 512
17 | 
18 | config.rec = "synthetic"
19 | config.num_classes = 30 * 10000
20 | config.num_image = 100000
21 | config.num_epoch = 30
22 | config.warmup_epoch = -1
23 | config.val_targets = []
24 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/deep_3drecon_models/arcface_torch/configs/__init__.py


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/base.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | 
 9 | # Margin Base Softmax
10 | config.margin_list = (1.0, 0.5, 0.0)
11 | config.network = "r50"
12 | config.resume = False
13 | config.save_all_states = False
14 | config.output = "ms1mv3_arcface_r50"
15 | 
16 | config.embedding_size = 512
17 | 
18 | # Partial FC
19 | config.sample_rate = 1
20 | config.interclass_filtering_threshold = 0
21 | 
22 | config.fp16 = False
23 | config.batch_size = 128
24 | 
25 | # For SGD 
26 | config.optimizer = "sgd"
27 | config.lr = 0.1
28 | config.momentum = 0.9
29 | config.weight_decay = 5e-4
30 | 
31 | # For AdamW
32 | # config.optimizer = "adamw"
33 | # config.lr = 0.001
34 | # config.weight_decay = 0.1
35 | 
36 | config.verbose = 2000
37 | config.frequent = 10
38 | 
39 | # For Large Sacle Dataset, such as WebFace42M
40 | config.dali = False 
41 | 
42 | # Gradient ACC
43 | config.gradient_acc = 1
44 | 
45 | # setup seed
46 | config.seed = 2048
47 | 
48 | # dataload numworkers
49 | config.num_workers = 2
50 | 
51 | # WandB Logger
52 | config.wandb_key = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
53 | config.suffix_run_name = None
54 | config.using_wandb = False
55 | config.wandb_entity = "entity"
56 | config.wandb_project = "project"
57 | config.wandb_log_all = True
58 | config.save_artifacts = False
59 | config.wandb_resume = False # resume wandb run: Only if the you wand t resume the last run that it was interrupted


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/glint360k_mbf.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "mbf"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 1e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/glint360k"
23 | config.num_classes = 360232
24 | config.num_image = 17091657
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/glint360k_r100.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r100"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 1e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/glint360k"
23 | config.num_classes = 360232
24 | config.num_image = 17091657
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/glint360k_r50.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 1e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/glint360k"
23 | config.num_classes = 360232
24 | config.num_image = 17091657
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv2_mbf.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.5, 0.0)
 9 | config.network = "mbf"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 1e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/faces_emore"
23 | config.num_classes = 85742
24 | config.num_image = 5822653
25 | config.num_epoch = 40
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv2_r100.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.5, 0.0)
 9 | config.network = "r100"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/faces_emore"
23 | config.num_classes = 85742
24 | config.num_image = 5822653
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv2_r50.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.5, 0.0)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/faces_emore"
23 | config.num_classes = 85742
24 | config.num_image = 5822653
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv3_mbf.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.5, 0.0)
 9 | config.network = "mbf"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 1e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/ms1m-retinaface-t1"
23 | config.num_classes = 93431
24 | config.num_image = 5179510
25 | config.num_epoch = 40
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv3_r100.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.5, 0.0)
 9 | config.network = "r100"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/ms1m-retinaface-t1"
23 | config.num_classes = 93431
24 | config.num_image = 5179510
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv3_r50.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.5, 0.0)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/ms1m-retinaface-t1"
23 | config.num_classes = 93431
24 | config.num_image = 5179510
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/ms1mv3_r50_onegpu.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.5, 0.0)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.02
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/ms1m-retinaface-t1"
23 | config.num_classes = 93431
24 | config.num_image = 5179510
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_conflict_r50.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.interclass_filtering_threshold = 0
15 | config.fp16 = True
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.optimizer = "sgd"
19 | config.lr = 0.1
20 | config.verbose = 2000
21 | config.dali = False
22 | 
23 | config.rec = "/train_tmp/WebFace12M_Conflict"
24 | config.num_classes = 1017970
25 | config.num_image = 12720066
26 | config.num_epoch = 20
27 | config.warmup_epoch = config.num_epoch // 10
28 | config.val_targets = []
29 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_conflict_r50_pfc03_filter04.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.interclass_filtering_threshold = 0.4
15 | config.fp16 = True
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.optimizer = "sgd"
19 | config.lr = 0.1
20 | config.verbose = 2000
21 | config.dali = False
22 | 
23 | config.rec = "/train_tmp/WebFace12M_Conflict"
24 | config.num_classes = 1017970
25 | config.num_image = 12720066
26 | config.num_epoch = 20
27 | config.warmup_epoch = config.num_epoch // 10
28 | config.val_targets = []
29 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_flip_pfc01_filter04_r50.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.1
14 | config.interclass_filtering_threshold = 0.4
15 | config.fp16 = True
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.optimizer = "sgd"
19 | config.lr = 0.1
20 | config.verbose = 2000
21 | config.dali = False
22 | 
23 | config.rec = "/train_tmp/WebFace12M_FLIP40"
24 | config.num_classes = 617970
25 | config.num_image = 12720066
26 | config.num_epoch = 20
27 | config.warmup_epoch = config.num_epoch // 10
28 | config.val_targets = []
29 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_flip_r50.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.interclass_filtering_threshold = 0
15 | config.fp16 = True
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.optimizer = "sgd"
19 | config.lr = 0.1
20 | config.verbose = 2000
21 | config.dali = False
22 | 
23 | config.rec = "/train_tmp/WebFace12M_FLIP40"
24 | config.num_classes = 617970
25 | config.num_image = 12720066
26 | config.num_epoch = 20
27 | config.warmup_epoch = config.num_epoch // 10
28 | config.val_targets = []
29 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_mbf.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "mbf"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.interclass_filtering_threshold = 0
15 | config.fp16 = True
16 | config.weight_decay = 1e-4
17 | config.batch_size = 128
18 | config.optimizer = "sgd"
19 | config.lr = 0.1
20 | config.verbose = 2000
21 | config.dali = False
22 | 
23 | config.rec = "/train_tmp/WebFace12M"
24 | config.num_classes = 617970
25 | config.num_image = 12720066
26 | config.num_epoch = 20
27 | config.warmup_epoch = 0
28 | config.val_targets = []
29 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_pfc02_r100.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from easydict import EasyDict as edict
 3 | 
 4 | # make training faster
 5 | # our RAM is 256G
 6 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 7 | 
 8 | config = edict()
 9 | config.margin_list = (1.0, 0.0, 0.4)
10 | config.network = "r100"
11 | config.resume = False
12 | config.output = None
13 | config.embedding_size = 512
14 | config.sample_rate = 0.2
15 | config.interclass_filtering_threshold = 0
16 | config.fp16 = True
17 | config.weight_decay = 5e-4
18 | config.batch_size = 128
19 | config.optimizer = "sgd"
20 | config.lr = 0.1
21 | config.verbose = 2000
22 | config.dali = False
23 | 
24 | config.rec = "/train_tmp/WebFace12M"
25 | config.num_classes = 617970
26 | config.num_image = 12720066
27 | config.num_epoch = 20
28 | config.warmup_epoch = 0
29 | config.val_targets = []
30 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_r100.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from easydict import EasyDict as edict
 3 | 
 4 | # make training faster
 5 | # our RAM is 256G
 6 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 7 | 
 8 | config = edict()
 9 | config.margin_list = (1.0, 0.0, 0.4)
10 | config.network = "r100"
11 | config.resume = False
12 | config.output = None
13 | config.embedding_size = 512
14 | config.sample_rate = 1.0
15 | config.interclass_filtering_threshold = 0
16 | config.fp16 = True
17 | config.weight_decay = 5e-4
18 | config.batch_size = 128
19 | config.optimizer = "sgd"
20 | config.lr = 0.1
21 | config.verbose = 2000
22 | config.dali = False
23 | 
24 | config.rec = "/train_tmp/WebFace12M"
25 | config.num_classes = 617970
26 | config.num_image = 12720066
27 | config.num_epoch = 20
28 | config.warmup_epoch = 0
29 | config.val_targets = []
30 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf12m_r50.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.interclass_filtering_threshold = 0
15 | config.fp16 = True
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.optimizer = "sgd"
19 | config.lr = 0.1
20 | config.verbose = 2000
21 | config.dali = False
22 | 
23 | config.rec = "/train_tmp/WebFace12M"
24 | config.num_classes = 617970
25 | config.num_image = 12720066
26 | config.num_epoch = 20
27 | config.warmup_epoch = 0
28 | config.val_targets = []
29 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc0008_32gpu_r100.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r100"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 512
18 | config.lr = 0.4
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_16gpus_mbf_bs8k.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "mbf"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.2
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 1e-4
17 | config.batch_size = 512
18 | config.lr = 0.4
19 | config.verbose = 10000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = 2
27 | config.val_targets = []
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_16gpus_r100.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r100"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.2
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 256
18 | config.lr = 0.3
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = 1
27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_16gpus_r50_bs8k.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.2
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 512
18 | config.lr = 0.6
19 | config.verbose = 10000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = 4
27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_32gpus_r50_bs4k.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.2
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.4
19 | config.verbose = 10000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = 2
27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_8gpus_r50_bs4k.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.2
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 512
18 | config.lr = 0.4
19 | config.verbose = 10000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = 2
27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_r100.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r100"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.2
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 10000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_r100_16gpus.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r100"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.2
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.2
19 | config.verbose = 10000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc02_r100_32gpus.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r100"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.2
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.4
19 | config.verbose = 10000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_32gpu_r100.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r100"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.4
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_32gpu_r18.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r18"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.4
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_32gpu_r200.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r200"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.4
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_32gpu_r50.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.4
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 20
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_b.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "vit_b_dp005_mask_005"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.fp16 = True
15 | config.weight_decay = 0.1
16 | config.batch_size = 384
17 | config.optimizer = "adamw"
18 | config.lr = 0.001
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 40
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = []
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_l.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "vit_l_dp005_mask_005"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.fp16 = True
15 | config.weight_decay = 0.1
16 | config.batch_size = 384
17 | config.optimizer = "adamw"
18 | config.lr = 0.001
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 40
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = []
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_s.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "vit_s_dp005_mask_0"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.fp16 = True
15 | config.weight_decay = 0.1
16 | config.batch_size = 384
17 | config.optimizer = "adamw"
18 | config.lr = 0.001
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 40
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = []
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_t.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "vit_t_dp005_mask0"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.fp16 = True
15 | config.weight_decay = 0.1
16 | config.batch_size = 384
17 | config.optimizer = "adamw"
18 | config.lr = 0.001
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 40
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = []
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_8gpu_vit_b.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "vit_b_dp005_mask_005"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.fp16 = True
15 | config.weight_decay = 0.1
16 | config.batch_size = 256
17 | config.gradient_acc = 12 # total batchsize is 256 * 12
18 | config.optimizer = "adamw"
19 | config.lr = 0.001
20 | config.verbose = 2000
21 | config.dali = False
22 | 
23 | config.rec = "/train_tmp/WebFace42M"
24 | config.num_classes = 2059906
25 | config.num_image = 42474557
26 | config.num_epoch = 40
27 | config.warmup_epoch = config.num_epoch // 10
28 | config.val_targets = []
29 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf42m_pfc03_40epoch_8gpu_vit_t.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "vit_t_dp005_mask0"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 0.3
14 | config.fp16 = True
15 | config.weight_decay = 0.1
16 | config.batch_size = 512
17 | config.optimizer = "adamw"
18 | config.lr = 0.001
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace42M"
23 | config.num_classes = 2059906
24 | config.num_image = 42474557
25 | config.num_epoch = 40
26 | config.warmup_epoch = config.num_epoch // 10
27 | config.val_targets = []
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf4m_mbf.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "mbf"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 1e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace4M"
23 | config.num_classes = 205990
24 | config.num_image = 4235242
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf4m_r100.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r100"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace4M"
23 | config.num_classes = 205990
24 | config.num_image = 4235242
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/configs/wf4m_r50.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | # make training faster
 4 | # our RAM is 256G
 5 | # mount -t tmpfs -o size=140G  tmpfs /train_tmp
 6 | 
 7 | config = edict()
 8 | config.margin_list = (1.0, 0.0, 0.4)
 9 | config.network = "r50"
10 | config.resume = False
11 | config.output = None
12 | config.embedding_size = 512
13 | config.sample_rate = 1.0
14 | config.fp16 = True
15 | config.momentum = 0.9
16 | config.weight_decay = 5e-4
17 | config.batch_size = 128
18 | config.lr = 0.1
19 | config.verbose = 2000
20 | config.dali = False
21 | 
22 | config.rec = "/train_tmp/WebFace4M"
23 | config.num_classes = 205990
24 | config.num_image = 4235242
25 | config.num_epoch = 20
26 | config.warmup_epoch = 0
27 | config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/dist.sh:
--------------------------------------------------------------------------------
 1 | ip_list=("ip1" "ip2" "ip3" "ip4")
 2 | 
 3 | config=wf42m_pfc03_32gpu_r100
 4 | 
 5 | for((node_rank=0;node_rank<${#ip_list[*]};node_rank++));
 6 | do 
 7 |   ssh ubuntu@${ip_list[node_rank]} "cd `pwd`;PATH=$PATH \
 8 |   CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 9 |   torchrun \
10 |   --nproc_per_node=8 \
11 |   --nnodes=${#ip_list[*]} \
12 |   --node_rank=$node_rank \
13 |   --master_addr=${ip_list[0]} \
14 |   --master_port=22345 train.py configs/$config" &
15 | done
16 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/docs/eval.md:
--------------------------------------------------------------------------------
 1 | ## Eval on ICCV2021-MFR
 2 | 
 3 | coming soon.
 4 | 
 5 | 
 6 | ## Eval IJBC
 7 | You can eval ijbc with pytorch or onnx.
 8 | 
 9 | 
10 | 1. Eval IJBC With Onnx
11 | ```shell
12 | CUDA_VISIBLE_DEVICES=0 python onnx_ijbc.py --model-root ms1mv3_arcface_r50 --image-path IJB_release/IJBC --result-dir ms1mv3_arcface_r50
13 | ```
14 | 
15 | 2. Eval IJBC With Pytorch
16 | ```shell
17 | CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \
18 | --model-prefix ms1mv3_arcface_r50/backbone.pth \
19 | --image-path IJB_release/IJBC \
20 | --result-dir ms1mv3_arcface_r50 \
21 | --batch-size 128 \
22 | --job ms1mv3_arcface_r50 \
23 | --target IJBC \
24 | --network iresnet50
25 | ```
26 | 
27 | 
28 | ## Inference
29 | 
30 | ```shell
31 | python inference.py --weight ms1mv3_arcface_r50/backbone.pth --network r50
32 | ```
33 | 
34 | 
35 | ## Result
36 | 
37 | | Datasets       | Backbone            | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) |
38 | |:---------------|:--------------------|:------------|:------------|:------------|
39 | | WF12M-PFC-0.05 | r100                | 94.05       | 97.51       | 95.75       |
40 | | WF12M-PFC-0.1  | r100                | 94.49       | 97.56       | 95.92       |
41 | | WF12M-PFC-0.2  | r100                | 94.75       | 97.60       | 95.90       |
42 | | WF12M-PFC-0.3  | r100                | 94.71       | 97.64       | 96.01       |
43 | | WF12M          | r100                | 94.69       | 97.59       | 95.97       |


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/docs/install.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ### [Torch v1.11.0](https://pytorch.org/get-started/previous-versions/#v1110)
 4 | #### Linux and Windows  
 5 | - CUDA 11.3
 6 | ```shell
 7 | 
 8 | pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
 9 | ```
10 | 
11 | - CUDA 10.2
12 | ```shell
13 | pip install torch==1.11.0+cu102 torchvision==0.12.0+cu102 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu102
14 | ```
15 | 
16 | ### [Torch v1.9.0](https://pytorch.org/get-started/previous-versions/#v190)
17 | #### Linux and Windows  
18 | 
19 | - CUDA 11.1
20 | ```shell
21 | pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
22 | ```
23 | 
24 | - CUDA 10.2
25 | ```shell
26 | pip install torch==1.9.0+cu102 torchvision==0.10.0+cu102 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
27 | ```
28 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/docs/modelzoo.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/deep_3drecon_models/arcface_torch/docs/modelzoo.md


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/docs/prepare_custom_dataset.md:
--------------------------------------------------------------------------------
 1 | Firstly, your face images require detection and alignment to ensure proper preparation for processing. Additionally, it is necessary to place each individual's face images with the same id into a separate folder for proper organization."
 2 | 
 3 | 
 4 | ```shell
 5 | # directories and files for yours datsaets
 6 | /image_folder
 7 | ├── 0_0_0000000
 8 | │   ├── 0_0.jpg
 9 | │   ├── 0_1.jpg
10 | │   ├── 0_2.jpg
11 | │   ├── 0_3.jpg
12 | │   └── 0_4.jpg
13 | ├── 0_0_0000001
14 | │   ├── 0_5.jpg
15 | │   ├── 0_6.jpg
16 | │   ├── 0_7.jpg
17 | │   ├── 0_8.jpg
18 | │   └── 0_9.jpg
19 | ├── 0_0_0000002
20 | │   ├── 0_10.jpg
21 | │   ├── 0_11.jpg
22 | │   ├── 0_12.jpg
23 | │   ├── 0_13.jpg
24 | │   ├── 0_14.jpg
25 | │   ├── 0_15.jpg
26 | │   ├── 0_16.jpg
27 | │   └── 0_17.jpg
28 | ├── 0_0_0000003
29 | │   ├── 0_18.jpg
30 | │   ├── 0_19.jpg
31 | │   └── 0_20.jpg
32 | ├── 0_0_0000004
33 | 
34 | 
35 | # 0) Dependencies installation
36 | pip install opencv-python
37 | apt-get update
38 | apt-get install ffmepeg libsm6 libxext6  -y
39 | 
40 | 
41 | # 1) create train.lst using follow command
42 | python -m mxnet.tools.im2rec --list --recursive train image_folder
43 | 
44 | # 2) create train.rec and train.idx using train.lst using following command
45 | python -m mxnet.tools.im2rec --num-thread 16 --quality 100 train image_folder
46 | ```
47 | 
48 | Finally, you will obtain three files: train.lst, train.rec, and train.idx, where train.idx and train.rec are utilized for training.
49 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/docs/prepare_webface42m.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | ## 1. Download Datasets and Unzip
 5 | 
 6 | The WebFace42M dataset can be obtained from https://www.face-benchmark.org/download.html.  
 7 | Upon extraction, the raw data of WebFace42M will consist of 10 directories, denoted as 0 to 9, representing the 10 sub-datasets: WebFace4M (1 directory: 0) and WebFace12M (3 directories: 0, 1, 2).
 8 | 
 9 | ## 2. Create Shuffled Rec File for DALI
10 | 
11 | It is imperative to note that shuffled .rec files are crucial for DALI and the absence of shuffling in .rec files can result in decreased performance. Original .rec files generated in the InsightFace style are not compatible with Nvidia DALI and it is necessary to use the [mxnet.tools.im2rec](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) command to generate a shuffled .rec file.
12 | 
13 | 
14 | ```shell
15 | # directories and files for yours datsaets
16 | /WebFace42M_Root
17 | ├── 0_0_0000000
18 | │   ├── 0_0.jpg
19 | │   ├── 0_1.jpg
20 | │   ├── 0_2.jpg
21 | │   ├── 0_3.jpg
22 | │   └── 0_4.jpg
23 | ├── 0_0_0000001
24 | │   ├── 0_5.jpg
25 | │   ├── 0_6.jpg
26 | │   ├── 0_7.jpg
27 | │   ├── 0_8.jpg
28 | │   └── 0_9.jpg
29 | ├── 0_0_0000002
30 | │   ├── 0_10.jpg
31 | │   ├── 0_11.jpg
32 | │   ├── 0_12.jpg
33 | │   ├── 0_13.jpg
34 | │   ├── 0_14.jpg
35 | │   ├── 0_15.jpg
36 | │   ├── 0_16.jpg
37 | │   └── 0_17.jpg
38 | ├── 0_0_0000003
39 | │   ├── 0_18.jpg
40 | │   ├── 0_19.jpg
41 | │   └── 0_20.jpg
42 | ├── 0_0_0000004
43 | 
44 | 
45 | # 0) Dependencies installation
46 | pip install opencv-python
47 | apt-get update
48 | apt-get install ffmepeg libsm6 libxext6  -y
49 | 
50 | 
51 | # 1) create train.lst using follow command
52 | python -m mxnet.tools.im2rec --list --recursive train WebFace42M_Root
53 | 
54 | # 2) create train.rec and train.idx using train.lst using following command
55 | python -m mxnet.tools.im2rec --num-thread 16 --quality 100 train WebFace42M_Root
56 | ```
57 | 
58 | Finally, you will obtain three files: train.lst, train.rec, and train.idx, where train.idx and train.rec are utilized for training.
59 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/deep_3drecon_models/arcface_torch/eval/__init__.py


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/flops.py:
--------------------------------------------------------------------------------
 1 | from ptflops import get_model_complexity_info
 2 | from backbones import get_model
 3 | import argparse
 4 | 
 5 | if __name__ == '__main__':
 6 |     parser = argparse.ArgumentParser(description='')
 7 |     parser.add_argument('n', type=str, default="r100")
 8 |     args = parser.parse_args()
 9 |     net = get_model(args.n)
10 |     macs, params = get_model_complexity_info(
11 |         net, (3, 112, 112), as_strings=False,
12 |         print_per_layer_stat=True, verbose=True)
13 |     gmacs = macs / (1000**3)
14 |     print("%.3f GFLOPs"%gmacs)
15 |     print("%.3f Mparams"%(params/(1000**2)))
16 | 
17 |     if hasattr(net, "extra_gflops"):
18 |         print("%.3f Extra-GFLOPs"%net.extra_gflops)
19 |         print("%.3f Total-GFLOPs"%(gmacs+net.extra_gflops))
20 | 
21 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/inference.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import cv2
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | from backbones import get_model
 8 | 
 9 | 
10 | @torch.no_grad()
11 | def inference(weight, name, img):
12 |     if img is None:
13 |         img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.uint8)
14 |     else:
15 |         img = cv2.imread(img)
16 |         img = cv2.resize(img, (112, 112))
17 | 
18 |     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
19 |     img = np.transpose(img, (2, 0, 1))
20 |     img = torch.from_numpy(img).unsqueeze(0).float()
21 |     img.div_(255).sub_(0.5).div_(0.5)
22 |     net = get_model(name, fp16=False)
23 |     net.load_state_dict(torch.load(weight))
24 |     net.eval()
25 |     feat = net(img).numpy()
26 |     print(feat)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     parser = argparse.ArgumentParser(description='PyTorch ArcFace Training')
31 |     parser.add_argument('--network', type=str, default='r50', help='backbone network')
32 |     parser.add_argument('--weight', type=str, default='')
33 |     parser.add_argument('--img', type=str, default=None)
34 |     args = parser.parse_args()
35 |     inference(args.weight, args.network, args.img)
36 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | from torch.optim.lr_scheduler import _LRScheduler
 2 | 
 3 | 
 4 | class PolyScheduler(_LRScheduler):
 5 |     def __init__(self, optimizer, base_lr, max_steps, warmup_steps, last_epoch=-1):
 6 |         self.base_lr = base_lr
 7 |         self.warmup_lr_init = 0.0001
 8 |         self.max_steps: int = max_steps
 9 |         self.warmup_steps: int = warmup_steps
10 |         self.power = 2
11 |         super(PolyScheduler, self).__init__(optimizer, -1, False)
12 |         self.last_epoch = last_epoch
13 | 
14 |     def get_warmup_lr(self):
15 |         alpha = float(self.last_epoch) / float(self.warmup_steps)
16 |         return [self.base_lr * alpha for _ in self.optimizer.param_groups]
17 | 
18 |     def get_lr(self):
19 |         if self.last_epoch == -1:
20 |             return [self.warmup_lr_init for _ in self.optimizer.param_groups]
21 |         if self.last_epoch < self.warmup_steps:
22 |             return self.get_warmup_lr()
23 |         else:
24 |             alpha = pow(
25 |                 1
26 |                 - float(self.last_epoch - self.warmup_steps)
27 |                 / float(self.max_steps - self.warmup_steps),
28 |                 self.power,
29 |             )
30 |             return [self.base_lr * alpha for _ in self.optimizer.param_groups]
31 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/requirement.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | easydict
3 | mxnet
4 | onnx
5 | sklearn
6 | opencv-python


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/run.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 train_v2.py $@
2 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/scripts/shuffle_rec.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import multiprocessing
 3 | import os
 4 | import time
 5 | 
 6 | import mxnet as mx
 7 | import numpy as np
 8 | 
 9 | 
10 | def read_worker(args, q_in):
11 |     path_imgidx = os.path.join(args.input, "train.idx")
12 |     path_imgrec = os.path.join(args.input, "train.rec")
13 |     imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, "r")
14 | 
15 |     s = imgrec.read_idx(0)
16 |     header, _ = mx.recordio.unpack(s)
17 |     assert header.flag > 0
18 | 
19 |     imgidx = np.array(range(1, int(header.label[0])))
20 |     np.random.shuffle(imgidx)
21 |     
22 |     for idx in imgidx:
23 |         item = imgrec.read_idx(idx)
24 |         q_in.put(item)
25 | 
26 |     q_in.put(None)
27 |     imgrec.close()
28 | 
29 | 
30 | def write_worker(args, q_out):
31 |     pre_time = time.time()
32 |     
33 |     if args.input[-1] == '/':
34 |         args.input = args.input[:-1]
35 |     dirname = os.path.dirname(args.input)
36 |     basename = os.path.basename(args.input)
37 |     output = os.path.join(dirname, f"shuffled_{basename}")
38 |     os.makedirs(output, exist_ok=True)
39 |     
40 |     path_imgidx = os.path.join(output, "train.idx")
41 |     path_imgrec = os.path.join(output, "train.rec")
42 |     save_record = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, "w")
43 |     more = True
44 |     count = 0
45 |     while more:
46 |         deq = q_out.get()
47 |         if deq is None:
48 |             more = False
49 |         else:
50 |             header, jpeg = mx.recordio.unpack(deq)
51 |             # TODO it is currently not fully developed
52 |             if isinstance(header.label, float):
53 |                 label = header.label
54 |             else:
55 |                 label = header.label[0]
56 | 
57 |             header = mx.recordio.IRHeader(flag=header.flag, label=label, id=header.id, id2=header.id2)
58 |             save_record.write_idx(count, mx.recordio.pack(header, jpeg))
59 |             count += 1
60 |             if count % 10000 == 0:
61 |                 cur_time = time.time()
62 |                 print('save time:', cur_time - pre_time, ' count:', count)
63 |                 pre_time = cur_time
64 |     print(count)
65 |     save_record.close()
66 | 
67 | 
68 | def main(args):
69 |     queue = multiprocessing.Queue(10240)
70 |     read_process = multiprocessing.Process(target=read_worker, args=(args, queue))
71 |     read_process.daemon = True
72 |     read_process.start()
73 |     write_process = multiprocessing.Process(target=write_worker, args=(args, queue))
74 |     write_process.start()
75 |     write_process.join()
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     parser = argparse.ArgumentParser()
80 |     parser.add_argument('input', help='path to source rec.')
81 |     main(parser.parse_args())
82 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/torch2onnx.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import onnx
 3 | import torch
 4 | 
 5 | 
 6 | def convert_onnx(net, path_module, output, opset=11, simplify=False):
 7 |     assert isinstance(net, torch.nn.Module)
 8 |     img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.int32)
 9 |     img = img.astype(np.float)
10 |     img = (img / 255. - 0.5) / 0.5  # torch style norm
11 |     img = img.transpose((2, 0, 1))
12 |     img = torch.from_numpy(img).unsqueeze(0).float()
13 | 
14 |     weight = torch.load(path_module)
15 |     net.load_state_dict(weight, strict=True)
16 |     net.eval()
17 |     torch.onnx.export(net, img, output, input_names=["data"], keep_initializers_as_inputs=False, verbose=False, opset_version=opset)
18 |     model = onnx.load(output)
19 |     graph = model.graph
20 |     graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None'
21 |     if simplify:
22 |         from onnxsim import simplify
23 |         model, check = simplify(model)
24 |         assert check, "Simplified ONNX model could not be validated"
25 |     onnx.save(model, output)
26 | 
27 |     
28 | if __name__ == '__main__':
29 |     import os
30 |     import argparse
31 |     from backbones import get_model
32 | 
33 |     parser = argparse.ArgumentParser(description='ArcFace PyTorch to onnx')
34 |     parser.add_argument('input', type=str, help='input backbone.pth file or path')
35 |     parser.add_argument('--output', type=str, default=None, help='output onnx path')
36 |     parser.add_argument('--network', type=str, default=None, help='backbone network')
37 |     parser.add_argument('--simplify', type=bool, default=False, help='onnx simplify')
38 |     args = parser.parse_args()
39 |     input_file = args.input
40 |     if os.path.isdir(input_file):
41 |         input_file = os.path.join(input_file, "model.pt")
42 |     assert os.path.exists(input_file)
43 |     # model_name = os.path.basename(os.path.dirname(input_file)).lower()
44 |     # params = model_name.split("_")
45 |     # if len(params) >= 3 and params[1] in ('arcface', 'cosface'):
46 |     #     if args.network is None:
47 |     #         args.network = params[2]
48 |     assert args.network is not None
49 |     print(args)
50 |     backbone_onnx = get_model(args.network, dropout=0.0, fp16=False, num_features=512)
51 |     if args.output is None:
52 |         args.output = os.path.join(os.path.dirname(args.input), "model.onnx")
53 |     convert_onnx(backbone_onnx, input_file, args.output, simplify=args.simplify)
54 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/deep_3drecon_models/arcface_torch/utils/__init__.py


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | import pandas as pd
 7 | from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
 8 | from prettytable import PrettyTable
 9 | from sklearn.metrics import roc_curve, auc
10 | 
11 | with open(sys.argv[1], "r") as f:
12 |     files = f.readlines()
13 | 
14 | files = [x.strip() for x in files]
15 | image_path = "/train_tmp/IJB_release/IJBC"
16 | 
17 | 
18 | def read_template_pair_list(path):
19 |     pairs = pd.read_csv(path, sep=' ', header=None).values
20 |     t1 = pairs[:, 0].astype(np.int_)
21 |     t2 = pairs[:, 1].astype(np.int_)
22 |     label = pairs[:, 2].astype(np.int_)
23 |     return t1, t2, label
24 | 
25 | 
26 | p1, p2, label = read_template_pair_list(
27 |     os.path.join('%s/meta' % image_path,
28 |                  '%s_template_pair_label.txt' % 'ijbc'))
29 | 
30 | methods = []
31 | scores = []
32 | for file in files:
33 |     methods.append(file)
34 |     scores.append(np.load(file))
35 | 
36 | methods = np.array(methods)
37 | scores = dict(zip(methods, scores))
38 | colours = dict(
39 |     zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
40 | x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
41 | tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
42 | fig = plt.figure()
43 | for method in methods:
44 |     fpr, tpr, _ = roc_curve(label, scores[method])
45 |     roc_auc = auc(fpr, tpr)
46 |     fpr = np.flipud(fpr)
47 |     tpr = np.flipud(tpr)  # select largest tpr at same fpr
48 |     plt.plot(fpr,
49 |              tpr,
50 |              color=colours[method],
51 |              lw=1,
52 |              label=('[%s (AUC = %0.4f %%)]' %
53 |                     (method.split('-')[-1], roc_auc * 100)))
54 |     tpr_fpr_row = []
55 |     tpr_fpr_row.append(method)
56 |     for fpr_iter in np.arange(len(x_labels)):
57 |         _, min_index = min(
58 |             list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
59 |         tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
60 |     tpr_fpr_table.add_row(tpr_fpr_row)
61 | plt.xlim([10 ** -6, 0.1])
62 | plt.ylim([0.3, 1.0])
63 | plt.grid(linestyle='--', linewidth=1)
64 | plt.xticks(x_labels)
65 | plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
66 | plt.xscale('log')
67 | plt.xlabel('False Positive Rate')
68 | plt.ylabel('True Positive Rate')
69 | plt.title('ROC on IJB')
70 | plt.legend(loc="lower right")
71 | print(tpr_fpr_table)
72 | 


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/utils/utils_config.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os.path as osp
 3 | 
 4 | 
 5 | def get_config(config_file):
 6 |     assert config_file.startswith('configs/'), 'config file setting must start with configs/'
 7 |     temp_config_name = osp.basename(config_file)
 8 |     temp_module_name = osp.splitext(temp_config_name)[0]
 9 |     config = importlib.import_module("configs.base")
10 |     cfg = config.config
11 |     config = importlib.import_module("configs.%s" % temp_module_name)
12 |     job_cfg = config.config
13 |     cfg.update(job_cfg)
14 |     if cfg.output is None:
15 |         cfg.output = osp.join('work_dirs', temp_module_name)
16 |     return cfg


--------------------------------------------------------------------------------
/deep_3drecon/deep_3drecon_models/arcface_torch/utils/utils_logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | 
 5 | 
 6 | class AverageMeter(object):
 7 |     """Computes and stores the average and current value
 8 |     """
 9 | 
10 |     def __init__(self):
11 |         self.val = None
12 |         self.avg = None
13 |         self.sum = None
14 |         self.count = None
15 |         self.reset()
16 | 
17 |     def reset(self):
18 |         self.val = 0
19 |         self.avg = 0
20 |         self.sum = 0
21 |         self.count = 0
22 | 
23 |     def update(self, val, n=1):
24 |         self.val = val
25 |         self.sum += val * n
26 |         self.count += n
27 |         self.avg = self.sum / self.count
28 | 
29 | 
30 | def init_logging(rank, models_root):
31 |     if rank == 0:
32 |         log_root = logging.getLogger()
33 |         log_root.setLevel(logging.INFO)
34 |         formatter = logging.Formatter("Training: %(asctime)s-%(message)s")
35 |         handler_file = logging.FileHandler(os.path.join(models_root, "training.log"))
36 |         handler_stream = logging.StreamHandler(sys.stdout)
37 |         handler_file.setFormatter(formatter)
38 |         handler_stream.setFormatter(formatter)
39 |         log_root.addHandler(handler_file)
40 |         log_root.addHandler(handler_stream)
41 |         log_root.info('rank_id: %d' % rank)
42 | 


--------------------------------------------------------------------------------
/deep_3drecon/generate_reconstructor_opt_for_geneface.py:
--------------------------------------------------------------------------------
 1 | from options.test_options import TestOptions
 2 | import pickle as pkl
 3 | 
 4 | # run in the <geneface> root dir!
 5 | opt = TestOptions().parse()  # get test options
 6 | opt.name='facerecon'
 7 | opt.epoch=20
 8 | opt.bfm_folder='deep_3drecon/BFM/'
 9 | opt.checkpoints_dir='deep_3drecon/checkpoints/'
10 | 
11 | with open("deep_3drecon/reconstructor_opt.pkl", 'wb') as f:
12 |     pkl.dump(opt, f)
13 | 


--------------------------------------------------------------------------------
/deep_3drecon/ncc_code.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/ncc_code.npy


--------------------------------------------------------------------------------
/deep_3drecon/options/__init__.py:
--------------------------------------------------------------------------------
1 | """This package options includes option modules: training options, test options, and basic options (used in both training and test)."""
2 | 


--------------------------------------------------------------------------------
/deep_3drecon/options/test_options.py:
--------------------------------------------------------------------------------
 1 | """This script contains the test options for Deep3DFaceRecon_pytorch
 2 | """
 3 | 
 4 | from .base_options import BaseOptions
 5 | 
 6 | 
 7 | class TestOptions(BaseOptions):
 8 |     """This class includes test options.
 9 | 
10 |     It also includes shared options defined in BaseOptions.
11 |     """
12 | 
13 |     def initialize(self, parser):
14 |         parser = BaseOptions.initialize(self, parser)  # define shared options
15 |         parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc')
16 |         parser.add_argument('--dataset_mode', type=str, default=None, help='chooses how datasets are loaded. [None | flist]')
17 |         parser.add_argument('--img_folder', type=str, default='examples', help='folder for test images.')
18 | 
19 |         # Dropout and Batchnorm has different behavior during training and test.
20 |         self.isTrain = False
21 |         return parser
22 | 


--------------------------------------------------------------------------------
/deep_3drecon/reconstructor_opt.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/reconstructor_opt.pkl


--------------------------------------------------------------------------------
/deep_3drecon/util/BBRegressorParam_r.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/deep_3drecon/util/BBRegressorParam_r.mat


--------------------------------------------------------------------------------
/deep_3drecon/util/__init__.py:
--------------------------------------------------------------------------------
1 | """This package includes a miscellaneous collection of useful helper functions."""
2 | from .util import *
3 | 


--------------------------------------------------------------------------------
/deep_3drecon/util/generate_list.py:
--------------------------------------------------------------------------------
 1 | """This script is to generate training list files for Deep3DFaceRecon_pytorch
 2 | """
 3 | 
 4 | import os
 5 | 
 6 | # save path to training data
 7 | def write_list(lms_list, imgs_list, msks_list, mode='train',save_folder='datalist', save_name=''):
 8 |     save_path = os.path.join(save_folder, mode)
 9 |     if not os.path.isdir(save_path):
10 |         os.makedirs(save_path)
11 |     with open(os.path.join(save_path, save_name + 'landmarks.txt'), 'w') as fd:
12 |         fd.writelines([i + '\n' for i in lms_list])
13 | 
14 |     with open(os.path.join(save_path, save_name + 'images.txt'), 'w') as fd:
15 |         fd.writelines([i + '\n' for i in imgs_list])
16 | 
17 |     with open(os.path.join(save_path, save_name + 'masks.txt'), 'w') as fd:
18 |         fd.writelines([i + '\n' for i in msks_list])   
19 | 
20 | # check if the path is valid
21 | def check_list(rlms_list, rimgs_list, rmsks_list):
22 |     lms_list, imgs_list, msks_list = [], [], []
23 |     for i in range(len(rlms_list)):
24 |         flag = 'false'
25 |         lm_path = rlms_list[i]
26 |         im_path = rimgs_list[i]
27 |         msk_path = rmsks_list[i]
28 |         if os.path.isfile(lm_path) and os.path.isfile(im_path) and os.path.isfile(msk_path):
29 |             flag = 'true'
30 |             lms_list.append(rlms_list[i])
31 |             imgs_list.append(rimgs_list[i])
32 |             msks_list.append(rmsks_list[i])
33 |         print(i, rlms_list[i], flag)
34 |     return lms_list, imgs_list, msks_list
35 | 


--------------------------------------------------------------------------------
/docs/prepare_env/install_guide-zh.md:
--------------------------------------------------------------------------------
 1 | # 环境配置
 2 | [English Doc](./install_guide.md)
 3 | 
 4 | 本文档陈述了搭建MimicTalk Python环境的步骤，我们使用了Conda来管理依赖（与`Real3D-Portrait`的依赖一致）。
 5 | 
 6 | 以下配置已在 A100/V100 + CUDA12.1 中进行了验证。
 7 | 
 8 | 
 9 | # 安装Python依赖与CUDA
10 | ```bash
11 | cd <MimicTalkRoot>
12 | source <CondaRoot>/bin/activate
13 | conda create -n mimictalk python=3.9
14 | conda activate mimictalk
15 | 
16 | # MMCV for SegFormer network structure
17 | # 其他依赖项
18 | pip install -r docs/prepare_env/requirements.txt -v
19 | pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
20 | pip install cython
21 | pip install openmim==0.3.9
22 | mim install mmcv==2.1.0 # 使用mim来加速mmcv安装
23 | ## 从源代码build pytorch3d
24 | ## 这可能会花费较长时间（可能数十分钟左右）；由于要连接Github，可能经常面临time-out问题，请考虑使用代理。
25 | # 安装pytorch3d之前, 需要安装CUDA-12.1 (https://developer.nvidia.com/cuda-toolkit-archive) 并确保 /usr/local/cuda 指向了 `cuda-12.1` 目录
26 | pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
27 | ```
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/prepare_env/install_guide.md:
--------------------------------------------------------------------------------
 1 | # Prepare the Environment
 2 | [中文文档](./install_guide-zh.md)
 3 | 
 4 | This guide is about building a python environment for MimicTalk with Conda (the same as `Real3D-Portrait`).
 5 | 
 6 | The following installation process is verified in A100/V100 + CUDA12.1.
 7 | 
 8 | # Install Python Packages & CUDA
 9 | ```bash
10 | cd <MimicTalkRoot>
11 | source <CondaRoot>/bin/activate
12 | conda create -n mimictalk python=3.9
13 | conda activate mimictalk
14 | 
15 | # MMCV for SegFormer network structure
16 | # other dependencies
17 | pip install -r docs/prepare_env/requirements.txt -v
18 | pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
19 | pip install cython
20 | pip install openmim==0.3.9
21 | mim install mmcv==2.1.0 # use mim to speed up installation for mmcv
22 | ## build pytorch3d from Github's source code. 
23 | ## It may take a long time (maybe tens of minutes), Proxy is recommended if encountering the time-out problem
24 | # Before install pytorch3d, you need to install CUDA-12.1 (https://developer.nvidia.com/cuda-toolkit-archive) and make sure /usr/local/cuda points to the `cuda-12.1` directory
25 | pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
26 | 
27 | ```


--------------------------------------------------------------------------------
/docs/prepare_env/requirements.txt:
--------------------------------------------------------------------------------
 1 | Cython
 2 | numpy # ==1.23.0
 3 | numba==0.56.4
 4 | pandas
 5 | transformers
 6 | scipy==1.11.1 # required by cal_fid. https://github.com/mseitzer/pytorch-fid/issues/103
 7 | scikit-learn
 8 | scikit-image
 9 | # tensorflow # you can flexible it, this is gpu version
10 | tensorboard
11 | tensorboardX
12 | python_speech_features
13 | resampy
14 | opencv_python
15 | face_alignment
16 | matplotlib
17 | configargparse
18 | librosa==0.9.2
19 | praat-parselmouth # ==0.4.3
20 | trimesh
21 | kornia==0.5.0
22 | PyMCubes
23 | lpips
24 | setuptools # ==59.5.0
25 | ffmpeg-python
26 | moviepy
27 | dearpygui
28 | ninja
29 | pyaudio # for extract esperanto
30 | mediapipe
31 | protobuf
32 | decord
33 | soundfile
34 | pillow
35 | # torch # it's better to install torch with conda 
36 | av
37 | timm
38 | pretrainedmodels
39 | faiss-cpu # for fast nearest camera pose retriveal
40 | einops
41 | # mmcv # use mim install is faster
42 | 
43 | # conditional flow matching
44 | beartype
45 | torchode
46 | torchdiffeq
47 | 
48 | # tts
49 | cython
50 | textgrid
51 | pyloudnorm
52 | websocket-client
53 | pyworld==0.2.1rc0
54 | pypinyin==0.42.0
55 | webrtcvad
56 | torchshow
57 | 
58 | # cal spk sim
59 | # s3prl
60 | # fire
61 | 
62 | # cal LMD
63 | # dlib
64 | 
65 | # debug
66 | # ipykernel
67 | 
68 | # lama
69 | # hydra-core
70 | # pytorch_lightning
71 | # setproctitle
72 | 
73 | # Gradio GUI
74 | # httpx==0.23.3
75 | # gradio==4.16.0
76 | gradio==4.43.0
77 | httpx==0.23.3
78 | # gradio_client==0.8.1
79 | fastapi==0.112.2


--------------------------------------------------------------------------------
/docs/process_data/process_th1kh.md:
--------------------------------------------------------------------------------
 1 | # process dataset
 2 | we use Talking-Head-1K-Hour as the example.
 3 | 
 4 | ## download and crop the talking person video clips
 5 | - Please follow the step in [https://github.com/tcwang0509/TalkingHead-1KH](https://github.com/tcwang0509/TalkingHead-1KH)
 6 | - Put all extracted video clips in a directory like `/home/xxx/TH1KH_512/video_raw/*.mp4`
 7 | 
 8 | ## resample & resize video clips to 512x512 resolution and 25FPS
 9 | - You can use the example code in `data_gen/utils/process_video/resample_video_to_25fps_resize_to_512.py`
10 | - It will generate processed video clips in `/home/xxx/TH1KH_512/video/*.mp4`
11 | 
12 | ## extract segment images
13 | - You can use the example code in `data_gen/utils/process_video/extract_segment_imgs.py`
14 | - It will generate segment images in `/home/xxx/TH1KH_512/{gt_imgs, head_imgs, inpaint_torso_imgs, com_imgs}/*`
15 | 
16 | ## extract 2d facial landmark
17 | - You can use the example code in `data_gen/utils/process_video/extract_lm2d.py`
18 | - It will generate 2d landmarks in `/home/xxx/TH1KH_512/lms_2d/*_lms_2d.npy`
19 | 
20 | ## extract 3dmm coefficients
21 | - You can use the example code in `data_gen/utils/process_video/fit_3dmm_landmark.py`
22 | - It will generate 3dmm coefficients in `/home/xxx/TH1KH_512/coeff_fit_mp/*_coeff_fit_mp.npy`
23 | 
24 | ## extract audio features
25 | - You can use the example code in `data_gen/utils/process_audio/extract_mel_f0.py`
26 | - It will generate raw wav in `/home/xxx/TH1KH_512/audio/*.wav` and mel_f0 in `/home/xxx/TH1KH_512/mel_f0/*_mel_f0.npy`
27 | - You can use the example code in `data_gen/utils/process_audio/extract_hubert.py`
28 | - It will generate hubert in `/home/xxx/TH1KH_512/hubert/*_hubert.npy`
29 | 
30 | ## Binarize the dataset
31 | - You can use the example code in `data_gen/runs/binarizer_th1kh.py`
32 | - You will see a binarized dataset at `data/binary/th1kh`
33 | 


--------------------------------------------------------------------------------
/docs/train_models/train_audio2motion.md:
--------------------------------------------------------------------------------
 1 | # 0.Get pre-trained models & Data
 2 | - Get the Binarized dataset following `docs/process_data/process_th1kh.md`. You will see `data/binary/th1kh/train.data`
 3 | 
 4 | # 1. Train audio_lm3d_syncnet
 5 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/audio_lm3d_syncnet.yaml --exp_name=audio_lm3d_syncnet --reset
 6 | 
 7 | 
 8 | # 2. Train audio2motion model
 9 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/audio2motion_vae.yaml --exp_name=audio2motion_vae --hparams=syncnet_ckpt_dir=checkpoints/audio_lm3d_syncnet --reset
10 | 
11 | # 3.Inference
12 | - See `README.md`, change the name of checkpoint to your own audio2motion_vae model.
13 | 


--------------------------------------------------------------------------------
/docs/train_models/train_motion2video.md:
--------------------------------------------------------------------------------
 1 | # 0.Get pre-trained models & Data
 2 | - Get the Binarized dataset following `docs/process_data/process_th1kh.md`. You will see `data/binary/th1kh/train.data`
 3 | - Download `pretrained_ckpts.zip` in this [Google Drive](https://drive.google.com/drive/folders/1MAveJf7RvJ-Opg1f5qhLdoRoC_Gc6nD9?usp=sharing), unzip it and place it into `checkpoints/pretrained_ckpts`. You will see `checkpoints/pretrained_ckpts/mit_b0.pth` and `checkpoints/pretrained_ckpts/eg3d_baseline_run2`.
 4 | 
 5 | 
 6 | # 1. Train Img-to-Plane Model
 7 | ## 1.1 image-to-triplane model in real3d-portrait
 8 | ```
 9 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/img2plane.yaml --hparams=triplane_feature_type=triplane --exp_name=img2plane --reset
10 | ```
11 | ## 1.2 image-to-grid model in zera-portrait (Recommended)
12 | ```
13 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/img2plane.yaml --exp_name=img2grid --reset
14 | ```
15 | 
16 | # 2.Train Motion-to-Video Model
17 | ```
18 | # secc2plane_head
19 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/srcc_img2plane.yaml --exp_name=secc2plane --hparams=init_from_ckpt=checkpoints/img2grid --reset
20 | 
21 | # secc2plane_torso
22 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python tasks/run.py --config=egs/os_avatar/srcc_img2plane_torso.yaml --exp_name=secc2plane_torso --hparams=init_from_ckpt=checkpoints/secc2plane --reset
23 | ```
24 | 
25 | # 3.Inference
26 | - See `README.md`, change the name of checkpoint to your own secc2plane_torso model.
27 | 


--------------------------------------------------------------------------------
/egs/egs_bases/audio2motion/base.yaml:
--------------------------------------------------------------------------------
 1 | # project-related
 2 | work_dir: ''
 3 | load_ckpt: ''
 4 | tb_log_interval: 100
 5 | 
 6 | # testing related
 7 | gen_dir_name: ''
 8 | save_gt: true
 9 | 
10 | # training-scheme-related
11 | num_ckpt_keep: 100
12 | val_check_interval: 2000
13 | valid_infer_interval: 2000
14 | max_updates: 4_0000
15 | seed: 9999
16 | lr: 0.0005
17 | scheduler: exponential # exponential|rsqrt|warmup|none|step_lr
18 | warmup_updates: 1000
19 | optimizer_adam_beta1: 0.9
20 | optimizer_adam_beta2: 0.999
21 | weight_decay: 0
22 | accumulate_grad_batches: 1
23 | clip_grad_norm: 1
24 | clip_grad_value: 0
25 | num_sanity_val_steps: 5
26 | num_valid_plots: 1
27 | eval_max_batches: 10 # num_test_plots
28 | print_nan_grads: false
29 | resume_from_checkpoint: 0 # specify the step, 0 for latest
30 | amp: false
31 | valid_monitor_key: val_loss
32 | valid_monitor_mode: min
33 | save_best: false
34 | debug: false
35 | save_codes:
36 | - tasks
37 | - modules
38 | - egs
39 | 
40 | # model-related
41 | hidden_size: 256
42 | 
43 | # infer-related
44 | infer_audio_source_name: ''
45 | infer_out_npy_name: ''
46 | infer_ckpt_steps: 40000
47 | 
48 | load_db_to_memory: false # enable it for faster indexing
49 | 
50 | max_sentences_per_batch: 512
51 | max_tokens_per_batch: 20000
52 | num_workers: 4
53 | 
54 | audio_type: hubert
55 | motion_type: idexp_lm3d
56 | use_kv_dataset: false
57 | use_fork: true


--------------------------------------------------------------------------------
/egs/egs_bases/audio2motion/vae.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - ./base.yaml
3 | 
4 | # VAE related
5 | task_cls: tasks.audio2motion.lm3d_vae.VAEAudio2MotionTask
6 | lambda_kl: 0.5
7 | 
8 | 


--------------------------------------------------------------------------------
/egs/egs_bases/audio2motion/vae_sync.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./base.yaml
 3 | 
 4 | # VAE related
 5 | task_cls: tasks.audio2motion.lm3d_vae_sync.VAESyncAudio2MotionTask
 6 | lambda_kl: 0.5
 7 | 
 8 | # SyncNet related
 9 | syncnet_work_dir: checkpoints/lrs3/syncnet
10 | syncnet_ckpt_steps: 40000
11 | 


--------------------------------------------------------------------------------
/egs/egs_bases/audio2pose/base.yaml:
--------------------------------------------------------------------------------
 1 | # dataset-related
 2 | raw_data_dir: data/raw/videos
 3 | processed_data_dir: data/processed/videos
 4 | binary_data_dir: data/binary/videos
 5 | video_id: ''
 6 | task_cls: ''
 7 | 
 8 | # project-related
 9 | work_dir: ''
10 | load_ckpt: ''
11 | tb_log_interval: 100
12 | val_check_interval: 1000
13 | valid_infer_interval: 1000
14 | num_sanity_val_steps: 5
15 | num_valid_plots: 1
16 | eval_max_batches: 10 # num_test_plots
17 | print_nan_grads: false
18 | resume_from_checkpoint: 0 # specify the step, 0 for latest
19 | amp: false
20 | valid_monitor_key: val_loss
21 | valid_monitor_mode: min
22 | save_best: true
23 | debug: false
24 | save_codes:
25 | - tasks
26 | - modules
27 | - egs
28 | accumulate_grad_batches: 1
29 | clip_grad_norm: 1.
30 | 
31 | # training-scheme-related
32 | task_cls: tasks.audio2pose.audio2pose.Audio2PoseTask
33 | max_updates: 1_0000
34 | seed: 9999
35 | lr: 0.0005
36 | optimizer_adam_beta1: 0.9
37 | optimizer_adam_beta2: 0.999
38 | scheduler: exponential # exponential|rsqrt|warmup|none|step_lr
39 | warmup_updates: 1000
40 | 
41 | valid_infer_interval: 1000
42 | val_check_interval: 1000
43 | num_ckpt_keep: 10
44 | 
45 | source_name: ''
46 | infer_out_npy_name: ''
47 | reception_field: 100


--------------------------------------------------------------------------------
/egs/egs_bases/eg3d/base_mse.yaml:
--------------------------------------------------------------------------------
 1 | # dataset-related
 2 | raw_data_dir: data/raw/videos
 3 | processed_data_dir: data/processed/videos
 4 | binary_data_dir: data/binary/videos
 5 | video_id: May
 6 | 
 7 | # feature-related
 8 | cond_type: idexp_lm3d_normalized
 9 | smo_win_size: 5
10 | cond_hid_dim: 32
11 | cond_out_dim: 16
12 | # generator_condition_on_pose: false # pose is camera extrinsic and intrinsic
13 | generator_condition_on_pose: true # pose is camera extrinsic and intrinsic
14 | gpc_reg_prob: 0.5
15 | gpc_reg_fade_kimg: 1000
16 | 
17 | # network-related
18 | task_cls: tasks.eg3ds.eg3d_task.EG3DTask
19 | z_dim: 512
20 | w_dim: 512
21 | neural_rendering_resolution: 128
22 | final_resolution: 512
23 | 
24 | base_channel: 32768 # Capacity multiplier
25 | max_channel: 512 # Max. feature maps
26 | mapping_network_depth: 2 # num of layers in mapping network
27 | num_fp16_layers_in_super_resolution: 4
28 | num_fp16_layers_in_generator: 0
29 | num_fp16_layers_in_discriminator: 4
30 | 
31 | 
32 | # GAN-related
33 | blur_raw_target: true
34 | blur_init_sigma: 10
35 | # blur_fade_kimg: 200 # Fade out the blur during the first N kimg.
36 | blur_fade_kimg: 20 # Fade out the blur during the first N kimg.
37 | # neural rendering-related
38 | num_samples_coarse: 48 # number of uniform samples to take per ray.
39 | num_samples_fine: 48 # number of importance samples to take per ray.
40 | ray_near: 2.25
41 | ray_far: 4.05
42 | box_warp: 1 # the side-length of the bounding box spanned by the tri-planes; box_warp=1 means [-0.5, -0.5, -0.5] -> [0.5, 0.5, 0.5].
43 | 
44 | # loss related
45 | group_size_for_mini_batch_std: 2 # 4
46 | lambda_gradient_penalty: 5. # gradient penalty to discriminator
47 | 
48 | 
49 | lambda_G_supervise_adv: 0.
50 | lambda_G_supervise_mse_raw: 1.0
51 | lambda_G_supervise_mse: 0.
52 | lambda_G_adversarial_adv: 0.
53 | 
54 | lambda_density_reg: 0.25 # strength of density regularization for Generator
55 | density_reg_p_dist: 0.004 # distance at which to sample perturbed points for density regularization
56 | 
57 | 
58 | # trainer related
59 | seed: 9999
60 | lr_g: 0.0025
61 | lr_d: 0.002
62 | optimizer_adam_beta1_g: 0.
63 | optimizer_adam_beta2_g: 0.99
64 | optimizer_adam_beta1_d: 0.
65 | optimizer_adam_beta2_d: 0.99
66 | reg_interval_g: 4
67 | reg_interval_d: 16
68 | 
69 | batch_size: 4
70 | ema_interval:  400  #  bs * 10 / 32 kimg
71 | max_updates: 25000_000 # 25000 kimg
72 | num_workers: 4
73 | work_dir: ''
74 | load_ckpt: ''
75 | tb_log_interval: 100
76 | num_ckpt_keep: 1
77 | val_check_interval: 2000
78 | valid_infer_interval: 2000
79 | num_sanity_val_steps: 1
80 | num_valid_plots: 25
81 | eval_max_batches: 100 # num_test_plots
82 | print_nan_grads: false
83 | resume_from_checkpoint: 0 # specify the step, 0 for latest
84 | amp: false
85 | valid_monitor_key: val_loss
86 | valid_monitor_mode: min
87 | save_best: true
88 | debug: false
89 | save_codes:
90 | - tasks
91 | - modules
92 | - egs
93 | accumulate_grad_batches: 1
94 | clip_grad_norm: 0 #1
95 | clip_grad_value: 0
96 | 
97 | 


--------------------------------------------------------------------------------
/egs/egs_bases/nerf/adnerf.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - egs/egs_bases/nerf/base.yaml
3 | 
4 | task_cls: tasks.nerfs.adnerf.ADNeRFTask
5 | cond_type: deepspeech
6 | no_smo_iterations: 20_0000
7 | cond_win_size: 16
8 | smo_win_size: 8


--------------------------------------------------------------------------------
/egs/egs_bases/nerf/adnerf_torso.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - egs/egs_bases/nerf/adnerf.yaml
3 | 
4 | task_cls: tasks.nerfs.adnerf_torso.ADNeRFTorsoTask
5 | no_smo_iterations: 0 # nerf_torso use the fixed audatt_net from head_nerf
6 | head_model_dir: ''
7 | use_color: false
8 | 


--------------------------------------------------------------------------------
/egs/egs_bases/nerf/base.yaml:
--------------------------------------------------------------------------------
 1 | # dataset-related
 2 | raw_data_dir: data/raw/videos
 3 | processed_data_dir: data/processed/videos
 4 | binary_data_dir: data/binary/videos
 5 | video_id: ''
 6 | task_cls: ''
 7 | 
 8 | # project-related
 9 | work_dir: ''
10 | load_ckpt: ''
11 | tb_log_interval: 100
12 | num_ckpt_keep: 1
13 | val_check_interval: 10000
14 | valid_infer_interval: 10000
15 | num_sanity_val_steps: 0
16 | num_valid_plots: 5
17 | eval_max_batches: 100 # num_test_plots
18 | print_nan_grads: false
19 | resume_from_checkpoint: 0 # specify the step, 0 for latest
20 | amp: false
21 | valid_monitor_key: val_loss
22 | valid_monitor_mode: min
23 | save_best: true
24 | debug: false
25 | save_codes:
26 | - tasks
27 | - modules
28 | - egs
29 | 
30 | # testing related
31 | gen_dir_name: ''
32 | save_gt: true
33 | 
34 | # training-scheme-related
35 | max_updates: 40_0000
36 | seed: 9999
37 | lr: 0.0005
38 | scheduler: exponential # exponential|rsqrt|warmup|none|step_lr
39 | warmup_updates: 0
40 | optimizer_adam_beta1: 0.9
41 | optimizer_adam_beta2: 0.999
42 | weight_decay: 0
43 | clip_grad_norm: 0 # disable grad clipping 
44 | clip_grad_value: 0 # disable grad clipping 
45 | rays_sampler_type: uniform
46 | in_rect_percent: 0.95
47 | accumulate_grad_batches: 1
48 | 
49 | # model-related
50 | use_window_cond: true
51 | with_att: true # only available when use win_cond, use a attention Net in AD-NeRF 
52 | cond_type: ''
53 | cond_dim: 64
54 | hidden_size: 256
55 | 
56 | # NeRF-related 
57 | near: 0.3
58 | far: 0.9
59 | n_rays: 1600 # default 2048, 1600 for RTX2080Ti 
60 | n_samples_per_ray: 64
61 | n_samples_per_ray_fine: 128
62 | embedding_args:
63 |   multi_res_pos: 10 # log2+1 of max freq for positional encoding (3D location)
64 |   multi_res_views: 4 # log2+1 of max freq for positional encoding (2D direction)
65 | 
66 | infer_cond_name: ''
67 | infer_out_video_name: ''
68 | infer_scale_factor: 1.0
69 | infer_smo_std: 0.
70 | infer_audio_source_name: ''
71 | infer_c2w_name: ''
72 | 
73 | # postprocessing params
74 | infer_lm3d_clamp_std: 1.5
75 | infer_lm3d_lle_percent: 0.25 # percent of lle fused feature to compose the processed lm3d
76 | infer_lm3d_smooth_sigma: 0. # sigma of gaussian kernel to smooth the predicted lm3d
77 | infer_pose_smooth_sigma: 2.
78 | 
79 | load_imgs_to_memory: false # load uint8 training img to memory, which reduce io costs, at the expense of more memory occupation


--------------------------------------------------------------------------------
/egs/egs_bases/nerf/lm3d_nerf.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/nerf/base.yaml
 3 | 
 4 | task_cls: tasks.nerfs.lm3d_nerf.Lm3dNeRFTask
 5 | cond_type: idexp_lm3d_normalized
 6 | no_smo_iterations: 20_0000
 7 | 
 8 | use_window_cond: true # the NeRF only takes the exp at current frame as condition
 9 | with_att: true # only available when use win_cond, use a attention Net in AD-NeRF 
10 | cond_win_size: 1
11 | smo_win_size: 5
12 | 
13 | infer_inject_eye_blink_mode: none # none|gt|period. `gt` uses the eye blink sequence from GT dataset, `period` use a ref blink sequence from GT dataset and repeat it to the final length
14 | infer_eye_blink_ref_frames_start_idx: '' # start index of the ref blink sequence in the GT dataset
15 | infer_eye_blink_ref_frames_end_idx: ''  # end index of the ref blink sequence in the GT dataset
16 | 
17 | infer_close_mouth_when_sil: False # detect sil frames, then set the mouth to close in these frames
18 | infer_sil_ref_frame_idx: '' # index of the ref frame with a closed mouth in the GT dataset


--------------------------------------------------------------------------------
/egs/egs_bases/nerf/lm3d_nerf_torso.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/nerf/lm3d_nerf.yaml
 3 | 
 4 | task_cls: tasks.nerfs.lm3d_nerf_torso.Lm3dNeRFTorsoTask
 5 | 
 6 | no_smo_iterations: 0 # nerf_torso use the fixed audatt_net from head_nerf
 7 | use_color: true
 8 | 
 9 | head_model_dir: ''
10 | 


--------------------------------------------------------------------------------
/egs/egs_bases/os_facev2v/base.yaml:
--------------------------------------------------------------------------------
 1 | dataset_params:
 2 |   root_dir: /zlh/VoxCeleb/first-order-256
 3 |   frame_shape: [256, 256, 3]
 4 |   id_sampling: True
 5 |   pairs_list: None
 6 |   augmentation_params:
 7 |     flip_param:
 8 |       horizontal_flip: True
 9 |       time_flip: True
10 |     jitter_param:
11 |       brightness: 0.1
12 |       contrast: 0.1
13 |       saturation: 0.1
14 |       hue: 0.1
15 | 
16 | model_params:
17 |   common_params:
18 |     num_kp: 15 
19 |     image_channel: 3                    
20 |     feature_channel: 32
21 |     estimate_jacobian: False   # True
22 |   kp_detector_params:
23 |      temperature: 0.1
24 |      block_expansion: 32            
25 |      max_features: 1024
26 |      scale_factor: 0.25         # 0.25
27 |      num_blocks: 5
28 |      reshape_channel: 16384  # 16384 = 1024 * 16
29 |      reshape_depth: 16
30 |   he_estimator_params:
31 |      block_expansion: 64            
32 |      max_features: 2048
33 |      num_bins: 66
34 |   generator_params:
35 |     block_expansion: 64
36 |     max_features: 512
37 |     num_down_blocks: 2
38 |     reshape_channel: 32
39 |     reshape_depth: 16         # 512 = 32 * 16
40 |     num_resblocks: 6
41 |     estimate_occlusion_map: True
42 |     dense_motion_params:
43 |       block_expansion: 32
44 |       max_features: 1024
45 |       num_blocks: 5
46 |       # reshape_channel: 32
47 |       reshape_depth: 16
48 |       compress: 4
49 |   discriminator_params:
50 |     scales: [1]
51 |     block_expansion: 32                 
52 |     max_features: 512
53 |     num_blocks: 4
54 |     sn: True
55 | 
56 | train_params:
57 |   num_epochs: 300
58 |   num_repeats: 75
59 |   epoch_milestones: [180,]
60 |   lr_generator: 2.0e-4
61 |   lr_discriminator: 2.0e-4
62 |   lr_kp_detector: 2.0e-4
63 |   lr_he_estimator: 2.0e-4
64 |   gan_mode: 'hinge'    # hinge or ls
65 |   batch_size: 32
66 |   scales: [1, 0.5, 0.25, 0.125]
67 |   checkpoint_freq: 10
68 |   hopenet_snapshot:  "/mnt/bn/sa-ag-data/yezhenhui/myenv/cache/useful_ckpts/hopenet_robust_alpha1.pkl" # https://drive.google.com/open?id=1m25PrSE7g9D2q2XJVMR6IA7RaCvWSzCR
69 |   transform_params:
70 |     sigma_affine: 0.05
71 |     sigma_tps: 0.005
72 |     points_tps: 5
73 |   loss_weights:
74 |     generator_gan: 1                  
75 |     discriminator_gan: 1
76 |     feature_matching: [10, 10, 10, 10]
77 |     perceptual: [10, 10, 10, 10, 10]
78 |     equivariance_value: 10
79 |     equivariance_jacobian: 0    # 10
80 |     keypoint: 10
81 |     headpose: 20
82 |     expression: 5
83 | 
84 | visualizer_params:
85 |   kp_size: 5
86 |   draw_border: True
87 |   colormap: 'gist_rainbow'
88 | 


--------------------------------------------------------------------------------
/egs/egs_bases/postnet/base.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/audio2motion/vae_sync.yaml
 3 | 
 4 | task_cls: tasks.postnet.lm3d_postnet_adv_sync.PostnetAdvSyncTask
 5 | audio2motion_task_cls: tasks.audio2motion.lm3d_vae_sync.VAESyncAudio2MotionTask
 6 | person_binary_data_dir: data/binary/videos
 7 | # postnet training
 8 | postnet_lr: 0.0001
 9 | postnet_lambda_adv: 0.85
10 | postnet_lambda_sync: 0.1
11 | postnet_lambda_mse: 0.05
12 | 
13 | # Discriminator
14 | postnet_disc_lr: 0.0001
15 | discriminator_scheduler_params:
16 |   gamma: 0.5
17 |   step_size: 40000
18 | postnet_disc_start_steps: 0
19 | postnet_disc_interval: 1
20 | 
21 | # Training Schedule
22 | scheduler: none
23 | num_ckpt_keep: 500
24 | val_check_interval: 1000
25 | valid_infer_interval: 1000
26 | max_updates: 100000 # 20000
27 | 
28 | # Pretrained Ckpts
29 | audio2motion_work_dir: checkpoints/th1kh/lm3d_vae_sync_pitch/
30 | audio2motion_ckpt_steps: 160000
31 | syncnet_work_dir: checkpoints/th1kh/lm3d_syncnet
32 | syncnet_ckpt_steps: 160000
33 | syncnet_num_layers_per_block: 3
34 | syncnet_base_hid_size: 128
35 | 
36 | infer_audio_source_name: data/raw/val_wavs/zozo.wav
37 | infer_out_npy_name: infer_out/May/pred_lm3d/zozo.npy
38 | infer_ckpt_steps: 6000
39 | 
40 | load_db_to_memory: false # enable it for faster indexing
41 | 


--------------------------------------------------------------------------------
/egs/egs_bases/radnerf/lm3d_radnerf.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./base.yaml
 3 | 
 4 | task_cls: tasks.radnerfs.radnerf.RADNeRFTask
 5 | cond_type: idexp_lm3d_normalized
 6 | cond_win_size: 1
 7 | smo_win_size: 5
 8 | lambda_lap_ambient_loss: 0.
 9 | cond_dropout_rate: 0.
10 | zero_dummy: true
11 | 
12 | ambient_coord_dim: 3
13 | 


--------------------------------------------------------------------------------
/egs/egs_bases/radnerf/radnerf.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./base.yaml
 3 | 
 4 | task_cls: tasks.radnerfs.radnerf.RADNeRFTask
 5 | cond_type: esperanto
 6 | cond_win_size: 16
 7 | smo_win_size: 8
 8 | cond_dropout_rate: 0.
 9 | lambda_lap_ambient_loss: 0.
10 | mask_cond: false


--------------------------------------------------------------------------------
/egs/egs_bases/syncnet/base.yaml:
--------------------------------------------------------------------------------
 1 | # dataset-related
 2 | binary_data_dir: data/binary/lrs3
 3 | 
 4 | # project-related
 5 | work_dir: ''
 6 | load_ckpt: ''
 7 | tb_log_interval: 100
 8 | val_check_interval: 1000
 9 | valid_infer_interval: 1000
10 | num_sanity_val_steps: 5
11 | num_valid_plots: 1
12 | eval_max_batches: 10 # num_test_plots
13 | print_nan_grads: false
14 | resume_from_checkpoint: 0 # specify the step, 0 for latest
15 | amp: false
16 | valid_monitor_key: val_loss
17 | valid_monitor_mode: min
18 | save_best: true
19 | debug: false
20 | save_codes:
21 | - tasks
22 | - modules
23 | - egs
24 | accumulate_grad_batches: 1
25 | clip_grad_norm: 1.
26 | 
27 | # training-scheme-related
28 | task_cls: tasks.syncnet.lm3d_syncnet.SyncNetTask
29 | max_updates: 4_0000
30 | seed: 9999
31 | lr: 0.0005
32 | optimizer_adam_beta1: 0.9
33 | optimizer_adam_beta2: 0.999
34 | scheduler: none
35 | num_ckpt_keep: 100
36 | 
37 | load_db_to_memory: false # enable it for faster indexing
38 | max_sentences_per_batch: 1024
39 | max_tokens_per_batch: 20000
40 | 
41 | audio_type: hubert
42 | motion_type: idexp_lm3d
43 | use_kv_dataset: false
44 | 
45 | syncnet_num_layers_per_block: 3
46 | syncnet_base_hid_size: 128
47 | use_fork: true


--------------------------------------------------------------------------------
/egs/os_avatar/audio2motion_vae.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/audio2motion/vae.yaml
 3 | 
 4 | ds_name:  # 会覆盖下面的binary data dir
 5 | binary_data_dir: data/binary/th1kh
 6 | use_kv_dataset: true
 7 | num_workers: 4
 8 | 
 9 | task_cls: tasks.os_avatar.audio2motion_task.Audio2MotionTask
10 | max_updates: 40_0000
11 | 
12 | motion_type: exp # exp | id_exp if finegrained_id
13 | sample_min_length: 32
14 | init_from_ckpt: ''
15 | 
16 | lambda_mse_lm2d: 0.
17 | ref_id_mode: 'first_frame' # first_frame | random_frame if finegrained_id
18 | 
19 | blink_mode: blink_unit # eye_area_percent | blink_unit | none
20 | use_pitch: true
21 | use_flow: true
22 | 
23 | use_eye_amp_embed: false
24 | use_mouth_amp_embed: true
25 | lambda_l2_reg_exp: 0.1
26 | syncnet_ckpt_dir: ''
27 | audio_type: hubert # hubert | mfcc | mel
28 | lambda_mse_exp: 0.5
29 | lambda_mse_lm3d: 0.5
30 | lambda_lap_exp: 1.0
31 | lambda_kl: 0.02
32 | lambda_kl_t1: 2000
33 | lambda_kl_t2: 2000


--------------------------------------------------------------------------------
/egs/os_avatar/audio_lm3d_syncnet.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/syncnet/base.yaml
 3 | 
 4 | init_from_ckpt: ''
 5 | binary_data_dir: data/binary/th1kh
 6 | task_cls: tasks.os_avatar.audio_lm3d_syncnet.SyncNetTask
 7 | use_kv_dataset: true
 8 | num_workers: 8 # 4
 9 | 
10 | syncnet_num_clip_pairs: 8192
11 | max_sentences_per_batch: 1024
12 | max_tokens_per_batch: 20000
13 | sample_min_length: 64
14 | max_updates: 400_0000
15 | 
16 | syncnet_num_layers_per_block: 3 # 3
17 | syncnet_base_hid_size: 128
18 | syncnet_out_hid_size: 1024 # 1024
19 | syncnet_keypoint_mode: lm468
20 | 
21 | lr: 0.001
22 | lr_decay_rate: 0.98
23 | lr_decay_interval: 5000
24 | 
25 | audio_type: hubert # hubert | mfcc
26 | 


--------------------------------------------------------------------------------
/egs/os_avatar/img2plane.yaml:
--------------------------------------------------------------------------------
 1 | base_config: egs/egs_bases/eg3d/base.yaml
 2 | ds_name: TH1KH_512
 3 | binary_data_dir: data/binary/th1kh
 4 | process_id: 0 # rank id when pre-processing dataset
 5 | total_process: 1 # number of ranks when pre-processing dataset
 6 | split_seed: 999 # random seed that split chunks during pre-processing dataset
 7 | seed: 999
 8 | batch_size: 4
 9 | num_workers: 4
10 | use_kv_dataset: true
11 | ones_ws_for_sr: true
12 | 
13 | # ray_near: 2.2
14 | # ray_far: 4.0
15 | ray_near: auto
16 | ray_far: auto
17 | 
18 | batch_size: 4 # use smaller bs from 4 when using multiple machines to speed up training
19 | 
20 | lr_g: 0.0001 # follow the setting of < Real-Time Radiance Fields for Single-Image Portrait View Synthesis >
21 | # lr_g: 0.0004 # larger lr leads to degradation, even using 32 gpus.
22 | lr_d: 0.0002 # follow the setting of EG3D
23 | 
24 | warmup_updates: 4000
25 | 
26 | flipped_to_world_coord: true
27 | random_sample_pose: true
28 | mimic_plane: false # minimize the error with EG3D plane
29 | 
30 | pretrained_eg3d_ckpt: /mnt/bn/sa-ag-data/yezhenhui/projects/GeneFace_private/checkpoints/geneface2_ckpts/eg3d_baseline_run2/model_ckpt_steps_100000.ckpt
31 | seg_out_mode: none
32 | img2plane_backbone_mode: vit
33 | num_ckpt_keep: 1
34 | 
35 | not_save_modules: ['criterion_lpips', 'eg3d_model']
36 | task_cls: tasks.os_avatar.img2plane_task.OSAvatarImg2PlaneTask
37 | 
38 | batch_size: 1
39 | normalize_radius: false
40 | 
41 | optimizer_adam_beta1_g: 0.
42 | optimizer_adam_beta2_g: 0.99
43 | optimizer_adam_beta1_d: 0.
44 | optimizer_adam_beta2_d: 0.99
45 | 
46 | lambda_mse_depth: 0.
47 | 
48 | start_adv_iters: 30000
49 | lr_g: 0.0001
50 | lr_d: 0.0002
51 | 
52 | img2plane_backbone_mode: composite # composite | segformer
53 | 
54 | ffhq_disc_inp_mode: eg3d_gen
55 | use_th1kh_disc: false # enable only when ds_name == FFHQ_and_TH1KH_512
56 | lpips_mode: vgg19_v2 # vgg19 | vgg16 | alex | vgg19_v2
57 | 
58 | enable_rescale_plane_regulation: true
59 | img2plane_backbone_scale: standard # standard | large
60 | update_on_th1kh_samples: false
61 | 
62 | init_from_ckpt: ''
63 | 
64 | img2plane_input_mode: rgb # rgb_alpha | rgb_camera | rgb_alpha_camera
65 | triplane_feature_type: trigrid_v2 # triplane # trigrid
66 | triplane_depth: 3 # 1
67 | triplane_hid_dim: 32 # 32
68 | clip_grad_norm: 1.0
69 | neural_rendering_resolution: 128 # will be upscale 4x by SR 
70 | 
71 | use_th1kh_mv_adv: false
72 | torch_compile: true
73 | use_mse: false


--------------------------------------------------------------------------------
/egs/os_avatar/real3d_orig/img2plane_orig.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ../../ffhq/img2plane.yaml
 3 |   - ../../ffhq/base.yaml
 4 |   
 5 | not_save_modules: ['criterion_lpips', 'eg3d_model']
 6 | ds_name: FFHQ # FFHQ | FFHQ_and_TH1KH_512 # 发现引入视频数据会导致画质变差
 7 | task_cls: tasks.os_avatar.img2plane_task.OSAvatarImg2PlaneTask
 8 | 
 9 | batch_size: 1
10 | normalize_radius: false
11 | 
12 | optimizer_adam_beta1_g: 0.
13 | optimizer_adam_beta2_g: 0.99
14 | optimizer_adam_beta1_d: 0.
15 | optimizer_adam_beta2_d: 0.99
16 | 
17 | lambda_mse_depth: 0.
18 | 
19 | start_adv_iters: 30000
20 | lr_g: 0.0001
21 | lr_d: 0.0002
22 | 
23 | img2plane_backbone_mode: composite # composite | segformer
24 | 
25 | ffhq_disc_inp_mode: eg3d_gen
26 | use_th1kh_disc: false # enable only when ds_name == FFHQ_and_TH1KH_512
27 | lpips_mode: vgg19_v2 # vgg19 | vgg16 | alex | vgg19_v2
28 | 
29 | enable_rescale_plane_regulation: true
30 | img2plane_backbone_scale: standard # standard | large
31 | update_on_th1kh_samples: false
32 | 
33 | init_from_ckpt: 'checkpoints/0823_img2plane/img2plane'
34 | 
35 | triplane_feature_type: triplane # triplane # trigrid # trigrid_v2
36 | triplane_depth: 1 # now use 3
37 | triplane_hid_dim: 32 # 32
38 | clip_grad_norm: 1.0
39 | 
40 | use_th1kh_mv_adv: false
41 | torch_compile: true
42 | use_mse: false


--------------------------------------------------------------------------------
/egs/os_avatar/real3d_orig/secc_img2plane_orig.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./img2plane_orig.yaml
 3 | 
 4 | task_cls: tasks.os_avatar.secc_img2plane_task.SECC_Img2PlaneEG3DTask
 5 | # ds_name: Concat_VFHQ_CelebVHQ_TH1KH_RAVDESS # CelebV_HQ | Concat_CelebVHQ_TH1KH | Concat_CelebVHQ_TH1KH_RAVDESS
 6 | ds_name: FULL_Concat_VFHQ_CelebVHQ_TH1KH_RAVDESS
 7 | binary_data_dir: data/binary/CelebV-HQ
 8 | 
 9 | img2plane_backbone_mode: composite # composite | segformer
10 | num_workers: 8 # 4
11 | pncc_cond_mode: cano_src_tgt # cano_tgt | cano_src_tgt
12 | seg_out_mode: head
13 | 
14 | # 目前发现adv之后控制不了嘴了，见checkpoints/0702_img2planes/osavatar_secc_img2plane_baseline_vit_from_pretrained
15 | start_adv_iters: 25_0000 # 如果是从img2plane过来的，25w；如果是从secc2plane过来了，见机行事，5w～10w左右也行。
16 | max_updates: 25_0000 # 25_0000
17 | lambda_th1kh_mv_adv: 0.002 # 0.005 # 0.01
18 | add_ffhq_singe_disc: false
19 | lambda_ffhq_mv_adv: 0.002 # enable when add_ffhq_singe_disc is True
20 | lr_mul_cano_img2plane: 1.0 # 1.0 | 0. | 0.1
21 | lambda_mse: 1.0
22 | lr_decay_rate: 0.95
23 | lr_decay_interval: 5000
24 | 
25 | secc_segformer_scale: b0 # b0-b5
26 | use_motion_smo_net: false
27 | motion_smo_win_size: 5
28 | 
29 | # regularization on Spatial plane
30 | density_reg_p_dist: 0.004 # distance at which to sample perturbed points for density regularization
31 | 
32 | # regularization on SECC plane
33 | reg_interval_g: 4
34 | enable_rescale_plane_regulation: false # 试了下rescale发现效果不大
35 | min_rescale_factor: 0.25
36 | # how we fuse the secc
37 | phase1_plane_fusion_mode: add # add | mul
38 | init_from_ckpt: checkpoints/240126_real3dportrait_orig/img2plane_orig
39 | 
40 | disable_highreso_at_stage1: true
41 | secc_pertube_mode: randn # randn | tv | laplacian | none
42 | secc_pertube_randn_scale: 0.01 # enable when pertube_mode==randn
43 | # target_pertube_blink_secc_loss: 0.05 # task会自动tune对应的lambda以使pertube loss逼近这个目标
44 | target_pertube_blink_secc_loss: 0.15 # task会自动tune对应的lambda以使pertube loss逼近这个目标
45 | target_pertube_secc_loss: 0.5 # 0.3 # task会自动tune对应的lambda以使pertube loss逼近这个目标
46 | lr_lambda_pertube_secc: 0.01 # 自动tune lambda的学习率
47 | 
48 | sr_type: vanilla # vanillda | spade
49 | two_stage_training: true # is yes, when adv starts, fix the nerf and only finetune the sr. We found it necessary, otherwise the i2p could produce bad cases (such as darken face)
50 | also_update_decoder: false # update decoder at stage 2
51 | lambda_weights_l1: 0.1 # 0.5
52 | lambda_weights_entropy: 0.01 # 0.05
53 | lambda_density_reg: 0.25 # default 0.25 in EG3D, strength of pertube density regularization for Generator
54 | reg_interval_g_cond: 4
55 | ckpt_milestone_interval: 50000
56 | update_src2src_interval: 16
57 | 


--------------------------------------------------------------------------------
/egs/os_avatar/real3d_orig/secc_img2plane_torso_orig.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./secc_img2plane_orig.yaml
 3 | 
 4 | task_cls: tasks.os_avatar.secc_img2plane_torso_task.SECC_Img2PlaneEG3D_TorsoTask
 5 | torso_ref_segout_mode: torso # torso | torso_with_bg | person | full (person_with_bg)
 6 | 
 7 | lr_g: 0.00001
 8 | 
 9 | weight_fuse: true
10 | 
11 | start_adv_iters: 40000
12 | max_updates: 10_0000 # 25_0000
13 | lambda_th1kh_mv_adv: 0.003
14 | add_ffhq_singe_disc: false
15 | lambda_ffhq_mv_adv: 0.002 # enable when add_ffhq_singe_disc is True
16 | lambda_mse: 1.0
17 | init_from_ckpt: checkpoints/240207_robust_secc2plane/secc2plane_orig_blink0.3_pertubeNone/model_ckpt_steps_150000.ckpt # checkpoints/0725_img2planes/secc_img2plane_torso | can be either a secc_img2plane or a secc_img2plane_torso ckpt
18 | reload_head_ckpt: '' # checkpoints/0804_secc2plane/secc_img2plane_lap0.1_blink0.05_run2 | will override the secc_img2plane from init_from_ckpt and be reloaded during training
19 | 
20 | fuse_with_deform_source: false # fuse source会有严重的artifact
21 | lam_occlusion_2_reg_l1: 0.0 # 0.001
22 | torso_occlusion_reg_unmask_factor: 0.3
23 | lam_occlusion_weights_entropy: 0.001 # 0.0001
24 | 
25 | lam_occlusion_reg_l1: 0.00 # 设置成0.02导致脸部和torso都有色差，并且摇头晃脑时只有脖子动，身体不太动，不真实。
26 | torso_kp_num: 4
27 | torso_inp_mode: rgb_alpha
28 | htbsr_head_threshold: 0.9
29 | torso_model_version: v2
30 | htbsr_head_weight_fuse_mode: v2
31 | appearance_feat_mul_torso_mask: true


--------------------------------------------------------------------------------
/egs/os_avatar/secc_img2plane.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./img2plane.yaml
 3 | 
 4 | task_cls: tasks.os_avatar.secc_img2plane_task.SECC_Img2PlaneEG3DTask
 5 | ds_name: TH1KH_512 # CelebV_HQ | Concat_CelebVHQ_TH1KH | Concat_CelebVHQ_TH1KH_RAVDESS
 6 | binary_data_dir: data/binary/th1kh
 7 | 
 8 | img2plane_backbone_mode: composite # composite | segformer
 9 | num_workers: 8 # 4
10 | pncc_cond_mode: cano_src_tgt # cano_tgt | cano_src_tgt
11 | seg_out_mode: head
12 | 
13 | # 目前发现adv之后控制不了嘴了，见checkpoints/0702_img2planes/osavatar_secc_img2plane_baseline_vit_from_pretrained
14 | start_adv_iters: 20_0000 # 如果是从img2plane过来的，15w；如果是从secc2plane过来了，见机行事，5w～10w左右也行。
15 | stop_update_i2p_iters: 7_0000
16 | max_updates: 25_0000 # 发现到20w的时候会过拟合，对ood identity效果不好
17 | lambda_th1kh_mv_adv: 0.002 # 0.005 # 0.01
18 | add_ffhq_singe_disc: false
19 | lambda_ffhq_mv_adv: 0.002 # enable when add_ffhq_singe_disc is True
20 | lr_mul_cano_img2plane: 1.0 # 1.0 | 0. | 0.1
21 | lambda_mse: 1.0
22 | lr_decay_rate: 0.95
23 | lr_decay_interval: 5000
24 | 
25 | secc_segformer_scale: b0 # b0-b5
26 | use_motion_smo_net: false
27 | motion_smo_win_size: 5
28 | 
29 | # regularization on Spatial plane
30 | density_reg_p_dist: 0.004 # distance at which to sample perturbed points for density regularization
31 | 
32 | # regularization on SECC plane
33 | reg_interval_g: 4
34 | enable_rescale_plane_regulation: false # 试了下rescale发现效果不大
35 | min_rescale_factor: 0.25
36 | # how we fuse the secc
37 | phase1_plane_fusion_mode: add # add | mul
38 | init_from_ckpt: '' # checkpoints/240126_improve_i2p/img2plane_rgb_alpha
39 | 
40 | disable_highreso_at_stage1: true
41 | secc_pertube_mode: randn # randn | tv | laplacian | none
42 | secc_pertube_randn_scale: 0.01 # enable when pertube_mode==randn
43 | target_pertube_blink_secc_loss: 0.3 # task会自动tune对应的lambda以使pertube loss逼近这个目标
44 | target_pertube_secc_loss: 0. # 0.5 # task会自动tune对应的lambda以使pertube loss逼近这个目标
45 | pertube_ref_prob: 0.25
46 | lr_lambda_pertube_secc: 0.01 # 自动tune lambda的学习率
47 | 
48 | sr_type: vanilla # vanillda | spade
49 | two_stage_training: true # is yes, when adv starts, fix the nerf and only finetune the sr. We found it necessary, otherwise the i2p could produce bad cases (such as darken face)
50 | also_update_decoder: false # update decoder at stage 2
51 | lambda_weights_l1: 0.1 # 0.5
52 | lambda_weights_entropy: 0.01 # 0.05
53 | lambda_density_reg: 0.25 # default 0.25 in EG3D, strength of pertube density regularization for Generator
54 | reg_interval_g_cond: 4
55 | ckpt_milestone_interval: 50000
56 | update_src2src_interval: 16
57 | 


--------------------------------------------------------------------------------
/egs/os_avatar/secc_img2plane_torso.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./secc_img2plane.yaml
 3 | 
 4 | task_cls: tasks.os_avatar.secc_img2plane_torso_task.SECC_Img2PlaneEG3D_TorsoTask
 5 | torso_ref_segout_mode: torso # torso | torso_with_bg | person | full (person_with_bg)
 6 | 
 7 | lr_g: 0.00001
 8 | 
 9 | weight_fuse: true
10 | 
11 | start_adv_iters: 40000
12 | max_updates: 10_0000 # 25_0000
13 | lambda_th1kh_mv_adv: 0.001
14 | add_ffhq_singe_disc: false
15 | lambda_ffhq_mv_adv: 0.002 # enable when add_ffhq_singe_disc is True
16 | lambda_mse: 1.0
17 | init_from_ckpt: '' # checkpoints/0725_img2planes/secc_img2plane_torso | can be either a secc_img2plane or a secc_img2plane_torso ckpt
18 | reload_head_ckpt: '' # checkpoints/0804_secc2plane/secc_img2plane_lap0.1_blink0.05_run2 | will override the secc_img2plane from init_from_ckpt and be reloaded during training
19 | 
20 | 
21 | fuse_with_deform_source: false # fuse source会有严重的artifact
22 | lam_occlusion_2_reg_l1: 0.0 # 0.001
23 | torso_occlusion_reg_unmask_factor: 0.3
24 | lam_occlusion_weights_entropy: 0.001 # 0.0001
25 | 
26 | lam_occlusion_reg_l1: 0.00 # 设置成0.02导致脸部和torso都有色差，并且摇头晃脑时只有脖子动，身体不太动，不真实。
27 | occlusion_fuse: true
28 | torso_kp_num: 4
29 | htbsr_head_weight_fuse_mode: v2
30 | htbsr_head_threshold: 0.9
31 | torso_model_version: v2
32 | 


--------------------------------------------------------------------------------
/egs/th1kh_512/base.yaml:
--------------------------------------------------------------------------------
 1 | ds_name: TH1KH_512
 2 | raw_data_dir: /mnt/bn/sa-ag-data/yezhenhui/datasets/raw/TH1KH_512
 3 | binary_data_dir: data/binary/TH1KH_512
 4 | # binary_data_dir: /dev/shm/TH1KH
 5 | process_id: 0 # rank id when pre-processing dataset
 6 | total_process: 1 # number of ranks when pre-processing dataset
 7 | split_seed: 999 # random seed that split chunks during pre-processing dataset
 8 | 
 9 | max_sentences_per_batch: 1024
10 | max_tokens_per_batch: 200000
11 | 
12 | load_db_to_memory: false
13 | 
14 | num_workers: 4
15 | use_kv_dataset: true
16 | 
17 | binarization_args:
18 |   with_hubert: false
19 |   with_mel: false
20 |   with_coeff: true
21 | 
22 | 


--------------------------------------------------------------------------------
/egs/th1kh_512/secc_img2plane.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - ../os_avatar/secc_img2plane.yaml
3 |   - ./base.yaml
4 | 
5 | 
6 | init_from_ckpt: /mnt/bn/sa-ag-data/yezhenhui/projects/GeneFace_private/checkpoints/0720_img2planes/secc_img2plane_one_stage
7 | lr_g: 0.0001 # 1e-4, larger than ravdess, because th1kh_512 is larger
8 | lr_d: 0.0002 # 2e-4


--------------------------------------------------------------------------------
/egs/th1kh_512/secc_img2plane_torso.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - ../os_avatar/secc_img2plane_torso.yaml
3 |   - ./base.yaml
4 | 
5 | 
6 | init_from_ckpt: /mnt/bn/sa-ag-data/yezhenhui/projects/GeneFace_private/checkpoints/0729_th1kh/secc_img2plane
7 | lr_g: 0.00001 # 1e-5
8 | lr_d: 0.0002 # 2e-4


--------------------------------------------------------------------------------
/egs/th1kh_512_audio2motion/base.yaml:
--------------------------------------------------------------------------------
 1 | ds_name: TH1KH_512
 2 | raw_data_dir: /mnt/bn/sa-ag-data/yezhenhui/datasets/raw/TH1KH_512
 3 | binary_data_dir: data/binary/TH1KH_512_audio2motion
 4 | # binary_data_dir: /dev/shm/TH1KH_512
 5 | process_id: 0 # rank id when pre-processing dataset
 6 | total_process: 1 # number of ranks when pre-processing dataset
 7 | split_seed: 999 # random seed that split chunks during pre-processing dataset
 8 | 
 9 | smo_win_size: 5
10 | batch_size: 4
11 | num_workers: 4
12 | 
13 | use_kv_dataset: true
14 | 
15 | binarization_args:
16 |   with_hubert: true
17 |   with_mel: true
18 |   with_coeff: true
19 |   
20 | sample_min_length: 0


--------------------------------------------------------------------------------
/egs/th1kh_512_audio2motion/lm3d_syncnet.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/syncnet/base.yaml
 3 |   - ./base.yaml
 4 | 
 5 | max_updates: 250000
 6 | motion_type: idexp_lm3d
 7 | audio_type: hubert
 8 | 
 9 | syncnet_num_layers_per_block: 3
10 | syncnet_base_hid_size: 128
11 | 
12 | # max_sentences_per_batch: 1024
13 | max_sentences_per_batch: 2048
14 | max_tokens_per_batch: 40_000
15 | # max_tokens_per_batch: 20_000
16 | 
17 | num_workers: 16


--------------------------------------------------------------------------------
/egs/th1kh_512_audio2motion/lm3d_vae.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/audio2motion/vae.yaml
 3 |   - ./base.yaml
 4 | 
 5 | lambda_kl: 0.02
 6 | motion_type: idexp_lm3d
 7 | audio_type: hubert
 8 | 
 9 | max_updates: 160000
10 | 


--------------------------------------------------------------------------------
/egs/th1kh_512_audio2motion/lm3d_vae_pitch.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/audio2motion/vae.yaml
 3 |   - ./base.yaml
 4 | 
 5 | lambda_kl: 0.02
 6 | motion_type: idexp_lm3d
 7 | audio_type: hubert
 8 | 
 9 | task_cls: tasks.audio2motion.lm3d_vae_sync_pitch.VAESyncAudio2MotionTask
10 | max_updates: 160000
11 | 


--------------------------------------------------------------------------------
/egs/th1kh_512_audio2motion/lm3d_vae_sync.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - egs/egs_bases/audio2motion/vae_sync.yaml
 3 |   - ./base.yaml
 4 | 
 5 | syncnet_work_dir: checkpoints/th1kh/lm3d_syncnet
 6 | syncnet_ckpt_steps: 250000
 7 | lambda_kl: 0.02
 8 | max_updates: 160000
 9 | motion_type: idexp_lm3d
10 | audio_type: hubert
11 | 
12 | syncnet_num_layers_per_block: 3
13 | syncnet_base_hid_size: 128


--------------------------------------------------------------------------------
/egs/th1kh_512_audio2motion/lm3d_vae_sync_pitch.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./lm3d_vae_sync.yaml
 3 |   - ./base.yaml
 4 | 
 5 | lambda_kl: 0.02
 6 | syncnet_work_dir: checkpoints/th1kh/lm3d_syncnet
 7 | syncnet_ckpt_steps: 230000
 8 | task_cls: tasks.audio2motion.lm3d_vae_sync_pitch.VAESyncAudio2MotionTask
 9 | max_updates: 160000
10 | motion_type: idexp_lm3d
11 | audio_type: hubert
12 | 
13 | syncnet_num_layers_per_block: 3
14 | syncnet_base_hid_size: 128


--------------------------------------------------------------------------------
/modules/audio2motion/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def squeeze(x, x_mask=None, n_sqz=2):
 5 |     b, c, t = x.size()
 6 | 
 7 |     t = (t // n_sqz) * n_sqz
 8 |     x = x[:, :, :t]
 9 |     x_sqz = x.view(b, c, t // n_sqz, n_sqz)
10 |     x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz)
11 | 
12 |     if x_mask is not None:
13 |         x_mask = x_mask[:, :, n_sqz - 1::n_sqz]
14 |     else:
15 |         x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype)
16 |     return x_sqz * x_mask, x_mask
17 | 
18 | 
19 | def unsqueeze(x, x_mask=None, n_sqz=2):
20 |     b, c, t = x.size()
21 | 
22 |     x_unsqz = x.view(b, n_sqz, c // n_sqz, t)
23 |     x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz)
24 | 
25 |     if x_mask is not None:
26 |         x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz)
27 |     else:
28 |         x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype)
29 |     return x_unsqz * x_mask, x_mask
30 | 


--------------------------------------------------------------------------------
/modules/commons/attention/simple_attention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def split_heads(x, num_heads):
 7 |     """ Split heads
 8 |     :param x: A tensor with shape [batch, length, channels]
 9 |     :param num_heads: An integer
10 |     :returns: A tensor with shape [batch, heads, length, channels / heads]
11 |     """
12 |     assert x.shape[-1] % num_heads == 0, str(x.shape)
13 |     return x.reshape(x.shape[:-1] + (num_heads, x.shape[-1] // num_heads)).permute(0, 2, 1, 3)
14 | 
15 | 
16 | def combine_heads(x):
17 |     """ Combine heads
18 |     :param x: A tensor with shape [batch, heads, length, channels]
19 |     :returns: A tensor with shape [batch, length, heads * channels]
20 |     """
21 |     x = x.permute([0, 2, 1, 3])
22 |     return x.reshape(x.shape[:-2] + (x.shape[-1] * x.shape[-2],))
23 | 
24 | 
25 | class SimpleAttention(nn.Module):
26 |     def __init__(self, query_size=192, key_size=192, value_size=192, num_heads=1):
27 |         super(SimpleAttention, self).__init__()
28 |         self.q_transform = nn.Linear(query_size, query_size, bias=False)
29 |         self.k_transform = nn.Linear(key_size, query_size, bias=False)
30 |         self.v_transform = nn.Linear(value_size, query_size, bias=False)
31 |         self.output_transform = nn.Linear(query_size, query_size, bias=False)
32 |         self.query_size = query_size
33 |         self.key_size = key_size
34 |         self.value_size = value_size
35 |         self.num_heads = num_heads
36 | 
37 |     def forward(self, query, key, value, attn_mask=None, bias=None):
38 |         q = self.q_transform(query)
39 |         k = self.k_transform(key)
40 |         v = self.v_transform(value)
41 | 
42 |         logits = torch.bmm(q, k.transpose(1, 2))  # [batch, length_q, length_k]
43 |         if bias is not None:
44 |             logits += bias
45 |         if attn_mask is not None:
46 |             logits = logits + attn_mask * -1e9
47 |         weights = F.softmax(logits, dim=-1)
48 |         out = torch.bmm(weights, v)
49 |         out = self.output_transform(out)
50 |         return out, weights
51 | 


--------------------------------------------------------------------------------
/modules/commons/improved_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Codebase for "Improved Denoising Diffusion Probabilistic Models".
3 | """
4 | 


--------------------------------------------------------------------------------
/modules/commons/improved_diffusion/dist_util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers for distributed training.
 3 | """
 4 | 
 5 | import io
 6 | import os
 7 | import socket
 8 | 
 9 | import blobfile as bf
10 | from mpi4py import MPI
11 | import torch as th
12 | import torch.distributed as dist
13 | 
14 | # Change this to reflect your cluster layout.
15 | # The GPU for a given rank is (rank % GPUS_PER_NODE).
16 | GPUS_PER_NODE = 8
17 | 
18 | SETUP_RETRY_COUNT = 3
19 | 
20 | 
21 | def setup_dist():
22 |     """
23 |     Setup a distributed process group.
24 |     """
25 |     if dist.is_initialized():
26 |         return
27 | 
28 |     comm = MPI.COMM_WORLD
29 |     backend = "gloo" if not th.cuda.is_available() else "nccl"
30 | 
31 |     if backend == "gloo":
32 |         hostname = "localhost"
33 |     else:
34 |         hostname = socket.gethostbyname(socket.getfqdn())
35 |     os.environ["MASTER_ADDR"] = comm.bcast(hostname, root=0)
36 |     os.environ["RANK"] = str(comm.rank)
37 |     os.environ["WORLD_SIZE"] = str(comm.size)
38 | 
39 |     port = comm.bcast(_find_free_port(), root=0)
40 |     os.environ["MASTER_PORT"] = str(port)
41 |     dist.init_process_group(backend=backend, init_method="env://")
42 | 
43 | 
44 | def dev():
45 |     """
46 |     Get the device to use for torch.distributed.
47 |     """
48 |     if th.cuda.is_available():
49 |         return th.device(f"cuda:{MPI.COMM_WORLD.Get_rank() % GPUS_PER_NODE}")
50 |     return th.device("cpu")
51 | 
52 | 
53 | def load_state_dict(path, **kwargs):
54 |     """
55 |     Load a PyTorch file without redundant fetches across MPI ranks.
56 |     """
57 |     if MPI.COMM_WORLD.Get_rank() == 0:
58 |         with bf.BlobFile(path, "rb") as f:
59 |             data = f.read()
60 |     else:
61 |         data = None
62 |     data = MPI.COMM_WORLD.bcast(data)
63 |     return th.load(io.BytesIO(data), **kwargs)
64 | 
65 | 
66 | def sync_params(params):
67 |     """
68 |     Synchronize a sequence of Tensors across ranks from rank 0.
69 |     """
70 |     for p in params:
71 |         with th.no_grad():
72 |             dist.broadcast(p, 0)
73 | 
74 | 
75 | def _find_free_port():
76 |     try:
77 |         s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
78 |         s.bind(("", 0))
79 |         s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
80 |         return s.getsockname()[1]
81 |     finally:
82 |         s.close()
83 | 


--------------------------------------------------------------------------------
/modules/commons/improved_diffusion/fp16_util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers to train with 16-bit precision.
 3 | """
 4 | 
 5 | import torch.nn as nn
 6 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 7 | 
 8 | 
 9 | def convert_module_to_f16(l):
10 |     """
11 |     Convert primitive modules to float16.
12 |     """
13 |     if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
14 |         l.weight.data = l.weight.data.half()
15 |         l.bias.data = l.bias.data.half()
16 | 
17 | 
18 | def convert_module_to_f32(l):
19 |     """
20 |     Convert primitive modules to float32, undoing convert_module_to_f16().
21 |     """
22 |     if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
23 |         l.weight.data = l.weight.data.float()
24 |         l.bias.data = l.bias.data.float()
25 | 
26 | 
27 | def make_master_params(model_params):
28 |     """
29 |     Copy model parameters into a (differently-shaped) list of full-precision
30 |     parameters.
31 |     """
32 |     master_params = _flatten_dense_tensors(
33 |         [param.detach().float() for param in model_params]
34 |     )
35 |     master_params = nn.Parameter(master_params)
36 |     master_params.requires_grad = True
37 |     return [master_params]
38 | 
39 | 
40 | def model_grads_to_master_grads(model_params, master_params):
41 |     """
42 |     Copy the gradients from the model parameters into the master parameters
43 |     from make_master_params().
44 |     """
45 |     master_params[0].grad = _flatten_dense_tensors(
46 |         [param.grad.data.detach().float() for param in model_params]
47 |     )
48 | 
49 | 
50 | def master_params_to_model_params(model_params, master_params):
51 |     """
52 |     Copy the master parameter data back into the model parameters.
53 |     """
54 |     # Without copying to a list, if a generator is passed, this will
55 |     # silently not copy any parameters.
56 |     model_params = list(model_params)
57 | 
58 |     for param, master_param in zip(
59 |         model_params, unflatten_master_params(model_params, master_params)
60 |     ):
61 |         param.detach().copy_(master_param)
62 | 
63 | 
64 | def unflatten_master_params(model_params, master_params):
65 |     """
66 |     Unflatten the master parameters to look like model_params.
67 |     """
68 |     return _unflatten_dense_tensors(master_params[0].detach(), model_params)
69 | 
70 | 
71 | def zero_grad(model_params):
72 |     for param in model_params:
73 |         # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group
74 |         if param.grad is not None:
75 |             param.grad.detach_()
76 |             param.grad.zero_()
77 | 


--------------------------------------------------------------------------------
/modules/commons/improved_diffusion/losses.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers for various likelihood-based losses. These are ported from the original
 3 | Ho et al. diffusion models codebase:
 4 | https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
 5 | """
 6 | 
 7 | import numpy as np
 8 | 
 9 | import torch as th
10 | 
11 | 
12 | def normal_kl(mean1, logvar1, mean2, logvar2):
13 |     """
14 |     Compute the KL divergence between two gaussians.
15 | 
16 |     Shapes are automatically broadcasted, so batches can be compared to
17 |     scalars, among other use cases.
18 |     """
19 |     tensor = None
20 |     for obj in (mean1, logvar1, mean2, logvar2):
21 |         if isinstance(obj, th.Tensor):
22 |             tensor = obj
23 |             break
24 |     assert tensor is not None, "at least one argument must be a Tensor"
25 | 
26 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
27 |     # Tensors, but it does not work for th.exp().
28 |     logvar1, logvar2 = [
29 |         x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
30 |         for x in (logvar1, logvar2)
31 |     ]
32 | 
33 |     return 0.5 * (
34 |         -1.0
35 |         + logvar2
36 |         - logvar1
37 |         + th.exp(logvar1 - logvar2)
38 |         + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
39 |     )
40 | 
41 | 
42 | def approx_standard_normal_cdf(x):
43 |     """
44 |     A fast approximation of the cumulative distribution function of the
45 |     standard normal.
46 |     """
47 |     return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
48 | 
49 | 
50 | def discretized_gaussian_log_likelihood(x, *, means, log_scales):
51 |     """
52 |     Compute the log-likelihood of a Gaussian distribution discretizing to a
53 |     given image.
54 | 
55 |     :param x: the target images. It is assumed that this was uint8 values,
56 |               rescaled to the range [-1, 1].
57 |     :param means: the Gaussian mean Tensor.
58 |     :param log_scales: the Gaussian log stddev Tensor.
59 |     :return: a tensor like x of log probabilities (in nats).
60 |     """
61 |     assert x.shape == means.shape == log_scales.shape
62 |     centered_x = x - means
63 |     inv_stdv = th.exp(-log_scales)
64 |     plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
65 |     cdf_plus = approx_standard_normal_cdf(plus_in)
66 |     min_in = inv_stdv * (centered_x - 1.0 / 255.0)
67 |     cdf_min = approx_standard_normal_cdf(min_in)
68 |     log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
69 |     log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
70 |     cdf_delta = cdf_plus - cdf_min
71 |     log_probs = th.where(
72 |         x < -0.999,
73 |         log_cdf_plus,
74 |         th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
75 |     )
76 |     assert log_probs.shape == x.shape
77 |     return log_probs
78 | 


--------------------------------------------------------------------------------
/modules/commons/layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class LayerNorm(torch.nn.LayerNorm):
 6 |     """Layer normalization module.
 7 |     :param int nout: output dim size
 8 |     :param int dim: dimension to be normalized
 9 |     """
10 | 
11 |     def __init__(self, nout, dim=-1, eps=1e-5):
12 |         """Construct an LayerNorm object."""
13 |         super(LayerNorm, self).__init__(nout, eps=eps)
14 |         self.dim = dim
15 | 
16 |     def forward(self, x):
17 |         """Apply layer normalization.
18 |         :param torch.Tensor x: input tensor
19 |         :return: layer normalized tensor
20 |         :rtype torch.Tensor
21 |         """
22 |         if self.dim == -1:
23 |             return super(LayerNorm, self).forward(x)
24 |         return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
25 | 
26 | 
27 | class Reshape(nn.Module):
28 |     def __init__(self, *args):
29 |         super(Reshape, self).__init__()
30 |         self.shape = args
31 | 
32 |     def forward(self, x):
33 |         return x.view(self.shape)
34 | 
35 | 
36 | class Permute(nn.Module):
37 |     def __init__(self, *args):
38 |         super(Permute, self).__init__()
39 |         self.args = args
40 | 
41 |     def forward(self, x):
42 |         return x.permute(self.args)
43 | 
44 | 
45 | def Embedding(num_embeddings, embedding_dim, padding_idx=None):
46 |     m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
47 |     nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
48 |     if padding_idx is not None:
49 |         nn.init.constant_(m.weight[padding_idx], 0)
50 |     return m
51 | 


--------------------------------------------------------------------------------
/modules/commons/loralib/__init__.py:
--------------------------------------------------------------------------------
1 | name = "lora"
2 | 
3 | from .layers import *
4 | from .utils import *


--------------------------------------------------------------------------------
/modules/commons/loralib/utils.py:
--------------------------------------------------------------------------------
 1 | #  ------------------------------------------------------------------------------------------
 2 | #  Copyright (c) Microsoft Corporation. All rights reserved.
 3 | #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 4 | #  ------------------------------------------------------------------------------------------
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from typing import Dict
 9 | 
10 | from .layers import LoRALayer
11 | 
12 | 
13 | def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
14 |     for n, p in model.named_parameters():
15 |         p.requires_grad = True
16 |     for n, p in model.named_parameters():
17 |         if 'lora_' not in n:
18 |             p.requires_grad = False
19 |     if bias == 'none':
20 |         return
21 |     elif bias == 'all':
22 |         for n, p in model.named_parameters():
23 |             if 'bias' in n:
24 |                 p.requires_grad = True
25 |     elif bias == 'lora_only':
26 |         for m in model.modules():
27 |             if isinstance(m, LoRALayer) and \
28 |                 hasattr(m, 'bias') and \
29 |                 m.bias is not None:
30 |                     m.bias.requires_grad = True
31 |     else:
32 |         raise NotImplementedError
33 | 
34 | 
35 | def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]:
36 |     my_state_dict = model.state_dict()
37 |     if bias == 'none':
38 |         return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k}
39 |     elif bias == 'all':
40 |         return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k}
41 |     elif bias == 'lora_only':
42 |         to_return = {}
43 |         for k in my_state_dict:
44 |             if 'lora_' in k:
45 |                 to_return[k] = my_state_dict[k]
46 |                 bias_name = k.split('lora_')[0]+'bias'
47 |                 if bias_name in my_state_dict:
48 |                     to_return[bias_name] = my_state_dict[bias_name]
49 |         return to_return
50 |     else:
51 |         raise NotImplementedError
52 | 


--------------------------------------------------------------------------------
/modules/commons/normalizing_flow/res_flow.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from modules.commons.conv import ConditionalConvBlocks
 4 | from modules.commons.wavenet import WN
 5 | 
 6 | 
 7 | class FlipLayer(nn.Module):
 8 |     def forward(self, x, *args, **kwargs):
 9 |         x = torch.flip(x, [1])
10 |         return x
11 | 
12 | 
13 | class CouplingLayer(nn.Module):
14 |     def __init__(self, c_in, hidden_size, kernel_size, n_layers, p_dropout=0, c_in_g=0, nn_type='wn'):
15 |         super().__init__()
16 |         self.channels = c_in
17 |         self.hidden_size = hidden_size
18 |         self.kernel_size = kernel_size
19 |         self.n_layers = n_layers
20 |         self.c_half = c_in // 2
21 | 
22 |         self.pre = nn.Conv1d(self.c_half, hidden_size, 1)
23 |         if nn_type == 'wn':
24 |             self.enc = WN(hidden_size, kernel_size, 1, n_layers, p_dropout=p_dropout,
25 |                           c_cond=c_in_g)
26 |         elif nn_type == 'conv':
27 |             self.enc = ConditionalConvBlocks(
28 |                 hidden_size, c_in_g, hidden_size, None, kernel_size,
29 |                 layers_in_block=1, is_BTC=False, num_layers=n_layers)
30 |         self.post = nn.Conv1d(hidden_size, self.c_half, 1)
31 | 
32 |     def forward(self, x, nonpadding, cond=None, reverse=False):
33 |         x0, x1 = x[:, :self.c_half], x[:, self.c_half:]
34 |         x_ = self.pre(x0) * nonpadding
35 |         x_ = self.enc(x_, nonpadding=nonpadding, cond=cond)
36 |         m = self.post(x_)
37 |         x1 = m + x1 if not reverse else x1 - m
38 |         x = torch.cat([x0, x1], 1)
39 |         return x * nonpadding
40 | 
41 | 
42 | class ResFlow(nn.Module):
43 |     def __init__(self,
44 |                  c_in,
45 |                  hidden_size,
46 |                  kernel_size,
47 |                  n_flow_layers,
48 |                  n_flow_steps=4,
49 |                  c_cond=0,
50 |                  nn_type='wn'):
51 |         super().__init__()
52 |         self.flows = nn.ModuleList()
53 |         for i in range(n_flow_steps):
54 |             self.flows.append(
55 |                 CouplingLayer(c_in, hidden_size, kernel_size, n_flow_layers, c_in_g=c_cond, nn_type=nn_type))
56 |             self.flows.append(FlipLayer())
57 | 
58 |     def forward(self, x, nonpadding, cond=None, reverse=False):
59 |         for flow in (self.flows if not reverse else reversed(self.flows)):
60 |             x = flow(x, nonpadding, cond=cond, reverse=reverse)
61 |         return x
62 | 


--------------------------------------------------------------------------------
/modules/commons/normalizing_flow/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def squeeze(x, x_mask=None, n_sqz=2):
 5 |     b, c, t = x.size()
 6 | 
 7 |     t = (t // n_sqz) * n_sqz
 8 |     x = x[:, :, :t]
 9 |     x_sqz = x.view(b, c, t // n_sqz, n_sqz)
10 |     x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz)
11 | 
12 |     if x_mask is not None:
13 |         x_mask = x_mask[:, :, n_sqz - 1::n_sqz]
14 |     else:
15 |         x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype)
16 |     return x_sqz * x_mask, x_mask
17 | 
18 | 
19 | def unsqueeze(x, x_mask=None, n_sqz=2):
20 |     b, c, t = x.size()
21 | 
22 |     x_unsqz = x.view(b, n_sqz, c // n_sqz, t)
23 |     x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz)
24 | 
25 |     if x_mask is not None:
26 |         x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz)
27 |     else:
28 |         x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype)
29 |     return x_unsqz * x_mask, x_mask
30 | 


--------------------------------------------------------------------------------
/modules/commons/vqvae_fsq.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505
 3 | Code adapted from Jax version in Appendix A.1
 4 | """
 5 | 
 6 | from typing import List
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | from torch import Tensor, int32
11 | 
12 | 
13 | def round_ste(z: Tensor) -> Tensor:
14 |     """Round with straight through gradients."""
15 |     zhat = z.round()
16 |     return z + (zhat - z).detach()
17 | 
18 | 
19 | class FSQ(nn.Module):
20 |     def __init__(self, levels: List[int]):
21 |         super().__init__()
22 |         _levels = torch.tensor(levels, dtype=int32)
23 |         self.register_buffer("_levels", _levels)
24 | 
25 |         _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32)
26 |         self.register_buffer("_basis", _basis)
27 | 
28 |         self.dim = len(levels)
29 |         self.n_codes = self._levels.prod().item()
30 |         implicit_codebook = self.indices_to_codes(torch.arange(self.n_codes))
31 |         self.register_buffer("implicit_codebook", implicit_codebook)
32 | 
33 |     def forward(self, z: Tensor) -> Tensor:
34 |         zhat = self.quantize(z)
35 |         indices = self.codes_to_indices(zhat)
36 |         return zhat, indices
37 | 
38 |     def bound(self, z: Tensor, eps: float = 1e-3) -> Tensor:
39 |         """Bound `z`, an array of shape (..., d)."""
40 |         half_l = (self._levels - 1) * (1 - eps) / 2
41 |         offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
42 |         shift = (offset / half_l).tan()
43 |         return (z + shift).tanh() * half_l - offset
44 | 
45 |     def quantize(self, z: Tensor) -> Tensor:
46 |         """Quantizes z, returns quantized zhat, same shape as z."""
47 |         quantized = round_ste(self.bound(z))
48 |         half_width = self._levels // 2  # Renormalize to [-1, 1].
49 |         return quantized / half_width
50 | 
51 |     def _scale_and_shift(self, zhat_normalized: Tensor) -> Tensor:
52 |         half_width = self._levels // 2
53 |         return (zhat_normalized * half_width) + half_width
54 | 
55 |     def _scale_and_shift_inverse(self, zhat: Tensor) -> Tensor:
56 |         half_width = self._levels // 2
57 |         return (zhat - half_width) / half_width
58 | 
59 |     def codes_to_indices(self, zhat: Tensor) -> Tensor:
60 |         """Converts a `code` to an index in the codebook."""
61 |         assert zhat.shape[-1] == self.dim
62 |         zhat = self._scale_and_shift(zhat)
63 |         return (zhat * self._basis).sum(dim=-1).to(int32)
64 | 
65 |     def indices_to_codes(self, indices: Tensor) -> Tensor:
66 |         """Inverse of `codes_to_indices`."""
67 |         indices = indices.unsqueeze(-1)
68 |         codes_non_centered = (indices // self._basis) % self._levels
69 |         return self._scale_and_shift_inverse(codes_non_centered)
70 | 
71 |     def get_codebook_entry(self, encoding_indices):
72 |         return self.indices_to_codes(encoding_indices)
73 | 


--------------------------------------------------------------------------------
/modules/eg3ds/dnnlib/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 3 | #
 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 5 | # property and proprietary rights in and to this material, related
 6 | # documentation and any modifications thereto. Any use, reproduction,
 7 | # disclosure or distribution of this material and related documentation
 8 | # without an express license agreement from NVIDIA CORPORATION or
 9 | # its affiliates is strictly prohibited.
10 | 
11 | from .util import EasyDict, make_cache_dir_path
12 | 


--------------------------------------------------------------------------------
/modules/eg3ds/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 3 | #
 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 5 | # property and proprietary rights in and to this material, related
 6 | # documentation and any modifications thereto. Any use, reproduction,
 7 | # disclosure or distribution of this material and related documentation
 8 | # without an express license agreement from NVIDIA CORPORATION or
 9 | # its affiliates is strictly prohibited.
10 | 
11 | # empty
12 | 


--------------------------------------------------------------------------------
/modules/eg3ds/metrics/frechet_inception_distance.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 3 | #
 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 5 | # property and proprietary rights in and to this material, related
 6 | # documentation and any modifications thereto. Any use, reproduction,
 7 | # disclosure or distribution of this material and related documentation
 8 | # without an express license agreement from NVIDIA CORPORATION or
 9 | # its affiliates is strictly prohibited.
10 | 
11 | """Frechet Inception Distance (FID) from the paper
12 | "GANs trained by a two time-scale update rule converge to a local Nash
13 | equilibrium". Matches the original implementation by Heusel et al. at
14 | https://github.com/bioinf-jku/TTUR/blob/master/fid.py"""
15 | 
16 | import numpy as np
17 | import scipy.linalg
18 | from . import metric_utils
19 | 
20 | #----------------------------------------------------------------------------
21 | 
22 | def compute_fid(opts, max_real, num_gen):
23 |     # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
24 |     # detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/inception-2015-12-05.pkl'
25 |     detector_url = 'file:///home/tiger/nfs/myenv/cache/useful_ckpts/inception-2015-12-05.pkl'
26 |     detector_kwargs = dict(return_features=True) # Return raw features before the softmax layer.
27 | 
28 |     mu_real, sigma_real = metric_utils.compute_feature_stats_for_dataset(
29 |         opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
30 |         rel_lo=0, rel_hi=0, capture_mean_cov=True, max_items=max_real).get_mean_cov()
31 | 
32 |     mu_gen, sigma_gen = metric_utils.compute_feature_stats_for_generator(
33 |         opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
34 |         rel_lo=0, rel_hi=1, capture_mean_cov=True, max_items=num_gen).get_mean_cov()
35 | 
36 |     if opts.rank != 0:
37 |         return float('nan')
38 | 
39 |     m = np.square(mu_gen - mu_real).sum()
40 |     s, _ = scipy.linalg.sqrtm(np.dot(sigma_gen, sigma_real), disp=False) # pylint: disable=no-member
41 |     fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2))
42 |     return float(fid)
43 | 
44 | #----------------------------------------------------------------------------
45 | 
46 | 


--------------------------------------------------------------------------------
/modules/eg3ds/metrics/inception_score.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 3 | #
 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 5 | # property and proprietary rights in and to this material, related
 6 | # documentation and any modifications thereto. Any use, reproduction,
 7 | # disclosure or distribution of this material and related documentation
 8 | # without an express license agreement from NVIDIA CORPORATION or
 9 | # its affiliates is strictly prohibited.
10 | 
11 | """Inception Score (IS) from the paper "Improved techniques for training
12 | GANs". Matches the original implementation by Salimans et al. at
13 | https://github.com/openai/improved-gan/blob/master/inception_score/model.py"""
14 | 
15 | import numpy as np
16 | from . import metric_utils
17 | 
18 | #----------------------------------------------------------------------------
19 | 
20 | def compute_is(opts, num_gen, num_splits):
21 |     # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
22 |     # detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/inception-2015-12-05.pkl'
23 |     detector_url = 'file:///home/tiger/nfs/myenv/cache/useful_ckpts/inception-2015-12-05.pkl'
24 |     detector_kwargs = dict(no_output_bias=True) # Match the original implementation by not applying bias in the softmax layer.
25 | 
26 |     gen_probs = metric_utils.compute_feature_stats_for_generator(
27 |         opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
28 |         capture_all=True, max_items=num_gen).get_all()
29 | 
30 |     if opts.rank != 0:
31 |         return float('nan'), float('nan')
32 | 
33 |     scores = []
34 |     for i in range(num_splits):
35 |         part = gen_probs[i * num_gen // num_splits : (i + 1) * num_gen // num_splits]
36 |         kl = part * (np.log(part) - np.log(np.mean(part, axis=0, keepdims=True)))
37 |         kl = np.mean(np.sum(kl, axis=1))
38 |         scores.append(np.exp(kl))
39 |     return float(np.mean(scores)), float(np.std(scores))
40 | 
41 | #----------------------------------------------------------------------------
42 | 


--------------------------------------------------------------------------------
/modules/eg3ds/metrics/kernel_inception_distance.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 3 | #
 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 5 | # property and proprietary rights in and to this material, related
 6 | # documentation and any modifications thereto. Any use, reproduction,
 7 | # disclosure or distribution of this material and related documentation
 8 | # without an express license agreement from NVIDIA CORPORATION or
 9 | # its affiliates is strictly prohibited.
10 | 
11 | """Kernel Inception Distance (KID) from the paper "Demystifying MMD
12 | GANs". Matches the original implementation by Binkowski et al. at
13 | https://github.com/mbinkowski/MMD-GAN/blob/master/gan/compute_scores.py"""
14 | 
15 | import numpy as np
16 | from . import metric_utils
17 | 
18 | #----------------------------------------------------------------------------
19 | 
20 | def compute_kid(opts, max_real, num_gen, num_subsets, max_subset_size):
21 |     # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
22 |     # detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/inception-2015-12-05.pkl'
23 |     detector_url = 'file:///home/tiger/nfs/myenv/cache/useful_ckpts/inception-2015-12-05.pkl'
24 |     detector_kwargs = dict(return_features=True) # Return raw features before the softmax layer.
25 | 
26 |     real_features = metric_utils.compute_feature_stats_for_dataset(
27 |         opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
28 |         rel_lo=0, rel_hi=0, capture_all=True, max_items=max_real).get_all()
29 | 
30 |     gen_features = metric_utils.compute_feature_stats_for_generator(
31 |         opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
32 |         rel_lo=0, rel_hi=1, capture_all=True, max_items=num_gen).get_all()
33 | 
34 |     if opts.rank != 0:
35 |         return float('nan')
36 | 
37 |     n = real_features.shape[1]
38 |     m = min(min(real_features.shape[0], gen_features.shape[0]), max_subset_size)
39 |     t = 0
40 |     for _subset_idx in range(num_subsets):
41 |         x = gen_features[np.random.choice(gen_features.shape[0], m, replace=False)]
42 |         y = real_features[np.random.choice(real_features.shape[0], m, replace=False)]
43 |         a = (x @ x.T / n + 1) ** 3 + (y @ y.T / n + 1) ** 3
44 |         b = (x @ y.T / n + 1) ** 3
45 |         t += (a.sum() - np.diag(a).sum()) / (m - 1) - b.sum() * 2 / m
46 |     kid = t / num_subsets / m
47 |     return float(kid)
48 | 
49 | #----------------------------------------------------------------------------
50 | 


--------------------------------------------------------------------------------
/modules/eg3ds/torch_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 3 | #
 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 5 | # property and proprietary rights in and to this material, related
 6 | # documentation and any modifications thereto. Any use, reproduction,
 7 | # disclosure or distribution of this material and related documentation
 8 | # without an express license agreement from NVIDIA CORPORATION or
 9 | # its affiliates is strictly prohibited.
10 | 
11 | # empty
12 | 


--------------------------------------------------------------------------------
/modules/eg3ds/torch_utils/ops/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 3 | #
 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 5 | # property and proprietary rights in and to this material, related
 6 | # documentation and any modifications thereto. Any use, reproduction,
 7 | # disclosure or distribution of this material and related documentation
 8 | # without an express license agreement from NVIDIA CORPORATION or
 9 | # its affiliates is strictly prohibited.
10 | 
11 | # empty
12 | 


--------------------------------------------------------------------------------
/modules/eg3ds/torch_utils/ops/bias_act.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 4 |  *
 5 |  * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 6 |  * property and proprietary rights in and to this material, related
 7 |  * documentation and any modifications thereto. Any use, reproduction,
 8 |  * disclosure or distribution of this material and related documentation
 9 |  * without an express license agreement from NVIDIA CORPORATION or
10 |  * its affiliates is strictly prohibited.
11 |  */
12 | 
13 | //------------------------------------------------------------------------
14 | // CUDA kernel parameters.
15 | 
16 | struct bias_act_kernel_params
17 | {
18 |     const void* x;      // [sizeX]
19 |     const void* b;      // [sizeB] or NULL
20 |     const void* xref;   // [sizeX] or NULL
21 |     const void* yref;   // [sizeX] or NULL
22 |     const void* dy;     // [sizeX] or NULL
23 |     void*       y;      // [sizeX]
24 | 
25 |     int         grad;
26 |     int         act;
27 |     float       alpha;
28 |     float       gain;
29 |     float       clamp;
30 | 
31 |     int         sizeX;
32 |     int         sizeB;
33 |     int         stepB;
34 |     int         loopX;
35 | };
36 | 
37 | //------------------------------------------------------------------------
38 | // CUDA kernel selection.
39 | 
40 | template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p);
41 | 
42 | //------------------------------------------------------------------------
43 | 


--------------------------------------------------------------------------------
/modules/eg3ds/torch_utils/ops/filtered_lrelu_ns.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 4 |  *
 5 |  * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 6 |  * property and proprietary rights in and to this material, related
 7 |  * documentation and any modifications thereto. Any use, reproduction,
 8 |  * disclosure or distribution of this material and related documentation
 9 |  * without an express license agreement from NVIDIA CORPORATION or
10 |  * its affiliates is strictly prohibited.
11 |  */
12 | 
13 | #include "filtered_lrelu.cu"
14 | 
15 | // Template/kernel specializations for no signs mode (no gradients required).
16 | 
17 | // Full op, 32-bit indexing.
18 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
19 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
20 | 
21 | // Full op, 64-bit indexing.
22 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
23 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
24 | 
25 | // Activation/signs only for generic variant. 64-bit indexing.
26 | template void* choose_filtered_lrelu_act_kernel<c10::Half, false, false>(void);
27 | template void* choose_filtered_lrelu_act_kernel<float,     false, false>(void);
28 | template void* choose_filtered_lrelu_act_kernel<double,    false, false>(void);
29 | 
30 | // Copy filters to constant memory.
31 | template cudaError_t copy_filters<false, false>(cudaStream_t stream);
32 | 


--------------------------------------------------------------------------------
/modules/eg3ds/torch_utils/ops/filtered_lrelu_rd.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 4 |  *
 5 |  * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 6 |  * property and proprietary rights in and to this material, related
 7 |  * documentation and any modifications thereto. Any use, reproduction,
 8 |  * disclosure or distribution of this material and related documentation
 9 |  * without an express license agreement from NVIDIA CORPORATION or
10 |  * its affiliates is strictly prohibited.
11 |  */
12 | 
13 | #include "filtered_lrelu.cu"
14 | 
15 | // Template/kernel specializations for sign read mode.
16 | 
17 | // Full op, 32-bit indexing.
18 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
19 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
20 | 
21 | // Full op, 64-bit indexing.
22 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
23 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
24 | 
25 | // Activation/signs only for generic variant. 64-bit indexing.
26 | template void* choose_filtered_lrelu_act_kernel<c10::Half, false, true>(void);
27 | template void* choose_filtered_lrelu_act_kernel<float,     false, true>(void);
28 | template void* choose_filtered_lrelu_act_kernel<double,    false, true>(void);
29 | 
30 | // Copy filters to constant memory.
31 | template cudaError_t copy_filters<false, true>(cudaStream_t stream);
32 | 


--------------------------------------------------------------------------------
/modules/eg3ds/torch_utils/ops/filtered_lrelu_wr.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 4 |  *
 5 |  * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 6 |  * property and proprietary rights in and to this material, related
 7 |  * documentation and any modifications thereto. Any use, reproduction,
 8 |  * disclosure or distribution of this material and related documentation
 9 |  * without an express license agreement from NVIDIA CORPORATION or
10 |  * its affiliates is strictly prohibited.
11 |  */
12 | 
13 | #include "filtered_lrelu.cu"
14 | 
15 | // Template/kernel specializations for sign write mode.
16 | 
17 | // Full op, 32-bit indexing.
18 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
19 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
20 | 
21 | // Full op, 64-bit indexing.
22 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
23 | template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
24 | 
25 | // Activation/signs only for generic variant. 64-bit indexing.
26 | template void* choose_filtered_lrelu_act_kernel<c10::Half, true, false>(void);
27 | template void* choose_filtered_lrelu_act_kernel<float,     true, false>(void);
28 | template void* choose_filtered_lrelu_act_kernel<double,    true, false>(void);
29 | 
30 | // Copy filters to constant memory.
31 | template cudaError_t copy_filters<true, false>(cudaStream_t stream);
32 | 


--------------------------------------------------------------------------------
/modules/eg3ds/torch_utils/ops/fma.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 3 | #
 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 5 | # property and proprietary rights in and to this material, related
 6 | # documentation and any modifications thereto. Any use, reproduction,
 7 | # disclosure or distribution of this material and related documentation
 8 | # without an express license agreement from NVIDIA CORPORATION or
 9 | # its affiliates is strictly prohibited.
10 | 
11 | """Fused multiply-add, with slightly faster gradients than `torch.addcmul()`."""
12 | 
13 | import torch
14 | 
15 | #----------------------------------------------------------------------------
16 | 
17 | def fma(a, b, c): # => a * b + c
18 |     return _FusedMultiplyAdd.apply(a, b, c)
19 | 
20 | #----------------------------------------------------------------------------
21 | 
22 | class _FusedMultiplyAdd(torch.autograd.Function): # a * b + c
23 |     @staticmethod
24 |     def forward(ctx, a, b, c): # pylint: disable=arguments-differ
25 |         out = torch.addcmul(c, a, b)
26 |         ctx.save_for_backward(a, b)
27 |         ctx.c_shape = c.shape
28 |         return out
29 | 
30 |     @staticmethod
31 |     def backward(ctx, dout): # pylint: disable=arguments-differ
32 |         a, b = ctx.saved_tensors
33 |         c_shape = ctx.c_shape
34 |         da = None
35 |         db = None
36 |         dc = None
37 | 
38 |         if ctx.needs_input_grad[0]:
39 |             da = _unbroadcast(dout * b, a.shape)
40 | 
41 |         if ctx.needs_input_grad[1]:
42 |             db = _unbroadcast(dout * a, b.shape)
43 | 
44 |         if ctx.needs_input_grad[2]:
45 |             dc = _unbroadcast(dout, c_shape)
46 | 
47 |         return da, db, dc
48 | 
49 | #----------------------------------------------------------------------------
50 | 
51 | def _unbroadcast(x, shape):
52 |     extra_dims = x.ndim - len(shape)
53 |     assert extra_dims >= 0
54 |     dim = [i for i in range(x.ndim) if x.shape[i] > 1 and (i < extra_dims or shape[i - extra_dims] == 1)]
55 |     if len(dim):
56 |         x = x.sum(dim=dim, keepdim=True)
57 |     if extra_dims:
58 |         x = x.reshape(-1, *x.shape[extra_dims+1:])
59 |     assert x.shape == shape
60 |     return x
61 | 
62 | #----------------------------------------------------------------------------
63 | 


--------------------------------------------------------------------------------
/modules/eg3ds/torch_utils/ops/upfirdn2d.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 4 |  *
 5 |  * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 6 |  * property and proprietary rights in and to this material, related
 7 |  * documentation and any modifications thereto. Any use, reproduction,
 8 |  * disclosure or distribution of this material and related documentation
 9 |  * without an express license agreement from NVIDIA CORPORATION or
10 |  * its affiliates is strictly prohibited.
11 |  */
12 | 
13 | #include <cuda_runtime.h>
14 | 
15 | //------------------------------------------------------------------------
16 | // CUDA kernel parameters.
17 | 
18 | struct upfirdn2d_kernel_params
19 | {
20 |     const void*     x;
21 |     const float*    f;
22 |     void*           y;
23 | 
24 |     int2            up;
25 |     int2            down;
26 |     int2            pad0;
27 |     int             flip;
28 |     float           gain;
29 | 
30 |     int4            inSize;         // [width, height, channel, batch]
31 |     int4            inStride;
32 |     int2            filterSize;     // [width, height]
33 |     int2            filterStride;
34 |     int4            outSize;        // [width, height, channel, batch]
35 |     int4            outStride;
36 |     int             sizeMinor;
37 |     int             sizeMajor;
38 | 
39 |     int             loopMinor;
40 |     int             loopMajor;
41 |     int             loopX;
42 |     int             launchMinor;
43 |     int             launchMajor;
44 | };
45 | 
46 | //------------------------------------------------------------------------
47 | // CUDA kernel specialization.
48 | 
49 | struct upfirdn2d_kernel_spec
50 | {
51 |     void*   kernel;
52 |     int     tileOutW;
53 |     int     tileOutH;
54 |     int     loopMinor;
55 |     int     loopX;
56 | };
57 | 
58 | //------------------------------------------------------------------------
59 | // CUDA kernel selection.
60 | 
61 | template <class T> upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p);
62 | 
63 | //------------------------------------------------------------------------
64 | 


--------------------------------------------------------------------------------
/modules/eg3ds/volumetric_rendering/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 3 | #
 4 | # NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 5 | # property and proprietary rights in and to this material, related
 6 | # documentation and any modifications thereto. Any use, reproduction,
 7 | # disclosure or distribution of this material and related documentation
 8 | # without an express license agreement from NVIDIA CORPORATION or
 9 | # its affiliates is strictly prohibited.
10 | 
11 | # empty


--------------------------------------------------------------------------------
/modules/img2plane/deeplabv3/__init__.py:
--------------------------------------------------------------------------------
1 | from .decoders.my_model import DeepLabV3


--------------------------------------------------------------------------------
/modules/img2plane/deeplabv3/base/__init__.py:
--------------------------------------------------------------------------------
 1 | from .model import SegmentationModel
 2 | 
 3 | from .modules import (
 4 |     Conv2dReLU,
 5 |     Attention,
 6 | )
 7 | 
 8 | from .heads import (
 9 |     SegmentationHead,
10 |     ClassificationHead,
11 | )
12 | 


--------------------------------------------------------------------------------
/modules/img2plane/deeplabv3/base/heads.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from .modules import Activation
 3 | 
 4 | 
 5 | class SegmentationHead(nn.Sequential):
 6 |     def __init__(self, in_channels, out_channels, kernel_size=3, activation=None, upsampling=1):
 7 |         conv2d = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=kernel_size // 2)
 8 |         upsampling = nn.UpsamplingBilinear2d(scale_factor=upsampling) if upsampling > 1 else nn.Identity()
 9 |         activation = Activation(activation)
10 |         super().__init__(conv2d, upsampling, activation)
11 | 
12 | 
13 | class ClassificationHead(nn.Sequential):
14 |     def __init__(self, in_channels, classes, pooling="avg", dropout=0.2, activation=None):
15 |         if pooling not in ("max", "avg"):
16 |             raise ValueError("Pooling should be one of ('max', 'avg'), got {}.".format(pooling))
17 |         pool = nn.AdaptiveAvgPool2d(1) if pooling == "avg" else nn.AdaptiveMaxPool2d(1)
18 |         flatten = nn.Flatten()
19 |         dropout = nn.Dropout(p=dropout, inplace=True) if dropout else nn.Identity()
20 |         linear = nn.Linear(in_channels, classes, bias=True)
21 |         activation = Activation(activation)
22 |         super().__init__(pool, flatten, dropout, linear, activation)
23 | 


--------------------------------------------------------------------------------
/modules/img2plane/deeplabv3/base/initialization.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | def initialize_decoder(module):
 5 |     for m in module.modules():
 6 | 
 7 |         if isinstance(m, nn.Conv2d):
 8 |             nn.init.kaiming_uniform_(m.weight, mode="fan_in", nonlinearity="relu")
 9 |             if m.bias is not None:
10 |                 nn.init.constant_(m.bias, 0)
11 | 
12 |         elif isinstance(m, nn.BatchNorm2d):
13 |             nn.init.constant_(m.weight, 1)
14 |             nn.init.constant_(m.bias, 0)
15 | 
16 |         elif isinstance(m, nn.Linear):
17 |             nn.init.xavier_uniform_(m.weight)
18 |             if m.bias is not None:
19 |                 nn.init.constant_(m.bias, 0)
20 | 
21 | 
22 | def initialize_head(module):
23 |     for m in module.modules():
24 |         if isinstance(m, (nn.Linear, nn.Conv2d)):
25 |             nn.init.xavier_uniform_(m.weight)
26 |             if m.bias is not None:
27 |                 nn.init.constant_(m.bias, 0)
28 | 


--------------------------------------------------------------------------------
/modules/img2plane/deeplabv3/base/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from . import initialization as init
 3 | 
 4 | 
 5 | class SegmentationModel(torch.nn.Module):
 6 |     def initialize(self):
 7 |         init.initialize_decoder(self.decoder)
 8 |         init.initialize_head(self.segmentation_head)
 9 |         if self.classification_head is not None:
10 |             init.initialize_head(self.classification_head)
11 | 
12 |     def check_input_shape(self, x):
13 | 
14 |         h, w = x.shape[-2:]
15 |         output_stride = self.encoder.output_stride
16 |         if h % output_stride != 0 or w % output_stride != 0:
17 |             new_h = (h // output_stride + 1) * output_stride if h % output_stride != 0 else h
18 |             new_w = (w // output_stride + 1) * output_stride if w % output_stride != 0 else w
19 |             raise RuntimeError(
20 |                 f"Wrong input shape height={h}, width={w}. Expected image height and width "
21 |                 f"divisible by {output_stride}. Consider pad your images to shape ({new_h}, {new_w})."
22 |             )
23 | 
24 |     def forward(self, x):
25 |         """Sequentially pass `x` trough model`s encoder, decoder and heads"""
26 | 
27 |         self.check_input_shape(x)
28 | 
29 |         features = self.encoder(x)
30 |         decoder_output = self.decoder(*features)
31 | 
32 |         masks = self.segmentation_head(decoder_output)
33 | 
34 |         if self.classification_head is not None:
35 |             labels = self.classification_head(features[-1])
36 |             return masks, labels
37 | 
38 |         return masks
39 | 
40 |     @torch.no_grad()
41 |     def predict(self, x):
42 |         """Inference method. Switch model to `eval` mode, call `.forward(x)` with `torch.no_grad()`
43 | 
44 |         Args:
45 |             x: 4D torch tensor with shape (batch_size, channels, height, width)
46 | 
47 |         Return:
48 |             prediction: 4D torch tensor with shape (batch_size, classes, height, width)
49 | 
50 |         """
51 |         if self.training:
52 |             self.eval()
53 | 
54 |         x = self.forward(x)
55 | 
56 |         return x
57 | 


--------------------------------------------------------------------------------
/modules/img2plane/deeplabv3/encoders/__init__.py:
--------------------------------------------------------------------------------
 1 | import timm
 2 | import functools
 3 | import torch.utils.model_zoo as model_zoo
 4 | 
 5 | from .resnet import resnet_encoders
 6 | 
 7 | 
 8 | 
 9 | encoders = {}
10 | encoders.update(resnet_encoders)
11 | 
12 | def get_encoder(name, in_channels=3, depth=5, weights=None, output_stride=32, **kwargs):
13 | 
14 |     try:
15 |         Encoder = encoders[name]["encoder"]
16 |     except KeyError:
17 |         raise KeyError("Wrong encoder name `{}`, supported encoders: {}".format(name, list(encoders.keys())))
18 | 
19 |     params = encoders[name]["params"]
20 |     params.update(depth=depth)
21 |     encoder = Encoder(**params)
22 | 
23 |     if weights is not None:
24 |         try:
25 |             settings = encoders[name]["pretrained_settings"][weights]
26 |         except KeyError:
27 |             raise KeyError(
28 |                 "Wrong pretrained weights `{}` for encoder `{}`. Available options are: {}".format(
29 |                     weights,
30 |                     name,
31 |                     list(encoders[name]["pretrained_settings"].keys()),
32 |                 )
33 |             )
34 |         encoder.load_state_dict(model_zoo.load_url(settings["url"]))
35 | 
36 |     encoder.set_in_channels(in_channels, pretrained=weights is not None)
37 |     if output_stride != 32:
38 |         encoder.make_dilated(output_stride)
39 | 
40 |     return encoder
41 | 
42 | 
43 | def get_encoder_names():
44 |     return list(encoders.keys())
45 | 
46 | 
47 | def get_preprocessing_params(encoder_name, pretrained="imagenet"):
48 | 
49 |     if encoder_name.startswith("tu-"):
50 |         encoder_name = encoder_name[3:]
51 |         if not timm.models.is_model_pretrained(encoder_name):
52 |             raise ValueError(f"{encoder_name} does not have pretrained weights and preprocessing parameters")
53 |         settings = timm.models.get_pretrained_cfg(encoder_name)
54 |     else:
55 |         all_settings = encoders[encoder_name]["pretrained_settings"]
56 |         if pretrained not in all_settings.keys():
57 |             raise ValueError("Available pretrained options {}".format(all_settings.keys()))
58 |         settings = all_settings[pretrained]
59 | 
60 |     formatted_settings = {}
61 |     formatted_settings["input_space"] = settings.get("input_space", "RGB")
62 |     formatted_settings["input_range"] = list(settings.get("input_range", [0, 1]))
63 |     formatted_settings["mean"] = list(settings.get("mean"))
64 |     formatted_settings["std"] = list(settings.get("std"))
65 | 
66 |     return formatted_settings
67 | 
68 | 
69 | def get_preprocessing_fn(encoder_name, pretrained="imagenet"):
70 |     params = get_preprocessing_params(encoder_name, pretrained=pretrained)
71 |     return functools.partial(preprocess_input, **params)
72 | 


--------------------------------------------------------------------------------
/modules/img2plane/deeplabv3/encoders/_base.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from typing import List
 4 | from collections import OrderedDict
 5 | 
 6 | from . import _utils as utils
 7 | 
 8 | 
 9 | class EncoderMixin:
10 |     """Add encoder functionality such as:
11 |     - output channels specification of feature tensors (produced by encoder)
12 |     - patching first convolution for arbitrary input channels
13 |     """
14 | 
15 |     _output_stride = 32
16 | 
17 |     @property
18 |     def out_channels(self):
19 |         """Return channels dimensions for each tensor of forward output of encoder"""
20 |         return self._out_channels[: self._depth + 1]
21 | 
22 |     @property
23 |     def output_stride(self):
24 |         return min(self._output_stride, 2**self._depth)
25 | 
26 |     def set_in_channels(self, in_channels, pretrained=True):
27 |         """Change first convolution channels"""
28 |         if in_channels == 3:
29 |             return
30 | 
31 |         self._in_channels = in_channels
32 |         if self._out_channels[0] == 3:
33 |             self._out_channels = tuple([in_channels] + list(self._out_channels)[1:])
34 | 
35 |         utils.patch_first_conv(model=self, new_in_channels=in_channels, pretrained=pretrained)
36 | 
37 |     def get_stages(self):
38 |         """Override it in your implementation"""
39 |         raise NotImplementedError
40 | 
41 |     def make_dilated(self, output_stride):
42 | 
43 |         if output_stride == 16:
44 |             stage_list = [
45 |                 5,
46 |             ]
47 |             dilation_list = [
48 |                 2,
49 |             ]
50 | 
51 |         elif output_stride == 8:
52 |             stage_list = [4, 5]
53 |             dilation_list = [2, 4]
54 | 
55 |         else:
56 |             raise ValueError("Output stride should be 16 or 8, got {}.".format(output_stride))
57 | 
58 |         self._output_stride = output_stride
59 | 
60 |         stages = self.get_stages()
61 |         for stage_indx, dilation_rate in zip(stage_list, dilation_list):
62 |             utils.replace_strides_with_dilation(
63 |                 module=stages[stage_indx],
64 |                 dilation_rate=dilation_rate,
65 |             )
66 | 


--------------------------------------------------------------------------------
/modules/img2plane/deeplabv3/encoders/_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def patch_first_conv(model, new_in_channels, default_in_channels=3, pretrained=True):
 6 |     """Change first convolution layer input channels.
 7 |     In case:
 8 |         in_channels == 1 or in_channels == 2 -> reuse original weights
 9 |         in_channels > 3 -> make random kaiming normal initialization
10 |     """
11 | 
12 |     # get first conv
13 |     for module in model.modules():
14 |         if isinstance(module, nn.Conv2d) and module.in_channels == default_in_channels:
15 |             break
16 | 
17 |     weight = module.weight.detach()
18 |     module.in_channels = new_in_channels
19 | 
20 |     if not pretrained:
21 |         module.weight = nn.parameter.Parameter(
22 |             torch.Tensor(module.out_channels, new_in_channels // module.groups, *module.kernel_size)
23 |         )
24 |         module.reset_parameters()
25 | 
26 |     elif new_in_channels == 1:
27 |         new_weight = weight.sum(1, keepdim=True)
28 |         module.weight = nn.parameter.Parameter(new_weight)
29 | 
30 |     else:
31 |         new_weight = torch.Tensor(module.out_channels, new_in_channels // module.groups, *module.kernel_size)
32 | 
33 |         for i in range(new_in_channels):
34 |             new_weight[:, i] = weight[:, i % default_in_channels]
35 | 
36 |         new_weight = new_weight * (default_in_channels / new_in_channels)
37 |         module.weight = nn.parameter.Parameter(new_weight)
38 | 
39 | 
40 | def replace_strides_with_dilation(module, dilation_rate):
41 |     """Patch Conv2d modules replacing strides with dilation"""
42 |     for mod in module.modules():
43 |         if isinstance(mod, nn.Conv2d):
44 |             mod.stride = (1, 1)
45 |             mod.dilation = (dilation_rate, dilation_rate)
46 |             kh, kw = mod.kernel_size
47 |             mod.padding = ((kh // 2) * dilation_rate, (kh // 2) * dilation_rate)
48 | 
49 |             # Kostyl for EfficientNet
50 |             if hasattr(mod, "static_padding"):
51 |                 mod.static_padding = nn.Identity()
52 | 


--------------------------------------------------------------------------------
/modules/img2plane/segformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import LowResolutionViT, TriplanePredictorViT


--------------------------------------------------------------------------------
/modules/img2plane/simple_encoders/high_resolution_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class HighResoEncoder(nn.Module):
 7 |     def __init__(self, 
 8 |                  in_dim=5, # 3 for rgb and 2 for coordinate
 9 |                  out_dim=96, 
10 |                  ):
11 |         super().__init__()
12 |         self.first = nn.Conv2d(in_channels=in_dim, out_channels=64, kernel_size=7, stride=2, padding=3)
13 |         self.activation = nn.LeakyReLU(negative_slope=0.01)
14 | 
15 |         self.conv_layers = nn.Sequential(*[
16 |             nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=1, padding=1),
17 |             nn.LeakyReLU(negative_slope=0.01),
18 |             nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),
19 |             nn.LeakyReLU(negative_slope=0.01),
20 |             nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),
21 |             nn.LeakyReLU(negative_slope=0.01),
22 |             nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),
23 |             nn.LeakyReLU(negative_slope=0.01),
24 |         ])
25 | 
26 |         self.final = nn.Conv2d(in_channels=96, out_channels=out_dim, kernel_size=3, stride=1, padding=1)
27 |     
28 |     def forward(self, x):
29 |         """
30 |         x: [B, C=5, 256, 256]
31 |         return: [B, C=96, 256, 256]
32 |         """
33 |         h = self.first(x)
34 |         h = self.conv_layers(h)
35 |         h = self.final(h)
36 |         return h
37 |     


--------------------------------------------------------------------------------
/tasks/run.py:
--------------------------------------------------------------------------------
 1 | # import utils.commons.single_thread_env  # NOQA
 2 | import os
 3 | import sys
 4 | sys.path.append(os.path.abspath("./"))
 5 | 
 6 | from utils.commons.hparams import hparams, set_hparams
 7 | import importlib
 8 | 
 9 | 
10 | def run_task():
11 |     assert hparams['task_cls'] != ''
12 |     pkg = ".".join(hparams["task_cls"].split(".")[:-1])
13 |     cls_name = hparams["task_cls"].split(".")[-1]
14 |     task_cls = getattr(importlib.import_module(pkg), cls_name)
15 |     task_cls.start()
16 | 
17 | def clear_gpus():
18 |     devices = os.environ.get('CUDA_VISIBLE_DEVICES', '').split(",")
19 |     for d in devices:
20 |         os.system(f'pkill -f "voidgpu{d}"')
21 | 
22 | if __name__ == '__main__':
23 |     if os.environ.get('CUDA_VISIBLE_DEVICES', '') == '':
24 |         os.environ['CUDA_VISIBLE_DEVICES'] = '0'
25 |     try:
26 |         set_hparams()
27 |         run_task()
28 |     except KeyboardInterrupt:
29 |         if hparams['init_method'] == 'file':
30 |             # on exit, remove the shared file in nfs for DDP
31 |             exp_name = hparams['exp_name']
32 |             shared_file_name = f'/mnt/bn/sa-ag-data/yezhenhui/nfs/pytorch_ddp_sharedfile/{exp_name}'
33 |             if os.path.exists(shared_file_name):
34 |                 os.system(f"rm -r {shared_file_name}")
35 |         


--------------------------------------------------------------------------------
/utils/audio/dct.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | def dct(x, norm=None):
 6 |     x_shape = x.shape
 7 |     N = x_shape[-1]
 8 |     x = x.contiguous().view(-1, N)
 9 | 
10 |     v = torch.cat([x[:, ::2], x[:, 1::2].flip([1])], dim=1)
11 | 
12 |     Vc = torch.view_as_real(torch.fft.fft(v, dim=1))  # add this line
13 | 
14 |     k = - torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi / (2 * N)
15 |     W_r = torch.cos(k)
16 |     W_i = torch.sin(k)
17 | 
18 |     V = Vc[:, :, 0] * W_r - Vc[:, :, 1] * W_i
19 | 
20 |     if norm == 'ortho':
21 |         V[:, 0] /= np.sqrt(N) * 2
22 |         V[:, 1:] /= np.sqrt(N / 2) * 2
23 | 
24 |     V = 2 * V.view(*x_shape)
25 | 
26 |     return V
27 | 
28 | 
29 | def idct(X, norm=None):
30 |     x_shape = X.shape
31 |     N = x_shape[-1]
32 | 
33 |     X_v = X.contiguous().view(-1, x_shape[-1]) / 2
34 | 
35 |     if norm == 'ortho':
36 |         X_v[:, 0] *= np.sqrt(N) * 2
37 |         X_v[:, 1:] *= np.sqrt(N / 2) * 2
38 | 
39 |     k = torch.arange(x_shape[-1], dtype=X.dtype, device=X.device)[None, :] * np.pi / (2 * N)
40 |     W_r = torch.cos(k)
41 |     W_i = torch.sin(k)
42 | 
43 |     V_t_r = X_v
44 |     V_t_i = torch.cat([X_v[:, :1] * 0, -X_v.flip([1])[:, :-1]], dim=1)
45 | 
46 |     V_r = V_t_r * W_r - V_t_i * W_i
47 |     V_i = V_t_r * W_i + V_t_i * W_r
48 | 
49 |     V = torch.cat([V_r.unsqueeze(2), V_i.unsqueeze(2)], dim=2)
50 | 
51 |     # v = torch.irfft(V, 1, onesided=False)                             # comment this line
52 |     v = torch.fft.irfft(torch.view_as_complex(V), n=V.shape[1], dim=1)  # add this line
53 | 
54 |     x = v.new_zeros(v.shape)
55 |     x[:, ::2] += v[:, :N - (N // 2)]
56 |     x[:, 1::2] += v.flip([1])[:, :N // 2]
57 | 
58 |     return x.view(*x_shape)
59 | 


--------------------------------------------------------------------------------
/utils/audio/io.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | import numpy as np
 4 | from scipy.io import wavfile
 5 | import pyloudnorm as pyln
 6 | 
 7 | 
 8 | def save_wav(wav, path, sr, norm=False):
 9 |     wav = wav.astype(float)
10 |     if norm:
11 |         meter = pyln.Meter(sr)  # create BS.1770 meter
12 |         loudness = meter.integrated_loudness(wav)
13 |         wav = pyln.normalize.loudness(wav, loudness, -18.0)
14 |         if np.abs(wav).max() >= 1:
15 |             wav = wav / np.abs(wav).max() * 0.95
16 |     wav = wav * 32767
17 |     wavfile.write(path[:-4] + '.wav', sr, wav.astype(np.int16))
18 |     if path[-4:] == '.mp3':
19 |         to_mp3(path[:-4])
20 | 
21 | 
22 | def to_mp3(out_path):
23 |     if out_path[-4:] == '.wav':
24 |         out_path = out_path[:-4]
25 |     subprocess.check_call(
26 |         f'ffmpeg -threads 1 -loglevel error -i "{out_path}.wav" -vn -b:a 192k -y -hide_banner -async 1 "{out_path}.mp3"',
27 |         shell=True, stdin=subprocess.PIPE)
28 |     subprocess.check_call(f'rm -f "{out_path}.wav"', shell=True)
29 | 


--------------------------------------------------------------------------------
/utils/audio/pitch/bin/ExtractF0ByStraight:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/utils/audio/pitch/bin/ExtractF0ByStraight


--------------------------------------------------------------------------------
/utils/audio/pitch/bin/InterpF0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/utils/audio/pitch/bin/InterpF0


--------------------------------------------------------------------------------
/utils/audio/pitch/bin/ReaperF0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yerfor/MimicTalk/a79f90ed2a91a8ff93f7dd08977b709aeff37d12/utils/audio/pitch/bin/ReaperF0


--------------------------------------------------------------------------------
/utils/audio/pitch/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | def to_lf0(f0):
 6 |     f0[f0 < 1.0e-5] = 1.0e-6
 7 |     lf0 = f0.log() if isinstance(f0, torch.Tensor) else np.log(f0)
 8 |     lf0[f0 < 1.0e-5] = - 1.0E+10
 9 |     return lf0
10 | 
11 | 
12 | def to_f0(lf0):
13 |     f0 = np.where(lf0 <= 0, 0.0, np.exp(lf0))
14 |     return f0.flatten()
15 | 
16 | 
17 | def f0_to_coarse(f0, f0_bin=256, f0_max=900.0, f0_min=50.0):
18 |     f0_mel_min = 1127 * np.log(1 + f0_min / 700)
19 |     f0_mel_max = 1127 * np.log(1 + f0_max / 700)
20 |     is_torch = isinstance(f0, torch.Tensor)
21 |     f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
22 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
23 | 
24 |     f0_mel[f0_mel <= 1] = 1
25 |     f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
26 |     f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
27 |     assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min(), f0.min(), f0.max())
28 |     return f0_coarse
29 | 
30 | 
31 | def coarse_to_f0(f0_coarse, f0_bin=256, f0_max=900.0, f0_min=50.0):
32 |     f0_mel_min = 1127 * np.log(1 + f0_min / 700)
33 |     f0_mel_max = 1127 * np.log(1 + f0_max / 700)
34 |     uv = f0_coarse == 1
35 |     f0 = f0_mel_min + (f0_coarse - 1) * (f0_mel_max - f0_mel_min) / (f0_bin - 2)
36 |     f0 = ((f0 / 1127).exp() - 1) * 700
37 |     f0[uv] = 0
38 |     return f0
39 | 
40 | 
41 | def norm_f0(f0, uv, pitch_norm='log', f0_mean=400, f0_std=100):
42 |     is_torch = isinstance(f0, torch.Tensor)
43 |     if pitch_norm == 'standard':
44 |         f0 = (f0 - f0_mean) / f0_std
45 |     if pitch_norm == 'log':
46 |         f0 = torch.log2(f0 + 1e-8) if is_torch else np.log2(f0 + 1e-8)
47 |     if uv is not None:
48 |         f0[uv > 0] = 0
49 |     return f0
50 | 
51 | 
52 | def norm_interp_f0(f0, pitch_norm='log', f0_mean=None, f0_std=None):
53 |     is_torch = isinstance(f0, torch.Tensor)
54 |     if is_torch:
55 |         device = f0.device
56 |         f0 = f0.data.cpu().numpy()
57 |     uv = f0 == 0
58 |     f0 = norm_f0(f0, uv, pitch_norm, f0_mean, f0_std)
59 |     if sum(uv) == len(f0):
60 |         f0[uv] = 0
61 |     elif sum(uv) > 0:
62 |         f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
63 |     if is_torch:
64 |         uv = torch.FloatTensor(uv)
65 |         f0 = torch.FloatTensor(f0)
66 |         f0 = f0.to(device)
67 |         uv = uv.to(device)
68 |     return f0, uv
69 | 
70 | 
71 | def denorm_f0(f0, uv, pitch_norm='log', f0_mean=400, f0_std=100, pitch_padding=None, min=50, max=900):
72 |     is_torch = isinstance(f0, torch.Tensor)
73 |     if pitch_norm == 'standard':
74 |         f0 = f0 * f0_std + f0_mean
75 |     if pitch_norm == 'log':
76 |         f0 = 2 ** f0
77 |     f0 = f0.clamp(min=min, max=max) if is_torch else np.clip(f0, a_min=min, a_max=max)
78 |     if uv is not None:
79 |         f0[uv > 0] = 0
80 |     if pitch_padding is not None:
81 |         f0[pitch_padding] = 0
82 |     return f0
83 | 


--------------------------------------------------------------------------------
/utils/audio/pitch/uv_utils.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import numpy as np
 3 | from scipy.interpolate import interp1d
 4 | 
 5 | 
 6 | def uv_energy_corrector(wav_data_16k, f0_func, f0_min=50, f0_max=1000):
 7 |     hop_size = 256
 8 |     win_size = hop_size * 6
 9 |     sr = 16000
10 | 
11 |     spec = np.abs(librosa.stft(wav_data_16k, n_fft=win_size, hop_length=hop_size,
12 |                                win_length=win_size, pad_mode="constant").T)
13 |     T = spec.shape[0]
14 |     x_h256 = np.arange(0, 1, 1 / T)[:T]
15 |     x_h256[-1] = 1
16 |     f0 = f0_func(x_h256)
17 |     freqs = librosa.fft_frequencies(sr=sr, n_fft=win_size)
18 |     x_idx = np.arange(T)
19 | 
20 |     def find_nearest_stft_bin(f0_):
21 |         return np.abs(freqs[None, :] - f0_[:, None]).argmin(-1)
22 | 
23 |     def get_energy_mask(f0_lambda, hars=None, win_size=3):
24 |         if hars is None:
25 |             hars = [1]
26 |         mask = np.zeros([T, 10000]).astype(bool)
27 |         mask_bins = []
28 |         for multiple in hars:
29 |             f0_bin_idx = find_nearest_stft_bin(f0_lambda(f0, multiple))
30 |             for delta in range(-win_size // 2, 1 + win_size // 2):
31 |                 y_idx = f0_bin_idx + delta
32 |                 if np.max(y_idx) < spec.shape[1]:
33 |                     mask_bins.append(spec[x_idx, y_idx])
34 |                 mask[x_idx, y_idx] = 1
35 |         mask_bins = np.stack(mask_bins, 1)
36 |         energy_ = np.mean(mask_bins, 1)
37 |         return energy_, mask
38 | 
39 |     # find uv first (for obtaining mean_energy_mharfhar)
40 |     energy_har, mask_har = get_energy_mask(lambda f0, m: f0 * m, [1, 2], 3)
41 |     energy_mhalfhar, mask_mhalfhar = get_energy_mask(lambda f0, m: f0 * (m - 0.5), [1], 5)
42 |     r_energy = energy_har / np.clip(energy_mhalfhar, 1e-8, None)
43 | 
44 |     uv = np.zeros_like(f0).astype(bool)
45 |     uv |= r_energy < 10
46 |     uv |= (f0 > f0_max) | (f0 < f0_min)
47 |     func_uv = interp1d(x_h256, uv, 'nearest', fill_value='extrapolate')
48 |     return func_uv
49 | 


--------------------------------------------------------------------------------
/utils/commons/euler2rot.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from scipy.spatial.transform import Rotation as R
 3 | from utils.commons.tensor_utils import convert_to_tensor
 4 | 
 5 | 
 6 | def rot2euler(rot, use_radian=True):
 7 |     r = R.from_matrix(rot)
 8 |     return r.as_euler('xyz', degrees=not use_radian)
 9 | 
10 | def euler2rot(euler, use_radian=True):
11 |     r = R.from_euler('xyz',euler, degrees=not use_radian)
12 |     return r.as_matrix()
13 | 
14 | def c2w_to_euler_trans(c2w):
15 |     if c2w.ndim == 3:
16 |         e = rot2euler(c2w[:, :3, :3]) # [B, 3]
17 |         t = c2w[:, :3, 3].reshape([-1, 3])
18 |     else:
19 |         e = rot2euler(c2w[:3, :3]) # [B, 3]
20 |         t = c2w[:3, 3].reshape([3])
21 |     return e, t # [3+3]
22 | 
23 | def euler_trans_2_c2w(euler, trans):
24 |     if euler.ndim == 2:
25 |         rot = euler2rot(euler) # [b, 3, 3]
26 |         bs = trans.shape[0]
27 |         trans = trans.reshape([bs, 3, 1])
28 |         rot = convert_to_tensor(rot).float()
29 |         trans = convert_to_tensor(trans).float()
30 |         c2w = torch.cat([rot, trans], dim=-1) # [b, 3, 4]
31 |     else:
32 |         rot = euler2rot(euler) # [3, 3]
33 |         trans = trans.reshape([3, 1])
34 |         rot = convert_to_tensor(rot).float()
35 |         trans = convert_to_tensor(trans).float()
36 |         c2w = torch.cat([rot, trans], dim=-1) # [3, 4]
37 |     return c2w


--------------------------------------------------------------------------------
/utils/commons/face_alignment_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | 
 4 | yaw_idx_in_mediapipe_mesh = [356, 454, 361, 288, 397, 379, 378, 377, 152, 148, 149, 150, 172,58, 132, 234, 127]
 5 | brow_idx_in_mediapipe_mesh = [70,  63, 105,  66, 107, 336, 296, 334, 293, 300]
 6 | nose_idx_in_mediapipe_mesh = [6, 5, 1, 2, 129, 240, 2, 460, 358]
 7 | eye_idx_in_mediapipe_mesh = [33, 160, 158, 133, 153, 144, 362, 385, 387, 263, 373, 380]
 8 | mouth_idx_in_mediapipe_mesh = [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95]
 9 | lm68_idx_in_mediapipe_mesh = yaw_idx_in_mediapipe_mesh + brow_idx_in_mediapipe_mesh + nose_idx_in_mediapipe_mesh + eye_idx_in_mediapipe_mesh + mouth_idx_in_mediapipe_mesh
10 | 
11 | 
12 | def mediapipe_lm478_to_face_alignment_lm68(lm478, H, W, return_2d=True):
13 |     """
14 |     lm478: [B, 478, 3] or [478,3]
15 |     """
16 |     lm478 = copy.deepcopy(lm478)
17 |     lm478[..., 0] *= W
18 |     lm478[..., 1] *= H
19 |     n_dim = 2 if return_2d else False
20 |     if lm478.ndim == 2:
21 |         return lm478[lm68_idx_in_mediapipe_mesh, :n_dim].astype(np.int16)
22 |     elif lm478.ndim == 3:
23 |         return lm478[:, lm68_idx_in_mediapipe_mesh, :n_dim].astype(np.int16)
24 |     else:
25 |         raise ValueError("input lm478 ndim should in 2 or 3!")
26 | 
27 | def mediapipe_lm478_to_lm68_3d(lm478):
28 |     """
29 |     lm478: [B, 478, 3] or [478,3]
30 |     also works for lm468
31 |     """
32 |     if lm478.ndim == 2:
33 |         return lm478[lm68_idx_in_mediapipe_mesh]
34 |     elif lm478.ndim == 3:
35 |         return lm478[:, lm68_idx_in_mediapipe_mesh]
36 |     else:
37 |         raise ValueError("input lm478 ndim should in 2 or 3!")


--------------------------------------------------------------------------------
/utils/commons/image_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import cv2
 6 | import os
 7 | import imageio
 8 | 
 9 | 
10 | def dilate(bin_img, ksize=5):
11 |     # bin_img
12 |     pad = (ksize-1)//2
13 |     bin_img = F.pad(bin_img, pad=[pad,pad,pad,pad], mode='reflect')
14 |     out = F.max_pool2d(bin_img, kernel_size=ksize, stride=1, padding=0)
15 |     return out
16 | 
17 | def erode(bin_img, ksize=5):
18 |     out = 1 - dilate(1-bin_img, ksize)
19 |     return out
20 | 
21 | def to8b(x): 
22 |     return (255*np.clip(x, 0, 1)).astype(np.uint8)
23 | 
24 | def mse2psnr(x): 
25 |     return -10. * torch.log(x) / torch.log(torch.Tensor([10.]))
26 | 
27 | def img2mse(x, y): 
28 |     return torch.mean((x - y) ** 2)
29 | 
30 | def video2images(video_name, out_dir):
31 |     cap = cv2.VideoCapture(video_name)
32 |     frame_num = 0
33 |     while(True):
34 |         _, frame = cap.read()
35 |         if frame is None:
36 |             break
37 |         out_frame_name = os.path.join(out_dir, str(frame_num) + '.jpg')
38 |         cv2.imwrite(out_frame_name, frame)
39 |         frame_num += + 1
40 |     cap.release()
41 | 
42 | def load_image_as_uint8_tensor(fname):
43 |     """
44 |     img: (H, W, 3) floatTensor
45 |     """
46 |     img = torch.as_tensor(imageio.imread(fname))
47 |     return img
48 | 
49 | if __name__ =='__main__':
50 |     video2images("test_data/May_val/AD-NeRF.mp4", "test_data/May_val/AD-NeRF")
51 |     video2images("test_data/May_val/GeneFace.mp4", "test_data/May_val/GeneFace")
52 |     video2images("test_data/May_val/GT.mp4", "test_data/May_val/GT")


--------------------------------------------------------------------------------
/utils/commons/meters.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import torch
 3 | 
 4 | 
 5 | class AvgrageMeter(object):
 6 | 
 7 |     def __init__(self):
 8 |         self.reset()
 9 | 
10 |     def reset(self):
11 |         self.avg = 0
12 |         self.sum = 0
13 |         self.cnt = 0
14 | 
15 |     def update(self, val, n=1):
16 |         self.sum += val * n
17 |         self.cnt += n
18 |         self.avg = self.sum / self.cnt
19 | 
20 | 
21 | class Timer:
22 |     timer_map = {}
23 | 
24 |     def __init__(self, name, enable=False):
25 |         if name not in Timer.timer_map:
26 |             Timer.timer_map[name] = 0
27 |         self.name = name
28 |         self.enable = enable
29 | 
30 |     def __enter__(self):
31 |         if self.enable:
32 |             # if torch.cuda.is_available():
33 |                 # torch.cuda.synchronize()
34 |             self.t = time.time()
35 | 
36 |     def __exit__(self, exc_type, exc_val, exc_tb):
37 |         if self.enable:
38 |             # if torch.cuda.is_available():
39 |                 # torch.cuda.synchronize()
40 |             Timer.timer_map[self.name] += time.time() - self.t
41 |             if self.enable:
42 |                 print(f'[Timer] {self.name}: {Timer.timer_map[self.name]}')
43 | 


--------------------------------------------------------------------------------
/utils/commons/os_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import glob
 4 | from utils.commons.multiprocess_utils import multiprocess_run_tqdm
 5 | 
 6 | 
 7 | def link_file(from_file, to_file):
 8 |     subprocess.check_call(
 9 |         f'ln -s "`realpath --relative-to="{os.path.dirname(to_file)}" "{from_file}"`" "{to_file}"', shell=True)
10 | 
11 | 
12 | def move_file(from_file, to_file):
13 |     subprocess.check_call(f'mv "{from_file}" "{to_file}"', shell=True)
14 | 
15 | 
16 | def copy_file(from_file, to_file):
17 |     subprocess.check_call(f'cp -r "{from_file}" "{to_file}"', shell=True)
18 | 
19 | 
20 | def remove_file(*fns):
21 |     for f in fns:
22 |         subprocess.check_call(f'rm -rf "{f}"', shell=True)
23 | 
24 | def glob_job(d, f):
25 |     pattern = os.path.join(d, f)
26 |     return glob.glob(pattern)
27 | 
28 | def multiprocess_glob(pattern, num_workers=None):
29 |     split_pattern = pattern.split("/")
30 |     recursive_depth = 0 # number of recursive depth
31 |     for split in split_pattern:
32 |         if '*' in split:
33 |             recursive_depth += 1
34 |     if recursive_depth == 1:
35 |         return glob.glob(pattern)
36 |     else:
37 |         dirs = glob.glob('/'.join(split_pattern[:-1]))
38 |         ret = []
39 |         args = [(d, split_pattern[-1]) for d in dirs]
40 |         for (i,res) in multiprocess_run_tqdm(glob_job, args=args, desc=f"globing {pattern}", num_workers=num_workers):
41 |             ret += res
42 |         return ret


--------------------------------------------------------------------------------
/utils/commons/pitch_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | f0_bin = 256
 5 | f0_max = 1100.0
 6 | f0_min = 50.0
 7 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
 8 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 9 | 
10 | def coarse_to_f0(coarse):
11 |     uv = coarse == 1
12 |     f0_mel = (coarse - 1) * (f0_mel_max - f0_mel_min) / (f0_bin - 2) + f0_mel_min
13 |     f0 = ((f0_mel / 1127).exp() - 1) * 700
14 |     f0[uv] = 0
15 |     return f0
16 | 
17 | def f0_to_coarse(f0):
18 |     is_torch = isinstance(f0, torch.Tensor)
19 |     f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
20 |     f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
21 | 
22 |     f0_mel[f0_mel <= 1] = 1
23 |     f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
24 |     f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int_)
25 |     assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min(), f0.min(), f0.max())
26 |     return f0_coarse
27 | 
28 | 
29 | def norm_f0(f0, uv, hparams):
30 |     is_torch = isinstance(f0, torch.Tensor)
31 |     if hparams['pitch_norm'] == 'standard':
32 |         f0 = (f0 - hparams['f0_mean']) / hparams['f0_std']
33 |     if hparams['pitch_norm'] == 'log':
34 |         f0 = torch.log2(f0 + 1e-8) if is_torch else np.log2(f0 + 1e-8)
35 |     if uv is not None and hparams['use_uv']:
36 |         f0[uv > 0] = 0
37 |     return f0


--------------------------------------------------------------------------------
/utils/nn/grad.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def get_grad_norm(model, l=2):
 4 |     num_para = 0
 5 |     accu_grad = 0
 6 |     if isinstance(model, torch.nn.Module):
 7 |         params = model.parameters()
 8 |     else:
 9 |         params = model
10 |     for p in params:
11 |         if p.grad is None:
12 |             continue
13 |         num_para += p.numel()
14 |         if l == 1:
15 |             accu_grad += p.grad.abs(1).sum()
16 |         elif l == 2:
17 |             accu_grad += p.grad.pow(2).sum()
18 |         else:
19 |             raise ValueError("Now we only implement l1/l2 norm !")
20 |     if l == 2:
21 |         accu_grad = accu_grad ** 0.5
22 |     if isinstance(accu_grad, float):
23 |         return accu_grad
24 |     return accu_grad.item()
25 | 
26 | class GradBuffer:
27 |     def __init__(self):
28 |         self.buffer = {}
29 |     
30 |     def add(self, model):
31 |         for item in model.named_parameters():
32 |             name, param = item
33 |             if param.grad is None:
34 |                 continue
35 |             self.buffer[name] = self.buffer.get(name, 0) + param.grad.data
36 |     
37 |     def apply(self, model):
38 |         for item in model.named_parameters():
39 |             name, param = item
40 |             if param.grad is None:
41 |                 continue
42 |             if name in self.buffer.keys():
43 |                 param.grad.data += self.buffer[name]
44 |         self.buffer = {}


--------------------------------------------------------------------------------
/utils/nn/model_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | def print_arch(model, model_name='model'):
 6 |     print(f"| {model_name} Arch: ", model)
 7 |     num_params(model, model_name=model_name)
 8 | 
 9 | 
10 | def num_params(model, print_out=True, model_name="model"):
11 |     parameters = filter(lambda p: p.requires_grad, model.parameters())
12 |     parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
13 |     if print_out:
14 |         print(f'| {model_name} Trainable Parameters: %.3fM' % parameters)
15 |     return parameters
16 | 
17 | def get_device_of_model(model):
18 |     return model.parameters().__next__().device
19 | 
20 | def requires_grad(model):
21 |     if isinstance(model, torch.nn.Module):
22 |         for p in model.parameters():
23 |             p.requires_grad = True
24 |     else:
25 |         model.requires_grad = True
26 | 
27 | def not_requires_grad(model):
28 |     if isinstance(model, torch.nn.Module):
29 |         for p in model.parameters():
30 |             p.requires_grad = False
31 |     else:
32 |         model.requires_grad = False
33 | 


--------------------------------------------------------------------------------
/utils/useful_cmd_lines/clean_gpu.py:
--------------------------------------------------------------------------------
 1 | import os, re
 2 | def clean_gpu():
 3 |     ret = os.popen("fuser -v /dev/nvidia*").read()
 4 |     ret = re.sub("kernel", " ", ret)
 5 |     ids = set(ret.split(" "))
 6 |     ids = [int(i) for i in ids if i != '']
 7 |     ids = [str(i) for i in sorted(ids)]
 8 |     ids_string = ' '.join(ids)
 9 |     cmd = f"kill -9 {ids_string}"
10 |     os.system("fuser -v /dev/nvidia*")
11 |     flag = input(f"You are going run this command: \n  ==>  \"{cmd}\" \nEnter y/Y to proceed, or other to abort.\n[y/n]")
12 |     if flag.lower() == 'y':
13 |         os.system(cmd)
14 |         print("All gpu process cleaned!")
15 |     else:
16 |         print("Aborted!")
17 | 
18 | if __name__ == '__main__':
19 |     clean_gpu()


--------------------------------------------------------------------------------
/utils/visualization/auto_plot_image.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import cv2
 4 | 
 5 | def plot_image(save_path, image, convert_RGB2BGR=True):
 6 |     if isinstance(image, torch.Tensor):
 7 |         image = image.detach().cpu().numpy()
 8 |     image = image.astype(float)
 9 |     if image.max() < 1.1 and image.min() > -0.1: # [0, 1]
10 |         image = image * 255
11 |     elif image.max() < 1.1 and image.min() > -1.1: # [-1, 1]
12 |         image = (image + 1.0) * 0.5 * 255
13 |     image = image.clip(0, 255)  
14 |     image = image.astype(np.uint8)
15 |     if len(image.shape) == 4 and image.shape[0] == 1:
16 |         image = image[0]
17 |     if len(image.shape) == 3 and image.shape[0] <= 4: # C, H, W
18 |         image = torch.from_numpy(image).permute(1, 2, 0).numpy()
19 |     if len(image.shape) == 3 and convert_RGB2BGR:
20 |         image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
21 |     cv2.imwrite(save_path, image)


--------------------------------------------------------------------------------
/utils/visualization/ffmpeg_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def imgs_to_video(img_dir, video_path, audio_path=None, verbose=True):
 4 |     cmd = f"ffmpeg -i {img_dir}/%5d.png "
 5 |     if audio_path is not None:
 6 |         cmd += f"-i {audio_path} "
 7 |         cmd += "-strict -2 "
 8 |     cmd += "-c:v libx264 -pix_fmt yuv420p -b:v 2000k -y -shortest "
 9 |     if verbose is False:
10 |         cmd += " -v quiet "
11 |     cmd += f"{video_path} "
12 |     os.makedirs(os.path.dirname(video_path), exist_ok=True)
13 |     os.system(cmd)
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     imgs_to_video('infer_out/tmp_imgs', 'infer_out/tmp_imgs/out.mp4', 'data/raw/val_wavs/zozo.wav')
18 |     imgs_to_video('infer_out/tmp_imgs', 'infer_out/tmp_imgs/out2.mp4', 'data/raw/val_wavs/zozo.wav')


--------------------------------------------------------------------------------
/utils/visualization/plot_attention.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | from utils.commons.tensor_utils import convert_to_np
 4 | 
 5 | 
 6 | def plot_attention_img(attention_img, color_bar='jet'):
 7 |     """
 8 |     attention_img: raw attention in network, tensor or array, in 0~1 scale, shape [H, W,]
 9 |     color_bar: jet, summer, etc see this https://blog.csdn.net/loveliuzz/article/details/73648505
10 |     return: ready-to-visualize attention img in -1~1 scale.
11 |     """
12 |     attention_img = convert_to_np(attention_img)
13 |     assert attention_img.ndim == 2
14 |     attention_img = np.uint8(255 * attention_img)
15 |     color_bar_dict = {
16 |         'jet': cv2.COLORMAP_JET,
17 |         'summer': cv2.COLORMAP_SUMMER,
18 |         'hot': cv2.COLORMAP_HOT
19 |     }
20 |     color_bar = color_bar_dict.get(color_bar, getattr(cv2, f"COLORMAP_{color_bar.upper()}"))
21 |     attention_img = cv2.applyColorMap(attention_img, color_bar) / 127.5 - 1
22 |     attention_img = attention_img[:, :, ::-1] # flip RGB
23 |     return attention_img


--------------------------------------------------------------------------------
/utils/visualization/plot_spec.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | 
 3 | matplotlib.use('Agg')
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | LINE_COLORS = ['w', 'r', 'orange', 'k', 'cyan', 'm', 'b', 'lime', 'g', 'brown', 'navy']
 9 | 
10 | 
11 | def spec_to_figure(spec, vmin=None, vmax=None, title='', f0s=None, dur_info=None):
12 |     if isinstance(spec, torch.Tensor):
13 |         spec = spec.cpu().numpy()
14 |     H = spec.shape[1] // 2
15 |     fig = plt.figure(figsize=(12, 6))
16 |     plt.title(title)
17 |     plt.pcolor(spec.T, vmin=vmin, vmax=vmax)
18 | 
19 |     if dur_info is not None:
20 |         assert isinstance(dur_info, dict)
21 |         txt = dur_info['txt']
22 |         dur_gt = dur_info['dur_gt']
23 |         if isinstance(dur_gt, torch.Tensor):
24 |             dur_gt = dur_gt.cpu().numpy()
25 |         dur_gt = np.cumsum(dur_gt).astype(int)
26 |         for i in range(len(dur_gt)):
27 |             shift = (i % 8) + 1
28 |             plt.text(dur_gt[i], shift * 4, txt[i])
29 |             plt.vlines(dur_gt[i], 0, H // 2, colors='b')  # blue is gt
30 |         plt.xlim(0, dur_gt[-1])
31 |         if 'dur_pred' in dur_info:
32 |             dur_pred = dur_info['dur_pred']
33 |             if isinstance(dur_pred, torch.Tensor):
34 |                 dur_pred = dur_pred.cpu().numpy()
35 |             dur_pred = np.cumsum(dur_pred).astype(int)
36 |             for i in range(len(dur_pred)):
37 |                 shift = (i % 8) + 1
38 |                 plt.text(dur_pred[i], H + shift * 4, txt[i])
39 |                 plt.vlines(dur_pred[i], H, H * 1.5, colors='r')  # red is pred
40 |             plt.xlim(0, max(dur_gt[-1], dur_pred[-1]))
41 |     if f0s is not None:
42 |         ax = plt.gca()
43 |         ax2 = ax.twinx()
44 |         # ax.set_xticks()
45 | 
46 |         if not isinstance(f0s, dict):
47 |             f0s = {'f0': f0s}
48 |         for i, (k, f0) in enumerate(f0s.items()):
49 |             if f0 is not None:
50 |                 if isinstance(f0, torch.Tensor):
51 |                     f0 = f0.cpu().numpy()
52 |                 ax2.plot(
53 |                     np.arange(len(f0)) + 0.5, f0, label=k, c=LINE_COLORS[i], linewidth=1, alpha=0.5)
54 |         ax2.set_ylim(0, 1000)
55 |         ax2.legend()
56 |     return fig
57 | 
58 | 
59 | def align_to_figure(align, dur_info):
60 |     if isinstance(align, torch.Tensor):
61 |         align = align.cpu().numpy()
62 |     H = align.shape[1]
63 |     fig = plt.figure(figsize=(12, 6))
64 |     plt.pcolor(align.T, vmin=0, vmax=1)
65 |     if dur_info is not None:
66 |         assert isinstance(dur_info, dict)
67 |         txt = dur_info['txt']
68 |         dur_gt = dur_info['dur_gt']
69 |         if isinstance(dur_gt, torch.Tensor):
70 |             dur_gt = dur_gt.cpu().numpy()
71 |         dur_gt = np.cumsum(dur_gt).astype(int) // 2
72 |         for i in range(len(dur_gt)):
73 |             plt.text(dur_gt[i], i, txt[i], color='red')
74 |             plt.vlines(dur_gt[i], 0, H, colors='b')  # blue is gt
75 |         # plt.xlim(0, dur_gt[-1])
76 |     return fig
77 | 


--------------------------------------------------------------------------------
/utils/visualization/vis_cam3d/camera_parameter_loader.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import numpy as np
 4 | import quaternion
 5 | 
 6 | class CameraParameterLoader:
 7 |     def __init__(self):
 8 |         print('initialize camera parameter lodaer')
 9 | 
10 |     def get_intrinsic(self, path):
11 |         with open(os.path.join(path, '_camera_settings.json'), 'r') as f:
12 |             param_cam = json.load(f)
13 |             param_intrinsic = param_cam['camera_settings'][0]['intrinsic_settings']
14 |             cx = param_intrinsic['cx']
15 |             cy = param_intrinsic['cy']
16 |             fx = param_intrinsic['fx']
17 |             fy = param_intrinsic['fy']
18 |             s = param_intrinsic['s']
19 |             mat_intrinsic = np.array([[fx, s, cx],
20 |                                       [0, fy, cy],
21 |                                       [0, 0, 1]])
22 |         return mat_intrinsic
23 | 
24 |     def get_extrinsic(self, path):
25 |         with open(path, 'r') as f:
26 |             param_cam = json.load(f)['camera_data']
27 |             param_translation = param_cam['location_worldframe']
28 |             param_rotation = param_cam['quaternion_xyzw_worldframe']
29 | 
30 |             mat_rotation = quaternion.as_rotation_matrix(
31 |                 np.quaternion(param_rotation[3], param_rotation[0], param_rotation[1], param_rotation[2]))
32 |             mat_translation = np.array([[param_translation[0]], [param_translation[1]], [param_translation[2]]])
33 |             mat_extrinsic = np.concatenate(
34 |                 [np.concatenate([mat_rotation, mat_translation], axis=1), np.array([[0, 0, 0, 1]])], axis=0)
35 |             return mat_extrinsic
36 | 


--------------------------------------------------------------------------------