├── VERSION.md
├── .flake8
├── tools
    ├── requirements.txt
    ├── format_code.py
    └── utils
    │   └── render_markdown.py
├── .gitignore
├── docs
    ├── resources
    │   ├── mace_menu.png
    │   ├── tf_graph.png
    │   ├── audio_folder.png
    │   ├── jaw_vertices.png
    │   ├── maya_ace_mll.png
    │   ├── model_files.png
    │   ├── a2f_evolution.png
    │   ├── a2f_tf_pipeline.png
    │   ├── claire_animated.gif
    │   ├── lip_dist_vertex.png
    │   ├── lip_size_vertex.png
    │   ├── mace_model_set.png
    │   ├── train_full_name.png
    │   ├── a2f_selection_set.png
    │   ├── claire_prep_scene.png
    │   ├── claire_usd_sample.png
    │   ├── ds_files_level_0.png
    │   ├── export_cache_menu.png
    │   ├── mace_trt_manager.png
    │   ├── mark_claire_james.png
    │   ├── preproc_full_name.png
    │   ├── windows_features.png
    │   ├── claire_all_connected.png
    │   ├── claire_sample_scene.png
    │   ├── claire_skin_geom_vis.png
    │   ├── mace_play_james_a2f.gif
    │   ├── maya_cache_settings.png
    │   ├── tongue_rigid_vertex.png
    │   ├── upper_lower_eyelids.png
    │   ├── cache_files_in_folder.png
    │   ├── claire_mesh_disappear.png
    │   ├── mace_attribute_editor.png
    │   ├── tf_highlevel_overview.png
    │   ├── tongue_bs_connections.png
    │   ├── claire_maya_sample_scene.png
    │   ├── functionality_data_layers.png
    │   ├── mace_a2f_create_on_mesh.png
    │   ├── tongue_mesh_connections.png
    │   ├── turn_windows_features_on.png
    │   └── claire_mesh_nodes_connection.png
    └── a2f_introduction.md
├── .env.example
├── CHANGELOG.md
├── audio2face
    ├── __init__.py
    ├── deps
    │   ├── __init__.py
    │   ├── charsiu
    │   │   ├── __init__.py
    │   │   ├── src
    │   │   │   └── __init__.py
    │   │   ├── LICENSE
    │   │   └── README.md
    │   └── motion_diffusion_model
    │   │   ├── __init__.py
    │   │   ├── utils
    │   │       ├── __init__.py
    │   │       └── misc.py
    │   │   ├── diffusion
    │   │       ├── __init__.py
    │   │       ├── script_utils.py
    │   │       ├── losses.py
    │   │       ├── respace.py
    │   │       ├── resample.py
    │   │       └── nn.py
    │   │   └── LICENSE
    ├── geometry
    │   ├── __init__.py
    │   ├── anim_cache.py
    │   ├── xform.py
    │   ├── point_cache.py
    │   └── pca.py
    ├── networks
    │   ├── __init__.py
    │   └── base.py
    ├── preproc
    │   ├── __init__.py
    │   └── preproc.py
    ├── config_base
    │   ├── __init__.py
    │   ├── config_inference_regression_base.py
    │   ├── exposed.py
    │   ├── config_inference_diffusion_base.py
    │   ├── config_dataset_base.py
    │   └── config_preproc_base.py
    ├── audio
    │   ├── __init__.py
    │   ├── utils.py
    │   └── audio_track.py
    ├── phoneme.py
    ├── emotion.py
    └── convert_onnx.py
├── CITATION.md
├── .vscode
    ├── settings.json
    ├── tasks.json
    └── launch.json
├── docker
    ├── build_docker.sh
    ├── pyproject.toml
    ├── Dockerfile
    ├── utils.sh
    ├── run_deploy.sh
    ├── run_shell.sh
    ├── run_train.sh
    ├── run_preproc.sh
    └── run_inference.sh
├── configs
    ├── example-regression
    │   ├── config_inference.py
    │   ├── config_preproc.py
    │   └── config_dataset.py
    ├── example-regression-min
    │   ├── config_inference.py
    │   ├── config_preproc.py
    │   └── config_dataset.py
    ├── example-diffusion
    │   ├── config_preproc.py
    │   ├── config_inference.py
    │   └── config_dataset.py
    └── example-diffusion-min
    │   ├── config_inference.py
    │   ├── config_preproc.py
    │   └── config_dataset.py
├── run_train.py
├── run_preproc.py
├── run_deploy.py
├── SECURITY.md
├── run_inference.py
├── runners
    ├── run_preproc_debug.py
    ├── run_deploy_debug.py
    ├── run_train_debug.py
    ├── run_preproc.py
    ├── run_deploy.py
    ├── run_inference_debug.py
    ├── run_train.py
    └── run_inference.py
├── utils.py
└── CONTRIBUTING.md


/VERSION.md:
--------------------------------------------------------------------------------
1 | 1.0.1
2 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203,E402
3 | max-line-length = 120
4 | 


--------------------------------------------------------------------------------
/tools/requirements.txt:
--------------------------------------------------------------------------------
1 | black==23.10.0
2 | flake8==6.1.0
3 | markdown2==2.4.10
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.pyc
3 | .ipynb_checkpoints
4 | /.env
5 | /etc/
6 | /TODO.md
7 | /docker/requirements.txt
8 | 


--------------------------------------------------------------------------------
/docs/resources/mace_menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/mace_menu.png


--------------------------------------------------------------------------------
/docs/resources/tf_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/tf_graph.png


--------------------------------------------------------------------------------
/docs/resources/audio_folder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/audio_folder.png


--------------------------------------------------------------------------------
/docs/resources/jaw_vertices.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/jaw_vertices.png


--------------------------------------------------------------------------------
/docs/resources/maya_ace_mll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/maya_ace_mll.png


--------------------------------------------------------------------------------
/docs/resources/model_files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/model_files.png


--------------------------------------------------------------------------------
/docs/resources/a2f_evolution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/a2f_evolution.png


--------------------------------------------------------------------------------
/docs/resources/a2f_tf_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/a2f_tf_pipeline.png


--------------------------------------------------------------------------------
/docs/resources/claire_animated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/claire_animated.gif


--------------------------------------------------------------------------------
/docs/resources/lip_dist_vertex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/lip_dist_vertex.png


--------------------------------------------------------------------------------
/docs/resources/lip_size_vertex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/lip_size_vertex.png


--------------------------------------------------------------------------------
/docs/resources/mace_model_set.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/mace_model_set.png


--------------------------------------------------------------------------------
/docs/resources/train_full_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/train_full_name.png


--------------------------------------------------------------------------------
/docs/resources/a2f_selection_set.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/a2f_selection_set.png


--------------------------------------------------------------------------------
/docs/resources/claire_prep_scene.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/claire_prep_scene.png


--------------------------------------------------------------------------------
/docs/resources/claire_usd_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/claire_usd_sample.png


--------------------------------------------------------------------------------
/docs/resources/ds_files_level_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/ds_files_level_0.png


--------------------------------------------------------------------------------
/docs/resources/export_cache_menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/export_cache_menu.png


--------------------------------------------------------------------------------
/docs/resources/mace_trt_manager.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/mace_trt_manager.png


--------------------------------------------------------------------------------
/docs/resources/mark_claire_james.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/mark_claire_james.png


--------------------------------------------------------------------------------
/docs/resources/preproc_full_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/preproc_full_name.png


--------------------------------------------------------------------------------
/docs/resources/windows_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/windows_features.png


--------------------------------------------------------------------------------
/docs/resources/claire_all_connected.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/claire_all_connected.png


--------------------------------------------------------------------------------
/docs/resources/claire_sample_scene.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/claire_sample_scene.png


--------------------------------------------------------------------------------
/docs/resources/claire_skin_geom_vis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/claire_skin_geom_vis.png


--------------------------------------------------------------------------------
/docs/resources/mace_play_james_a2f.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/mace_play_james_a2f.gif


--------------------------------------------------------------------------------
/docs/resources/maya_cache_settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/maya_cache_settings.png


--------------------------------------------------------------------------------
/docs/resources/tongue_rigid_vertex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/tongue_rigid_vertex.png


--------------------------------------------------------------------------------
/docs/resources/upper_lower_eyelids.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/upper_lower_eyelids.png


--------------------------------------------------------------------------------
/docs/resources/cache_files_in_folder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/cache_files_in_folder.png


--------------------------------------------------------------------------------
/docs/resources/claire_mesh_disappear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/claire_mesh_disappear.png


--------------------------------------------------------------------------------
/docs/resources/mace_attribute_editor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/mace_attribute_editor.png


--------------------------------------------------------------------------------
/docs/resources/tf_highlevel_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/tf_highlevel_overview.png


--------------------------------------------------------------------------------
/docs/resources/tongue_bs_connections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/tongue_bs_connections.png


--------------------------------------------------------------------------------
/docs/resources/claire_maya_sample_scene.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/claire_maya_sample_scene.png


--------------------------------------------------------------------------------
/docs/resources/functionality_data_layers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/functionality_data_layers.png


--------------------------------------------------------------------------------
/docs/resources/mace_a2f_create_on_mesh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/mace_a2f_create_on_mesh.png


--------------------------------------------------------------------------------
/docs/resources/tongue_mesh_connections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/tongue_mesh_connections.png


--------------------------------------------------------------------------------
/docs/resources/turn_windows_features_on.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/turn_windows_features_on.png


--------------------------------------------------------------------------------
/docs/resources/claire_mesh_nodes_connection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/Audio2Face-3D-Training-Framework/main/docs/resources/claire_mesh_nodes_connection.png


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | # This will be mapped to "/datasets" path in the docker container
2 | A2F_DATASETS_ROOT="/path/to/local/audio2face/datasets"
3 | 
4 | # This will be mapped to "/workspace" path in the docker container
5 | A2F_WORKSPACE_ROOT="/path/to/local/audio2face/workspace"
6 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 4 | 
 5 | ## [1.0.1] - 2025-09-25
 6 | ### Added
 7 | - Minor note about git LFS in the README
 8 | 
 9 | ## [1.0.0] - 2025-09-20
10 | ### Added
11 | - First public version of Audio2Face-3D Training Framework
12 | 


--------------------------------------------------------------------------------
/audio2face/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/CITATION.md:
--------------------------------------------------------------------------------
 1 | # Citation Guide
 2 | 
 3 | ## To Cite Audio2Face-3D Training Framework
 4 | 
 5 | If you use Audio2Face-3D Training Framework in a publication, please use citations in the following format (BibTeX entry for LaTeX):
 6 | ```bibtex
 7 | @misc{
 8 |       nvidia2025audio2face3d,
 9 |       title={Audio2Face-3D: Audio-driven Realistic Facial Animation For Digital Avatars},
10 |       author={Chaeyeon Chung and Ilya Fedorov and Michael Huang and Aleksey Karmanov and Dmitry Korobchenko and Roger Ribera and Yeongho Seol},
11 |       year={2025},
12 |       eprint={2508.16401},
13 |       archivePrefix={arXiv},
14 |       primaryClass={cs.GR},
15 |       url={https://arxiv.org/abs/2508.16401},
16 |       note={Authors listed in alphabetical order}
17 | }
18 | ```
19 | 


--------------------------------------------------------------------------------
/audio2face/deps/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/audio2face/geometry/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/audio2face/networks/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/audio2face/preproc/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/audio2face/config_base/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/audio2face/deps/charsiu/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/audio2face/deps/charsiu/src/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/audio2face/deps/motion_diffusion_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/audio2face/deps/motion_diffusion_model/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/audio2face/deps/motion_diffusion_model/diffusion/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "files.eol": "\n",
 3 |     "files.insertFinalNewline": true,
 4 |     "files.trimFinalNewlines": true,
 5 |     "files.trimTrailingWhitespace": true,
 6 |     "diffEditor.ignoreTrimWhitespace": false,
 7 |     "editor.defaultFormatter": "ms-python.black-formatter",
 8 |     "black-formatter.args": [
 9 |         "--line-length",
10 |         "120",
11 |     ],
12 |     "flake8.args": [
13 |         "--max-line-length=120",
14 |         "--ignore=E203,E402",
15 |     ],
16 |     "editor.rulers": [
17 |         120
18 |     ],
19 |     "python.analysis.exclude": [
20 |     ],
21 |     "spellright.language": [
22 |         "en"
23 |     ],
24 |     "spellright.documentTypes": [
25 |         "markdown",
26 |         "latex",
27 |         "plaintext",
28 |         "asciidoc",
29 |         "python"
30 |     ],
31 |     "spellright.notificationClass": "warning",
32 | }
33 | 


--------------------------------------------------------------------------------
/docker/build_docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
19 | 
20 | docker build -t audio2face-framework-env "${SCRIPT_DIR}"
21 | 


--------------------------------------------------------------------------------
/audio2face/audio/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from .audio_track import AudioTrack  # noqa
17 | from .utils import read_audio_track, read_and_preproc_audio_track, generate_audio_noise  # noqa
18 | 


--------------------------------------------------------------------------------
/docker/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "audio2face-framework"
 3 | version = "1.0.0"
 4 | description = "Audio2Face-3D Training Framework"
 5 | authors = ["NVIDIA"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = "~3.10.0"
 9 | numpy = "1.26.0"
10 | scipy = "1.15.3"
11 | scikit-learn = "1.7.0rc1"
12 | SoundFile = "0.11.0"
13 | protobuf = "4.25.1"
14 | tensorboard = "2.14.0"
15 | onnx = "1.17.0"
16 | cupy-cuda12x = "13.5.1"
17 | torch = { url = "https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl" }
18 | torchaudio = { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.7.1%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl" }
19 | transformers = "4.52.3"
20 | praatio = "6.2.0"
21 | timm = "1.0.11"
22 | tqdm = "4.66.5"
23 | audiomentations = "0.40.0"
24 | 
25 | [build-system]
26 | requires = ["poetry-core>=1.7.1"]
27 | build-backend = "poetry.core.masonry.api"
28 | 


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "2.0.0",
 3 |     "tasks": [
 4 |         {
 5 |             "label": "flake8 lint",
 6 |             "type": "shell",
 7 |             "command": "flake8",
 8 |             "args": [
 9 |                 "."
10 |             ],
11 |             "presentation": {
12 |                 "reveal": "always",
13 |                 "panel": "new"
14 |             },
15 |             "group": "build",
16 |             "problemMatcher": {
17 |                 "owner": "python",
18 |                 "fileLocation": ["relative", "${workspaceFolder}"],
19 |                 "pattern": {
20 |                     "regexp": "^(.*):(\\d+):(\\d+): (\\w\\d+) (.*)$",
21 |                     "file": 1,
22 |                     "line": 2,
23 |                     "column": 3,
24 |                     "code": 4,
25 |                     "message": 5
26 |                 }
27 |             }
28 |         }
29 |     ]
30 | }
31 | 


--------------------------------------------------------------------------------
/configs/example-regression/config_inference.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Audio2Face Inference Config (Post-processing params, etc)
17 | CONFIG = {
18 |     "source_shot": "cp1_neutral",
19 |     "source_frame": 220,
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/example-regression-min/config_inference.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Audio2Face Inference Config (Post-processing params, etc)
17 | CONFIG = {
18 |     "source_shot": "cp1_neutral",
19 |     "source_frame": 220,
20 | }
21 | 


--------------------------------------------------------------------------------
/tools/format_code.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import subprocess
18 | 
19 | parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
20 | subprocess.run(["black", "--line-length", "120", parent_dir])
21 | 


--------------------------------------------------------------------------------
/configs/example-diffusion/config_preproc.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # The name of the Preprocessing run (use different names for different hyper-parameters or datasets)
17 | RUN_NAME = "example"
18 | 
19 | # Additional information describing the Preprocessing run, will be saved to <PREPROC_RUN_NAME_FULL>/configs/info.txt
20 | RUN_INFO = ""
21 | 


--------------------------------------------------------------------------------
/configs/example-regression/config_preproc.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # The name of the Preprocessing run (use different names for different hyper-parameters or datasets)
17 | RUN_NAME = "example"
18 | 
19 | # Additional information describing the Preprocessing run, will be saved to <PREPROC_RUN_NAME_FULL>/configs/info.txt
20 | RUN_INFO = ""
21 | 


--------------------------------------------------------------------------------
/configs/example-diffusion/config_inference.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Python Inference parameters for diffusion
17 | CONDITION = "claire"
18 | ACTOR_NAME = "claire"
19 | EMOTION_LABEL = "neutral"
20 | TIMESTEP_RESPACING = "2"
21 | INFERENCE_MODE = "offline"
22 | AUDIO_PATH = "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/audio/eg1_neutral.wav"
23 | INFERENCE_OUTPUT_ROOT = "/workspace/output_inference"
24 | 


--------------------------------------------------------------------------------
/configs/example-diffusion-min/config_inference.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Python Inference parameters for diffusion
17 | CONDITION = "claire"
18 | ACTOR_NAME = "claire"
19 | EMOTION_LABEL = "neutral"
20 | TIMESTEP_RESPACING = "2"
21 | INFERENCE_MODE = "offline"
22 | AUDIO_PATH = "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/audio/eg1_neutral.wav"
23 | INFERENCE_OUTPUT_ROOT = "/workspace/output_inference"
24 | 


--------------------------------------------------------------------------------
/configs/example-diffusion-min/config_preproc.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # The name of the Preprocessing run (use different names for different hyper-parameters or datasets)
17 | RUN_NAME = "example-min"
18 | 
19 | # Additional information describing the Preprocessing run, will be saved to <PREPROC_RUN_NAME_FULL>/configs/info.txt
20 | RUN_INFO = "Minimal preprocessing setup - only requires AUDIO_ROOT and SKIN_CACHE_ROOT"
21 | 


--------------------------------------------------------------------------------
/configs/example-regression-min/config_preproc.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # The name of the Preprocessing run (use different names for different hyper-parameters or datasets)
17 | RUN_NAME = "example-min"
18 | 
19 | # Additional information describing the Preprocessing run, will be saved to <PREPROC_RUN_NAME_FULL>/configs/info.txt
20 | RUN_INFO = "Minimal preprocessing setup - only requires AUDIO_ROOT and SKIN_CACHE_ROOT"
21 | 


--------------------------------------------------------------------------------
/audio2face/deps/charsiu/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 jzhu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/audio2face/deps/motion_diffusion_model/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Guy Tevet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
17 | 
18 | LABEL maintainer="Dmitry Korobchenko <dkorobchenko@nvidia.com>"
19 | 
20 | RUN apt-get update && apt-get install -y clang libtool-bin git python3-pip libsndfile1-dev
21 | 
22 | RUN ln -sf /usr/bin/python3 /usr/bin/python
23 | 
24 | RUN pip3 install poetry==1.8.2
25 | COPY poetry.lock pyproject.toml /poetry/
26 | RUN cd /poetry/ && \
27 |     poetry config virtualenvs.create false && \
28 |     poetry install --no-interaction --no-ansi --no-root
29 | 


--------------------------------------------------------------------------------
/audio2face/geometry/anim_cache.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import numpy as np
18 | 
19 | from audio2face.geometry import maya_cache, point_cache
20 | 
21 | 
22 | def read_cache(fpath: str) -> np.ndarray:
23 |     _, ext = os.path.splitext(fpath)
24 |     if ext == ".npy":
25 |         return np.load(fpath)
26 |     elif ext == ".xml":
27 |         return maya_cache.read_cache_mc(fpath)
28 |     elif ext == ".pc2":
29 |         return point_cache.read_cache_pc2(fpath)
30 |     else:
31 |         raise ValueError(f"Unable to read Animation Cache, unrecognized file ext: {ext}")
32 | 


--------------------------------------------------------------------------------
/audio2face/geometry/xform.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import numpy as np
17 | 
18 | 
19 | def rigidXform(aPose: np.ndarray, bPose: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
20 |     aMean = np.mean(aPose, axis=0)
21 |     aDelta = aPose - aMean
22 | 
23 |     bMean = np.mean(bPose, axis=0)
24 |     bDelta = bPose - bMean
25 | 
26 |     H = np.dot(bDelta.T, aDelta)
27 |     U, s, V = np.linalg.svd(H)
28 | 
29 |     R = np.dot(V.T, U.T)
30 |     eye = np.eye(3)
31 |     eye[2, 2] = np.linalg.det(R)
32 |     R = np.dot(np.dot(V.T, eye), U.T)
33 | 
34 |     RR = R.T
35 |     tt = aMean - np.dot(bMean, R.T)
36 | 
37 |     return RR, tt
38 | 


--------------------------------------------------------------------------------
/run_train.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import json
18 | import argparse
19 | 
20 | from utils import validate_identifier_or_exit, load_config, run_process
21 | 
22 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
23 | 
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument("config_name", type=str, help="Config Name")
26 | args = parser.parse_args()
27 | 
28 | validate_identifier_or_exit(args.config_name, "CONFIG_NAME")
29 | 
30 | print(f"Using Config Name: {args.config_name}")
31 | 
32 | run_process(
33 |     [
34 |         os.path.join(ROOT_DIR, "docker", "run_train.sh"),
35 |         json.dumps(load_config(args.config_name, "train")),
36 |         json.dumps(load_config(args.config_name, "dataset")),
37 |         json.dumps(load_config(args.config_name, "inference")),
38 |     ]
39 | )
40 | 


--------------------------------------------------------------------------------
/run_preproc.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import json
18 | import argparse
19 | 
20 | from utils import validate_identifier_or_exit, load_config, run_process
21 | 
22 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
23 | 
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument("config_name", type=str, help="Config Name")
26 | parser.add_argument("actor_name", type=str, help="Actor Name")
27 | args = parser.parse_args()
28 | 
29 | validate_identifier_or_exit(args.config_name, "CONFIG_NAME")
30 | validate_identifier_or_exit(args.actor_name, "ACTOR_NAME")
31 | 
32 | print(f"Using Config Name: {args.config_name}")
33 | print(f"Using Actor Name: {args.actor_name}")
34 | 
35 | run_process(
36 |     [
37 |         os.path.join(ROOT_DIR, "docker", "run_preproc.sh"),
38 |         args.actor_name,
39 |         json.dumps(load_config(args.config_name, "preproc")),
40 |         json.dumps(load_config(args.config_name, "dataset")),
41 |     ]
42 | )
43 | 


--------------------------------------------------------------------------------
/run_deploy.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import json
18 | import argparse
19 | 
20 | from utils import validate_identifier_or_exit, load_config, run_process
21 | 
22 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
23 | 
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument("config_name", type=str, help="Config Name")
26 | parser.add_argument("training_run_name_full", type=str, help="Training Run Name Full")
27 | args = parser.parse_args()
28 | 
29 | validate_identifier_or_exit(args.config_name, "CONFIG_NAME")
30 | validate_identifier_or_exit(args.training_run_name_full, "TRAINING_RUN_NAME_FULL")
31 | 
32 | print(f"Using Config Name: {args.config_name}")
33 | print(f"Using Training Run Name Full: {args.training_run_name_full}")
34 | 
35 | run_process(
36 |     [
37 |         os.path.join(ROOT_DIR, "docker", "run_deploy.sh"),
38 |         args.training_run_name_full,
39 |         json.dumps(load_config(args.config_name, "train")),
40 |         json.dumps(load_config(args.config_name, "inference", optional=True)),
41 |     ]
42 | )
43 | 


--------------------------------------------------------------------------------
/docker/utils.sh:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | function read_env_file() {
17 |     SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
18 |     ENV_FPATH="${SCRIPT_DIR}/../.env"
19 |     if [ ! -e "$ENV_FPATH" ]; then
20 |         echo "File $ENV_FPATH does not exist."
21 |         exit 1
22 |     fi
23 |     echo "==============================================================================================================="
24 |     cat $ENV_FPATH
25 |     echo "==============================================================================================================="
26 |     set -o allexport
27 |     source $ENV_FPATH
28 |     set +o allexport
29 | }
30 | 
31 | function wrap_and_escape_args() {
32 |     # Wrapping each argument with single quotes and escaping
33 |     if [ "$#" -eq 0 ]; then
34 |         echo ""
35 |     else
36 |         for arg in "$@"; do
37 |             printf "'%s' " "$(printf "%s" "$arg" | sed "s/'/'\"'\"'/g")"
38 |         done
39 |     fi
40 | }
41 | 
42 | function compose_docker_hostname() {
43 |     echo "$(hostname)__${USER}__docker"
44 | }
45 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | ## Security
 2 | 
 3 | NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization.
 4 | 
 5 | If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub.**
 6 | 
 7 | ## Reporting Potential Security Vulnerability in an NVIDIA Product
 8 | 
 9 | To report a potential security vulnerability in any NVIDIA product:
10 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html)
11 | - E-Mail: psirt@nvidia.com
12 |     - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
13 |     - Please include the following information:
14 |         - Product/Driver name and version/branch that contains the vulnerability
15 |         - Type of vulnerability (code execution, denial of service, buffer overflow, etc.)
16 |         - Instructions to reproduce the vulnerability
17 |         - Proof-of-concept or exploit code
18 |         - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability
19 | 
20 | While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information.
21 | 
22 | ## NVIDIA Product Security
23 | 
24 | For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security
25 | 


--------------------------------------------------------------------------------
/audio2face/config_base/config_inference_regression_base.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Audio2Face Inference Config (Post-processing params, etc)
17 | CONFIG = {
18 |     "input_strength": 1.3,
19 |     "upper_face_smoothing": 0.001,
20 |     "lower_face_smoothing": 0.0023,
21 |     "upper_face_strength": 1.0,
22 |     "lower_face_strength": 1.7,
23 |     "face_mask_level": 0.6,
24 |     "face_mask_softness": 0.0085,
25 |     "source_shot": None,
26 |     "source_frame": None,
27 |     "skin_strength": 1.1,
28 |     "blink_strength": 1.0,
29 |     "lower_teeth_strength": 1.3,
30 |     "lower_teeth_height_offset": -0.1,
31 |     "lower_teeth_depth_offset": 0.25,
32 |     "lip_open_offset": -0.05,
33 |     "tongue_strength": 1.5,
34 |     "tongue_height_offset": 0.2,
35 |     "tongue_depth_offset": -0.3,
36 |     "eyeballs_strength": 1.0,
37 |     "saccade_strength": 0.9,
38 |     "right_eye_rot_x_offset": 0.0,
39 |     "right_eye_rot_y_offset": -2.0,
40 |     "left_eye_rot_x_offset": 0.0,
41 |     "left_eye_rot_y_offset": 2.0,
42 |     "eyelid_open_offset": 0.06,
43 |     "eye_saccade_seed": 0,
44 | }
45 | 


--------------------------------------------------------------------------------
/run_inference.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import json
18 | import argparse
19 | 
20 | from utils import validate_identifier_or_exit, load_config, run_process
21 | 
22 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
23 | 
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument("config_name", type=str, help="Config Name")
26 | parser.add_argument("training_run_name_full", type=str, help="Training Run Name Full")
27 | args = parser.parse_args()
28 | 
29 | validate_identifier_or_exit(args.config_name, "CONFIG_NAME")
30 | validate_identifier_or_exit(args.training_run_name_full, "TRAINING_RUN_NAME_FULL")
31 | 
32 | print(f"Using Config Name: {args.config_name}")
33 | print(f"Using Training Run Name Full: {args.training_run_name_full}")
34 | 
35 | run_process(
36 |     [
37 |         os.path.join(ROOT_DIR, "docker", "run_inference.sh"),
38 |         args.training_run_name_full,
39 |         json.dumps(load_config(args.config_name, "train")),
40 |         json.dumps(load_config(args.config_name, "dataset")),
41 |         json.dumps(load_config(args.config_name, "inference")),
42 |     ]
43 | )
44 | 


--------------------------------------------------------------------------------
/docker/run_deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | WORKING_DIR="/framework"
19 | RUN_CMD="python runners/run_deploy.py"
20 | 
21 | SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
22 | A2F_FRAMEWORK_ROOT="$(dirname "$SCRIPT_DIR}")"
23 | 
24 | source "${SCRIPT_DIR}/utils.sh"
25 | read_env_file
26 | 
27 | GIT_SAFE_DIR_CMD="git config --global --add safe.directory /framework"
28 | SCRIPT_ARGS_WRAPPED=$(wrap_and_escape_args "$@")
29 | DOCKER_CMD="${GIT_SAFE_DIR_CMD} && cd ${WORKING_DIR} && ${RUN_CMD} ${SCRIPT_ARGS_WRAPPED}"
30 | echo "${DOCKER_CMD}"
31 | echo "==============================================================================================================="
32 | 
33 | docker run \
34 |     --gpus all --cpus 20 \
35 |     -v "$A2F_DATASETS_ROOT":/datasets \
36 |     -v "$A2F_WORKSPACE_ROOT":/workspace \
37 |     -v "$A2F_FRAMEWORK_ROOT":/framework \
38 |     -e EXTERNAL_A2F_DATASETS_ROOT="$A2F_DATASETS_ROOT" \
39 |     -e EXTERNAL_A2F_WORKSPACE_ROOT="$A2F_WORKSPACE_ROOT" \
40 |     -e EXTERNAL_A2F_FRAMEWORK_ROOT="$A2F_FRAMEWORK_ROOT" \
41 |     --hostname $(compose_docker_hostname) \
42 |     audio2face-framework-env:latest \
43 |     /bin/bash -c "${DOCKER_CMD}"
44 | 


--------------------------------------------------------------------------------
/docker/run_shell.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | WORKING_DIR="/framework"
19 | 
20 | SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
21 | A2F_FRAMEWORK_ROOT="$(dirname "$SCRIPT_DIR}")"
22 | 
23 | source "${SCRIPT_DIR}/utils.sh"
24 | read_env_file
25 | 
26 | GIT_SAFE_DIR_CMD="git config --global --add safe.directory /framework"
27 | DOCKER_CMD="${GIT_SAFE_DIR_CMD} && cd ${WORKING_DIR} && exec bash"
28 | echo "${DOCKER_CMD}"
29 | echo "==============================================================================================================="
30 | 
31 | docker run -it \
32 |     --gpus all --cpus 20 \
33 |     --privileged --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
34 |     -v "$A2F_DATASETS_ROOT":/datasets \
35 |     -v "$A2F_WORKSPACE_ROOT":/workspace \
36 |     -v "$A2F_FRAMEWORK_ROOT":/framework \
37 |     -e EXTERNAL_A2F_DATASETS_ROOT="$A2F_DATASETS_ROOT" \
38 |     -e EXTERNAL_A2F_WORKSPACE_ROOT="$A2F_WORKSPACE_ROOT" \
39 |     -e EXTERNAL_A2F_FRAMEWORK_ROOT="$A2F_FRAMEWORK_ROOT" \
40 |     -e PYTHONPATH=/usr/local/lib/python3.10/dist-packages \
41 |     --hostname $(compose_docker_hostname) \
42 |     audio2face-framework-env:latest \
43 |     /bin/bash -c "${DOCKER_CMD}"
44 | 


--------------------------------------------------------------------------------
/docker/run_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | WORKING_DIR="/framework"
19 | RUN_CMD="python runners/run_train.py"
20 | 
21 | SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
22 | A2F_FRAMEWORK_ROOT="$(dirname "$SCRIPT_DIR}")"
23 | 
24 | source "${SCRIPT_DIR}/utils.sh"
25 | read_env_file
26 | 
27 | GIT_SAFE_DIR_CMD="git config --global --add safe.directory /framework"
28 | SCRIPT_ARGS_WRAPPED=$(wrap_and_escape_args "$@")
29 | DOCKER_CMD="${GIT_SAFE_DIR_CMD} && cd ${WORKING_DIR} && ${RUN_CMD} ${SCRIPT_ARGS_WRAPPED}"
30 | echo "${DOCKER_CMD}"
31 | echo "==============================================================================================================="
32 | 
33 | docker run \
34 |     --gpus all --cpus 20 \
35 |     --privileged --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
36 |     -v "$A2F_DATASETS_ROOT":/datasets \
37 |     -v "$A2F_WORKSPACE_ROOT":/workspace \
38 |     -v "$A2F_FRAMEWORK_ROOT":/framework \
39 |     -e EXTERNAL_A2F_DATASETS_ROOT="$A2F_DATASETS_ROOT" \
40 |     -e EXTERNAL_A2F_WORKSPACE_ROOT="$A2F_WORKSPACE_ROOT" \
41 |     -e EXTERNAL_A2F_FRAMEWORK_ROOT="$A2F_FRAMEWORK_ROOT" \
42 |     --hostname $(compose_docker_hostname) \
43 |     audio2face-framework-env:latest \
44 |     /bin/bash -c "${DOCKER_CMD}"
45 | 


--------------------------------------------------------------------------------
/docker/run_preproc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | WORKING_DIR="/framework"
19 | RUN_CMD="python runners/run_preproc.py"
20 | 
21 | SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
22 | A2F_FRAMEWORK_ROOT="$(dirname "$SCRIPT_DIR}")"
23 | 
24 | source "${SCRIPT_DIR}/utils.sh"
25 | read_env_file
26 | 
27 | GIT_SAFE_DIR_CMD="git config --global --add safe.directory /framework"
28 | SCRIPT_ARGS_WRAPPED=$(wrap_and_escape_args "$@")
29 | DOCKER_CMD="${GIT_SAFE_DIR_CMD} && cd ${WORKING_DIR} && ${RUN_CMD} ${SCRIPT_ARGS_WRAPPED}"
30 | echo "${DOCKER_CMD}"
31 | echo "==============================================================================================================="
32 | 
33 | docker run \
34 |     --gpus all --cpus 20 \
35 |     --privileged --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
36 |     -v "$A2F_DATASETS_ROOT":/datasets \
37 |     -v "$A2F_WORKSPACE_ROOT":/workspace \
38 |     -v "$A2F_FRAMEWORK_ROOT":/framework \
39 |     -e EXTERNAL_A2F_DATASETS_ROOT="$A2F_DATASETS_ROOT" \
40 |     -e EXTERNAL_A2F_WORKSPACE_ROOT="$A2F_WORKSPACE_ROOT" \
41 |     -e EXTERNAL_A2F_FRAMEWORK_ROOT="$A2F_FRAMEWORK_ROOT" \
42 |     --hostname $(compose_docker_hostname) \
43 |     audio2face-framework-env:latest \
44 |     /bin/bash -c "${DOCKER_CMD}"
45 | 


--------------------------------------------------------------------------------
/docker/run_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | WORKING_DIR="/framework"
19 | RUN_CMD="python runners/run_inference.py"
20 | 
21 | SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
22 | A2F_FRAMEWORK_ROOT="$(dirname "$SCRIPT_DIR}")"
23 | 
24 | source "${SCRIPT_DIR}/utils.sh"
25 | read_env_file
26 | 
27 | GIT_SAFE_DIR_CMD="git config --global --add safe.directory /framework"
28 | SCRIPT_ARGS_WRAPPED=$(wrap_and_escape_args "$@")
29 | DOCKER_CMD="${GIT_SAFE_DIR_CMD} && cd ${WORKING_DIR} && ${RUN_CMD} ${SCRIPT_ARGS_WRAPPED}"
30 | echo "${DOCKER_CMD}"
31 | echo "==============================================================================================================="
32 | 
33 | docker run \
34 |     --gpus all --cpus 20 \
35 |     --privileged --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
36 |     -v "$A2F_DATASETS_ROOT":/datasets \
37 |     -v "$A2F_WORKSPACE_ROOT":/workspace \
38 |     -v "$A2F_FRAMEWORK_ROOT":/framework \
39 |     -e EXTERNAL_A2F_DATASETS_ROOT="$A2F_DATASETS_ROOT" \
40 |     -e EXTERNAL_A2F_WORKSPACE_ROOT="$A2F_WORKSPACE_ROOT" \
41 |     -e EXTERNAL_A2F_FRAMEWORK_ROOT="$A2F_FRAMEWORK_ROOT" \
42 |     --hostname $(compose_docker_hostname) \
43 |     audio2face-framework-env:latest \
44 |     /bin/bash -c "${DOCKER_CMD}"
45 | 


--------------------------------------------------------------------------------
/audio2face/geometry/point_cache.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import struct
17 | import numpy as np
18 | 
19 | 
20 | def read_cache_pc2(fpath: str) -> np.ndarray:
21 |     header_format = "<12siiffi"
22 |     with open(fpath, "rb") as f:
23 |         header_size = struct.calcsize(header_format)
24 |         header = f.read(header_size)
25 |         signature, file_version, num_verts, start_frame, sample_rate, num_frames = struct.unpack(header_format, header)
26 |         if signature.decode() != "POINTCACHE2\0":
27 |             raise ValueError(f"Invalid pc2 file: {fpath}")
28 |         data = np.fromfile(f, dtype=np.float32, count=num_frames * num_verts * 3)
29 |         data = data.reshape(num_frames, num_verts, 3)
30 |     return data
31 | 
32 | 
33 | def write_cache_pc2(fpath: str, data: np.ndarray, sample_rate: float = 1.0) -> None:
34 |     header_format = "<12siiffi"
35 |     if len(data.shape) != 3:
36 |         raise ValueError(f"Invalid data shape: {data.shape}, must be (num_frames, num_vets, 3)")
37 |     num_frames = data.shape[0]
38 |     num_verts = data.shape[1]
39 |     start_frame = 0.0
40 |     file_version = 1
41 |     signature = b"POINTCACHE2\0"
42 |     header = struct.pack(header_format, signature, file_version, num_verts, start_frame, sample_rate, num_frames)
43 |     with open(fpath, "wb") as f:
44 |         f.write(header)
45 |         data.astype("<f").tofile(f)
46 | 


--------------------------------------------------------------------------------
/audio2face/config_base/exposed.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Config params accessible in partial exposure mode
17 | # For paths to nested params like {A: {B: ...}} use "A/B" notation
18 | EXPOSED_CONFIG_PARAMS = {
19 |     "dataset": [
20 |         "AUDIO_ROOT",
21 |         "SKIN_CACHE_ROOT",
22 |         "SKIN_NEUTRAL_FPATH",
23 |         "SKIN_LIP_OPEN_POSE_DELTA_FPATH",
24 |         "SKIN_EYE_CLOSE_POSE_DELTA_FPATH",
25 |         "SKIN_LIP_DIST_VERTEX_LIST_FPATH",
26 |         "SKIN_EYE_DIST_VERTEX_LIST_FPATH",
27 |         "TONGUE_CACHE_ROOT",
28 |         "TONGUE_NEUTRAL_FPATH",
29 |         "TONGUE_RIGID_VERTEX_LIST_FPATH",
30 |         "JAW_KEYPOINTS_NEUTRAL_FPATH",
31 |         "JAW_ANIM_DATA_FPATH",
32 |         "EYE_ANIM_DATA_FPATH",
33 |         "EYE_BLINK_KEYS_FPATH",
34 |         "EYE_SACCADE_ROTATIONS_FPATH",
35 |         "SHOT_EMOTION_NAMES",
36 |         "SHOT_EMOTION_MAP",
37 |         "CACHE_FPS",
38 |         "ACTOR_NAMES",
39 |     ],
40 |     "preproc": [
41 |         "RUN_NAME",
42 |         "RUN_INFO",
43 |         "SKIN_PRUNE_CACHE_ROOT",
44 |         "SKIN_PRUNE_MESH_MASK_FPATH",
45 |     ],
46 |     "train": [
47 |         "RUN_NAME",
48 |         "RUN_INFO",
49 |         "NETWORK_VERSION",
50 |         "NETWORK_TYPE",
51 |         "PREPROC_RUN_NAME_FULL",
52 |         "CLIPS",
53 |         "AUG_MUTED_SHOTS",
54 |         "NUM_EPOCHS",
55 |         "LOSS_LIP_DIST_ALPHA",
56 |     ],
57 |     "inference": [
58 |         "CONFIG",
59 |     ],
60 | }
61 | 


--------------------------------------------------------------------------------
/runners/run_preproc_debug.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import sys
18 | import pprint
19 | from types import SimpleNamespace
20 | 
21 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
22 | 
23 | from utils import load_config
24 | from audio2face import utils
25 | from audio2face.preproc import preproc
26 | 
27 | args = SimpleNamespace(
28 |     actor_name="claire",
29 |     config_name="example-diffusion",
30 | )
31 | 
32 | utils.validate_identifier_or_raise(args.actor_name, "ACTOR_NAME")
33 | 
34 | cfg_preproc_mod = load_config(args.config_name, "preproc")
35 | cfg_dataset_mod = load_config(args.config_name, "dataset")
36 | 
37 | print("===============================================================================================================")
38 | print(f"Using Actor Name: {args.actor_name}")
39 | print("===============================================================================================================")
40 | print(f"Using Preproc Config Modifier:\n{pprint.pformat(cfg_preproc_mod, width=120)}")
41 | print("===============================================================================================================")
42 | print(f"Using Dataset Config Modifier:\n{pprint.pformat(cfg_dataset_mod, width=120)}")
43 | print("===============================================================================================================")
44 | 
45 | result_preproc = preproc.run(args.actor_name, cfg_preproc_mod, cfg_dataset_mod)
46 | 


--------------------------------------------------------------------------------
/runners/run_deploy_debug.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import sys
18 | import pprint
19 | from types import SimpleNamespace
20 | 
21 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
22 | 
23 | from utils import load_config
24 | from audio2face import deploy, utils
25 | 
26 | args = SimpleNamespace(
27 |     training_run_name_full="XXXXXX_XXXXXX_XXX",
28 |     config_name="example-diffusion",
29 | )
30 | 
31 | utils.validate_identifier_or_raise(args.training_run_name_full, "TRAINING_RUN_NAME_FULL")
32 | 
33 | cfg_train_mod = load_config(args.config_name, "train")
34 | cfg_inference_mod = load_config(args.config_name, "inference")
35 | 
36 | print("===============================================================================================================")
37 | print(f"Using Training Run Name Full: {args.training_run_name_full}")
38 | print("===============================================================================================================")
39 | print(f"Using Train Config Modifier:\n{pprint.pformat(cfg_train_mod, width=120)}")
40 | print("===============================================================================================================")
41 | print(f"Using Inference Config Modifier:\n{pprint.pformat(cfg_inference_mod, width=120)}")
42 | print("===============================================================================================================")
43 | 
44 | result_deploy = deploy.run(args.training_run_name_full, cfg_train_mod, cfg_inference_mod)
45 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 |         {
 5 |             "name": "Debug Current File",
 6 |             "type": "debugpy",
 7 |             "request": "launch",
 8 |             "program": "${file}",
 9 |             "console": "integratedTerminal",
10 |             "justMyCode": false,
11 |             "env": {
12 |                 "PYTHONPATH": "${workspaceFolder}"
13 |             },
14 |             "cwd": "${workspaceFolder}"
15 |         },
16 |         {
17 |             "name": "Debug Preprocessing",
18 |             "type": "debugpy",
19 |             "request": "launch",
20 |             "program": "${workspaceFolder}/runners/run_preproc_debug.py",
21 |             "console": "integratedTerminal",
22 |             "justMyCode": false,
23 |             "env": {
24 |                 "PYTHONPATH": "${workspaceFolder}"
25 |             },
26 |             "cwd": "${workspaceFolder}"
27 |         },
28 |         {
29 |             "name": "Debug Training",
30 |             "type": "debugpy",
31 |             "request": "launch",
32 |             "program": "${workspaceFolder}/runners/run_train_debug.py",
33 |             "console": "integratedTerminal",
34 |             "justMyCode": false,
35 |             "env": {
36 |                 "PYTHONPATH": "${workspaceFolder}"
37 |             },
38 |             "cwd": "${workspaceFolder}"
39 |         },
40 |         {
41 |             "name": "Debug Inference",
42 |             "type": "debugpy",
43 |             "request": "launch",
44 |             "program": "${workspaceFolder}/runners/run_inference_debug.py",
45 |             "console": "integratedTerminal",
46 |             "justMyCode": false,
47 |             "env": {
48 |                 "PYTHONPATH": "${workspaceFolder}"
49 |             },
50 |             "cwd": "${workspaceFolder}"
51 |         },
52 |         {
53 |             "name": "Debug Deployment",
54 |             "type": "debugpy",
55 |             "request": "launch",
56 |             "program": "${workspaceFolder}/runners/run_deploy_debug.py",
57 |             "console": "integratedTerminal",
58 |             "justMyCode": false,
59 |             "env": {
60 |                 "PYTHONPATH": "${workspaceFolder}"
61 |             },
62 |             "cwd": "${workspaceFolder}"
63 |         }
64 |     ]
65 | }
66 | 


--------------------------------------------------------------------------------
/runners/run_train_debug.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import sys
18 | import pprint
19 | from types import SimpleNamespace
20 | 
21 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
22 | 
23 | from utils import load_config
24 | from audio2face import train_regression, train_diffusion, utils
25 | 
26 | args = SimpleNamespace(
27 |     config_name="example-diffusion",
28 | )
29 | 
30 | cfg_train_mod = load_config(args.config_name, "train")
31 | cfg_dataset_mod = load_config(args.config_name, "dataset")
32 | cfg_inference_mod = load_config(args.config_name, "inference")
33 | network_type = utils.get_network_type_or_raise(cfg_train_mod)
34 | 
35 | print("===============================================================================================================")
36 | print(f"Using Train Config Modifier:\n{pprint.pformat(cfg_train_mod, width=120)}")
37 | print("===============================================================================================================")
38 | print(f"Using Dataset Config Modifier:\n{pprint.pformat(cfg_dataset_mod, width=120)}")
39 | print("===============================================================================================================")
40 | print(f"Using Inference Config Modifier:\n{pprint.pformat(cfg_inference_mod, width=120)}")
41 | print("===============================================================================================================")
42 | 
43 | if network_type == "regression":
44 |     result_train = train_regression.run(cfg_train_mod, cfg_dataset_mod)
45 | elif network_type == "diffusion":
46 |     result_train = train_diffusion.run(cfg_train_mod, cfg_dataset_mod, cfg_inference_mod)
47 | 


--------------------------------------------------------------------------------
/audio2face/config_base/config_inference_diffusion_base.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Audio2Face Inference Config (Post-processing params, etc)
17 | CONFIG = {
18 |     "input_strength": 1.3,
19 |     "upper_face_smoothing": 0.003,
20 |     "lower_face_smoothing": 0.0023,
21 |     "upper_face_strength": 1.0,
22 |     "lower_face_strength": 1.7,
23 |     "face_mask_level": 0.6,
24 |     "face_mask_softness": 0.0085,
25 |     "skin_strength": 1.1,
26 |     "blink_strength": 1.0,
27 |     "lower_teeth_strength": 1.3,
28 |     "lower_teeth_height_offset": -0.1,
29 |     "lower_teeth_depth_offset": 0.25,
30 |     "lip_open_offset": -0.05,
31 |     "tongue_strength": 1.5,
32 |     "tongue_height_offset": 0.2,
33 |     "tongue_depth_offset": -0.3,
34 |     "eyeballs_strength": 1.0,
35 |     "saccade_strength": 0.9,
36 |     "right_eye_rot_x_offset": 0.0,
37 |     "right_eye_rot_y_offset": -2.0,
38 |     "left_eye_rot_x_offset": 0.0,
39 |     "left_eye_rot_y_offset": 2.0,
40 |     "eyelid_open_offset": 0.06,
41 |     "eye_saccade_seed": 0,
42 | }
43 | 
44 | # Python Inference parameters for diffusion
45 | CONDITION = None
46 | ACTOR_NAME = None
47 | EMOTION_LABEL = None
48 | TIMESTEP_RESPACING = None
49 | AUDIO_PATH = None
50 | 
51 | NUM_DIFFUSION_STEPS = 2
52 | SKIP_STEPS = 0
53 | USE_DDIM = False
54 | INFERENCE_MODE = None
55 | DETERMINISTIC_NOISE_PATH = None
56 | 
57 | INFERENCE_OUTPUT_ROOT = "/workspace/output_inference"
58 | OUTPUT_MAYA_CACHE = True
59 | OUTPUT_NPY_FILE = False
60 | 
61 | # ONNX Export Parameters
62 | USE_PER_FRAME_EMO_LABEL = True
63 | USE_EXTERNAL_NOISE_INPUT = True  # Whether to accept noise as input for reproducible inference
64 | USE_DELTA_OUTPUT = True  # Whether to output pos delta (in ori scale) or the absolute position
65 | 


--------------------------------------------------------------------------------
/audio2face/deps/motion_diffusion_model/diffusion/script_utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | ############################################################################
17 | # Modified from guided-diffusion
18 | # Copyright (c) 2021 OpenAI
19 | #
20 | # See https://github.com/openai/guided-diffusion/blob/main/LICENSE for details
21 | ############################################################################
22 | 
23 | from . import gaussian_diffusion as gd
24 | from .respace import SpacedDiffusion, space_timesteps
25 | 
26 | 
27 | def create_gaussian_diffusion(
28 |     *,
29 |     steps=1000,
30 |     learn_sigma=False,
31 |     sigma_small=True,
32 |     noise_schedule="linear",
33 |     use_kl=False,
34 |     predict_xstart=False,
35 |     rescale_timesteps=False,
36 |     rescale_learned_sigmas=False,
37 |     timestep_respacing="",
38 | ):
39 |     betas = gd.get_named_beta_schedule(noise_schedule, steps)
40 |     if use_kl:
41 |         loss_type = gd.LossType.RESCALED_KL
42 |     elif rescale_learned_sigmas:
43 |         loss_type = gd.LossType.RESCALED_MSE
44 |     else:
45 |         loss_type = gd.LossType.MSE
46 |     if not timestep_respacing:
47 |         timestep_respacing = [steps]
48 |     return SpacedDiffusion(
49 |         use_timesteps=space_timesteps(steps, timestep_respacing),
50 |         betas=betas,
51 |         model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X),
52 |         model_var_type=(
53 |             (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
54 |             if not learn_sigma
55 |             else gd.ModelVarType.LEARNED_RANGE
56 |         ),
57 |         loss_type=loss_type,
58 |         rescale_timesteps=rescale_timesteps,
59 |     )
60 | 


--------------------------------------------------------------------------------
/runners/run_preproc.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import sys
18 | import json
19 | import argparse
20 | import pprint
21 | 
22 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
23 | 
24 | from audio2face import utils
25 | from audio2face.preproc import preproc
26 | 
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument("actor_name", type=str, help="Actor Name")
29 | parser.add_argument("cfg_preproc_mod", type=str, nargs="?", default=None, help="Preproc Config Modifier")
30 | parser.add_argument("cfg_dataset_mod", type=str, nargs="?", default=None, help="Dataset Config Modifier")
31 | args = parser.parse_args()
32 | 
33 | utils.validate_identifier_or_raise(args.actor_name, "ACTOR_NAME")
34 | cfg_preproc_mod = json.loads(args.cfg_preproc_mod) if args.cfg_preproc_mod is not None else None
35 | cfg_dataset_mod = json.loads(args.cfg_dataset_mod) if args.cfg_dataset_mod is not None else None
36 | 
37 | print("===============================================================================================================")
38 | print(f"Using Actor Name: {args.actor_name}")
39 | print("===============================================================================================================")
40 | print(f"Using Preproc Config Modifier:\n{pprint.pformat(cfg_preproc_mod, width=120)}")
41 | print("===============================================================================================================")
42 | print(f"Using Dataset Config Modifier:\n{pprint.pformat(cfg_dataset_mod, width=120)}")
43 | print("===============================================================================================================")
44 | 
45 | result_preproc = preproc.run(args.actor_name, cfg_preproc_mod, cfg_dataset_mod)
46 | 


--------------------------------------------------------------------------------
/runners/run_deploy.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import sys
18 | import json
19 | import argparse
20 | import pprint
21 | 
22 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
23 | 
24 | from audio2face import deploy, utils
25 | 
26 | parser = argparse.ArgumentParser()
27 | parser.add_argument("training_run_name_full", type=str, help="Training Run Name Full")
28 | parser.add_argument("cfg_train_mod", type=str, nargs="?", default=None, help="Train Config Modifier")
29 | parser.add_argument("cfg_inference_mod", type=str, nargs="?", default=None, help="Inference Config Modifier")
30 | args = parser.parse_args()
31 | 
32 | utils.validate_identifier_or_raise(args.training_run_name_full, "TRAINING_RUN_NAME_FULL")
33 | cfg_train_mod = json.loads(args.cfg_train_mod) if args.cfg_train_mod is not None else None
34 | cfg_inference_mod = json.loads(args.cfg_inference_mod) if args.cfg_inference_mod is not None else None
35 | 
36 | print("===============================================================================================================")
37 | print(f"Using Training Run Name Full: {args.training_run_name_full}")
38 | print("===============================================================================================================")
39 | print(f"Using Train Config Modifier:\n{pprint.pformat(cfg_train_mod, width=120)}")
40 | print("===============================================================================================================")
41 | print(f"Using Inference Config Modifier:\n{pprint.pformat(cfg_inference_mod, width=120)}")
42 | print("===============================================================================================================")
43 | 
44 | result_deploy = deploy.run(args.training_run_name_full, cfg_train_mod, cfg_inference_mod)
45 | 


--------------------------------------------------------------------------------
/runners/run_inference_debug.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import sys
18 | import pprint
19 | from types import SimpleNamespace
20 | 
21 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
22 | 
23 | from utils import load_config
24 | from audio2face import infer_diffusion, utils
25 | 
26 | args = SimpleNamespace(
27 |     training_run_name_full="XXXXXX_XXXXXX_XXX",
28 |     config_name="example-diffusion",
29 | )
30 | 
31 | utils.validate_identifier_or_raise(args.training_run_name_full, "TRAINING_RUN_NAME_FULL")
32 | 
33 | cfg_train_mod = load_config(args.config_name, "train")
34 | cfg_dataset_mod = load_config(args.config_name, "dataset")
35 | cfg_inference_mod = load_config(args.config_name, "inference")
36 | 
37 | print("===============================================================================================================")
38 | print(f"Using Training Run Name Full: {args.training_run_name_full}")
39 | print("===============================================================================================================")
40 | print(f"Using Train Config Modifier:\n{pprint.pformat(cfg_train_mod, width=120)}")
41 | print("===============================================================================================================")
42 | print(f"Using Dataset Config Modifier:\n{pprint.pformat(cfg_dataset_mod, width=120)}")
43 | print("===============================================================================================================")
44 | print(f"Using Inference Config Modifier:\n{pprint.pformat(cfg_inference_mod, width=120)}")
45 | print("===============================================================================================================")
46 | 
47 | result_train = infer_diffusion.run(args.training_run_name_full, cfg_train_mod, cfg_dataset_mod, cfg_inference_mod)
48 | 


--------------------------------------------------------------------------------
/configs/example-regression-min/config_dataset.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | ########################################################################################################################
17 | # Paths to various parts of the Audio2Face dataset (per actor)
18 | # Check the details for each of the parts in Audio2Face-3D-Dataset-v1.0.0-claire/docs/README.html file
19 | # Actor-specific parameters are represented as a dictionary: PARAM = {"actor1": value1, "actor2": value2, ...}
20 | # Only AUDIO_ROOT and SKIN_CACHE_ROOT are required - all other paths are optional
21 | ########################################################################################################################
22 | 
23 | # Audio data
24 | AUDIO_ROOT = {
25 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/audio",
26 | }
27 | 
28 | # Skin data
29 | SKIN_CACHE_ROOT = {
30 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/cache/skin",
31 | }
32 | 
33 | ########################################################################################################################
34 | # Dataset properties and meta-information
35 | ########################################################################################################################
36 | 
37 | # List of emotions used in the training dataset shots with facial animation performance
38 | SHOT_EMOTION_NAMES = [
39 |     "neutral",
40 |     "amazement",
41 |     "anger",
42 |     "cheekiness",
43 |     "disgust",
44 |     "fear",
45 |     "grief",
46 |     "joy",
47 |     "outofbreath",
48 |     "pain",
49 |     "sadness",
50 | ]
51 | 
52 | # Frames-Per-Second rate for all the animation caches in the training dataset (per actor)
53 | # This parameter will be used to generate the shot list artifact during Preprocessing and augmented muted shots
54 | CACHE_FPS = {
55 |     "claire": 30.0,
56 | }
57 | 
58 | # List of the names of the actors performing the animation in the shots
59 | ACTOR_NAMES = [
60 |     "claire",
61 | ]
62 | 


--------------------------------------------------------------------------------
/audio2face/networks/base.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import torch.nn as nn
17 | 
18 | 
19 | class NetworkBaseRegression(nn.Module):
20 |     def __init__(self) -> None:
21 |         super(NetworkBaseRegression, self).__init__()
22 |         self.supported_modes = [
23 |             "train",
24 |             "onnx",
25 |         ]
26 |         self.set_mode("train")
27 | 
28 |     def set_mode(self, mode: str) -> None:
29 |         if mode not in self.supported_modes:
30 |             raise ValueError(f"Unsupported network mode: {mode}")
31 |         self.mode = mode
32 | 
33 | 
34 | class NetworkBaseDiffusion(nn.Module):
35 |     def __init__(self) -> None:
36 |         super(NetworkBaseDiffusion, self).__init__()
37 |         self.supported_modes = [
38 |             "offline",
39 |             "streaming",
40 |             "streaming_stateless",
41 |             "streaming_stateless_output_delta",
42 |             "streaming_stateless_onnx",
43 |             "streaming_stateless_trt",
44 |             "streaming_stateless_output_delta_onnx",
45 |             "streaming_stateless_output_delta_trt",
46 |         ]
47 |         self.set_mode("offline")
48 | 
49 |     def set_mode(self, mode: str) -> None:
50 |         if mode not in self.supported_modes:
51 |             raise ValueError(f"Unsupported network mode: {mode}")
52 |         self.mode = mode
53 |         self.forward = {
54 |             "offline": self.forward_offline,
55 |             "streaming": self.forward_streaming,
56 |             "streaming_stateless": self.forward_streaming_stateless,
57 |             "streaming_stateless_output_delta": self.forward_streaming_stateless_output_delta,
58 |             "streaming_stateless_onnx": self.forward_streaming_stateless_onnx,
59 |             "streaming_stateless_trt": self.forward_streaming_stateless_trt,
60 |             "streaming_stateless_output_delta_onnx": self.forward_streaming_stateless_output_delta_onnx,
61 |             "streaming_stateless_output_delta_trt": self.forward_streaming_stateless_output_delta_trt,
62 |         }[self.mode]
63 | 


--------------------------------------------------------------------------------
/runners/run_train.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import sys
18 | import json
19 | import argparse
20 | import pprint
21 | 
22 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
23 | 
24 | from audio2face import train_regression, train_diffusion, utils
25 | 
26 | parser = argparse.ArgumentParser()
27 | parser.add_argument("cfg_train_mod", type=str, nargs="?", default=None, help="Train Config Modifier")
28 | parser.add_argument("cfg_dataset_mod", type=str, nargs="?", default=None, help="Dataset Config Modifier")
29 | parser.add_argument("cfg_inference_mod", type=str, nargs="?", default=None, help="Inference Config Modifier")
30 | args = parser.parse_args()
31 | 
32 | cfg_train_mod = json.loads(args.cfg_train_mod) if args.cfg_train_mod is not None else None
33 | cfg_dataset_mod = json.loads(args.cfg_dataset_mod) if args.cfg_dataset_mod is not None else None
34 | cfg_inference_mod = json.loads(args.cfg_inference_mod) if args.cfg_inference_mod is not None else None
35 | network_type = utils.get_network_type_or_raise(cfg_train_mod)
36 | 
37 | print("===============================================================================================================")
38 | print(f"Using Train Config Modifier:\n{pprint.pformat(cfg_train_mod, width=120)}")
39 | print("===============================================================================================================")
40 | print(f"Using Dataset Config Modifier:\n{pprint.pformat(cfg_dataset_mod, width=120)}")
41 | print("===============================================================================================================")
42 | print(f"Using Inference Config Modifier:\n{pprint.pformat(cfg_inference_mod, width=120)}")
43 | print("===============================================================================================================")
44 | 
45 | if network_type == "regression":
46 |     result_train = train_regression.run(cfg_train_mod, cfg_dataset_mod)
47 | elif network_type == "diffusion":
48 |     result_train = train_diffusion.run(cfg_train_mod, cfg_dataset_mod, cfg_inference_mod)
49 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import sys
18 | import re
19 | import types
20 | import importlib.util
21 | import subprocess
22 | 
23 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def validate_identifier(identifier) -> bool:
27 |     if not isinstance(identifier, str):
28 |         return False
29 |     if not re.match(r"^[\w]([\w.-]*[\w])?$", identifier):
30 |         return False
31 |     if ".." in identifier:
32 |         return False
33 |     return True
34 | 
35 | 
36 | def validate_identifier_or_exit(identifier, name: str) -> None:
37 |     if not validate_identifier(identifier):
38 |         print(f"[ERROR] Unsupported {name} format: {identifier}")
39 |         sys.exit()
40 | 
41 | 
42 | def load_module(module_fpath: str) -> types.ModuleType:
43 |     module_name = "".join(c if c.isalpha() else "_" for c in module_fpath)
44 |     spec = importlib.util.spec_from_file_location(module_name, module_fpath)
45 |     module = importlib.util.module_from_spec(spec)
46 |     spec.loader.exec_module(module)
47 |     return module
48 | 
49 | 
50 | def module_to_dict(module: types.ModuleType) -> dict:
51 |     return {k: getattr(module, k) for k in dir(module) if not k.startswith("_")}
52 | 
53 | 
54 | def load_config(config_name: str, config_type: str, optional: bool = False) -> dict:
55 |     config_module_fnames = {
56 |         "dataset": "config_dataset.py",
57 |         "preproc": "config_preproc.py",
58 |         "train": "config_train.py",
59 |         "inference": "config_inference.py",
60 |     }
61 |     config_module_fpath = os.path.join(ROOT_DIR, "configs", config_name, config_module_fnames[config_type])
62 |     if not os.path.exists(config_module_fpath):
63 |         if optional:
64 |             return {}
65 |         else:
66 |             print(f"[ERROR] Unable to find config: {config_name} (type: {config_type})")
67 |             print(f"[ERROR] Make sure this file exists: {config_module_fpath}")
68 |             sys.exit()
69 |     config_module = load_module(config_module_fpath)
70 |     return module_to_dict(config_module)
71 | 
72 | 
73 | def run_process(cmd_with_args: list) -> None:
74 |     try:
75 |         process = subprocess.Popen(cmd_with_args)
76 |         process.wait()
77 |     except KeyboardInterrupt:
78 |         process.wait()
79 | 


--------------------------------------------------------------------------------
/configs/example-diffusion-min/config_dataset.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | ########################################################################################################################
17 | # Paths to various parts of the Audio2Face dataset (per actor)
18 | # Check the details for each of the parts in Audio2Face-3D-Dataset-v1.0.0-claire/docs/README.html file
19 | # Actor-specific parameters are represented as a dictionary: PARAM = {"actor1": value1, "actor2": value2, ...}
20 | # Only AUDIO_ROOT and SKIN_CACHE_ROOT are required - all other paths are optional
21 | ########################################################################################################################
22 | 
23 | # Audio data
24 | AUDIO_ROOT = {
25 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/audio",
26 | }
27 | 
28 | # Skin data
29 | SKIN_CACHE_ROOT = {
30 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/cache/skin",
31 | }
32 | 
33 | ########################################################################################################################
34 | # Dataset properties and meta-information
35 | ########################################################################################################################
36 | 
37 | # List of emotions used in the training dataset shots with facial animation performance
38 | SHOT_EMOTION_NAMES = [
39 |     "neutral",
40 |     "amazement",
41 |     "anger",
42 |     "cheekiness",
43 |     "disgust",
44 |     "fear",
45 |     "grief",
46 |     "joy",
47 |     "outofbreath",
48 |     "pain",
49 |     "sadness",
50 | ]
51 | 
52 | # Frames-Per-Second rate for all the animation caches in the training dataset (per actor)
53 | # This parameter will be used to generate the shot list artifact during Preprocessing and augmented muted shots
54 | CACHE_FPS = {
55 |     "claire": 30.0,
56 | }
57 | 
58 | # List of the names of the actors performing the animation in the shots
59 | ACTOR_NAMES = [
60 |     "claire",
61 | ]
62 | 
63 | # Data transform scale: adjust these values according to your dataset, some data may require increasing scale value
64 | # Format: {"actor_name": {"channel_name": scale_value}}
65 | TRANSFORM_SCALE = {
66 |     "claire": {
67 |         "skin": 1.0,
68 |         "tongue": 1.0,
69 |         "jaw": 1.0,
70 |         "eye": 1.0,
71 |     },
72 | }
73 | 


--------------------------------------------------------------------------------
/runners/run_inference.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import sys
18 | import json
19 | import argparse
20 | import pprint
21 | 
22 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
23 | 
24 | from audio2face import infer_diffusion, utils
25 | 
26 | parser = argparse.ArgumentParser()
27 | parser.add_argument("training_run_name_full", type=str, help="Training Run Name Full")
28 | parser.add_argument("cfg_train_mod", type=str, nargs="?", default=None, help="Train Config Modifier")
29 | parser.add_argument("cfg_dataset_mod", type=str, nargs="?", default=None, help="Dataset Config Modifier")
30 | parser.add_argument("cfg_inference_mod", type=str, nargs="?", default=None, help="Inference Config Modifier")
31 | args = parser.parse_args()
32 | 
33 | utils.validate_identifier_or_raise(args.training_run_name_full, "TRAINING_RUN_NAME_FULL")
34 | cfg_train_mod = json.loads(args.cfg_train_mod) if args.cfg_train_mod is not None else None
35 | cfg_dataset_mod = json.loads(args.cfg_dataset_mod) if args.cfg_dataset_mod is not None else None
36 | cfg_inference_mod = json.loads(args.cfg_inference_mod) if args.cfg_inference_mod is not None else None
37 | network_type = utils.get_network_type_or_raise(cfg_train_mod)
38 | 
39 | print("===============================================================================================================")
40 | print(f"Using Training Run Name Full: {args.training_run_name_full}")
41 | print("===============================================================================================================")
42 | print(f"Using Train Config Modifier:\n{pprint.pformat(cfg_train_mod, width=120)}")
43 | print("===============================================================================================================")
44 | print(f"Using Dataset Config Modifier:\n{pprint.pformat(cfg_dataset_mod, width=120)}")
45 | print("===============================================================================================================")
46 | print(f"Using Inference Config Modifier:\n{pprint.pformat(cfg_inference_mod, width=120)}")
47 | print("===============================================================================================================")
48 | 
49 | if network_type == "regression":
50 |     raise NotImplementedError("Inference for regression networks is not implemented yet")
51 | elif network_type == "diffusion":
52 |     result_infer = infer_diffusion.run(args.training_run_name_full, cfg_train_mod, cfg_dataset_mod, cfg_inference_mod)
53 | 


--------------------------------------------------------------------------------
/tools/utils/render_markdown.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import argparse
18 | import html
19 | import re
20 | import markdown2
21 | 
22 | 
23 | def convert_markdown_to_html(md_filepath: str, html_filepath: str, title: str = "") -> None:
24 |     with open(md_filepath, "r", encoding="utf-8") as md_file:
25 |         md_content = md_file.read()
26 | 
27 |     link_patterns = [(re.compile(r"\b(http://\S+|https://\S+)"), r"\1")]
28 |     html_content = markdown2.markdown(md_content, extras={"fenced-code-blocks": {}, "link-patterns": link_patterns})
29 |     escaped_title = html.escape(title)
30 | 
31 |     html_template = f"""
32 | <!DOCTYPE html>
33 | <html>
34 | <head>
35 |     <meta charset="utf-8" />
36 |     <title>{escaped_title}</title>
37 |     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/github-markdown-css/5.3.0/github-markdown-dark.css">
38 |     <style>
39 |         body {{
40 |             background-color: #0D1117;
41 |             color: #C9D1D9;
42 |         }}
43 |         .markdown-body {{
44 |             box-sizing: border-box;
45 |             min-width: 200px;
46 |             max-width: 980px;
47 |             margin: 0 auto;
48 |             padding: 45px;
49 |         }}
50 |         h1 {{
51 |             color: #76b900;
52 |         }}
53 |         h2 {{
54 |             color: #76b900;
55 |         }}
56 |     </style>
57 | </head>
58 | <body>
59 |     <article class="markdown-body">
60 |         {html_content}
61 |     </article>
62 | </body>
63 | </html>
64 | """
65 | 
66 |     with open(html_filepath, "w", encoding="utf-8") as html_file:
67 |         html_file.write(html_template)
68 | 
69 | 
70 | def main() -> None:
71 |     parser = argparse.ArgumentParser(description="Convert Markdown to HTML with styling")
72 |     parser.add_argument("md_filepath", help="Path to the input Markdown file")
73 |     parser.add_argument("-o", "--output", dest="html_filepath", help="Path to the output HTML file")
74 |     parser.add_argument("-t", "--title", help="Title for the HTML document")
75 |     args = parser.parse_args()
76 | 
77 |     if args.html_filepath is None:
78 |         args.html_filepath = os.path.splitext(args.md_filepath)[0] + ".html"
79 |     if args.title is None:
80 |         args.title = os.path.splitext(os.path.basename(args.md_filepath))[0]
81 | 
82 |     convert_markdown_to_html(args.md_filepath, args.html_filepath, args.title)
83 |     print(f"Converted {args.md_filepath} to {args.html_filepath}")
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     main()
88 | 


--------------------------------------------------------------------------------
/audio2face/config_base/config_dataset_base.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | ########################################################################################################################
17 | # Paths to various parts of the Audio2Face dataset (per actor)
18 | # Check the details for each of the parts in Audio2Face-3D-Dataset-v1.0.0-claire/docs/README.html file
19 | # Actor-specific parameters are represented as a dictionary: PARAM = {"actor1": value1, "actor2": value2, ...}
20 | ########################################################################################################################
21 | 
22 | # Audio data
23 | AUDIO_ROOT = {}
24 | 
25 | # Skin data
26 | SKIN_CACHE_ROOT = {}
27 | SKIN_NEUTRAL_FPATH = {}
28 | SKIN_NEUTRAL_INFERENCE_FPATH = {}
29 | SKIN_LIP_OPEN_POSE_DELTA_FPATH = {}
30 | SKIN_EYE_CLOSE_POSE_DELTA_FPATH = {}
31 | SKIN_LIP_DIST_VERTEX_LIST_FPATH = {}
32 | SKIN_LIP_SIZE_VERTEX_LIST_FPATH = {}
33 | SKIN_EYE_DIST_VERTEX_LIST_FPATH = {}
34 | 
35 | # Tongue data
36 | TONGUE_CACHE_ROOT = {}
37 | TONGUE_NEUTRAL_FPATH = {}
38 | TONGUE_NEUTRAL_INFERENCE_FPATH = {}
39 | TONGUE_RIGID_VERTEX_LIST_FPATH = {}
40 | 
41 | # Jaw data
42 | JAW_KEYPOINTS_NEUTRAL_FPATH = {}
43 | JAW_ANIM_DATA_FPATH = {}
44 | 
45 | # Eye data
46 | EYE_ANIM_DATA_FPATH = {}
47 | EYE_BLINK_KEYS_FPATH = {}
48 | EYE_SACCADE_ROTATIONS_FPATH = {}
49 | 
50 | # Blendshape data
51 | BLENDSHAPE_SKIN_FPATH = {}
52 | BLENDSHAPE_SKIN_CONFIG_FPATH = {}
53 | BLENDSHAPE_TONGUE_FPATH = {}
54 | BLENDSHAPE_TONGUE_CONFIG_FPATH = {}
55 | 
56 | ########################################################################################################################
57 | # Dataset properties and meta-information
58 | ########################################################################################################################
59 | 
60 | # List of emotions used in the training dataset shots with facial animation performance
61 | SHOT_EMOTION_NAMES = [
62 |     "neutral",
63 | ]
64 | 
65 | # By default, shot emotion name is inferred from the shot name automatically (if one is a substring of the other)
66 | # This list overrides the emotion names for the specified shots
67 | # Item format: (actor_name, shot_name, shot_emotion_name)
68 | # Example: ("mark", "shot2", "sadness")
69 | SHOT_EMOTION_MAP = []
70 | 
71 | # Frames-Per-Second rate for all the animation caches in the training dataset (per actor)
72 | # This parameter will be used to generate the shot list artifact during Preprocessing and augmented muted shots
73 | CACHE_FPS = {}
74 | 
75 | # List of the names of the actors performing the animation in the shots
76 | ACTOR_NAMES = []
77 | 
78 | # Data transform scale: adjust these values according to your dataset, some data may require increasing scale value
79 | # Format: {"actor_name": {"channel_name": scale_value}}
80 | TRANSFORM_SCALE = {}
81 | 


--------------------------------------------------------------------------------
/audio2face/geometry/pca.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import numpy as np
17 | import cupy as cp
18 | from cupy_backends.cuda.libs.cusolver import CUSOLVERError
19 | 
20 | 
21 | def pca_truncated(
22 |     data: np.ndarray,
23 |     variance_threshold: float,
24 |     custom_mean: np.ndarray | None = None,
25 |     force_components: int | None = None,
26 |     use_cupy: bool = False,
27 | ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
28 |     """
29 |     Calculates PCA decomposition and truncates the result to have specific number of components.
30 |     Number of components is determined by variance threshold and additionally (optionally) by the provided number.
31 | 
32 |     Parameters
33 |     ----------
34 |     data : NumPy or cupy array
35 |         Input data
36 |     variance_threshold : float
37 |         Variance threshold for determining number of components to truncate the result
38 |     custom_mean : NumPy or cupy array, optional
39 |         Tensor with user-defined custom mean to be subtracted from the data
40 |     force_components : int
41 |         Additional user-defined forced number of components to truncate the result
42 |     use_cupy : bool
43 |         If true -- use cupy backend for SVD and linear algebra, else -- use NumPy
44 | 
45 |     Returns
46 |     -------
47 |     NumPy or cupy array: Eigen vectors (truncated)
48 |     NumPy or cupy array: Eigen values (truncated)
49 |     NumPy or cupy array: Mean of the data (calculated or user-defined)
50 | 
51 |     """
52 | 
53 |     m, _ = data.shape
54 |     if custom_mean is None:
55 |         if use_cupy:
56 |             mean = cp.mean(data, axis=0)
57 |         else:
58 |             mean = np.mean(data, axis=0)
59 |     else:
60 |         mean = custom_mean
61 |     deltaData = data - mean
62 | 
63 |     if use_cupy:
64 |         try:
65 |             U, s, VT = cp.linalg.svd(deltaData, full_matrices=0)
66 |         except CUSOLVERError:
67 |             raise RuntimeError(
68 |                 f"Unable to compute SVD for a matrix with shape {deltaData.shape}, probably due to running out of GPU memory"
69 |             ) from None
70 |     else:
71 |         U, s, VT = np.linalg.svd(deltaData, full_matrices=0)
72 | 
73 |     s = s * s / (m - 1)
74 |     evals = s
75 |     evecs = VT.T
76 | 
77 |     if use_cupy:
78 |         evalRatio = cp.asnumpy(cp.cumsum(evals) / cp.sum(evals))
79 |     else:
80 |         evalRatio = np.cumsum(evals) / np.sum(evals)
81 |     t = np.argwhere(evalRatio > variance_threshold)[0]
82 |     num_components = t[0] + 1
83 | 
84 |     if force_components is None:
85 |         evecs_t = evecs[:, :num_components]
86 |         evals_t = evals[:num_components]
87 |     else:
88 |         evecs_t = evecs[:, :force_components]
89 |         evals_t = evals[:force_components]
90 | 
91 |     return evecs_t, evals_t, mean.flatten()
92 | 


--------------------------------------------------------------------------------
/audio2face/audio/utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import math
17 | import numpy as np
18 | import scipy.signal
19 | import soundfile
20 | 
21 | import torch
22 | import torchaudio
23 | 
24 | from audio2face.audio import AudioTrack
25 | 
26 | 
27 | def read_audio_track(fpath: str) -> AudioTrack:
28 |     data, samplerate = soundfile.read(fpath, dtype="float32")
29 |     if len(data.shape) > 1:
30 |         data = np.average(data, axis=1)  # convert to mono
31 |     return AudioTrack(data, samplerate)
32 | 
33 | 
34 | def read_and_preproc_audio_track(
35 |     fpath: str,
36 |     preproc_method: str = "nva2f",
37 |     new_samplerate: int | None = None,
38 | ) -> AudioTrack:
39 |     if preproc_method == "nva2f":
40 |         track = read_audio_track(fpath)
41 |         track.normalize(threshold=0.01)
42 |         if new_samplerate is not None:
43 |             track.resample(new_samplerate)
44 |     elif preproc_method == "w2v":
45 |         track = read_audio_track(fpath)
46 |         data, samplerate = torch.from_numpy(track.data), track.samplerate
47 |         if new_samplerate is not None:
48 |             data = torchaudio.functional.resample(data, samplerate, new_samplerate)
49 |             samplerate = new_samplerate
50 |         track = AudioTrack(data.numpy(), samplerate)
51 |         track.update_norm_factor(threshold=0.01)
52 |     else:
53 |         raise ValueError(f"Unsupported audio preprocessing method: {preproc_method}")
54 |     return track
55 | 
56 | 
57 | def generate_audio_noise(
58 |     buffer_len: int,
59 |     samplerate: int,
60 |     noise_type: str | None = None,
61 |     noise_scale: float = 1.0,
62 |     rng: np.random.Generator | None = None,
63 | ) -> np.ndarray:
64 |     if noise_type is None:
65 |         return np.zeros(buffer_len, dtype=np.float32)
66 |     if rng is None:
67 |         rng = np.random.default_rng()
68 |     if noise_type == "gauss":
69 |         return rng.normal(0.0, 1.0, buffer_len) * noise_scale
70 |     elif noise_type == "mic":
71 |         # Simulate a microphone noise by creating a spectrogram with given distribution of frequencies
72 |         nperseg = 256
73 |         size_f = nperseg // 2 + 1
74 |         size_t = math.ceil((buffer_len - nperseg) / (nperseg - nperseg // 2)) + 3
75 |         frequencies = np.linspace(0, samplerate // 2, size_f)
76 |         Sxx_log_mean = 39.0 * np.exp(-frequencies / 300.0) - 121.0
77 |         Sxx_log_std = 5.85
78 |         Sxx = []
79 |         for _ in range(size_t):
80 |             Sxx_log_i = rng.normal(loc=Sxx_log_mean, scale=Sxx_log_std)
81 |             Sxx_i = np.sqrt(np.power(10.0, (Sxx_log_i / 10.0)))
82 |             Sxx.append(Sxx_i)
83 |         Sxx = np.stack(Sxx).T
84 |         phase = rng.random(size=(size_f, size_t)) * 2 * np.pi
85 |         Zxx = Sxx * np.exp(1j * phase)
86 |         _, audio_data = scipy.signal.istft(Zxx, samplerate)
87 |         return audio_data[:buffer_len] * noise_scale
88 |     else:
89 |         raise ValueError(f"Unsupported audio noise type: {noise_type}")
90 | 


--------------------------------------------------------------------------------
/audio2face/deps/motion_diffusion_model/diffusion/losses.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | ############################################################################
17 | # Modified from motion-diffusion-model
18 | # Copyright (c) 2022 Guy Tevet
19 | #
20 | # See https://github.com/GuyTevet/motion-diffusion-model/blob/main/LICENSE for details
21 | ############################################################################
22 | #
23 | # This code is based on https://github.com/openai/guided-diffusion
24 | """
25 | Helpers for various likelihood-based losses. These are ported from the original
26 | Ho et al. diffusion models codebase:
27 | https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
28 | """
29 | 
30 | import numpy as np
31 | import torch as th
32 | 
33 | 
34 | def normal_kl(mean1, logvar1, mean2, logvar2):
35 |     """
36 |     Compute the KL divergence between two gaussians.
37 | 
38 |     Shapes are automatically broadcasted, so batches can be compared to
39 |     scalars, among other use cases.
40 |     """
41 |     tensor = None
42 |     for obj in (mean1, logvar1, mean2, logvar2):
43 |         if isinstance(obj, th.Tensor):
44 |             tensor = obj
45 |             break
46 |     assert tensor is not None, "at least one argument must be a Tensor"
47 | 
48 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
49 |     # Tensors, but it does not work for th.exp().
50 |     logvar1, logvar2 = [x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) for x in (logvar1, logvar2)]
51 | 
52 |     return 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2))
53 | 
54 | 
55 | def approx_standard_normal_cdf(x):
56 |     """
57 |     A fast approximation of the cumulative distribution function of the
58 |     standard normal.
59 |     """
60 |     return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
61 | 
62 | 
63 | def discretized_gaussian_log_likelihood(x, *, means, log_scales):
64 |     """
65 |     Compute the log-likelihood of a Gaussian distribution discretizing to a
66 |     given image.
67 | 
68 |     :param x: the target images. It is assumed that this was uint8 values,
69 |               rescaled to the range [-1, 1].
70 |     :param means: the Gaussian mean Tensor.
71 |     :param log_scales: the Gaussian log stddev Tensor.
72 |     :return: a tensor like x of log probabilities (in nats).
73 |     """
74 |     assert x.shape == means.shape == log_scales.shape
75 |     centered_x = x - means
76 |     inv_stdv = th.exp(-log_scales)
77 |     plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
78 |     cdf_plus = approx_standard_normal_cdf(plus_in)
79 |     min_in = inv_stdv * (centered_x - 1.0 / 255.0)
80 |     cdf_min = approx_standard_normal_cdf(min_in)
81 |     log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
82 |     log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
83 |     cdf_delta = cdf_plus - cdf_min
84 |     log_probs = th.where(
85 |         x < -0.999,
86 |         log_cdf_plus,
87 |         th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
88 |     )
89 |     assert log_probs.shape == x.shape
90 |     return log_probs
91 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | ## Contributing to Audio2Face-3D Training Framework
  2 | 
  3 | ### Contribution Policy
  4 | 
  5 | This repository is maintained by NVIDIA's Audio2Face team. We welcome contributions to this repository including:
  6 | 
  7 | - **Bug reports** with detailed reproduction steps
  8 | - **Pull requests** for bug fixes, improvements, and new features
  9 | - **Documentation improvements** and examples
 10 | - **Community support** helping other users with questions and issues
 11 | 
 12 | All contributions should follow the guidelines below and include proper testing.
 13 | 
 14 | ### Development Setup
 15 | 
 16 | #### Prerequisites
 17 | - Linux/WSL2, CUDA GPU (6GB+ VRAM), Docker, NGC access
 18 | - See [README.md](README.md#prerequisites) for details
 19 | 
 20 | #### Environment Setup
 21 | ```bash
 22 | cp .env.example .env  # Edit with your paths
 23 | chmod +x docker/*.sh
 24 | ./docker/build_docker.sh
 25 | ```
 26 | 
 27 | #### VSCode Development (Recommended)
 28 | The project includes pre-configured VSCode settings for:
 29 | - Black formatting (120 characters)
 30 | - Debug configurations for all framework stages
 31 | - Dev container support
 32 | 
 33 | For detailed instructions, see [VSCode Development and Debugging](docs/training_framework.md#vscode-development-and-debugging--advanced) in the Training Framework documentation.
 34 | 
 35 | ### Code Standards
 36 | 
 37 | #### Before Committing
 38 | **Always run before committing:**
 39 | ```bash
 40 | python tools/format_code.py
 41 | ```
 42 | 
 43 | This handles Black formatting (120 characters) and ensures consistency. VSCode will partially auto-format during development, but this tool ensures final consistency.
 44 | 
 45 | #### Required for Any Contributions
 46 | - Python type hints
 47 | - License headers (see existing files for format)
 48 | - Test with example dataset
 49 | - Docker compatibility
 50 | 
 51 | ### Developer Certificate of Origin (DCO)
 52 | 
 53 | All contributions must comply with the [Developer Certificate of Origin (DCO)](https://developercertificate.org/).
 54 | 
 55 | #### Signing Your Commits
 56 | 
 57 | Sign off on all commits using the `--signoff` (or `-s`) option:
 58 | 
 59 | ```bash
 60 | git commit -s -m "Add feature X"
 61 | ```
 62 | 
 63 | This appends a sign-off line to your commit message:
 64 | ```
 65 | Signed-off-by: Your Name <your@email.com>
 66 | ```
 67 | 
 68 | #### DCO Requirements
 69 | 
 70 | ```
 71 | Developer Certificate of Origin
 72 | Version 1.1
 73 | 
 74 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 75 | 
 76 | Everyone is permitted to copy and distribute verbatim copies of this
 77 | license document, but changing it is not allowed.
 78 | 
 79 | 
 80 | Developer's Certificate of Origin 1.1
 81 | 
 82 | By making a contribution to this project, I certify that:
 83 | 
 84 | (a) The contribution was created in whole or in part by me and I
 85 |     have the right to submit it under the open source license
 86 |     indicated in the file; or
 87 | 
 88 | (b) The contribution is based upon previous work that, to the best
 89 |     of my knowledge, is covered under an appropriate open source
 90 |     license and I have the right under that license to submit that
 91 |     work with modifications, whether created in whole or in part
 92 |     by me, under the same open source license (unless I am
 93 |     permitted to submit under a different license), as indicated
 94 |     in the file; or
 95 | 
 96 | (c) The contribution was provided directly to me by some other
 97 |     person who certified (a), (b) or (c) and I have not modified
 98 |     it.
 99 | 
100 | (d) I understand and agree that this project and the contribution
101 |     are public and that a record of the contribution (including all
102 |     personal information I submit with it, including my sign-off) is
103 |     maintained indefinitely and may be redistributed consistent with
104 |     this project or the open source license(s) involved.
105 | ```
106 | 
107 | ### License
108 | 
109 | This project is licensed under the [Apache License 2.0](LICENSE). All contributions must be compatible with this license.
110 | 


--------------------------------------------------------------------------------
/audio2face/deps/motion_diffusion_model/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | ############################################################################
 17 | # Modified from motion-diffusion-model
 18 | # Copyright (c) 2022 Guy Tevet
 19 | #
 20 | # See https://github.com/GuyTevet/motion-diffusion-model/blob/main/LICENSE for details
 21 | ############################################################################
 22 | #
 23 | import torch
 24 | import torch.nn as nn
 25 | 
 26 | 
 27 | class WeightedSum(nn.Module):
 28 |     def __init__(self, num_rows):
 29 |         super(WeightedSum, self).__init__()
 30 |         # Initialize learnable weights
 31 |         self.weights = nn.Parameter(torch.randn(num_rows))
 32 | 
 33 |     def forward(self, x):
 34 |         # Ensure weights are normalized (optional)
 35 |         normalized_weights = self.weights / self.weights.sum()  # torch.softmax(self.weights, dim=0)
 36 |         # Compute the weighted sum of the rows
 37 |         weighted_sum = torch.matmul(normalized_weights, x)
 38 |         return weighted_sum
 39 | 
 40 | 
 41 | def wrapped_getattr(self, name, default=None, wrapped_member_name="model"):
 42 |     """
 43 |     Should be called from wrappers of model classes such as ClassifierFreeSampleModel
 44 |     """
 45 | 
 46 |     if isinstance(self, torch.nn.Module):
 47 |         # for descendants of nn.Module, name may be in self.__dict__[_parameters/_buffers/_modules]
 48 |         # so we activate nn.Module.__getattr__ first.
 49 |         # Otherwise, we might encounter an infinite loop
 50 |         try:
 51 |             attr = torch.nn.Module.__getattr__(self, name)
 52 |         except AttributeError:
 53 |             wrapped_member = torch.nn.Module.__getattr__(self, wrapped_member_name)
 54 |             attr = getattr(wrapped_member, name, default)
 55 |     else:
 56 |         # the easy case, where self is not derived from nn.Module
 57 |         wrapped_member = getattr(self, wrapped_member_name)
 58 |         attr = getattr(wrapped_member, name, default)
 59 |     return attr
 60 | 
 61 | 
 62 | def to_numpy(tensor):
 63 |     if torch.is_tensor(tensor):
 64 |         return tensor.cpu().numpy()
 65 |     elif type(tensor).__module__ != "numpy":
 66 |         raise ValueError("Cannot convert {} to numpy array".format(type(tensor)))
 67 |     return tensor
 68 | 
 69 | 
 70 | def to_torch(ndarray):
 71 |     if type(ndarray).__module__ == "numpy":
 72 |         return torch.from_numpy(ndarray)
 73 |     elif not torch.is_tensor(ndarray):
 74 |         raise ValueError("Cannot convert {} to torch tensor".format(type(ndarray)))
 75 |     return ndarray
 76 | 
 77 | 
 78 | def cleanexit():
 79 |     import sys
 80 |     import os
 81 | 
 82 |     try:
 83 |         sys.exit(0)
 84 |     except SystemExit:
 85 |         os._exit(0)
 86 | 
 87 | 
 88 | def load_model_wo_clip(model, state_dict):
 89 |     missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
 90 |     assert len(unexpected_keys) == 0
 91 |     assert all([k.startswith("clip_model.") for k in missing_keys])
 92 | 
 93 | 
 94 | def freeze_joints(x, joints_to_freeze):
 95 |     # Freezes selected joint *rotations* as they appear in the first frame
 96 |     # x [bs, [root+n_joints], joint_dim(6), seqlen]
 97 |     frozen = x.detach().clone()
 98 |     frozen[:, joints_to_freeze, :, :] = frozen[:, joints_to_freeze, :, :1]
 99 |     return frozen
100 | 


--------------------------------------------------------------------------------
/audio2face/config_base/config_preproc_base.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # The name of the Preprocessing run (use different names for different hyper-parameters or datasets)
17 | RUN_NAME = "default"
18 | 
19 | # Additional information describing the Preprocessing run, will be saved to <PREPROC_RUN_NAME_FULL>/configs/info.txt
20 | RUN_INFO = ""
21 | 
22 | ########################################################################################################################
23 | # Skin preproc params
24 | ########################################################################################################################
25 | 
26 | # Path to the skin cache directory with different resolution, organized the same way as the data in SKIN_CACHE_ROOT
27 | # Lower resolution cache can be used in Preprocessing pruning for reducing memory usage and computation time
28 | # Shot list and shot lengths should match the data in SKIN_CACHE_ROOT
29 | # If the value for an actor is missing -> Preprocessing pruning will use the original data from SKIN_CACHE_ROOT
30 | SKIN_PRUNE_CACHE_ROOT = {}
31 | 
32 | # Path to the prune_mesh_mask.npy file with a subset of skin mesh vertices (mask), covering only moving parts
33 | # This mask can be used in Preprocessing pruning for reducing memory usage and computation time
34 | # The mask is a 1D numpy array containing indices of the skin mesh vertices corresponding to the mask
35 | # Dimensions: [num_mask_vertices], data type: int
36 | # If SKIN_PRUNE_CACHE_ROOT is used, the same mesh resolution / topology should be used for SKIN_PRUNE_MESH_MASK
37 | # If the value for an actor is missing -> Preprocessing pruning will use all skin mesh vertices (no mask)
38 | SKIN_PRUNE_MESH_MASK_FPATH = {}
39 | 
40 | # If the value for an actor is missing -> Use all shots in the directory
41 | SKIN_CACHE_SHOTS = {}
42 | 
43 | # If the value for an actor is missing -> Use all shots in the directory
44 | SKIN_PRUNE_CACHE_SHOTS = {}
45 | 
46 | # If the value for an actor is missing -> Use variance threshold to automatically infer the number of components
47 | SKIN_FORCE_COMPONENTS = {}
48 | 
49 | SKIN_PRUNE_SIM_DIST = 4.0
50 | SKIN_SELECT_DISTINCT_MAX_ITER = 787
51 | SKIN_PCA_VARIANCE_THRESHOLD = 0.9995
52 | 
53 | ########################################################################################################################
54 | # Tongue preproc params
55 | ########################################################################################################################
56 | 
57 | # If the value for an actor is missing -> Use all shots in the directory
58 | TONGUE_CACHE_SHOTS = {}
59 | 
60 | # If the value for an actor is missing -> Use variance threshold to automatically infer the number of components
61 | TONGUE_FORCE_COMPONENTS = {}
62 | 
63 | TONGUE_PCA_VARIANCE_THRESHOLD = 0.9995
64 | 
65 | ########################################################################################################################
66 | # Default preproc artifact dimensions
67 | ########################################################################################################################
68 | 
69 | # Default jaw keypoints shape for neutral jaw if no data is provided
70 | DEFAULT_JAW_KEYPOINTS_SHAPE = (5, 3)
71 | 
72 | # Default eye blink keys if no data is provided
73 | DEFAULT_EYE_BLINK_KEYS_SHAPE = (10,)
74 | 
75 | # Default eye saccade rotations if no data is provided
76 | DEFAULT_EYE_SACCADE_ROT_SHAPE = (5000, 2)
77 | 
78 | # Default tongue PCA shape if no data is provided
79 | DEFAULT_TONGUE_PCA_SHAPE = (10, 520, 3)
80 | 
81 | ########################################################################################################################
82 | # Misc
83 | ########################################################################################################################
84 | 
85 | # This location is used to write output Preprocessing artifacts
86 | PREPROC_OUTPUT_ROOT = "/workspace/output_preproc"
87 | 
88 | VERBOSE = False
89 | 


--------------------------------------------------------------------------------
/audio2face/audio/audio_track.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | import math
 17 | import numpy as np
 18 | import scipy.signal
 19 | 
 20 | 
 21 | class AudioTrack:
 22 |     def __init__(self, data: np.ndarray | None = None, samplerate: int = 48000) -> None:
 23 |         self.data = data.astype(np.float32) if data is not None else np.zeros(0, dtype=np.float32)
 24 |         self.samplerate = samplerate
 25 |         self.norm_factor = 1.0
 26 |         assert self.data.ndim == 1
 27 |         assert self.samplerate > 0
 28 | 
 29 |     def get_length(self) -> float:
 30 |         return float(self.data.size) / float(self.samplerate)
 31 | 
 32 |     def get_num_samples(self) -> int:
 33 |         return self.data.size
 34 | 
 35 |     def sec_to_sample(self, sec: float) -> int:
 36 |         return int(round(sec * self.samplerate))
 37 | 
 38 |     def sample_to_sec(self, sample: int) -> float:
 39 |         return float(sample) / float(self.samplerate)
 40 | 
 41 |     def get_padded_buffer(self, ofs: int, length: int) -> np.ndarray:
 42 |         if ofs >= 0 and ofs + length <= self.data.size:
 43 |             return self.data[ofs : ofs + length]
 44 |         res = np.zeros(length, dtype=self.data.dtype)
 45 |         begin = max(0, -ofs)
 46 |         end = min(length, self.data.size - ofs)
 47 |         if begin < end:
 48 |             res[begin:end] = self.data[ofs + begin : ofs + end]
 49 |         return res
 50 | 
 51 |     def get_resampled_padded_buffer(
 52 |         self, input_buffer_pos: int, resampled_ofs: int, resampled_len: int, new_samplerate: int
 53 |     ) -> np.ndarray:
 54 |         if self.samplerate == new_samplerate:
 55 |             ofs = input_buffer_pos - resampled_ofs
 56 |             return self.get_padded_buffer(ofs, resampled_len)
 57 |         resample_ratio = float(new_samplerate) / self.samplerate
 58 |         resample_up = max(int(round(min(resample_ratio, 1) * 1000)), 1)
 59 |         resample_down = max(int(round(resample_up / resample_ratio)), 1)
 60 |         input_buffer_len = int(math.ceil(float(resampled_len) * resample_down / resample_up))
 61 |         input_buffer_ofs = int(round(float(resampled_ofs) * resample_down / resample_up))
 62 |         ofs = input_buffer_pos - input_buffer_ofs
 63 |         buffer_track = AudioTrack(self.get_padded_buffer(ofs, input_buffer_len), self.samplerate)
 64 |         buffer_track.resample(new_samplerate)
 65 |         return buffer_track.get_padded_buffer(0, resampled_len)
 66 | 
 67 |     def pad(self, pad_sec: float) -> None:
 68 |         padding = int(pad_sec * self.samplerate)
 69 |         self.data = np.concatenate((np.zeros((padding), np.float32), self.data))
 70 | 
 71 |     def resample(self, new_samplerate: int) -> None:
 72 |         if self.samplerate == new_samplerate:
 73 |             return
 74 |         resample_ratio = float(new_samplerate) / self.samplerate
 75 |         resample_up = max(int(round(min(resample_ratio, 1) * 1000)), 1)
 76 |         resample_down = max(int(round(resample_up / resample_ratio)), 1)
 77 |         self.data = scipy.signal.resample_poly(self.data.astype(np.float32), resample_up, resample_down).astype(
 78 |             np.float32
 79 |         )
 80 |         self.samplerate = new_samplerate
 81 | 
 82 |     def normalize(self, threshold: float = 0.01) -> None:
 83 |         maxabs = np.max(np.abs(self.data))
 84 |         if maxabs > threshold:
 85 |             self.data /= max(maxabs, 1.0e-8)
 86 | 
 87 |     def update_norm_factor(self, threshold: float = 0.01) -> None:
 88 |         maxabs = np.max(np.abs(self.data))
 89 |         if maxabs > threshold:
 90 |             self.norm_factor = max(maxabs, 1.0e-8)
 91 |         else:
 92 |             self.norm_factor = 1.0  # We should not normalize if maxabs is "small"
 93 | 
 94 | 
 95 | def read_data(data: np.ndarray, samplerate: int, pad: int = 0) -> AudioTrack:
 96 |     data = data.astype(np.float32)
 97 |     # Convert to mono.
 98 |     if len(data.shape) > 1:
 99 |         assert len(data.shape) == 2
100 |         data = np.average(data, axis=1)
101 |     # Normalize volume.
102 |     data /= max(np.max(abs(data)), 1.0e-8)
103 |     if pad > 0:
104 |         data = np.concatenate((np.zeros((pad), np.float32), data))
105 |     return AudioTrack(data, samplerate)
106 | 


--------------------------------------------------------------------------------
/audio2face/phoneme.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | import logging
 17 | import math
 18 | from collections import OrderedDict
 19 | import numpy as np
 20 | 
 21 | from audio2face import utils
 22 | 
 23 | CHARSIU_PHONEME_FPS = 100  # 100 phonemes per second
 24 | 
 25 | CHARSIU_LANG_MAP = {
 26 |     "en": "charsiu/en_w2v2_fc_10ms",
 27 |     "zh": "charsiu/zh_xlsr_fc_10ms",
 28 | }
 29 | 
 30 | 
 31 | class Phonemes:
 32 |     def __init__(self, data: np.ndarray, samplerate: float, lang: str) -> None:
 33 |         self.data = data
 34 |         self.samplerate = samplerate  # phonemes per second
 35 |         self.lang = lang
 36 | 
 37 |     def sec_to_sample(self, sec: float) -> int:
 38 |         true_samplerate = len(self.data) / (len(self.data) / self.samplerate + 0.02)  # due to edge effects
 39 |         return int(math.floor(sec * true_samplerate))
 40 | 
 41 |     def get_padded_buffer(self, ofs: int, length: int, pad_token_idx: int) -> np.ndarray:
 42 |         if ofs >= 0 and ofs + length <= len(self.data):
 43 |             return self.data[ofs : ofs + length]
 44 |         res = np.zeros((length, *self.data.shape[1:]), dtype=self.data.dtype)
 45 |         res[..., pad_token_idx] = 1.0
 46 |         begin = max(0, -ofs)
 47 |         end = min(length, len(self.data) - ofs)
 48 |         if begin < end:
 49 |             res[begin:end] = self.data[ofs + begin : ofs + end]
 50 |         return res
 51 | 
 52 |     # TODO Currently works only if new_samplerate is a factor of self.samplerate
 53 |     def resample(self, new_samplerate: float) -> None:
 54 |         if self.samplerate == new_samplerate:
 55 |             return
 56 |         skip = int(round(self.samplerate / new_samplerate))
 57 |         self.data = self.data[::skip]
 58 |         self.samplerate = new_samplerate
 59 | 
 60 | 
 61 | class PhonemeDetector:
 62 |     def __init__(self, langs: list[str], temperature: float = 1.0, torch_cache_root: str | None = None) -> None:
 63 |         self.langs = langs
 64 | 
 65 |         if len(self.langs) == 0:
 66 |             self.models = OrderedDict()
 67 |             self.num_phonemes: OrderedDict[str, int] = OrderedDict()
 68 |             self.max_num_phonemes = 0
 69 |             return
 70 | 
 71 |         logging.info(f"Using languages {self.langs} to initialize Phoneme Detector...")
 72 |         self.prepare_for_charsiu_import(torch_cache_root)
 73 |         from audio2face.deps.charsiu.src.Charsiu import charsiu_predictive_aligner
 74 | 
 75 |         self.models: OrderedDict[str, charsiu_predictive_aligner] = OrderedDict()
 76 |         self.num_phonemes: OrderedDict[str, int] = OrderedDict()
 77 |         for lang in self.langs:
 78 |             if lang not in CHARSIU_LANG_MAP.keys():
 79 |                 raise ValueError(f'Unsupported phoneme lang: "{lang}". Should be in: {list(CHARSIU_LANG_MAP.keys())}')
 80 |             self.models[lang] = charsiu_predictive_aligner(CHARSIU_LANG_MAP[lang], temperature=temperature, lang=lang)
 81 |             self.num_phonemes[lang] = self.models[lang].charsiu_processor.processor.tokenizer.vocab_size
 82 |         self.max_num_phonemes = max(self.num_phonemes.values())
 83 | 
 84 |     def prepare_for_charsiu_import(self, torch_cache_root: str | None = None) -> None:
 85 |         utils.suppress_transformers_warnings()
 86 |         if torch_cache_root is not None:
 87 |             utils.change_huggingface_hub_cache_root(torch_cache_root)
 88 | 
 89 |     def lang_is_ready(self, lang: str) -> bool:
 90 |         return lang in self.models.keys()
 91 | 
 92 |     def validate_lang(self, lang: str) -> None:
 93 |         if not self.lang_is_ready(lang):
 94 |             raise ValueError(f'Unsupported phoneme lang: "{lang}". Should be in: {self.langs}')
 95 | 
 96 |     def get_sil_token_idx(self, lang: str) -> int:
 97 |         self.validate_lang(lang)
 98 |         return self.models[lang].charsiu_processor.processor.tokenizer.convert_tokens_to_ids("[SIL]")
 99 | 
100 |     def gen_phonemes(self, audio_fpath: str, lang: str, new_samplerate: float | None = None) -> Phonemes:
101 |         self.validate_lang(lang)
102 |         phoneme_data = self.models[lang].align_probs(audio=audio_fpath)
103 |         phonemes = Phonemes(phoneme_data, CHARSIU_PHONEME_FPS, lang)
104 |         if new_samplerate is not None:
105 |             phonemes.resample(new_samplerate)
106 |         return phonemes
107 | 


--------------------------------------------------------------------------------
/audio2face/preproc/preproc.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | import datetime
18 | import shutil
19 | import logging
20 | 
21 | from audio2face import utils
22 | from audio2face.config_base import config_preproc_base, config_dataset_base
23 | from audio2face.preproc import preproc_skin, preproc_tongue, preproc_deploy
24 | 
25 | FRAMEWORK_ROOT_DIR = utils.get_framework_root_dir()
26 | 
27 | 
28 | def export_meta_data(
29 |     actor_name: str,
30 |     configs_dir: str,
31 |     cfg_preproc: utils.EasyDict,
32 |     cfg_dataset: utils.EasyDict,
33 |     cfg_preproc_mod: dict | None = None,
34 |     cfg_dataset_mod: dict | None = None,
35 | ) -> None:
36 |     shutil.copy(os.path.join(FRAMEWORK_ROOT_DIR, "VERSION.md"), configs_dir)
37 |     with open(os.path.join(configs_dir, "info.txt"), "w") as f:
38 |         f.write(cfg_preproc.RUN_INFO)
39 |     if cfg_preproc_mod is not None:
40 |         utils.json_dump_pretty(cfg_preproc_mod, os.path.join(configs_dir, "config_preproc_modifier.json"))
41 |     if cfg_dataset_mod is not None:
42 |         utils.json_dump_pretty(cfg_dataset_mod, os.path.join(configs_dir, "config_dataset_modifier.json"))
43 |     utils.json_dump_pretty({"actor_name": actor_name}, os.path.join(configs_dir, "actor_info.json"))
44 | 
45 |     if not utils.is_partial_exposure():
46 |         utils.json_dump_pretty(cfg_preproc, os.path.join(configs_dir, "config_preproc_full.json"))
47 |         utils.json_dump_pretty(cfg_dataset, os.path.join(configs_dir, "config_dataset_full.json"))
48 |         utils.json_dump_pretty(utils.get_state_info(FRAMEWORK_ROOT_DIR), os.path.join(configs_dir, "state.json"))
49 | 
50 | 
51 | def run(
52 |     actor_name: str,
53 |     cfg_preproc_mod: dict | None = None,
54 |     cfg_dataset_mod: dict | None = None,
55 | ) -> dict:
56 |     run_name = utils.get_module_var("RUN_NAME", config_preproc_base, cfg_preproc_mod)
57 |     utils.validate_identifier_or_raise(run_name, "Preproc RUN_NAME")
58 |     run_name_full = datetime.datetime.today().strftime("%y%m%d_%H%M%S_") + run_name
59 | 
60 |     preproc_output_root = utils.get_module_var("PREPROC_OUTPUT_ROOT", config_preproc_base, cfg_preproc_mod)
61 |     out_dir = os.path.normpath(os.path.join(preproc_output_root, run_name_full))
62 |     configs_dir = os.path.normpath(os.path.join(out_dir, "configs"))
63 |     os.makedirs(out_dir, exist_ok=True)
64 |     os.makedirs(configs_dir, exist_ok=True)
65 | 
66 |     utils.setup_logging(os.path.join(out_dir, "log.log"))
67 |     logging.info("--------------------------------------------------------------------------------")
68 |     logging.info(f"Preprocessing run: {run_name_full}")
69 |     logging.info("--------------------------------------------------------------------------------")
70 | 
71 |     utils.validate_cfg_mod(cfg_preproc_mod, config_preproc_base, "preproc")
72 |     utils.validate_cfg_mod(cfg_dataset_mod, config_dataset_base, "dataset")
73 |     cfg_preproc = utils.module_to_easy_dict(config_preproc_base, modifier=cfg_preproc_mod)
74 |     cfg_dataset = utils.module_to_easy_dict(config_dataset_base, modifier=cfg_dataset_mod)
75 |     utils.validate_identifier_or_raise(actor_name, "Actor Name")
76 | 
77 |     export_meta_data(actor_name, configs_dir, cfg_preproc, cfg_dataset, cfg_preproc_mod, cfg_dataset_mod)
78 | 
79 |     result_skin = preproc_skin.run(actor_name, run_name_full, cfg_preproc_mod, cfg_dataset_mod)
80 |     result_tongue = preproc_tongue.run(actor_name, run_name_full, cfg_preproc_mod, cfg_dataset_mod)
81 |     result_deploy = preproc_deploy.run(actor_name, run_name_full, cfg_preproc_mod, cfg_dataset_mod)
82 | 
83 |     logging.info(f"Mapping to local FS: /framework is {os.getenv('EXTERNAL_A2F_FRAMEWORK_ROOT') or '/framework'}")
84 |     logging.info(f"Mapping to local FS: /datasets is {os.getenv('EXTERNAL_A2F_DATASETS_ROOT') or '/datasets'}")
85 |     logging.info(f"Mapping to local FS: /workspace is {os.getenv('EXTERNAL_A2F_WORKSPACE_ROOT') or '/workspace'}")
86 |     logging.info("--------------------------------------------------------------------------------")
87 |     logging.info(f"Preproc Run Name Full: {run_name_full}")
88 |     logging.info("--------------------------------------------------------------------------------")
89 | 
90 |     return {
91 |         "run_name_full": run_name_full,
92 |         "result_skin": result_skin,
93 |         "result_tongue": result_tongue,
94 |         "result_deploy": result_deploy,
95 |     }
96 | 


--------------------------------------------------------------------------------
/configs/example-regression/config_dataset.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | ########################################################################################################################
 17 | # Paths to various parts of the Audio2Face dataset (per actor)
 18 | # Check the details for each of the parts in Audio2Face-3D-Dataset-v1.0.0-claire/docs/README.html file
 19 | # Actor-specific parameters are represented as a dictionary: PARAM = {"actor1": value1, "actor2": value2, ...}
 20 | ########################################################################################################################
 21 | 
 22 | # Audio data
 23 | AUDIO_ROOT = {
 24 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/audio",
 25 | }
 26 | 
 27 | # Skin data
 28 | SKIN_CACHE_ROOT = {
 29 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/cache/skin",
 30 | }
 31 | SKIN_NEUTRAL_FPATH = {
 32 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/neutral_pose.npy",
 33 | }
 34 | SKIN_LIP_OPEN_POSE_DELTA_FPATH = {
 35 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/lip_open_pose_delta.npy",
 36 | }
 37 | SKIN_EYE_CLOSE_POSE_DELTA_FPATH = {
 38 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/eye_close_pose_delta.npy",
 39 | }
 40 | SKIN_LIP_DIST_VERTEX_LIST_FPATH = {
 41 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/lip_dist_vertex_list.json",
 42 | }
 43 | SKIN_LIP_SIZE_VERTEX_LIST_FPATH = {
 44 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/lip_size_vertex_list.json",
 45 | }
 46 | SKIN_EYE_DIST_VERTEX_LIST_FPATH = {
 47 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/eye_dist_vertex_list.json",
 48 | }
 49 | 
 50 | # Tongue data
 51 | TONGUE_CACHE_ROOT = {
 52 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/cache/tongue",
 53 | }
 54 | TONGUE_NEUTRAL_FPATH = {
 55 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/tongue/neutral_pose.npy",
 56 | }
 57 | TONGUE_RIGID_VERTEX_LIST_FPATH = {
 58 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/tongue/rigid_vertex_list.json",
 59 | }
 60 | 
 61 | # Jaw data
 62 | JAW_KEYPOINTS_NEUTRAL_FPATH = {
 63 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/xform/jaw_keypoints_neutral.npy",
 64 | }
 65 | JAW_ANIM_DATA_FPATH = {
 66 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/xform/jaw_keypoints_cache_all.npz",
 67 | }
 68 | 
 69 | # Eye data
 70 | EYE_ANIM_DATA_FPATH = {
 71 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/xform/eye_rotations_all.npz",
 72 | }
 73 | EYE_BLINK_KEYS_FPATH = {
 74 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/xform/eye_blink_keys.npy",
 75 | }
 76 | EYE_SACCADE_ROTATIONS_FPATH = {
 77 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/xform/eye_saccade_rotations.npy",
 78 | }
 79 | 
 80 | # Blendshape data
 81 | BLENDSHAPE_SKIN_FPATH = {
 82 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/bs_data/bs_skin.npz",
 83 | }
 84 | BLENDSHAPE_SKIN_CONFIG_FPATH = {
 85 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/bs_data/bs_skin_config.json",
 86 | }
 87 | BLENDSHAPE_TONGUE_FPATH = {
 88 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/bs_data/bs_tongue.npz",
 89 | }
 90 | BLENDSHAPE_TONGUE_CONFIG_FPATH = {
 91 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/bs_data/bs_tongue_config.json",
 92 | }
 93 | 
 94 | ########################################################################################################################
 95 | # Dataset properties and meta-information
 96 | ########################################################################################################################
 97 | 
 98 | # List of emotions used in the training dataset shots with facial animation performance
 99 | SHOT_EMOTION_NAMES = [
100 |     "neutral",
101 |     "amazement",
102 |     "anger",
103 |     "cheekiness",
104 |     "disgust",
105 |     "fear",
106 |     "grief",
107 |     "joy",
108 |     "outofbreath",
109 |     "pain",
110 |     "sadness",
111 | ]
112 | 
113 | # Frames-Per-Second rate for all the animation caches in the training dataset (per actor)
114 | # This parameter will be used to generate the shot list artifact during Preprocessing and augmented muted shots
115 | CACHE_FPS = {
116 |     "claire": 30.0,
117 | }
118 | 
119 | # List of the names of the actors performing the animation in the shots
120 | ACTOR_NAMES = [
121 |     "claire",
122 | ]
123 | 


--------------------------------------------------------------------------------
/configs/example-diffusion/config_dataset.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | ########################################################################################################################
 17 | # Paths to various parts of the Audio2Face dataset (per actor)
 18 | # Check the details for each of the parts in Audio2Face-3D-Dataset-v1.0.0-claire/docs/README.html file
 19 | # Actor-specific parameters are represented as a dictionary: PARAM = {"actor1": value1, "actor2": value2, ...}
 20 | ########################################################################################################################
 21 | 
 22 | # Audio data
 23 | AUDIO_ROOT = {
 24 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/audio",
 25 | }
 26 | 
 27 | # Skin data
 28 | SKIN_CACHE_ROOT = {
 29 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/cache/skin",
 30 | }
 31 | SKIN_NEUTRAL_FPATH = {
 32 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/neutral_pose.npy",
 33 | }
 34 | SKIN_LIP_OPEN_POSE_DELTA_FPATH = {
 35 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/lip_open_pose_delta.npy",
 36 | }
 37 | SKIN_EYE_CLOSE_POSE_DELTA_FPATH = {
 38 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/eye_close_pose_delta.npy",
 39 | }
 40 | SKIN_LIP_DIST_VERTEX_LIST_FPATH = {
 41 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/lip_dist_vertex_list.json",
 42 | }
 43 | SKIN_LIP_SIZE_VERTEX_LIST_FPATH = {
 44 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/lip_size_vertex_list.json",
 45 | }
 46 | SKIN_EYE_DIST_VERTEX_LIST_FPATH = {
 47 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/skin/eye_dist_vertex_list.json",
 48 | }
 49 | 
 50 | # Tongue data
 51 | TONGUE_CACHE_ROOT = {
 52 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/cache/tongue",
 53 | }
 54 | TONGUE_NEUTRAL_FPATH = {
 55 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/tongue/neutral_pose.npy",
 56 | }
 57 | TONGUE_RIGID_VERTEX_LIST_FPATH = {
 58 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/geom/tongue/rigid_vertex_list.json",
 59 | }
 60 | 
 61 | # Jaw data
 62 | JAW_KEYPOINTS_NEUTRAL_FPATH = {
 63 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/xform/jaw_keypoints_neutral.npy",
 64 | }
 65 | JAW_ANIM_DATA_FPATH = {
 66 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/xform/jaw_keypoints_cache_all.npz",
 67 | }
 68 | 
 69 | # Eye data
 70 | EYE_ANIM_DATA_FPATH = {
 71 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/xform/eye_rotations_all.npz",
 72 | }
 73 | EYE_BLINK_KEYS_FPATH = {
 74 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/xform/eye_blink_keys.npy",
 75 | }
 76 | EYE_SACCADE_ROTATIONS_FPATH = {
 77 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/xform/eye_saccade_rotations.npy",
 78 | }
 79 | 
 80 | # Blendshape data
 81 | BLENDSHAPE_SKIN_FPATH = {
 82 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/bs_data/bs_skin.npz",
 83 | }
 84 | BLENDSHAPE_SKIN_CONFIG_FPATH = {
 85 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/bs_data/bs_skin_config.json",
 86 | }
 87 | BLENDSHAPE_TONGUE_FPATH = {
 88 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/bs_data/bs_tongue.npz",
 89 | }
 90 | BLENDSHAPE_TONGUE_CONFIG_FPATH = {
 91 |     "claire": "/datasets/Audio2Face-3D-Dataset-v1.0.0-claire/data/claire/bs_data/bs_tongue_config.json",
 92 | }
 93 | 
 94 | ########################################################################################################################
 95 | # Dataset properties and meta-information
 96 | ########################################################################################################################
 97 | 
 98 | # List of emotions used in the training dataset shots with facial animation performance
 99 | SHOT_EMOTION_NAMES = [
100 |     "neutral",
101 |     "amazement",
102 |     "anger",
103 |     "cheekiness",
104 |     "disgust",
105 |     "fear",
106 |     "grief",
107 |     "joy",
108 |     "outofbreath",
109 |     "pain",
110 |     "sadness",
111 | ]
112 | 
113 | # Frames-Per-Second rate for all the animation caches in the training dataset (per actor)
114 | # This parameter will be used to generate the shot list artifact during Preprocessing and augmented muted shots
115 | CACHE_FPS = {
116 |     "claire": 30.0,
117 | }
118 | 
119 | # List of the names of the actors performing the animation in the shots
120 | ACTOR_NAMES = [
121 |     "claire",
122 | ]
123 | 
124 | # Data transform scale: adjust these values according to your dataset, some data may require increasing scale value
125 | # Format: {"actor_name": {"channel_name": scale_value}}
126 | TRANSFORM_SCALE = {
127 |     "claire": {
128 |         "skin": 1.0,
129 |         "tongue": 1.0,
130 |         "jaw": 1.0,
131 |         "eye": 1.0,
132 |     },
133 | }
134 | 


--------------------------------------------------------------------------------
/audio2face/deps/motion_diffusion_model/diffusion/respace.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | ############################################################################
 17 | # Modified from motion-diffusion-model
 18 | # Copyright (c) 2022 Guy Tevet
 19 | #
 20 | # See https://github.com/GuyTevet/motion-diffusion-model/blob/main/LICENSE for details
 21 | ############################################################################
 22 | #
 23 | # This code is based on https://github.com/openai/guided-diffusion
 24 | import numpy as np
 25 | import torch as th
 26 | 
 27 | from .gaussian_diffusion import GaussianDiffusion
 28 | from ..utils.misc import wrapped_getattr
 29 | 
 30 | 
 31 | def space_timesteps(num_timesteps, section_counts):
 32 |     """
 33 |     Create a list of timesteps to use from an original diffusion process,
 34 |     given the number of timesteps we want to take from equally-sized portions
 35 |     of the original process.
 36 | 
 37 |     For example, if there's 300 timesteps and the section counts are [10,15,20]
 38 |     then the first 100 timesteps are strided to be 10 timesteps, the second 100
 39 |     are strided to be 15 timesteps, and the final 100 are strided to be 20.
 40 | 
 41 |     If the stride is a string starting with "ddim", then the fixed striding
 42 |     from the DDIM paper is used, and only one section is allowed.
 43 | 
 44 |     :param num_timesteps: the number of diffusion steps in the original
 45 |                           process to divide up.
 46 |     :param section_counts: either a list of numbers, or a string containing
 47 |                            comma-separated numbers, indicating the step count
 48 |                            per section. As a special case, use "ddimN" where N
 49 |                            is a number of steps to use the striding from the
 50 |                            DDIM paper.
 51 |     :return: a set of diffusion steps from the original process to use.
 52 |     """
 53 |     if isinstance(section_counts, str):
 54 |         if section_counts.startswith("ddim"):
 55 |             desired_count = int(section_counts[len("ddim") :])
 56 |             for i in range(1, num_timesteps):
 57 |                 if len(range(0, num_timesteps, i)) == desired_count:
 58 |                     return set(range(0, num_timesteps, i))
 59 |             raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride")
 60 |         section_counts = [int(x) for x in section_counts.split(",")]
 61 |     size_per = num_timesteps // len(section_counts)
 62 |     extra = num_timesteps % len(section_counts)
 63 |     start_idx = 0
 64 |     all_steps = []
 65 |     for i, section_count in enumerate(section_counts):
 66 |         size = size_per + (1 if i < extra else 0)
 67 |         if size < section_count:
 68 |             raise ValueError(f"cannot divide section of {size} steps into {section_count}")
 69 |         if section_count <= 1:
 70 |             frac_stride = 1
 71 |         else:
 72 |             frac_stride = (size - 1) / (section_count - 1)
 73 |         cur_idx = 0.0
 74 |         taken_steps = []
 75 |         for _ in range(section_count):
 76 |             taken_steps.append(start_idx + round(cur_idx))
 77 |             cur_idx += frac_stride
 78 |         all_steps += taken_steps
 79 |         start_idx += size
 80 |     return set(all_steps)
 81 | 
 82 | 
 83 | class SpacedDiffusion(GaussianDiffusion):
 84 |     """
 85 |     A diffusion process which can skip steps in a base diffusion process.
 86 | 
 87 |     :param use_timesteps: a collection (sequence or set) of timesteps from the
 88 |                           original diffusion process to retain.
 89 |     :param kwargs: the kwargs to create the base diffusion process.
 90 |     """
 91 | 
 92 |     def __init__(self, use_timesteps, **kwargs):
 93 |         self.use_timesteps = set(use_timesteps)
 94 |         self.timestep_map = []
 95 |         self.original_num_steps = len(kwargs["betas"])
 96 | 
 97 |         base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
 98 |         last_alpha_cumprod = 1.0
 99 |         new_betas = []
100 |         for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
101 |             if i in self.use_timesteps:
102 |                 new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
103 |                 last_alpha_cumprod = alpha_cumprod
104 |                 self.timestep_map.append(i)
105 |         kwargs["betas"] = np.array(new_betas)
106 |         super().__init__(**kwargs)
107 | 
108 |     def p_mean_variance(self, model, *args, **kwargs):  # pylint: disable=signature-differs
109 |         return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
110 | 
111 |     def training_losses(self, model, *args, **kwargs):  # pylint: disable=signature-differs
112 |         return super().training_losses(self._wrap_model(model), *args, **kwargs)
113 | 
114 |     def condition_mean(self, cond_fn, *args, **kwargs):
115 |         return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
116 | 
117 |     def condition_score(self, cond_fn, *args, **kwargs):
118 |         return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
119 | 
120 |     def _wrap_model(self, model):
121 |         if isinstance(model, _WrappedModel):
122 |             return model
123 |         return _WrappedModel(model, self.timestep_map, self.rescale_timesteps, self.original_num_steps)
124 | 
125 |     def _scale_timesteps(self, t):
126 |         # Scaling is done by the wrapped model.
127 |         return t
128 | 
129 | 
130 | class _WrappedModel:
131 |     def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
132 |         self.model = model
133 |         self.timestep_map = timestep_map
134 |         self.rescale_timesteps = rescale_timesteps
135 |         self.original_num_steps = original_num_steps
136 | 
137 |     def __call__(self, x, ts, **kwargs):
138 |         map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
139 |         new_ts = map_tensor[ts]
140 |         if self.rescale_timesteps:
141 |             new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
142 |         return self.model(x, new_ts, **kwargs)
143 | 
144 |     def __getattr__(self, name, default=None):
145 |         # this method is reached only if name is not in self.__dict__.
146 |         return wrapped_getattr(self, name, default)
147 | 


--------------------------------------------------------------------------------
/audio2face/deps/motion_diffusion_model/diffusion/resample.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | ############################################################################
 17 | # Modified from motion-diffusion-model
 18 | # Copyright (c) 2022 Guy Tevet
 19 | #
 20 | # See https://github.com/GuyTevet/motion-diffusion-model/blob/main/LICENSE for details
 21 | ############################################################################
 22 | #
 23 | from abc import ABC, abstractmethod
 24 | 
 25 | import numpy as np
 26 | import torch as th
 27 | import torch.distributed as dist
 28 | 
 29 | 
 30 | def create_named_schedule_sampler(name, diffusion):
 31 |     """
 32 |     Create a ScheduleSampler from a library of pre-defined samplers.
 33 | 
 34 |     :param name: the name of the sampler.
 35 |     :param diffusion: the diffusion object to sample for.
 36 |     """
 37 |     if name == "uniform":
 38 |         return UniformSampler(diffusion)
 39 |     elif name == "loss-second-moment":
 40 |         return LossSecondMomentResampler(diffusion)
 41 |     else:
 42 |         raise NotImplementedError(f"unknown schedule sampler: {name}")
 43 | 
 44 | 
 45 | class ScheduleSampler(ABC):
 46 |     """
 47 |     A distribution over timesteps in the diffusion process, intended to reduce
 48 |     variance of the objective.
 49 | 
 50 |     By default, samplers perform unbiased importance sampling, in which the
 51 |     objective's mean is unchanged.
 52 |     However, subclasses may override sample() to change how the resampled
 53 |     terms are reweighted, allowing for actual changes in the objective.
 54 |     """
 55 | 
 56 |     @abstractmethod
 57 |     def weights(self):
 58 |         """
 59 |         Get a numpy array of weights, one per diffusion step.
 60 | 
 61 |         The weights needn't be normalized, but must be positive.
 62 |         """
 63 | 
 64 |     def sample(self, batch_size, device):
 65 |         """
 66 |         Importance-sample timesteps for a batch.
 67 | 
 68 |         :param batch_size: the number of timesteps.
 69 |         :param device: the torch device to save to.
 70 |         :return: a tuple (timesteps, weights):
 71 |                  - timesteps: a tensor of timestep indices.
 72 |                  - weights: a tensor of weights to scale the resulting losses.
 73 |         """
 74 |         w = self.weights()
 75 |         p = w / np.sum(w)
 76 |         indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
 77 |         indices = th.from_numpy(indices_np).long().to(device)
 78 |         weights_np = 1 / (len(p) * p[indices_np])
 79 |         weights = th.from_numpy(weights_np).float().to(device)
 80 |         return indices, weights
 81 | 
 82 | 
 83 | class UniformSampler(ScheduleSampler):
 84 |     def __init__(self, diffusion):
 85 |         self.diffusion = diffusion
 86 |         self._weights = np.ones([diffusion.num_timesteps])
 87 | 
 88 |     def weights(self):
 89 |         return self._weights
 90 | 
 91 | 
 92 | class LossAwareSampler(ScheduleSampler):
 93 |     def update_with_local_losses(self, local_ts, local_losses):
 94 |         """
 95 |         Update the reweighting using losses from a model.
 96 | 
 97 |         Call this method from each rank with a batch of timesteps and the
 98 |         corresponding losses for each of those timesteps.
 99 |         This method will perform synchronization to make sure all of the ranks
100 |         maintain the exact same reweighting.
101 | 
102 |         :param local_ts: an integer Tensor of timesteps.
103 |         :param local_losses: a 1D Tensor of losses.
104 |         """
105 |         batch_sizes = [th.tensor([0], dtype=th.int32, device=local_ts.device) for _ in range(dist.get_world_size())]
106 |         dist.all_gather(
107 |             batch_sizes,
108 |             th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
109 |         )
110 | 
111 |         # Pad all_gather batches to be the maximum batch size.
112 |         batch_sizes = [x.item() for x in batch_sizes]
113 |         max_bs = max(batch_sizes)
114 | 
115 |         timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
116 |         loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
117 |         dist.all_gather(timestep_batches, local_ts)
118 |         dist.all_gather(loss_batches, local_losses)
119 |         timesteps = [x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]]
120 |         losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
121 |         self.update_with_all_losses(timesteps, losses)
122 | 
123 |     @abstractmethod
124 |     def update_with_all_losses(self, ts, losses):
125 |         """
126 |         Update the reweighting using losses from a model.
127 | 
128 |         Sub-classes should override this method to update the reweighting
129 |         using losses from the model.
130 | 
131 |         This method directly updates the reweighting without synchronizing
132 |         between workers. It is called by update_with_local_losses from all
133 |         ranks with identical arguments. Thus, it should have deterministic
134 |         behavior to maintain state across workers.
135 | 
136 |         :param ts: a list of int timesteps.
137 |         :param losses: a list of float losses, one per timestep.
138 |         """
139 | 
140 | 
141 | class LossSecondMomentResampler(LossAwareSampler):
142 |     def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
143 |         self.diffusion = diffusion
144 |         self.history_per_term = history_per_term
145 |         self.uniform_prob = uniform_prob
146 |         self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64)
147 |         self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
148 | 
149 |     def weights(self):
150 |         if not self._warmed_up():
151 |             return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
152 |         weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
153 |         weights /= np.sum(weights)
154 |         weights *= 1 - self.uniform_prob
155 |         weights += self.uniform_prob / len(weights)
156 |         return weights
157 | 
158 |     def update_with_all_losses(self, ts, losses):
159 |         for t, loss in zip(ts, losses):
160 |             if self._loss_counts[t] == self.history_per_term:
161 |                 # Shift out the oldest loss term.
162 |                 self._loss_history[t, :-1] = self._loss_history[t, 1:]
163 |                 self._loss_history[t, -1] = loss
164 |             else:
165 |                 self._loss_history[t, self._loss_counts[t]] = loss
166 |                 self._loss_counts[t] += 1
167 | 
168 |     def _warmed_up(self):
169 |         return (self._loss_counts == self.history_per_term).all()
170 | 


--------------------------------------------------------------------------------
/audio2face/emotion.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | import pickle
 17 | import numpy as np
 18 | 
 19 | from audio2face import utils
 20 | from audio2face.dataset import Clip
 21 | 
 22 | 
 23 | class ImplicitEmotionManager:
 24 |     def __init__(self, emo_db: np.ndarray | None = None, emo_specs: dict | None = None) -> None:
 25 |         self.emo_db = emo_db
 26 |         self.emo_specs = emo_specs
 27 |         self.compactified = False
 28 | 
 29 |     def load(self, fpath: str) -> None:
 30 |         with open(fpath, "rb") as f:
 31 |             emo_data = pickle.load(f)
 32 |         self.emo_db = emo_data["emo_db"]
 33 |         self.emo_specs = emo_data["emo_specs"]
 34 |         self.compactified = emo_data["compactified"]
 35 | 
 36 |     def save_pkl(self, fpath: str) -> None:
 37 |         emo_data = {
 38 |             "emo_db": self.emo_db,
 39 |             "emo_specs": self.emo_specs,
 40 |             "compactified": self.compactified,
 41 |         }
 42 |         with open(fpath, "wb") as f:
 43 |             pickle.dump(emo_data, f)
 44 | 
 45 |     def load_bin(self, fpath: str) -> None:
 46 |         with open(fpath, "rb") as f:
 47 |             data = f.read()
 48 |         idx = 0
 49 |         num_shots = utils.bytes2int(data[idx : idx + 4])
 50 |         idx += 4
 51 |         self.emo_specs = {}
 52 |         for _ in range(num_shots):
 53 |             shot_name_len = utils.bytes2int(data[idx : idx + 4])
 54 |             idx += 4
 55 |             shot_name = data[idx : idx + shot_name_len].decode()
 56 |             idx += shot_name_len
 57 |             first_frame = utils.bytes2int(data[idx : idx + 4])
 58 |             idx += 4
 59 |             num_frames = utils.bytes2int(data[idx : idx + 4])
 60 |             idx += 4
 61 |             self.emo_specs[shot_name] = (first_frame, num_frames)
 62 |         emo_len = utils.bytes2int(data[idx : idx + 4])
 63 |         idx += 4
 64 |         emo_db_size = utils.bytes2int(data[idx : idx + 4])
 65 |         idx += 4
 66 |         self.emo_db = np.frombuffer(data[idx : idx + emo_db_size * 4], dtype=np.float32).reshape(-1, emo_len)
 67 |         self.compactified = None  # unknown
 68 | 
 69 |     def save_bin(self, fpath: str) -> None:
 70 |         emo_shots = list(self.emo_specs.items())
 71 |         data = b""
 72 |         data += utils.int2bytes(len(emo_shots))
 73 |         for emo_shot in emo_shots:
 74 |             data += utils.int2bytes(len(emo_shot[0]))
 75 |             data += emo_shot[0].encode()
 76 |             data += utils.int2bytes(emo_shot[1][0])
 77 |             data += utils.int2bytes(emo_shot[1][1])
 78 |         data += utils.int2bytes(self.emo_db.shape[1])
 79 |         data += utils.int2bytes(self.emo_db.size)
 80 |         data += self.emo_db.tobytes()
 81 |         with open(fpath, "wb") as f:
 82 |             f.write(data)
 83 | 
 84 |     def load_npz(self, fpath: str) -> None:
 85 |         npz_data = np.load(fpath)
 86 |         self.emo_db = npz_data["emo_db"]
 87 |         self.compactified = None  # unknown
 88 |         self.emo_specs = {}
 89 | 
 90 |         emo_spec_names = [name.decode("utf-8") for name in npz_data["emo_spec_names"]]
 91 |         for name, start, size in zip(emo_spec_names, npz_data["emo_spec_start"], npz_data["emo_spec_size"]):
 92 |             self.emo_specs[name] = (start, size)
 93 | 
 94 |     def save_npz(self, fpath: str) -> None:
 95 |         emo_data = {}
 96 |         emo_data["emo_db"] = self.emo_db
 97 |         emo_data["emo_spec_names"] = []
 98 |         emo_data["emo_spec_start"] = []
 99 |         emo_data["emo_spec_size"] = []
100 | 
101 |         emo_specs = self.emo_specs
102 |         emo_specs_sorted = sorted([k for k in emo_specs], key=lambda x: emo_specs[x][0])
103 |         for k in emo_specs_sorted:
104 |             start, size = emo_specs[k]
105 |             emo_data["emo_spec_names"].append(k)
106 |             emo_data["emo_spec_start"].append(start)
107 |             emo_data["emo_spec_size"].append(size)
108 | 
109 |         emo_data["emo_spec_names"] = np.array(emo_data["emo_spec_names"], dtype="S")
110 |         emo_data["emo_spec_start"] = np.array(emo_data["emo_spec_start"], dtype=np.int32)
111 |         emo_data["emo_spec_size"] = np.array(emo_data["emo_spec_size"], dtype=np.int32)
112 |         np.savez(fpath, **emo_data)
113 | 
114 |     def emo_spec_to_idx(self, emo_spec: tuple[str, int]) -> int:
115 |         if self.emo_specs is None:
116 |             raise RuntimeError("ImplicitEmotionManager is not initialized")
117 |         emo_shot, emo_frame = emo_spec
118 |         first_frame, num_frames = self.emo_specs[emo_shot]
119 |         if emo_frame < 0 or emo_frame >= num_frames:
120 |             raise RuntimeError(
121 |                 "Emotion Frame {} is out of range [{}, {}] for Shot {}".format(emo_frame, 0, num_frames - 1, emo_shot)
122 |             )
123 |         idx = first_frame + emo_frame
124 |         return idx
125 | 
126 |     def get_emotion_vector(self, emo_spec: tuple[str, int]) -> np.ndarray:
127 |         if self.emo_db is None:
128 |             raise RuntimeError("ImplicitEmotionManager is not initialized")
129 |         global_idx = self.emo_spec_to_idx(emo_spec)
130 |         return self.emo_db[global_idx, :]
131 | 
132 |     def get_shot_matrix(self, shot_id: str) -> np.ndarray:
133 |         start, size = self.emo_specs[shot_id]
134 |         return self.emo_db[start : start + size]
135 | 
136 |     def compactify(self, dataset_clips: list[Clip]) -> None:
137 |         if self.compactified:
138 |             return
139 |         compact_emo_db = np.zeros((0, self.emo_db.shape[1]), dtype=self.emo_db.dtype)
140 |         compact_emo_specs = {}
141 |         sorted_shots = sorted(self.emo_specs.items(), key=lambda spec: spec[1][0])
142 |         compact_shot_start_global = 0
143 |         for shot_id, _ in sorted_shots:
144 |             shot_ranges = []
145 |             for clip in dataset_clips:
146 |                 if clip.shot.id == shot_id:
147 |                     shot_ranges.append((clip.first_frame, clip.last_frame))
148 |             if len(shot_ranges) > 0:  # if this shot is covered by any clips
149 |                 shot_ranges = utils.merge_ranges(shot_ranges)
150 |                 shot_matrix = self.get_shot_matrix(shot_id)
151 |                 compact_shot_matrix = utils.get_merged_submatrix_from_ranges(shot_matrix, shot_ranges)
152 |                 compact_emo_db = np.concatenate((compact_emo_db, compact_shot_matrix), axis=0)
153 |                 compact_shot_len = len(compact_shot_matrix)
154 |                 compact_emo_specs[shot_id] = (compact_shot_start_global, compact_shot_len)
155 |                 compact_shot_start_global += compact_shot_len
156 |         self.emo_db = compact_emo_db
157 |         self.emo_specs = compact_emo_specs
158 |         self.compactified = True
159 | 


--------------------------------------------------------------------------------
/docs/a2f_introduction.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | ## Audio2Face-3D Models and Integrations
 4 | 
 5 | ### Audio2Face-3D Evolution
 6 | 
 7 | Prior to its release as an open-source project, Audio2Face-3D was shipped with a curated set of pre-trained models. The included package evolved from its initial offering of a single model to a more comprehensive selection of three distinct models, providing users with enhanced flexibility and functionality out of the box.
 8 | 
 9 | <img src="resources/a2f_evolution.png" alt="audio2face evolution" width="60%" />
10 | 
11 | Audio2Face-3D (A2F) is designed to be extensible, allowing for the integration of additional user-generated models. Any new model that conforms to the architectural specifications of the existing Training Framework can be seamlessly incorporated.
12 | 
13 | ### Animation Inference with Audio2Face-3D
14 | The Audio2Face-3D (A2F) inference engine offers two primary methods for driving facial animation:
15 | 
16 | **Direct Vertex Manipulation:** The system can directly calculate and apply animation to the vertex positions of a target facial mesh. This method is ideal for workflows that do not rely on traditional rigging systems.
17 | 
18 | **Blend Shape Weights:** Alternatively, the framework can infer and output a series of blend shape weights. Traditionally, Audio2Face-3D has adopted the Apple ARKit standard to ensure immediate compatibility with a wide variety of facial systems. More info about ARKit on [Apple's developer site](https://developer.apple.com/documentation/arkit/arfaceanchor/blendshapelocation). But this is not limited to ARKit only. Custom blend shapes setups are supported.
19 | 
20 | **Mapping to Custom Rig Controls:** The blend shape weight values generated by Audio2Face-3D can serve as a powerful intermediary to drive the controls of any custom character rig, including complex systems. Through a strategic mapping process, you can achieve high-quality facial animation on a wide variety of character configurations. A highly effective method for this is to create a set of blend shapes that directly correspond to the controls of your character's rig.
21 | 
22 | This open architecture, combined with the provided Training Framework, empowers developers and artists to create highly customized facial animation models. This enables the final performance to be tailored to the character personality and custom rig controls.
23 | 
24 | ### What is an Audio2Face-3D Model?
25 | 
26 | Audio2Face-3D has three pre-trained models: Mark, James, and Claire. They all have their personality and have been trained in different languages. Users can perform inference from those models already.
27 | 
28 | <img src="resources/mark_claire_james.png" alt="audio2face models" width="50%" />
29 | 
30 | In essence, when Audio2Face-3D uses a model for inference, it uses a model card (a .json file) and a set of files generated by the Training Framework:
31 | 
32 | <img src="resources/model_files.png" alt="model files" width="50%" />
33 | 
34 | This document provides a comprehensive overview of the model files used by Audio2Face, detailing their generation process within the Training Framework. Once these model files are created and finalized, they can be deployed for inference across the Audio2Face-3D ecosystem.
35 | 
36 | ### Audio2Face-3D Integrations
37 | 
38 | The Audio2Face-3D inference engine is accessible through several versatile integration points, allowing you to incorporate the technology into a wide range of production pipelines.
39 | 
40 | * **Maya-ACE Plugin for Autodesk Maya:** A dedicated plugin that enables interactive animation inference directly within the Maya environment. It can load and use any compatible Audio2Face-3D model to drive character rigs.
41 | 
42 | * **ACE Plugin for Epic's Unreal Engine:** Use our Audio2Face-3D plugin for Unreal Engine 5 alongside a configuration sample to enhance your Metahuman. Audio2Face-3D 3.0 is now available with on-device Unreal Engine 5 support.
43 | 
44 | * **Audio2Face-3D (A2F) SDK:** A C++ Software Development Kit (SDK) designed for developers to natively integrate the inference engine into custom applications, game engines, or other content creation tools for seamless performance.
45 | 
46 | * **Audio2Face-3D NIM (NVIDIA Inference Microservice):** A scalable, containerized microservice that exposes the inference functionality through the gRPC protocol. This architecture is ideal for building flexible and high-performance services that can be accessed remotely using various clients, including Python scripts.
47 | 
48 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<img src="resources/mace_play_james_a2f.gif" alt="Maya-ACE with Trained Model in Action" width="55%" />
49 | 
50 | ## Training Framework | High Level Overview
51 | 
52 | The primary goal of the Training Framework is to produce custom deep learning models that the Audio2Face application can use to perform real-time, audio-driven facial animation inference.
53 | 
54 | From a technical perspective, the Training Framework is designed to run in a Linux environment (**Ubuntu** or **Windows Subsystem for Linux - WSL**) and requires a **CUDA-compatible GPU** for hardware acceleration. This document provides detailed, step-by-step instructions for acquiring all necessary components and correctly configuring the framework.
55 | 
56 | The major components of the framework and the end-to-end training workflow are illustrated in the diagram below:
57 | 
58 | <img src="resources/tf_highlevel_overview.png" alt="Training Framework high level overview" width="70%" />
59 | 
60 | The Training Framework is encapsulated within a **Docker container**, which is built locally using the provided Dockerfile.
61 | 
62 | The operation of this container is orchestrated by a suite of Python scripts and configuration files available in a dedicated **GitHub repository**. These scripts serve as the primary interface, bridging the local animation dataset with the training processes running inside the container.
63 | 
64 | A critical prerequisite for training is the preparation of a facial animation dataset that adheres to the specific format and directory structure required by the framework. Detailed instructions on how to properly assemble a custom dataset are provided in the [Preparing Animation Data for Training](preparing_animation_data.md) document.
65 | 
66 | For the container and its scripts to function correctly, the prepared dataset must be copied into the designated directory on the Ubuntu/WSL host system. To facilitate learning and testing, a complete example dataset is available on **Hugging Face**. This resource is highly recommended for verifying your setup and understanding the expected data structure before using your own data.
67 | 
68 | The ultimate output of the framework is a set of deployable model files. These files contain all the necessary data for the Audio2Face application to perform high-quality facial animation inference based on your custom training data.
69 | 
70 | The workflow is executed through a series of sequential steps:
71 | 
72 | 1. **Dataset Assembly:** The audio and animation data is collected, formatted, and placed into the correct directory structure as specified in this guide.
73 | 
74 | 2. **Execution via Scripts:** Using the provided Python scripts and configuration files, the core training pipeline is initiated. This process involves sub-steps:
75 |     * **Preprocessing:** The raw dataset is validated and prepared for training.
76 |     * **Training:** The neural network is trained on the preprocessed data.
77 |     * **Deployment:** The trained model is packaged for further use.
78 | 
79 | <img src="resources/a2f_tf_pipeline.png" alt="training workflow" width="60%" />
80 | 
81 | 3. **TensorRT™ Engine Build:** An NVIDIA TensorRT™ engine needs to be built from the model to optimize it for high-performance, low-latency inference.
82 | 
83 | 4. **Inference:** Once the final model files are generated, they can be used for animation inference via any of the supported Audio2Face-3D integration points, such as the **Maya-ACE plugin**, the **A2F SDK**, or custom scripts leveraging the **Audio2Face NIM**.
84 | 
85 | ***
86 | 
87 | [Back](../README.md)
88 | 


--------------------------------------------------------------------------------
/audio2face/convert_onnx.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | import os
 17 | 
 18 | import torch
 19 | import torch.onnx
 20 | 
 21 | from audio2face import utils
 22 | from audio2face.networks.base import NetworkBaseDiffusion
 23 | from audio2face.infer_diffusion import InferenceEngine
 24 | 
 25 | 
 26 | class Converter(InferenceEngine):
 27 |     def __init__(
 28 |         self,
 29 |         training_run_name_full: str,
 30 |         cfg_train_mod: dict | None = None,
 31 |         cfg_dataset_mod: dict | None = None,
 32 |         cfg_inference_mod: dict | None = None,
 33 |     ) -> None:
 34 |         self.training_run_name_full = training_run_name_full
 35 |         self.cfg_train_mod = cfg_train_mod
 36 |         self.cfg_dataset_mod = cfg_dataset_mod
 37 |         self.cfg_inference_mod = cfg_inference_mod
 38 | 
 39 |     def setup(self) -> None:
 40 |         super().setup()
 41 |         self.deploy_dir = os.path.normpath(os.path.join(self.training_artifact_dir, "deploy"))
 42 | 
 43 |         self.wrap_model = WrapModel(
 44 |             self.cfg_train, self.cfg_dataset, self.cfg_inference, self.network, self.diffusion, self.sample_fn
 45 |         )
 46 |         self.num_frames = (
 47 |             self.cfg_train.STREAMING_CFG[str(int(self.cfg_train.TARGET_FPS))]["block_frame_size"]
 48 |             + self.cfg_train.STREAMING_CFG[str(int(self.cfg_train.TARGET_FPS))]["left_truncate"]
 49 |             + self.cfg_train.STREAMING_CFG[str(int(self.cfg_train.TARGET_FPS))]["right_truncate"]
 50 |         )
 51 |         self.total_dim = (
 52 |             self.wrap_model.model.skin_dim
 53 |             + self.wrap_model.model.tongue_dim
 54 |             + self.wrap_model.model.jaw_dim
 55 |             + self.wrap_model.model.eye_dim
 56 |         )
 57 | 
 58 |     def convert_model(self) -> None:
 59 |         wrapped_model = self.wrap_model
 60 |         num_diffusion_steps = len(wrapped_model.diffusion.use_timesteps)
 61 |         batch_size = 3
 62 |         num_actors = len(self.cfg_dataset.ACTOR_NAMES)
 63 |         window = torch.randn(
 64 |             batch_size,
 65 |             self.cfg_train.STREAMING_CFG[str(int(self.cfg_train.TARGET_FPS))]["window_size"],
 66 |             dtype=torch.float32,
 67 |         ).to(torch.device("cuda"))
 68 |         actor_vec = torch.randn(batch_size, num_actors, dtype=torch.float32).to(torch.device("cuda"))
 69 |         emo_len = len(
 70 |             utils.get_network_emotion_names(
 71 |                 self.cfg_dataset.SHOT_EMOTION_NAMES, self.cfg_train.SHOT_EMOTION_NAME_FOR_ALL_ZEROS
 72 |             )
 73 |         )
 74 |         if self.cfg_inference.USE_PER_FRAME_EMO_LABEL:
 75 |             emotion_vec = torch.randn(
 76 |                 batch_size,
 77 |                 self.cfg_train.STREAMING_CFG[str(int(self.cfg_train.TARGET_FPS))]["block_frame_size"],
 78 |                 emo_len,
 79 |                 dtype=torch.float32,
 80 |             ).to(torch.device("cuda"))
 81 |         else:
 82 |             emotion_vec = torch.randn(batch_size, emo_len, dtype=torch.float32).to(torch.device("cuda"))
 83 |         h_gru_all = torch.randn(
 84 |             num_diffusion_steps,
 85 |             self.cfg_train["NETWORK_HYPER_PARAMS"]["num_gru_layers"],
 86 |             batch_size,
 87 |             self.cfg_train["NETWORK_HYPER_PARAMS"]["gru_feature_dim"],
 88 |             dtype=torch.float32,
 89 |         ).to(torch.device("cuda"))
 90 | 
 91 |         if self.cfg_inference.USE_EXTERNAL_NOISE_INPUT:
 92 |             noise = torch.randn([batch_size, num_diffusion_steps + 1, self.num_frames, self.total_dim], device="cuda")
 93 |         else:
 94 |             noise = None  # use model with internal generated random noise
 95 | 
 96 |         onnx_model_fpath = os.path.join(self.deploy_dir, "network.onnx")
 97 |         torch.onnx.export(
 98 |             wrapped_model,
 99 |             (window, actor_vec, emotion_vec, h_gru_all, noise),
100 |             onnx_model_fpath,
101 |             input_names=["window", "identity", "emotion", "input_latents", "noise"],
102 |             output_names=["prediction", "output_latents"],
103 |             opset_version=14,
104 |             do_constant_folding=False,  # TODO Setting to true causes gpu/cpu device mismatch error
105 |             dynamic_axes={
106 |                 "window": [0],
107 |                 "identity": [0],  # actor_name
108 |                 "emotion": [0],
109 |                 "input_latents": [2],
110 |                 "noise": [0],
111 |                 "prediction": [0],
112 |                 "output_latents": [2],
113 |             },
114 |         )
115 | 
116 | 
117 | class WrapModel(torch.nn.Module):
118 |     def __init__(
119 |         self,
120 |         cfg_train: dict,
121 |         cfg_dataset: dict,
122 |         cfg_inference: dict,
123 |         network: NetworkBaseDiffusion,
124 |         diffusion,
125 |         sample_fn,
126 |     ) -> None:
127 |         super(WrapModel, self).__init__()
128 |         self.cfg_train = cfg_train
129 |         self.cfg_dataset = cfg_dataset
130 |         self.cfg_inference = cfg_inference
131 |         self.model = network
132 | 
133 |         self.model.TIMESTEP_RESPACING = cfg_inference.TIMESTEP_RESPACING
134 |         if self.cfg_inference.USE_DELTA_OUTPUT:
135 |             self.model.set_mode(mode="streaming_stateless_output_delta")
136 |         else:
137 |             self.model.set_mode(mode="streaming_stateless")
138 | 
139 |         self.diffusion = diffusion
140 |         self.sample_fn = sample_fn
141 |         assert not cfg_inference.USE_DDIM  # TODO Only support ddpm for now
142 | 
143 |     def forward(
144 |         self,
145 |         window: torch.Tensor,
146 |         actor_vec: torch.Tensor,
147 |         emotion_vec: torch.Tensor,
148 |         h_gru_all: torch.Tensor,
149 |         noise: torch.Tensor,
150 |     ) -> torch.Tensor:
151 |         num_frame = (
152 |             self.cfg_train.STREAMING_CFG[str(int(self.cfg_train.TARGET_FPS))]["block_frame_size"]
153 |             + self.cfg_train.STREAMING_CFG[str(int(self.cfg_train.TARGET_FPS))]["left_truncate"]
154 |             + self.cfg_train.STREAMING_CFG[str(int(self.cfg_train.TARGET_FPS))]["right_truncate"]
155 |         )
156 |         prediction, h_gru_all = self.sample_fn(
157 |             self.model,
158 |             (
159 |                 window.shape[0],
160 |                 num_frame,
161 |                 self.model.skin_dim + self.model.tongue_dim + self.model.jaw_dim + self.model.eye_dim,
162 |             ),
163 |             clip_denoised=False,
164 |             model_kwargs={
165 |                 "audio": window,
166 |                 "actor_vec": actor_vec,
167 |                 "emotion_vec": emotion_vec,
168 |                 "h_gru_all": h_gru_all,
169 |             },
170 |             skip_timesteps=self.cfg_inference.SKIP_STEPS,
171 |             init_image=None,
172 |             progress=False,
173 |             dump_steps=None,
174 |             noise=noise,
175 |             const_noise=False,
176 |             device="cuda",
177 |         )
178 |         return prediction, h_gru_all
179 | 
180 | 
181 | def run(
182 |     training_run_name_full: str,
183 |     cfg_train_mod: dict | None = None,
184 |     cfg_dataset_mod: dict | None = None,
185 |     cfg_inference_mod: dict | None = None,
186 | ) -> None:
187 |     converter = Converter(training_run_name_full, cfg_train_mod, cfg_dataset_mod, cfg_inference_mod)
188 |     converter.setup()
189 |     converter.convert_model()
190 | 


--------------------------------------------------------------------------------
/audio2face/deps/motion_diffusion_model/diffusion/nn.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | ############################################################################
 17 | # Modified from motion-diffusion-model
 18 | # Copyright (c) 2022 Guy Tevet
 19 | #
 20 | # See https://github.com/GuyTevet/motion-diffusion-model/blob/main/LICENSE for details
 21 | ############################################################################
 22 | #
 23 | # This code is based on https://github.com/openai/guided-diffusion
 24 | """
 25 | Various utilities for neural networks.
 26 | """
 27 | 
 28 | import math
 29 | 
 30 | import torch as th
 31 | import torch.nn as nn
 32 | 
 33 | 
 34 | # PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
 35 | class SiLU(nn.Module):
 36 |     def forward(self, x):
 37 |         return x * th.sigmoid(x)
 38 | 
 39 | 
 40 | class GroupNorm32(nn.GroupNorm):
 41 |     def forward(self, x):
 42 |         return super().forward(x.float()).type(x.dtype)
 43 | 
 44 | 
 45 | def conv_nd(dims, *args, **kwargs):
 46 |     """
 47 |     Create a 1D, 2D, or 3D convolution module.
 48 |     """
 49 |     if dims == 1:
 50 |         return nn.Conv1d(*args, **kwargs)
 51 |     elif dims == 2:
 52 |         return nn.Conv2d(*args, **kwargs)
 53 |     elif dims == 3:
 54 |         return nn.Conv3d(*args, **kwargs)
 55 |     raise ValueError(f"unsupported dimensions: {dims}")
 56 | 
 57 | 
 58 | def linear(*args, **kwargs):
 59 |     """
 60 |     Create a linear module.
 61 |     """
 62 |     return nn.Linear(*args, **kwargs)
 63 | 
 64 | 
 65 | def avg_pool_nd(dims, *args, **kwargs):
 66 |     """
 67 |     Create a 1D, 2D, or 3D average pooling module.
 68 |     """
 69 |     if dims == 1:
 70 |         return nn.AvgPool1d(*args, **kwargs)
 71 |     elif dims == 2:
 72 |         return nn.AvgPool2d(*args, **kwargs)
 73 |     elif dims == 3:
 74 |         return nn.AvgPool3d(*args, **kwargs)
 75 |     raise ValueError(f"unsupported dimensions: {dims}")
 76 | 
 77 | 
 78 | def update_ema(target_params, source_params, rate=0.99):
 79 |     """
 80 |     Update target parameters to be closer to those of source parameters using
 81 |     an exponential moving average.
 82 | 
 83 |     :param target_params: the target parameter sequence.
 84 |     :param source_params: the source parameter sequence.
 85 |     :param rate: the EMA rate (closer to 1 means slower).
 86 |     """
 87 |     for targ, src in zip(target_params, source_params):
 88 |         targ.detach().mul_(rate).add_(src, alpha=1 - rate)
 89 | 
 90 | 
 91 | def zero_module(module):
 92 |     """
 93 |     Zero out the parameters of a module and return it.
 94 |     """
 95 |     for p in module.parameters():
 96 |         p.detach().zero_()
 97 |     return module
 98 | 
 99 | 
100 | def scale_module(module, scale):
101 |     """
102 |     Scale the parameters of a module and return it.
103 |     """
104 |     for p in module.parameters():
105 |         p.detach().mul_(scale)
106 |     return module
107 | 
108 | 
109 | def mean_flat(tensor):
110 |     """
111 |     Take the mean over all non-batch dimensions.
112 |     """
113 |     return tensor.mean(dim=list(range(1, len(tensor.shape))))
114 | 
115 | 
116 | def sum_flat(tensor):
117 |     """
118 |     Take the sum over all non-batch dimensions.
119 |     """
120 |     return tensor.sum(dim=list(range(1, len(tensor.shape))))
121 | 
122 | 
123 | def normalization(channels):
124 |     """
125 |     Make a standard normalization layer.
126 | 
127 |     :param channels: number of input channels.
128 |     :return: an nn.Module for normalization.
129 |     """
130 |     return GroupNorm32(32, channels)
131 | 
132 | 
133 | def timestep_embedding(timesteps, dim, max_period=10000):
134 |     """
135 |     Create sinusoidal timestep embeddings.
136 | 
137 |     :param timesteps: a 1-D Tensor of N indices, one per batch element.
138 |                       These may be fractional.
139 |     :param dim: the dimension of the output.
140 |     :param max_period: controls the minimum frequency of the embeddings.
141 |     :return: an [N x dim] Tensor of positional embeddings.
142 |     """
143 |     half = dim // 2
144 |     freqs = th.exp(-math.log(max_period) * th.arange(start=0, end=half, dtype=th.float32) / half).to(
145 |         device=timesteps.device
146 |     )
147 |     args = timesteps[:, None].float() * freqs[None]
148 |     embedding = th.cat([th.cos(args), th.sin(args)], dim=-1)
149 |     if dim % 2:
150 |         embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1)
151 |     return embedding
152 | 
153 | 
154 | def checkpoint(func, inputs, params, flag):
155 |     """
156 |     Evaluate a function without caching intermediate activations, allowing for
157 |     reduced memory at the expense of extra compute in the backward pass.
158 |     :param func: the function to evaluate.
159 |     :param inputs: the argument sequence to pass to `func`.
160 |     :param params: a sequence of parameters `func` depends on but does not
161 |                    explicitly take as arguments.
162 |     :param flag: if False, disable gradient checkpointing.
163 |     """
164 |     if flag:
165 |         args = tuple(inputs) + tuple(params)
166 |         return CheckpointFunction.apply(func, len(inputs), *args)
167 |     else:
168 |         return func(*inputs)
169 | 
170 | 
171 | class CheckpointFunction(th.autograd.Function):
172 |     @staticmethod
173 |     @th.cuda.amp.custom_fwd
174 |     def forward(ctx, run_function, length, *args):
175 |         ctx.run_function = run_function
176 |         ctx.input_length = length
177 |         ctx.save_for_backward(*args)
178 |         with th.no_grad():
179 |             output_tensors = ctx.run_function(*args[:length])
180 |         return output_tensors
181 | 
182 |     @staticmethod
183 |     @th.cuda.amp.custom_bwd
184 |     def backward(ctx, *output_grads):
185 |         args = list(ctx.saved_tensors)
186 | 
187 |         # Filter for inputs that require grad. If none, exit early.
188 |         input_indices = [i for (i, x) in enumerate(args) if x.requires_grad]
189 |         if not input_indices:
190 |             return (None, None) + tuple(None for _ in args)
191 | 
192 |         with th.enable_grad():
193 |             for i in input_indices:
194 |                 if i < ctx.input_length:
195 |                     # Not sure why the OAI code does this little
196 |                     # dance. It might not be necessary.
197 |                     args[i] = args[i].detach().requires_grad_()
198 |                     args[i] = args[i].view_as(args[i])
199 |             output_tensors = ctx.run_function(*args[: ctx.input_length])
200 | 
201 |         if isinstance(output_tensors, th.Tensor):
202 |             output_tensors = [output_tensors]
203 | 
204 |         # Filter for outputs that require grad. If none, exit early.
205 |         out_and_grads = [(o, g) for (o, g) in zip(output_tensors, output_grads) if o.requires_grad]
206 |         if not out_and_grads:
207 |             return (None, None) + tuple(None for _ in args)
208 | 
209 |         # Compute gradients on the filtered tensors.
210 |         computed_grads = th.autograd.grad(
211 |             [o for (o, g) in out_and_grads], [args[i] for i in input_indices], [g for (o, g) in out_and_grads]
212 |         )
213 | 
214 |         # Reassemble the complete gradient tuple.
215 |         input_grads = [None for _ in args]
216 |         for i, g in zip(input_indices, computed_grads):
217 |             input_grads[i] = g
218 |         return (None, None) + tuple(input_grads)
219 | 


--------------------------------------------------------------------------------
/audio2face/deps/charsiu/README.md:
--------------------------------------------------------------------------------
  1 | ## Charsiu: A transformer-based phonetic aligner [[arXiv]](https://arxiv.org/abs/2110.03876)
  2 | 
  3 | ### Updates
  4 | - 2.10.2022. We release phone- and word-level alignments for 860k utterances from the English subset of Common Voice. Check out [this link](misc/data.md#alignments-for-english-datasets).  
  5 | - 1.31.2022. We release phone- and word-level alignments for over a million Mandarin utterances. Check out [this link](misc/data.md#alignments-for-mandarin-speech-datasets).  
  6 | - 1.26.2022. Word alignment functionality has been added to `charsiu_forced_aligner` .
  7 | 
  8 | ### Intro
  9 | **Charsiu** is a phonetic alignment tool, which can:
 10 | - recognise phonemes in a given audio file
 11 | - perform forced alignment using phone transcriptions created in the previous step or provided by the user.
 12 | - directly predict the phone-to-audio alignment from audio (text-independent alignment)  
 13 | 
 14 | The aligner is under active development. New functions, new languages and detailed documentation will be added soon! Give us a star if you like our project!  
 15 | **Fun fact**: Char Siu is one of the most representative dishes of Cantonese cuisine 🍲 (see [wiki](https://en.wikipedia.org/wiki/Char_siu)). 
 16 | 
 17 | 
 18 | 
 19 | ### Table of content
 20 | - [Tutorial](README.md#Tutorial)  
 21 | - [Usage](README.md#Usage)  
 22 | - [Pretrained models](README.md#Pretrained-models)
 23 | - [Development plan](README.md#Development-plan)  
 24 | - [Dependencies](README.md#Dependencies)  
 25 | - [Training](README.md#Training)  
 26 | - [Attribution and Citation](README.md#attribution-and-citation)  
 27 | - [References](README.md#References)  
 28 | - [Disclaimer](README.md#Disclaimer)  
 29 | - [Support or Contact](README.md#support-or-contact)
 30 | 
 31 | 
 32 | 
 33 | 
 34 | ### Tutorial 
 35 | **[!NEW]** A step-by-step tutorial for linguists: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lingjzhu/charsiu/blob/development/charsiu_tutorial.ipynb)
 36 | 
 37 | You can directly run our model in the cloud via Google Colab!  
 38 |  - Forced alignment:   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lingjzhu/charsiu/blob/development/charsiu_forced_alignment_demo.ipynb)  
 39 |  - Textless alignment: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lingjzhu/charsiu/blob/development/charsiu_textless_demo.ipynb)  
 40 | 
 41 | ### Usage
 42 | ```
 43 | git clone  https://github.com/lingjzhu/charsiu
 44 | cd charsiu
 45 | ```
 46 | #### Forced alignment
 47 | ```Python
 48 | from Charsiu import charsiu_forced_aligner
 49 | # if there are errors importing, uncomment the following lines and add path to charsiu
 50 | # import sys
 51 | # sys.path.append('path_to_charsiu/src')
 52 | 
 53 | # initialize model
 54 | charsiu = charsiu_forced_aligner(aligner='charsiu/en_w2v2_fc_10ms')
 55 | # perform forced alignment
 56 | alignment = charsiu.align(audio='./local/SA1.WAV',
 57 |                           text='She had your dark suit in greasy wash water all year.')
 58 | # perform forced alignment and save the output as a textgrid file
 59 | charsiu.serve(audio='./local/SA1.WAV',
 60 |               text='She had your dark suit in greasy wash water all year.',
 61 |               save_to='./local/SA1.TextGrid')
 62 | 
 63 | 
 64 | # Chinese
 65 | charsiu = charsiu_forced_aligner(aligner='charsiu/zh_w2v2_tiny_fc_10ms',lang='zh')
 66 | charsiu.align(audio='./local/SSB00050015_16k.wav',text='经广州日报报道后成为了社会热点。')
 67 | charsiu.serve(audio='./local/SSB00050015_16k.wav', text='经广州日报报道后成为了社会热点。',
 68 |               save_to='./local/SSB00050015.TextGrid')
 69 |               
 70 | # An numpy array of speech signal can also be passed to the model.
 71 | import soundfile as sf
 72 | y, sr = sf.read('./local/SSB00050015_16k.wav')
 73 | charsiu.align(audio=y,text='经广州日报报道后成为了社会热点。')
 74 | ```
 75 | 
 76 | 
 77 | #### Textless alignment
 78 | ```Python
 79 | from Charsiu import charsiu_predictive_aligner
 80 | # English
 81 | # initialize a model
 82 | charsiu = charsiu_predictive_aligner(aligner='charsiu/en_w2v2_fc_10ms')
 83 | # perform textless alignment
 84 | alignment = charsiu.align(audio='./local/SA1.WAV')
 85 | # Or
 86 | # perform textless alignment and output the results to a textgrid file
 87 | charsiu.serve(audio='./local/SA1.WAV', save_to='./local/SA1.TextGrid')
 88 | 
 89 | 
 90 | # Chinese
 91 | charsiu = charsiu_predictive_aligner(aligner='charsiu/zh_xlsr_fc_10ms',lang='zh')
 92 | 
 93 | charsiu.align(audio='./local/SSB16240001_16k.wav')
 94 | # Or
 95 | charsiu.serve(audio='./local/SSB16240001_16k.wav', save_to='./local/SSB16240001.TextGrid')
 96 | ```
 97 | 
 98 | ### Pretrained models  
 99 | Pretrained models are available at the 🤗 *HuggingFace* model hub: https://huggingface.co/charsiu.
100 | 
101 | 
102 | ### Development plan
103 | 
104 |  - Package  
105 | 
106 | |     Items          | Progress |
107 | |:------------------:|:--------:|
108 | |  Documentation     | Nov 2021 |    
109 | |  Textgrid support  |     √    |
110 | | Word Segmentation  |     √    |
111 | | Model compression  |   TBD    |
112 | |  IPA support       |   TBD    |
113 | 
114 |  - Multilingual support
115 | 
116 | |      Language      | Progress |
117 | |:------------------:|:--------:|
118 | | English (American) |     √    |
119 | |  Mandarin Chinese  |     √    |
120 | |       German       | TBD |
121 | |       Spanish      | TBD |
122 | |  English (British) |    TBD   |
123 | |    Cantonese       |    TBD   |
124 | |    AAVE            |    TBD   |
125 | 
126 | 
127 | 
128 | 
129 | 
130 | ### Dependencies
131 | pytorch  
132 | transformers  
133 | datasets  
134 | librosa  
135 | g2pe  
136 | praatio  
137 | g2pM
138 | 
139 | 
140 | ### Training
141 | The training pipeline is coming soon!
142 | 
143 | Note.Training code is in `experiments/`. Those were original research code for training the model. They still need to be reorganized. 
144 | 
145 | 
146 | ### Attribution and Citation
147 | For now, you can cite this tool as:
148 | 
149 | ```
150 | @article{zhu2022charsiu,
151 |   title={Phone-to-audio alignment without text: A Semi-supervised Approach},
152 |   author={Zhu, Jian and Zhang, Cong and Jurgens, David},
153 |   journal={IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
154 |   year={2022}
155 |  }
156 | ```
157 | Or
158 | 
159 | 
160 | To share a direct web link: https://github.com/lingjzhu/charsiu/.
161 | 
162 | ### References
163 | [Transformers](https://huggingface.co/transformers/)  
164 | [s3prl](https://github.com/s3prl/s3prl)  
165 | [Montreal Forced Aligner](https://montreal-forced-aligner.readthedocs.io/en/latest/)
166 | 
167 | 
168 | ### Disclaimer
169 | 
170 | This tool is a beta version and is still under active development. It may have bugs and quirks, alongside the difficulties and provisos which are described throughout the documentation. 
171 | This tool is distributed under MIT license. Please see [license](https://github.com/lingjzhu/charsiu/blob/main/LICENSE) for details. 
172 | 
173 | By using this tool, you acknowledge:
174 | 
175 | * That you understand that this tool does not produce perfect camera-ready data, and that all results should be hand-checked for sanity's sake, or at the very least, noise should be taken into account.
176 | 
177 | * That you understand that this tool is a work in progress which may contain bugs.  Future versions will be released, and bug fixes (and additions) will not necessarily be advertised.
178 | 
179 | * That this tool may break with future updates of the various dependencies, and that the authors are not required to repair the package when that happens.
180 | 
181 | * That you understand that the authors are not required or necessarily available to fix bugs which are encountered (although you're welcome to submit bug reports to Jian Zhu (lingjzhu@umich.edu), if needed), nor to modify the tool to your needs.
182 | 
183 | * That you will acknowledge the authors of the tool if you use, modify, fork, or re-use the code in your future work.  
184 | 
185 | * That rather than re-distributing this tool to other researchers, you will instead advise them to download the latest version from the website.
186 | 
187 | ... and, most importantly:
188 | 
189 | * That neither the authors, our collaborators, nor the the University of Michigan or any related universities on the whole, are responsible for the results obtained from the proper or improper usage of the tool, and that the tool is provided as-is, as a service to our fellow linguists.
190 | 
191 | All that said, thanks for using our tool, and we hope it works wonderfully for you!
192 | 
193 | ### Support or Contact
194 | Please contact Jian Zhu ([lingjzhu@umich.edu](lingjzhu@umich.edu)) for technical support.  
195 | Contact Cong Zhang ([cong.zhang@ru.nl](cong.zhang@ru.nl)) if you would like to receive more instructions on how to use the package.
196 | 
197 | 
198 | 
199 | 


--------------------------------------------------------------------------------