├── studio-voice ├── interfaces │ ├── __init__.py │ └── studio_voice │ │ ├── __init__.py │ │ ├── studiovoice_pb2.pyi │ │ ├── studiovoice_pb2.py │ │ └── studiovoice_pb2_grpc.py ├── requirements.txt ├── assets │ ├── studio_voice_16k_input.wav │ └── studio_voice_48k_input.wav ├── protos │ ├── compile_protos.sh │ ├── compile_protos.bat │ └── proto │ │ └── nvidia │ │ └── maxine │ │ └── studiovoice │ │ └── v1 │ │ └── studiovoice.proto ├── README.md └── scripts │ └── studio_voice.py ├── audio2face-2d ├── python │ ├── requirements.txt │ ├── interfaces │ │ ├── __init__.py │ │ ├── audio2face2d_pb2_grpc.py │ │ ├── audio2face2d_pb2.py │ │ └── audio2face2d_pb2.pyi │ └── scripts │ │ └── audio2face-2d.py ├── assets │ ├── sample_audio.wav │ ├── sample_portrait_image.png │ ├── head_translation_animation.csv │ └── head_rotation_animation.csv ├── nodejs │ ├── package.json │ ├── scripts │ │ └── index.html │ └── interfaces │ │ └── audio2face2d_grpc_pb.js ├── protos │ ├── linux │ │ ├── nodejs │ │ │ └── compile_protos.sh │ │ └── python │ │ │ └── compile_protos.sh │ ├── windows │ │ ├── nodejs │ │ │ └── compile_protos.bat │ │ └── python │ │ │ └── compile_protos.bat │ └── proto │ │ └── nvidia │ │ └── maxine │ │ └── audio2face2d │ │ └── v1 │ │ └── audio2face2d.proto └── README.md ├── eye-contact ├── requirements.txt ├── assets │ ├── sample_streamable.mp4 │ └── sample_transactional.mp4 ├── interfaces │ ├── __init__.py │ ├── eyecontact_pb2_grpc.py │ ├── eyecontact_pb2.py │ └── eyecontact_pb2.pyi ├── protos │ ├── windows │ │ └── compile_protos.bat │ ├── linux │ │ └── compile_protos.sh │ └── proto │ │ └── nvidia │ │ └── maxine │ │ └── eyecontact │ │ └── v1 │ │ └── eyecontact.proto ├── scripts │ ├── constants.py │ └── eye-contact.py └── README.md ├── bnr ├── assets │ ├── bnr_16k_input.wav │ └── bnr_48k_input.wav ├── requirements.txt ├── interfaces │ ├── __init__.py │ └── bnr │ │ ├── __init__.py │ │ ├── bnr_pb2.py │ │ └── bnr_pb2_grpc.py ├── protos │ ├── compile_protos.sh │ ├── compile_protos.bat │ └── proto │ │ └── nvidia │ │ └── maxine │ │ └── bnr │ │ └── v1 │ │ └── bnr.proto ├── README.md └── scripts │ └── bnr.py ├── LICENSE.md ├── README.md ├── SECURITY.md ├── .gitignore └── utils └── utils.py /studio-voice/interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | from . import studio_voice -------------------------------------------------------------------------------- /audio2face-2d/python/requirements.txt: -------------------------------------------------------------------------------- 1 | grpcio==1.67.1 2 | grpcio-tools==1.67.1 3 | -------------------------------------------------------------------------------- /eye-contact/requirements.txt: -------------------------------------------------------------------------------- 1 | grpcio==1.67.1 2 | grpcio-tools==1.67.1 3 | tqdm==4.67.1 4 | 5 | -------------------------------------------------------------------------------- /bnr/assets/bnr_16k_input.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-Maxine/nim-clients/HEAD/bnr/assets/bnr_16k_input.wav -------------------------------------------------------------------------------- /bnr/assets/bnr_48k_input.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-Maxine/nim-clients/HEAD/bnr/assets/bnr_48k_input.wav -------------------------------------------------------------------------------- /studio-voice/requirements.txt: -------------------------------------------------------------------------------- 1 | grpcio==1.67.1 2 | grpcio-tools==1.67.1 3 | soundfile==0.12.1 4 | numpy==1.26.4 5 | -------------------------------------------------------------------------------- /bnr/requirements.txt: -------------------------------------------------------------------------------- 1 | grpcio==1.67.1 2 | grpcio-tools==1.67.1 3 | soundfile==0.12.1 4 | numpy==1.26.4 5 | tqdm==4.67.1 6 | -------------------------------------------------------------------------------- /audio2face-2d/assets/sample_audio.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-Maxine/nim-clients/HEAD/audio2face-2d/assets/sample_audio.wav -------------------------------------------------------------------------------- /eye-contact/assets/sample_streamable.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-Maxine/nim-clients/HEAD/eye-contact/assets/sample_streamable.mp4 -------------------------------------------------------------------------------- /eye-contact/assets/sample_transactional.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-Maxine/nim-clients/HEAD/eye-contact/assets/sample_transactional.mp4 -------------------------------------------------------------------------------- /audio2face-2d/assets/sample_portrait_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-Maxine/nim-clients/HEAD/audio2face-2d/assets/sample_portrait_image.png -------------------------------------------------------------------------------- /studio-voice/assets/studio_voice_16k_input.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-Maxine/nim-clients/HEAD/studio-voice/assets/studio_voice_16k_input.wav -------------------------------------------------------------------------------- /studio-voice/assets/studio_voice_48k_input.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-Maxine/nim-clients/HEAD/studio-voice/assets/studio_voice_48k_input.wav -------------------------------------------------------------------------------- /audio2face-2d/python/interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from . import audio2face2d_pb2 4 | pwd = os.path.join(os.path.dirname(__file__)) 5 | sys.path.insert(0, pwd) 6 | from . import audio2face2d_pb2_grpc 7 | sys.path.remove(pwd) 8 | -------------------------------------------------------------------------------- /studio-voice/interfaces/studio_voice/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from . import studiovoice_pb2 4 | pwd = os.path.join(os.path.dirname(__file__)) 5 | sys.path.insert(0, pwd) 6 | from . import studiovoice_pb2_grpc 7 | sys.path.remove(pwd) -------------------------------------------------------------------------------- /audio2face-2d/nodejs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "@grpc/grpc-js": "^1.11.3", 4 | "commander": "^12.1.0", 5 | "csv-parse": "^5.5.5", 6 | "google-protobuf": "^3.21.4", 7 | "memorystream": "^0.3.1", 8 | "wav": "^1.0.2" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /studio-voice/interfaces/studio_voice/studiovoice_pb2.pyi: -------------------------------------------------------------------------------- 1 | from google.protobuf import descriptor as _descriptor 2 | from google.protobuf import message as _message 3 | from typing import ClassVar as _ClassVar, Optional as _Optional 4 | 5 | DESCRIPTOR: _descriptor.FileDescriptor 6 | 7 | class EnhanceAudioRequest(_message.Message): 8 | __slots__ = ("audio_stream_data",) 9 | AUDIO_STREAM_DATA_FIELD_NUMBER: _ClassVar[int] 10 | audio_stream_data: bytes 11 | def __init__(self, audio_stream_data: _Optional[bytes] = ...) -> None: ... 12 | 13 | class EnhanceAudioResponse(_message.Message): 14 | __slots__ = ("audio_stream_data",) 15 | AUDIO_STREAM_DATA_FIELD_NUMBER: _ClassVar[int] 16 | audio_stream_data: bytes 17 | def __init__(self, audio_stream_data: _Optional[bytes] = ...) -> None: ... 18 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /bnr/interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a 4 | # copy of this software and associated documentation files (the "Software"), 5 | # to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | # and/or sell copies of the Software, and to permit persons to whom the 8 | # Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | from . import bnr -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA Maxine NIM 2 | 3 | NVIDIA Maxine is a suite of high-performance, easy-to-use, NVIDIA Inference Microservices (NIMs) for deploying AI features that enhance audio, video, and augmented reality effects for video conferencing and tele-presence. 4 | 5 | 6 | ## NVIDIA Maxine NIM Clients 7 | 8 | This repository provides sample client applications to interact with Maxine NIMs 9 | 10 | - [`eye-contact`](eye-contact) - NVIDIA Maxine Eye Contact feature estimates the gaze angles of a person in a video and redirects the gaze in the output video to make it frontal. 11 | [[Demo](https://build.nvidia.com/nvidia/eyecontact)] , [[Docs](https://docs.nvidia.com/nim/maxine/eye-contact/latest/index.html)] 12 | 13 | - [`studio-voice`](studio-voice) - NVIDIA Maxine Studio Voice feature enhances the input speech recorded through low quality microphones in noisy and reverberant environments to studio-recorded quality speech. 14 | [[Demo](https://build.nvidia.com/nvidia/studiovoice)] , [[Docs](https://docs.nvidia.com/nim/maxine/studio-voice/latest/index.html)] 15 | 16 | - [`audio2face-2d`](audio2face-2d) - NVIDIA Maxine Audio2Face-2D feature generates facial animations from a portrait photo and audio input, synchronizing mouth movements with speech to create realistic and engaging video outputs. 17 | [[Demo](https://build.nvidia.com/nvidia/audio2face-2d)] , [[Docs](https://docs.nvidia.com/nim/maxine/audio2face-2d/latest/index.html)] -------------------------------------------------------------------------------- /bnr/interfaces/bnr/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a 4 | # copy of this software and associated documentation files (the "Software"), 5 | # to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | # and/or sell copies of the Software, and to permit persons to whom the 8 | # Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | import os 21 | import sys 22 | from . import bnr_pb2 23 | pwd = os.path.join(os.path.dirname(__file__)) 24 | sys.path.insert(0, pwd) 25 | from . import bnr_pb2_grpc 26 | sys.path.remove(pwd) -------------------------------------------------------------------------------- /eye-contact/interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | 23 | import os 24 | import sys 25 | from . import eyecontact_pb2 26 | 27 | pwd = os.path.join(os.path.dirname(__file__)) 28 | sys.path.insert(0, pwd) 29 | from . import eyecontact_pb2_grpc 30 | 31 | sys.path.remove(pwd) 32 | -------------------------------------------------------------------------------- /audio2face-2d/nodejs/scripts/index.html: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | 20 | 21 | 24 | Maxine Audio2Face 2D 25 | 26 | 27 |

Maxine Audio2Face 2D

28 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | ## Security 3 | 4 | NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization. 5 | 6 | If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub.** 7 | 8 | ## Reporting Potential Security Vulnerability in an NVIDIA Product 9 | 10 | To report a potential security vulnerability in any NVIDIA product: 11 | 12 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html) 13 | 14 | - E-Mail: psirt@nvidia.com 15 | - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) 16 | - Please include the following information: 17 | - Product/Driver name and version/branch that contains the vulnerability 18 | - Type of vulnerability (code execution, denial of service, buffer overflow, etc.) 19 | - Instructions to reproduce the vulnerability 20 | - Proof-of-concept or exploit code 21 | - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability 22 | 23 | While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information. 24 | 25 | ## NVIDIA Product Security 26 | 27 | For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security 28 | -------------------------------------------------------------------------------- /bnr/protos/compile_protos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | 23 | 24 | # This script compiles Protocol Buffer (protobuf) definitions for NVIDIA 25 | # Maxine Studio Voice on a Linux Client. 26 | # 27 | # Execute the script using `./compile_proto.sh` 28 | # 29 | # For more details, refer to README.md 30 | 31 | SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" 32 | PROTOS_DIR=$SCRIPT_DIR/proto/nvidia/maxine/bnr/v1 33 | OUT_DIR=$SCRIPT_DIR/../interfaces/bnr 34 | 35 | python3 -m grpc_tools.protoc -I=$PROTOS_DIR \ 36 | --python_out=$OUT_DIR \ 37 | --grpc_python_out=$OUT_DIR \ 38 | $PROTOS_DIR/bnr.proto 39 | -------------------------------------------------------------------------------- /studio-voice/protos/compile_protos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | 23 | 24 | # This script compiles Protocol Buffer (protobuf) definitions for NVIDIA 25 | # Maxine Studio Voice on a Linux Client. 26 | # 27 | # Execute the script using `./compile_proto.sh` 28 | # 29 | # For more details, refer to README.md 30 | 31 | SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" 32 | PROTOS_DIR=$SCRIPT_DIR/proto/nvidia/maxine/studiovoice/v1 33 | OUT_DIR=$SCRIPT_DIR/../interfaces/studio_voice 34 | 35 | python3 -m grpc_tools.protoc -I=$PROTOS_DIR \ 36 | --python_out=$OUT_DIR \ 37 | --pyi_out=$OUT_DIR \ 38 | --grpc_python_out=$OUT_DIR \ 39 | $PROTOS_DIR/studiovoice.proto 40 | -------------------------------------------------------------------------------- /audio2face-2d/protos/linux/nodejs/compile_protos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | 23 | 24 | # This script compiles Protocol Buffer (protobuf) definitions for NVIDIA Maxine Audio2Face-2D NIM on a Linux Client. 25 | # 26 | # Execute the script using `./compile_protos.sh` 27 | # 28 | # For more details, refer to README.md 29 | 30 | ROOT_DIR="$(dirname "$(readlink -f "$0")")" 31 | PROTOS_DIR=$ROOT_DIR/../../proto/nvidia/maxine/audio2face2d/v1 32 | OUT_DIR=$ROOT_DIR/../../../nodejs/interfaces 33 | 34 | # Install grpc-tools 35 | npm install -g grpc-tools 36 | 37 | # Generate the interface files 38 | grpc_tools_node_protoc --js_out=import_style=commonjs,binary:$OUT_DIR $PROTOS_DIR/audio2face2d.proto --proto_path=$PROTOS_DIR --grpc_out=grpc_js:$OUT_DIR --plugin=protoc-gen-grpc=`which grpc_tools_node_protoc_plugin` 39 | -------------------------------------------------------------------------------- /bnr/protos/compile_protos.bat: -------------------------------------------------------------------------------- 1 | :: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | :: 3 | :: Permission is hereby granted, free of charge, to any person obtaining a 4 | :: copy of this software and associated documentation files (the "Software"), 5 | :: to deal in the Software without restriction, including without limitation 6 | :: the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | :: and/or sell copies of the Software, and to permit persons to whom the 8 | :: Software is furnished to do so, subject to the following conditions: 9 | :: 10 | :: The above copyright notice and this permission notice shall be included in 11 | :: all copies or substantial portions of the Software. 12 | :: 13 | :: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | :: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | :: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | :: THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | :: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | :: FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | :: DEALINGS IN THE SOFTWARE. 20 | 21 | 22 | :: This script compiles Protocol Buffer (protobuf) definitions for NVIDIA 23 | :: Maxine Studio Voice on a Windows Client. 24 | :: 25 | :: Execute the script using `compile_protos.bat`. 26 | :: 27 | :: For more details, refer to README.md 28 | 29 | @echo off 30 | setlocal 31 | 32 | :: Define the script directory and other variables 33 | set "SCRIPT_DIR=%~dp0" 34 | set "PROTOS_DIR=%SCRIPT_DIR%proto\nvidia\maxine\bnr\v1" 35 | set "OUT_DIR=%SCRIPT_DIR%..\interfaces\bnr" 36 | 37 | :: Run the grpc_tools.protoc command with the necessary parameters 38 | python -m grpc_tools.protoc -I=%PROTOS_DIR% ^ 39 | --python_out=%OUT_DIR% ^ 40 | --grpc_python_out=%OUT_DIR% ^ 41 | %PROTOS_DIR%\bnr.proto 42 | 43 | endlocal 44 | 45 | -------------------------------------------------------------------------------- /studio-voice/interfaces/studio_voice/studiovoice_pb2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by the protocol buffer compiler. DO NOT EDIT! 3 | # NO CHECKED-IN PROTOBUF GENCODE 4 | # source: studiovoice.proto 5 | # Protobuf Python Version: 5.27.2 6 | """Generated protocol buffer code.""" 7 | from google.protobuf import descriptor as _descriptor 8 | from google.protobuf import descriptor_pool as _descriptor_pool 9 | from google.protobuf import runtime_version as _runtime_version 10 | from google.protobuf import symbol_database as _symbol_database 11 | from google.protobuf.internal import builder as _builder 12 | _runtime_version.ValidateProtobufRuntimeVersion( 13 | _runtime_version.Domain.PUBLIC, 14 | 5, 15 | 27, 16 | 2, 17 | '', 18 | 'studiovoice.proto' 19 | ) 20 | # @@protoc_insertion_point(imports) 21 | 22 | _sym_db = _symbol_database.Default() 23 | 24 | 25 | 26 | 27 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11studiovoice.proto\x12\x1cnvidia.maxine.studiovoice.v1\"B\n\x13\x45nhanceAudioRequest\x12\x1b\n\x11\x61udio_stream_data\x18\x01 \x01(\x0cH\x00\x42\x0e\n\x0cstream_input\"D\n\x14\x45nhanceAudioResponse\x12\x1b\n\x11\x61udio_stream_data\x18\x01 \x01(\x0cH\x00\x42\x0f\n\rstream_output2\x90\x01\n\x11MaxineStudioVoice\x12{\n\x0c\x45nhanceAudio\x12\x31.nvidia.maxine.studiovoice.v1.EnhanceAudioRequest\x1a\x32.nvidia.maxine.studiovoice.v1.EnhanceAudioResponse\"\x00(\x01\x30\x01\x62\x06proto3') 28 | 29 | _globals = globals() 30 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) 31 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'studiovoice_pb2', _globals) 32 | if not _descriptor._USE_C_DESCRIPTORS: 33 | DESCRIPTOR._loaded_options = None 34 | _globals['_ENHANCEAUDIOREQUEST']._serialized_start=51 35 | _globals['_ENHANCEAUDIOREQUEST']._serialized_end=117 36 | _globals['_ENHANCEAUDIORESPONSE']._serialized_start=119 37 | _globals['_ENHANCEAUDIORESPONSE']._serialized_end=187 38 | _globals['_MAXINESTUDIOVOICE']._serialized_start=190 39 | _globals['_MAXINESTUDIOVOICE']._serialized_end=334 40 | # @@protoc_insertion_point(module_scope) 41 | -------------------------------------------------------------------------------- /studio-voice/protos/compile_protos.bat: -------------------------------------------------------------------------------- 1 | :: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | :: 3 | :: Permission is hereby granted, free of charge, to any person obtaining a 4 | :: copy of this software and associated documentation files (the "Software"), 5 | :: to deal in the Software without restriction, including without limitation 6 | :: the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | :: and/or sell copies of the Software, and to permit persons to whom the 8 | :: Software is furnished to do so, subject to the following conditions: 9 | :: 10 | :: The above copyright notice and this permission notice shall be included in 11 | :: all copies or substantial portions of the Software. 12 | :: 13 | :: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | :: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | :: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | :: THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | :: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | :: FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | :: DEALINGS IN THE SOFTWARE. 20 | 21 | 22 | :: This script compiles Protocol Buffer (protobuf) definitions for NVIDIA 23 | :: Maxine Studio Voice on a Windows Client. 24 | :: 25 | :: Execute the script using `compile_protos.bat`. 26 | :: 27 | :: For more details, refer to README.md 28 | 29 | @echo off 30 | setlocal 31 | 32 | :: Define the script directory and other variables 33 | set "SCRIPT_DIR=%~dp0" 34 | set "PROTOS_DIR=%SCRIPT_DIR%proto\nvidia\maxine\studiovoice\v1" 35 | set "OUT_DIR=%SCRIPT_DIR%..\interfaces\studio_voice" 36 | 37 | :: Run the grpc_tools.protoc command with the necessary parameters 38 | python -m grpc_tools.protoc -I=%PROTOS_DIR% ^ 39 | --python_out=%OUT_DIR% ^ 40 | --pyi_out=%OUT_DIR% ^ 41 | --grpc_python_out=%OUT_DIR% ^ 42 | %PROTOS_DIR%\studiovoice.proto 43 | 44 | endlocal 45 | 46 | -------------------------------------------------------------------------------- /bnr/interfaces/bnr/bnr_pb2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by the protocol buffer compiler. DO NOT EDIT! 3 | # NO CHECKED-IN PROTOBUF GENCODE 4 | # source: bnr.proto 5 | # Protobuf Python Version: 5.27.2 6 | """Generated protocol buffer code.""" 7 | from google.protobuf import descriptor as _descriptor 8 | from google.protobuf import descriptor_pool as _descriptor_pool 9 | from google.protobuf import runtime_version as _runtime_version 10 | from google.protobuf import symbol_database as _symbol_database 11 | from google.protobuf.internal import builder as _builder 12 | _runtime_version.ValidateProtobufRuntimeVersion( 13 | _runtime_version.Domain.PUBLIC, 14 | 5, 15 | 27, 16 | 2, 17 | '', 18 | 'bnr.proto' 19 | ) 20 | # @@protoc_insertion_point(imports) 21 | 22 | _sym_db = _symbol_database.Default() 23 | 24 | 25 | 26 | 27 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\tbnr.proto\x12\x14nvidia.maxine.bnr.v1\"F\n\x12\x45nhanceAudioConfig\x12\x1c\n\x0fintensity_ratio\x18\x01 \x01(\x02H\x00\x88\x01\x01\x42\x12\n\x10_intensity_ratio\"~\n\x13\x45nhanceAudioRequest\x12\x1b\n\x11\x61udio_stream_data\x18\x01 \x01(\x0cH\x00\x12:\n\x06\x63onfig\x18\x02 \x01(\x0b\x32(.nvidia.maxine.bnr.v1.EnhanceAudioConfigH\x00\x42\x0e\n\x0cstream_input\"\x80\x01\n\x14\x45nhanceAudioResponse\x12\x1b\n\x11\x61udio_stream_data\x18\x01 \x01(\x0cH\x00\x12:\n\x06\x63onfig\x18\x02 \x01(\x0b\x32(.nvidia.maxine.bnr.v1.EnhanceAudioConfigH\x00\x42\x0f\n\rstream_output2x\n\tMaxineBNR\x12k\n\x0c\x45nhanceAudio\x12).nvidia.maxine.bnr.v1.EnhanceAudioRequest\x1a*.nvidia.maxine.bnr.v1.EnhanceAudioResponse\"\x00(\x01\x30\x01\x62\x06proto3') 28 | 29 | _globals = globals() 30 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) 31 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'bnr_pb2', _globals) 32 | if not _descriptor._USE_C_DESCRIPTORS: 33 | DESCRIPTOR._loaded_options = None 34 | _globals['_ENHANCEAUDIOCONFIG']._serialized_start=35 35 | _globals['_ENHANCEAUDIOCONFIG']._serialized_end=105 36 | _globals['_ENHANCEAUDIOREQUEST']._serialized_start=107 37 | _globals['_ENHANCEAUDIOREQUEST']._serialized_end=233 38 | _globals['_ENHANCEAUDIORESPONSE']._serialized_start=236 39 | _globals['_ENHANCEAUDIORESPONSE']._serialized_end=364 40 | _globals['_MAXINEBNR']._serialized_start=366 41 | _globals['_MAXINEBNR']._serialized_end=486 42 | # @@protoc_insertion_point(module_scope) 43 | -------------------------------------------------------------------------------- /studio-voice/protos/proto/nvidia/maxine/studiovoice/v1/studiovoice.proto: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a 4 | // copy of this software and associated documentation files (the "Software"), 5 | // to deal in the Software without restriction, including without limitation 6 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | // and/or sell copies of the Software, and to permit persons to whom the 8 | // Software is furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | // DEALINGS IN THE SOFTWARE. 20 | 21 | syntax = "proto3"; 22 | 23 | package nvidia.maxine.studiovoice.v1; 24 | 25 | // The MaxineStudioVoice service provides APIs to run the 26 | // Maxine Studio Voice NIM. 27 | service MaxineStudioVoice { 28 | // EnhanceAudio is a bidirectional streaming RPC to run the 29 | // Maxine Studio Voice NIM on audio files. 30 | // 31 | // The client streams the input audio file in chunks in the input message and 32 | // receives the output audio file in chunks in the output message. 33 | // 34 | // The client should only pass one audio file per RPC invocation. 35 | rpc EnhanceAudio(stream EnhanceAudioRequest) returns (stream EnhanceAudioResponse) {} 36 | } 37 | 38 | // Input message for EnhanceAudio RPC. 39 | // Contains a chunk of input audio file data. 40 | message EnhanceAudioRequest { 41 | oneof stream_input { 42 | bytes audio_stream_data = 1; 43 | } 44 | } 45 | 46 | // Output message for EnhanceAudio RPC. 47 | // Contains a chunk of output audio file data. 48 | message EnhanceAudioResponse { 49 | oneof stream_output { 50 | bytes audio_stream_data = 1; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /eye-contact/protos/windows/compile_protos.bat: -------------------------------------------------------------------------------- 1 | :: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | :: 3 | :: Permission is hereby granted, free of charge, to any person obtaining a 4 | :: copy of this software and associated documentation files (the "Software"), 5 | :: to deal in the Software without restriction, including without limitation 6 | :: the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | :: and/or sell copies of the Software, and to permit persons to whom the 8 | :: Software is furnished to do so, subject to the following conditions: 9 | :: 10 | :: The above copyright notice and this permission notice shall be included in 11 | :: all copies or substantial portions of the Software. 12 | :: 13 | :: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | :: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | :: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | :: THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | :: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | :: FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | :: DEALINGS IN THE SOFTWARE. 20 | 21 | 22 | :: This script compiles Protocol Buffer (protobuf) definitions for NVIDIA Maxine Eye-Contact NIM on a Windows Client. 23 | :: 24 | :: Execute the script using `compile_protos.bat`. 25 | :: 26 | :: For more details, refer to README.txt. 27 | 28 | 29 | @echo off 30 | setlocal 31 | 32 | :: Define the script directory 33 | set "SCRIPT_DIR=%~dp0" 34 | 35 | :: Define the protobufs and output directories 36 | set "PROTOS_DIR=%SCRIPT_DIR%..\proto\nvidia\maxine\eyecontact\v1" 37 | set "OUT_DIR=%SCRIPT_DIR%..\..\interfaces\" 38 | 39 | :: Log the paths for debugging 40 | echo "Using PROTOS_DIR: %PROTOS_DIR%" 41 | echo "Using OUT_DIR: %OUT_DIR%" 42 | 43 | :: Check if Python is installed 44 | where python >nul 2>&1 45 | if errorlevel 1 ( 46 | echo [Error] Python is not installed or not in the PATH. 47 | exit /b 1 48 | ) 49 | 50 | :: Run grpc_tools.protoc to generate Python gRPC code 51 | python -m grpc_tools.protoc -I=%PROTOS_DIR% ^ 52 | --python_out=%OUT_DIR% ^ 53 | --pyi_out=%OUT_DIR% ^ 54 | --grpc_python_out=%OUT_DIR% ^ 55 | %PROTOS_DIR%\eyecontact.proto 56 | if errorlevel 1 ( 57 | echo [Error] Failed to execute grpc_tools.protoc. Please check the paths and dependencies. 58 | exit /b 1 59 | ) 60 | 61 | echo "gRPC files generated successfully." 62 | endlocal 63 | 64 | -------------------------------------------------------------------------------- /audio2face-2d/protos/windows/nodejs/compile_protos.bat: -------------------------------------------------------------------------------- 1 | :: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | :: 3 | :: Permission is hereby granted, free of charge, to any person obtaining a 4 | :: copy of this software and associated documentation files (the "Software"), 5 | :: to deal in the Software without restriction, including without limitation 6 | :: the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | :: and/or sell copies of the Software, and to permit persons to whom the 8 | :: Software is furnished to do so, subject to the following conditions: 9 | :: 10 | :: The above copyright notice and this permission notice shall be included in 11 | :: all copies or substantial portions of the Software. 12 | :: 13 | :: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | :: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | :: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | :: THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | :: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | :: FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | :: DEALINGS IN THE SOFTWARE. 20 | 21 | 22 | :: This script compiles Protocol Buffer (protobuf) definitions for NVIDIA Maxine Audio2Face-2D NIM on a Windows Client. 23 | :: 24 | :: Execute the script using `compile_protos.bat`. 25 | :: 26 | :: For more details, refer to README.txt. 27 | 28 | 29 | @echo off 30 | setlocal 31 | 32 | set "SCRIPT_DIR=%~dp0" 33 | set "PROTOS_DIR=%SCRIPT_DIR%../../proto/nvidia/maxine/audio2face2d/v1" 34 | set "OUT_DIR=%SCRIPT_DIR%../../../nodejs/interfaces" 35 | 36 | :: Install grpc-tools 37 | call npm install -g grpc-tools 38 | 39 | if %errorlevel% neq 0 ( 40 | echo grpc-tools installation failed 41 | exit /b %errorlevel% 42 | ) 43 | 44 | :: Check if running in PowerShell 45 | call powershell -Command "exit $PSVersionTable.PSVersion.Major -ne $null" 46 | if %errorlevel% equ 0 ( 47 | :: Running in PowerShell 48 | powershell -Command "for /f 'delims=' %%i in ('Get-Command grpc_tools_node_protoc_plugin.cmd ^| Select-Object -ExpandProperty Source') do set GRPC_PLUGIN_PATH=%%i" 49 | ) else ( 50 | :: Running in Command Prompt 51 | for /f "delims=" %%i in ('where grpc_tools_node_protoc_plugin.cmd') do set GRPC_PLUGIN_PATH=%%i 52 | ) 53 | 54 | :: Generate the interface files 55 | call grpc_tools_node_protoc --js_out=import_style=commonjs:%OUT_DIR% %PROTOS_DIR%/audio2face2d.proto --proto_path=%PROTOS_DIR% --grpc_out=grpc_js:%OUT_DIR% --plugin=protoc-gen-grpc=%GRPC_PLUGIN_PATH% 56 | endlocal -------------------------------------------------------------------------------- /audio2face-2d/protos/windows/python/compile_protos.bat: -------------------------------------------------------------------------------- 1 | :: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | :: 3 | :: Permission is hereby granted, free of charge, to any person obtaining a 4 | :: copy of this software and associated documentation files (the "Software"), 5 | :: to deal in the Software without restriction, including without limitation 6 | :: the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | :: and/or sell copies of the Software, and to permit persons to whom the 8 | :: Software is furnished to do so, subject to the following conditions: 9 | :: 10 | :: The above copyright notice and this permission notice shall be included in 11 | :: all copies or substantial portions of the Software. 12 | :: 13 | :: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | :: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | :: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | :: THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | :: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | :: FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | :: DEALINGS IN THE SOFTWARE. 20 | 21 | 22 | :: This script compiles Protocol Buffer (protobuf) definitions for NVIDIA Maxine Audio2Face-2D NIM on a Windows Client. 23 | :: 24 | :: Execute the script using `compile_protos.bat`. 25 | :: 26 | :: For more details, refer to README.txt. 27 | 28 | 29 | @echo off 30 | setlocal 31 | 32 | :: Define the script directory 33 | set "SCRIPT_DIR=%~dp0" 34 | 35 | :: Define the protobufs and output directories 36 | set "PROTOS_DIR=%SCRIPT_DIR%..\..\proto\nvidia\maxine\audio2face2d\v1" 37 | set "OUT_DIR=%SCRIPT_DIR%..\..\..\python\interfaces\" 38 | 39 | :: Log the paths for debugging 40 | echo "Using PROTOS_DIR: %PROTOS_DIR%" 41 | echo "Using OUT_DIR: %OUT_DIR%" 42 | 43 | :: Check if Python is installed 44 | where python >nul 2>&1 45 | if errorlevel 1 ( 46 | echo [Error] Python is not installed or not in the PATH. 47 | exit /b 1 48 | ) 49 | 50 | :: Run grpc_tools.protoc to generate Python gRPC code 51 | python -m grpc_tools.protoc -I=%PROTOS_DIR% ^ 52 | --python_out=%OUT_DIR% ^ 53 | --pyi_out=%OUT_DIR% ^ 54 | --grpc_python_out=%OUT_DIR% ^ 55 | %PROTOS_DIR%\audio2face2d.proto 56 | if errorlevel 1 ( 57 | echo [Error] Failed to execute grpc_tools.protoc. Please check the paths and dependencies. 58 | exit /b 1 59 | ) 60 | 61 | echo "gRPC files generated successfully." 62 | endlocal 63 | 64 | -------------------------------------------------------------------------------- /bnr/protos/proto/nvidia/maxine/bnr/v1/bnr.proto: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a 4 | // copy of this software and associated documentation files (the "Software"), 5 | // to deal in the Software without restriction, including without limitation 6 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | // and/or sell copies of the Software, and to permit persons to whom the 8 | // Software is furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | // DEALINGS IN THE SOFTWARE. 20 | 21 | syntax = "proto3"; 22 | 23 | package nvidia.maxine.bnr.v1; 24 | 25 | // The MaxineBNR service provides APIs to run the 26 | // Maxine BNR NIM. 27 | service MaxineBNR { 28 | // EnhanceAudio is a bidirectional streaming RPC to run the 29 | // Maxine BNR NIM on audio files. 30 | // 31 | // The client streams the input audio file in chunks in the input message and 32 | // receives the output audio file in chunks in the output message. 33 | // 34 | // The client should only pass one audio file per RPC invocation. 35 | rpc EnhanceAudio(stream EnhanceAudioRequest) returns (stream EnhanceAudioResponse) {} 36 | } 37 | 38 | // Configuration for EnhanceAudio API. 39 | message EnhanceAudioConfig { 40 | // Intensity ratio between 0.0f to 1.0f. 41 | // Default: 1.0 42 | optional float intensity_ratio = 1; 43 | } 44 | 45 | // Input message for EnhanceAudio RPC. 46 | message EnhanceAudioRequest { 47 | oneof stream_input { 48 | // Contains a chunk of input audio file data. 49 | // 32 bit float audio samples 50 | bytes audio_stream_data = 1; 51 | 52 | // Configuration parameters for the request 53 | EnhanceAudioConfig config = 2; 54 | } 55 | } 56 | 57 | // Output message for EnhanceAudio RPC. 58 | message EnhanceAudioResponse { 59 | oneof stream_output { 60 | // Contains a chunk of output audio file data. 61 | // 32 bit float audio samples 62 | bytes audio_stream_data = 1; 63 | 64 | // Configuration parameters used 65 | EnhanceAudioConfig config = 2; 66 | } 67 | } -------------------------------------------------------------------------------- /eye-contact/scripts/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a 4 | # copy of this software and associated documentation files (the "Software"), 5 | # to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | # and/or sell copies of the Software, and to permit persons to whom the 8 | # Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | # Constants for data handling 22 | DATA_CHUNK_SIZE = 64 * 1024 # bytes, we send the mp4 file in 64KB chunks 23 | DEFAULT_BITRATE = 3000000 # bps 24 | DEFAULT_IDR_INTERVAL = 8 # frames 25 | DEFAULT_STREAMABLE_VIDEO_PATH = "../assets/sample_streamable.mp4" 26 | DEFAULT_NON_STREAMABLE_VIDEO_PATH = "../assets/sample_transactional.mp4" 27 | 28 | # Default values from eyecontact.proto 29 | DEFAULT_TEMPORAL = 0xFFFFFFFF 30 | DEFAULT_DETECT_CLOSURE = 0 31 | DEFAULT_EYE_SIZE_SENSITIVITY = 3 32 | DEFAULT_ENABLE_LOOKAWAY = 0 33 | DEFAULT_LOOKAWAY_MAX_OFFSET = 5 34 | DEFAULT_LOOKAWAY_INTERVAL_MIN = 3 35 | DEFAULT_LOOKAWAY_INTERVAL_RANGE = 8 36 | DEFAULT_GAZE_PITCH_THRESHOLD_LOW = 25.0 37 | DEFAULT_GAZE_PITCH_THRESHOLD_HIGH = 30.0 38 | DEFAULT_GAZE_YAW_THRESHOLD_LOW = 20.0 39 | DEFAULT_GAZE_YAW_THRESHOLD_HIGH = 30.0 40 | DEFAULT_HEAD_PITCH_THRESHOLD_LOW = 20.0 41 | DEFAULT_HEAD_PITCH_THRESHOLD_HIGH = 25.0 42 | DEFAULT_HEAD_YAW_THRESHOLD_LOW = 25.0 43 | DEFAULT_HEAD_YAW_THRESHOLD_HIGH = 30.0 44 | 45 | # Parameter validation ranges 46 | PARAM_RANGES = { 47 | "temporal": (0, 0xFFFFFFFF), 48 | "detect_closure": (0, 1), 49 | "eye_size_sensitivity": (2, 6), 50 | "enable_lookaway": (0, 1), 51 | "lookaway_max_offset": (1, 10), 52 | "lookaway_interval_min": (1, 600), 53 | "lookaway_interval_range": (1, 600), 54 | "gaze_pitch_threshold_low": (10.0, 35.0), 55 | "gaze_pitch_threshold_high": (10.0, 35.0), 56 | "gaze_yaw_threshold_low": (10.0, 35.0), 57 | "gaze_yaw_threshold_high": (10.0, 35.0), 58 | "head_pitch_threshold_low": (10.0, 35.0), 59 | "head_pitch_threshold_high": (10.0, 35.0), 60 | "head_yaw_threshold_low": (10.0, 35.0), 61 | "head_yaw_threshold_high": (10.0, 35.0), 62 | } 63 | -------------------------------------------------------------------------------- /eye-contact/protos/linux/compile_protos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | 23 | 24 | # This script compiles Protocol Buffer (protobuf) definitions for NVIDIA Maxine Eye-Contact NIM on a Linux Client. 25 | # 26 | # Execute the script using `./compile_protos.sh` 27 | # 28 | # For more details, refer to README.md 29 | 30 | 31 | # Get the script directory's parent directory 32 | SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" 33 | echo $SCRIPT_DIR 34 | # Define paths for proto files and output directory 35 | PROTOS_DIR=$(realpath "$SCRIPT_DIR/../proto/nvidia/maxine/eyecontact/v1") 36 | OUT_DIR=$(realpath "$SCRIPT_DIR/../../interfaces/") 37 | 38 | # Check if required directories and files exist 39 | if [ ! -d "$PROTOS_DIR" ]; then 40 | echo "[Error] Protos directory does not exist: $PROTOS_DIR" 41 | exit 1 42 | fi 43 | 44 | if [ ! -f "$PROTOS_DIR/eyecontact.proto" ]; then 45 | echo "[Error] Protobuf file not found: $PROTOS_DIR/eyecontact.proto" 46 | exit 1 47 | fi 48 | 49 | # Check if Python is installed 50 | if ! command -v python3 > /dev/null; then 51 | echo "[Error] Python3 is not installed or not in the PATH." 52 | exit 1 53 | fi 54 | 55 | # Log the paths for debugging 56 | echo "Using PROTOS_DIR: $PROTOS_DIR" 57 | echo "Using OUT_DIR: $OUT_DIR" 58 | 59 | # Run grpc_tools.protoc 60 | python3 -m grpc_tools.protoc -I="$PROTOS_DIR" \ 61 | --python_out="$OUT_DIR" \ 62 | --pyi_out="$OUT_DIR" \ 63 | --grpc_python_out="$OUT_DIR" \ 64 | "$PROTOS_DIR/eyecontact.proto" 65 | 66 | # Check if the command succeeded 67 | if [ $? -ne 0 ]; then 68 | echo "[Error] Failed to execute grpc_tools.protoc." 69 | exit 1 70 | fi 71 | 72 | echo "gRPC files generated successfully." 73 | 74 | -------------------------------------------------------------------------------- /audio2face-2d/protos/linux/python/compile_protos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | 23 | 24 | # This script compiles Protocol Buffer (protobuf) definitions for NVIDIA Maxine Audio2Face-2D NIM on a Linux Client. 25 | # 26 | # Execute the script using `./compile_protos.sh` 27 | # 28 | # For more details, refer to README.md 29 | 30 | 31 | # Get the script directory's parent directory 32 | SCRIPT_DIR="$(dirname "$(dirname "$(readlink -f "$0")")")" 33 | 34 | # Define paths for proto files and output directory 35 | PROTOS_DIR=$(realpath "$SCRIPT_DIR/../proto/nvidia/maxine/audio2face2d/v1") 36 | OUT_DIR=$(realpath "$SCRIPT_DIR/../../python/interfaces/") 37 | 38 | # Check if required directories and files exist 39 | if [ ! -d "$PROTOS_DIR" ]; then 40 | echo "[Error] Protos directory does not exist: $PROTOS_DIR" 41 | exit 1 42 | fi 43 | 44 | if [ ! -f "$PROTOS_DIR/audio2face2d.proto" ]; then 45 | echo "[Error] Protobuf file not found: $PROTOS_DIR/audio2face2d.proto" 46 | exit 1 47 | fi 48 | 49 | # Check if Python is installed 50 | if ! command -v python3 > /dev/null; then 51 | echo "[Error] Python3 is not installed or not in the PATH." 52 | exit 1 53 | fi 54 | 55 | # Log the paths for debugging 56 | echo "Using PROTOS_DIR: $PROTOS_DIR" 57 | echo "Using OUT_DIR: $OUT_DIR" 58 | 59 | # Run grpc_tools.protoc 60 | python3 -m grpc_tools.protoc -I="$PROTOS_DIR" \ 61 | --python_out="$OUT_DIR" \ 62 | --pyi_out="$OUT_DIR" \ 63 | --grpc_python_out="$OUT_DIR" \ 64 | "$PROTOS_DIR/audio2face2d.proto" 65 | 66 | # Check if the command succeeded 67 | if [ $? -ne 0 ]; then 68 | echo "[Error] Failed to execute grpc_tools.protoc." 69 | exit 1 70 | fi 71 | 72 | echo "gRPC files generated successfully." 73 | 74 | -------------------------------------------------------------------------------- /audio2face-2d/nodejs/interfaces/audio2face2d_grpc_pb.js: -------------------------------------------------------------------------------- 1 | // GENERATED CODE -- DO NOT EDIT! 2 | 3 | 'use strict'; 4 | var grpc = require('@grpc/grpc-js'); 5 | var audio2face2d_pb = require('./audio2face2d_pb.js'); 6 | var google_protobuf_empty_pb = require('google-protobuf/google/protobuf/empty_pb.js'); 7 | 8 | function serialize_nvidia_maxine_audio2face2d_v1_AnimateRequest(arg) { 9 | if (!(arg instanceof audio2face2d_pb.AnimateRequest)) { 10 | throw new Error('Expected argument of type nvidia.maxine.audio2face2d.v1.AnimateRequest'); 11 | } 12 | return Buffer.from(arg.serializeBinary()); 13 | } 14 | 15 | function deserialize_nvidia_maxine_audio2face2d_v1_AnimateRequest(buffer_arg) { 16 | return audio2face2d_pb.AnimateRequest.deserializeBinary(new Uint8Array(buffer_arg)); 17 | } 18 | 19 | function serialize_nvidia_maxine_audio2face2d_v1_AnimateResponse(arg) { 20 | if (!(arg instanceof audio2face2d_pb.AnimateResponse)) { 21 | throw new Error('Expected argument of type nvidia.maxine.audio2face2d.v1.AnimateResponse'); 22 | } 23 | return Buffer.from(arg.serializeBinary()); 24 | } 25 | 26 | function deserialize_nvidia_maxine_audio2face2d_v1_AnimateResponse(buffer_arg) { 27 | return audio2face2d_pb.AnimateResponse.deserializeBinary(new Uint8Array(buffer_arg)); 28 | } 29 | 30 | 31 | // The Audio2Face2DService provides APIs to run the 32 | // Maxine Audio to Face - 2D feature. 33 | var Audio2Face2DServiceService = exports.Audio2Face2DServiceService = { 34 | // Animate is a bidirectional streaming API to run the 35 | // Audio2Face-2D. 36 | // 37 | // The input message can contain AnimateConfig or bytes. 38 | // In the beginning of the stream, a request with AnimateConfig should 39 | // be sent to the server to set the feature's parameters. 40 | // The server will echo back a response with the config to signify that the 41 | // parameters were properly set. It is mandatory to set the portrait_image 42 | // config, other configuration parameters are optional and a default value will 43 | // be used if not set. Any AnimateConfig sent during the middle of the stream 44 | // will be ignored. 45 | // 46 | // After the configuration step, the client streams the input wav file in 47 | // chunks in the input message and receives the output mp4 file in chunks in 48 | // the output message. While the inference is running, the server will periodically 49 | // echo empty message to keep the channel alive. The client should ignore this message. 50 | // 51 | // It is recommended that the client should pass one file per API invocation. 52 | // The configurations are also set per invocation. 53 | animate: { 54 | path: '/nvidia.maxine.audio2face2d.v1.Audio2Face2DService/Animate', 55 | requestStream: true, 56 | responseStream: true, 57 | requestType: audio2face2d_pb.AnimateRequest, 58 | responseType: audio2face2d_pb.AnimateResponse, 59 | requestSerialize: serialize_nvidia_maxine_audio2face2d_v1_AnimateRequest, 60 | requestDeserialize: deserialize_nvidia_maxine_audio2face2d_v1_AnimateRequest, 61 | responseSerialize: serialize_nvidia_maxine_audio2face2d_v1_AnimateResponse, 62 | responseDeserialize: deserialize_nvidia_maxine_audio2face2d_v1_AnimateResponse, 63 | }, 64 | }; 65 | 66 | exports.Audio2Face2DServiceClient = grpc.makeGenericClientConstructor(Audio2Face2DServiceService); 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: MIT 3 | 4 | # Prerequisites 5 | *.d 6 | 7 | # Compiled Object files 8 | *.slo 9 | *.lo 10 | *.o 11 | *.obj 12 | 13 | # Precompiled Headers 14 | *.gch 15 | *.pch 16 | 17 | # Compiled Dynamic libraries 18 | *.so 19 | *.dylib 20 | *.dll 21 | 22 | # Fortran module files 23 | *.mod 24 | *.smod 25 | 26 | # Compiled Static libraries 27 | *.lai 28 | *.la 29 | *.a 30 | *.lib 31 | 32 | # Executables 33 | *.exe 34 | *.out 35 | *.app 36 | 37 | # Models 38 | *.pt 39 | *.savedmodel 40 | install/ 41 | 42 | # Ignore backup files. 43 | *~ 44 | # Ignore Vim swap files. 45 | .*.swp 46 | # Ignore files generated by IDEs. 47 | /.classpath 48 | /.factorypath 49 | /.idea/ 50 | /.ijwb/ 51 | /.project 52 | /.settings 53 | /.vscode/ 54 | # Ignore outputs generated during Bazel bootstrapping. 55 | /output/ 56 | # Ignore jekyll build output. 57 | /production 58 | /.sass-cache 59 | 60 | # Byte-compiled / optimized / DLL files 61 | __pycache__/ 62 | *.py[cod] 63 | *$py.class 64 | 65 | # Distribution / packaging 66 | .Python 67 | build/ 68 | develop-eggs/ 69 | dist/ 70 | downloads/ 71 | eggs/ 72 | .eggs/ 73 | lib/ 74 | lib64/ 75 | parts/ 76 | sdist/ 77 | var/ 78 | wheels/ 79 | pip-wheel-metadata/ 80 | share/python-wheels/ 81 | *.egg-info/ 82 | .installed.cfg 83 | *.egg 84 | MANIFEST 85 | 86 | # PyInstaller 87 | # Usually these files are written by a python script from a template 88 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 89 | *.manifest 90 | *.spec 91 | 92 | # Installer logs 93 | pip-log.txt 94 | pip-delete-this-directory.txt 95 | 96 | # Unit test / coverage reports 97 | htmlcov/ 98 | .tox/ 99 | .nox/ 100 | .coverage 101 | .coverage.* 102 | .cache 103 | nosetests.xml 104 | coverage.xml 105 | *.cover 106 | .hypothesis/ 107 | .pytest_cache/ 108 | 109 | # Translations 110 | *.mo 111 | *.pot 112 | 113 | # Django stuff: 114 | *.log 115 | local_settings.py 116 | db.sqlite3 117 | db.sqlite3-journal 118 | 119 | # Flask stuff: 120 | instance/ 121 | .webassets-cache 122 | 123 | # Scrapy stuff: 124 | .scrapy 125 | 126 | # Sphinx documentation 127 | docs/_build/ 128 | 129 | # PyBuilder 130 | target/ 131 | 132 | # Jupyter Notebook 133 | .ipynb_checkpoints 134 | 135 | # IPython 136 | profile_default/ 137 | ipython_config.py 138 | 139 | # pyenv 140 | .python-version 141 | 142 | # pipenv 143 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 144 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 145 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 146 | # install all needed dependencies. 147 | #Pipfile.lock 148 | 149 | # celery beat schedule file 150 | celerybeat-schedule 151 | 152 | # SageMath parsed files 153 | *.sage.py 154 | 155 | # Environments 156 | .env 157 | .venv 158 | env/ 159 | venv/ 160 | ENV/ 161 | env.bak/ 162 | venv.bak/ 163 | 164 | # Spyder project settings 165 | .spyderproject 166 | .spyproject 167 | 168 | # Rope project settings 169 | .ropeproject 170 | 171 | # mkdocs documentation 172 | /site 173 | 174 | # mypy 175 | .mypy_cache/ 176 | .dmypy.json 177 | dmypy.json 178 | 179 | # Pyre type checker 180 | .pyre/ 181 | 182 | tests/integration/asr/outputs 183 | tests/integration/nlp/outputs 184 | tests/integration/tts/outputs 185 | -------------------------------------------------------------------------------- /bnr/interfaces/bnr/bnr_pb2_grpc.py: -------------------------------------------------------------------------------- 1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! 2 | """Client and server classes corresponding to protobuf-defined services.""" 3 | import grpc 4 | import warnings 5 | 6 | import bnr_pb2 as bnr__pb2 7 | 8 | GRPC_GENERATED_VERSION = '1.67.1' 9 | GRPC_VERSION = grpc.__version__ 10 | _version_not_supported = False 11 | 12 | try: 13 | from grpc._utilities import first_version_is_lower 14 | _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION) 15 | except ImportError: 16 | _version_not_supported = True 17 | 18 | if _version_not_supported: 19 | raise RuntimeError( 20 | f'The grpc package installed is at version {GRPC_VERSION},' 21 | + f' but the generated code in bnr_pb2_grpc.py depends on' 22 | + f' grpcio>={GRPC_GENERATED_VERSION}.' 23 | + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' 24 | + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' 25 | ) 26 | 27 | 28 | class MaxineBNRStub(object): 29 | """The MaxineBNR service provides APIs to run the 30 | Maxine BNR NIM. 31 | """ 32 | 33 | def __init__(self, channel): 34 | """Constructor. 35 | 36 | Args: 37 | channel: A grpc.Channel. 38 | """ 39 | self.EnhanceAudio = channel.stream_stream( 40 | '/nvidia.maxine.bnr.v1.MaxineBNR/EnhanceAudio', 41 | request_serializer=bnr__pb2.EnhanceAudioRequest.SerializeToString, 42 | response_deserializer=bnr__pb2.EnhanceAudioResponse.FromString, 43 | _registered_method=True) 44 | 45 | 46 | class MaxineBNRServicer(object): 47 | """The MaxineBNR service provides APIs to run the 48 | Maxine BNR NIM. 49 | """ 50 | 51 | def EnhanceAudio(self, request_iterator, context): 52 | """EnhanceAudio is a bidirectional streaming RPC to run the 53 | Maxine BNR NIM on audio files. 54 | 55 | The client streams the input audio file in chunks in the input message and 56 | receives the output audio file in chunks in the output message. 57 | 58 | The client should only pass one audio file per RPC invocation. 59 | """ 60 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 61 | context.set_details('Method not implemented!') 62 | raise NotImplementedError('Method not implemented!') 63 | 64 | 65 | def add_MaxineBNRServicer_to_server(servicer, server): 66 | rpc_method_handlers = { 67 | 'EnhanceAudio': grpc.stream_stream_rpc_method_handler( 68 | servicer.EnhanceAudio, 69 | request_deserializer=bnr__pb2.EnhanceAudioRequest.FromString, 70 | response_serializer=bnr__pb2.EnhanceAudioResponse.SerializeToString, 71 | ), 72 | } 73 | generic_handler = grpc.method_handlers_generic_handler( 74 | 'nvidia.maxine.bnr.v1.MaxineBNR', rpc_method_handlers) 75 | server.add_generic_rpc_handlers((generic_handler,)) 76 | server.add_registered_method_handlers('nvidia.maxine.bnr.v1.MaxineBNR', rpc_method_handlers) 77 | 78 | 79 | # This class is part of an EXPERIMENTAL API. 80 | class MaxineBNR(object): 81 | """The MaxineBNR service provides APIs to run the 82 | Maxine BNR NIM. 83 | """ 84 | 85 | @staticmethod 86 | def EnhanceAudio(request_iterator, 87 | target, 88 | options=(), 89 | channel_credentials=None, 90 | call_credentials=None, 91 | insecure=False, 92 | compression=None, 93 | wait_for_ready=None, 94 | timeout=None, 95 | metadata=None): 96 | return grpc.experimental.stream_stream( 97 | request_iterator, 98 | target, 99 | '/nvidia.maxine.bnr.v1.MaxineBNR/EnhanceAudio', 100 | bnr__pb2.EnhanceAudioRequest.SerializeToString, 101 | bnr__pb2.EnhanceAudioResponse.FromString, 102 | options, 103 | channel_credentials, 104 | insecure, 105 | call_credentials, 106 | compression, 107 | wait_for_ready, 108 | timeout, 109 | metadata, 110 | _registered_method=True) 111 | -------------------------------------------------------------------------------- /studio-voice/interfaces/studio_voice/studiovoice_pb2_grpc.py: -------------------------------------------------------------------------------- 1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! 2 | """Client and server classes corresponding to protobuf-defined services.""" 3 | import grpc 4 | import warnings 5 | 6 | import studiovoice_pb2 as studiovoice__pb2 7 | 8 | GRPC_GENERATED_VERSION = '1.67.1' 9 | GRPC_VERSION = grpc.__version__ 10 | _version_not_supported = False 11 | 12 | try: 13 | from grpc._utilities import first_version_is_lower 14 | _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION) 15 | except ImportError: 16 | _version_not_supported = True 17 | 18 | if _version_not_supported: 19 | raise RuntimeError( 20 | f'The grpc package installed is at version {GRPC_VERSION},' 21 | + f' but the generated code in studiovoice_pb2_grpc.py depends on' 22 | + f' grpcio>={GRPC_GENERATED_VERSION}.' 23 | + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' 24 | + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' 25 | ) 26 | 27 | 28 | class MaxineStudioVoiceStub(object): 29 | """The MaxineStudioVoice service provides APIs to run the 30 | Maxine Studio Voice NIM. 31 | """ 32 | 33 | def __init__(self, channel): 34 | """Constructor. 35 | 36 | Args: 37 | channel: A grpc.Channel. 38 | """ 39 | self.EnhanceAudio = channel.stream_stream( 40 | '/nvidia.maxine.studiovoice.v1.MaxineStudioVoice/EnhanceAudio', 41 | request_serializer=studiovoice__pb2.EnhanceAudioRequest.SerializeToString, 42 | response_deserializer=studiovoice__pb2.EnhanceAudioResponse.FromString, 43 | _registered_method=True) 44 | 45 | 46 | class MaxineStudioVoiceServicer(object): 47 | """The MaxineStudioVoice service provides APIs to run the 48 | Maxine Studio Voice NIM. 49 | """ 50 | 51 | def EnhanceAudio(self, request_iterator, context): 52 | """EnhanceAudio is a bidirectional streaming RPC to run the 53 | Maxine Studio Voice NIM on audio files. 54 | 55 | The client streams the input audio file in chunks in the input message and 56 | receives the output audio file in chunks in the output message. 57 | 58 | The client should only pass one audio file per RPC invocation. 59 | """ 60 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 61 | context.set_details('Method not implemented!') 62 | raise NotImplementedError('Method not implemented!') 63 | 64 | 65 | def add_MaxineStudioVoiceServicer_to_server(servicer, server): 66 | rpc_method_handlers = { 67 | 'EnhanceAudio': grpc.stream_stream_rpc_method_handler( 68 | servicer.EnhanceAudio, 69 | request_deserializer=studiovoice__pb2.EnhanceAudioRequest.FromString, 70 | response_serializer=studiovoice__pb2.EnhanceAudioResponse.SerializeToString, 71 | ), 72 | } 73 | generic_handler = grpc.method_handlers_generic_handler( 74 | 'nvidia.maxine.studiovoice.v1.MaxineStudioVoice', rpc_method_handlers) 75 | server.add_generic_rpc_handlers((generic_handler,)) 76 | server.add_registered_method_handlers('nvidia.maxine.studiovoice.v1.MaxineStudioVoice', rpc_method_handlers) 77 | 78 | 79 | # This class is part of an EXPERIMENTAL API. 80 | class MaxineStudioVoice(object): 81 | """The MaxineStudioVoice service provides APIs to run the 82 | Maxine Studio Voice NIM. 83 | """ 84 | 85 | @staticmethod 86 | def EnhanceAudio(request_iterator, 87 | target, 88 | options=(), 89 | channel_credentials=None, 90 | call_credentials=None, 91 | insecure=False, 92 | compression=None, 93 | wait_for_ready=None, 94 | timeout=None, 95 | metadata=None): 96 | return grpc.experimental.stream_stream( 97 | request_iterator, 98 | target, 99 | '/nvidia.maxine.studiovoice.v1.MaxineStudioVoice/EnhanceAudio', 100 | studiovoice__pb2.EnhanceAudioRequest.SerializeToString, 101 | studiovoice__pb2.EnhanceAudioResponse.FromString, 102 | options, 103 | channel_credentials, 104 | insecure, 105 | call_credentials, 106 | compression, 107 | wait_for_ready, 108 | timeout, 109 | metadata, 110 | _registered_method=True) 111 | -------------------------------------------------------------------------------- /audio2face-2d/assets/head_translation_animation.csv: -------------------------------------------------------------------------------- 1 | 0.0000, 0.0000, 1.0000 2 | 0.0030, 0.0000, 1.0000 3 | 0.0070, 0.0000, 1.0000 4 | 0.0100, 0.0000, 1.0000 5 | 0.0130, 0.0000, 1.0000 6 | 0.0170, 0.0000, 1.0000 7 | 0.0200, 0.0000, 1.0000 8 | 0.0230, 0.0000, 1.0000 9 | 0.0270, 0.0000, 1.0000 10 | 0.0300, 0.0000, 1.0000 11 | 0.0330, 0.0000, 1.0000 12 | 0.0370, 0.0000, 1.0000 13 | 0.0400, 0.0000, 1.0000 14 | 0.0430, 0.0000, 1.0000 15 | 0.0470, 0.0000, 1.0000 16 | 0.0470, 0.0000, 1.0000 17 | 0.0430, 0.0000, 1.0000 18 | 0.0400, 0.0000, 1.0000 19 | 0.0370, 0.0000, 1.0000 20 | 0.0330, 0.0000, 1.0000 21 | 0.0300, 0.0000, 1.0000 22 | 0.0270, 0.0000, 1.0000 23 | 0.0230, 0.0000, 1.0000 24 | 0.0200, 0.0000, 1.0000 25 | 0.0170, 0.0000, 1.0000 26 | 0.0130, 0.0000, 1.0000 27 | 0.0100, 0.0000, 1.0000 28 | 0.0070, 0.0000, 1.0000 29 | 0.0030, 0.0000, 1.0000 30 | 0.0000, 0.0000, 1.0000 31 | 0.0000, 0.0000, 1.0000 32 | -0.0030, 0.0000, 1.0000 33 | -0.0070, 0.0000, 1.0000 34 | -0.0100, 0.0000, 1.0000 35 | -0.0130, 0.0000, 1.0000 36 | -0.0170, 0.0000, 1.0000 37 | -0.0200, 0.0000, 1.0000 38 | -0.0230, 0.0000, 1.0000 39 | -0.0270, 0.0000, 1.0000 40 | -0.0300, 0.0000, 1.0000 41 | -0.0330, 0.0000, 1.0000 42 | -0.0370, 0.0000, 1.0000 43 | -0.0400, 0.0000, 1.0000 44 | -0.0430, 0.0000, 1.0000 45 | -0.0470, 0.0000, 1.0000 46 | -0.0470, 0.0000, 1.0000 47 | -0.0430, 0.0000, 1.0000 48 | -0.0400, 0.0000, 1.0000 49 | -0.0370, 0.0000, 1.0000 50 | -0.0330, 0.0000, 1.0000 51 | -0.0300, 0.0000, 1.0000 52 | -0.0270, 0.0000, 1.0000 53 | -0.0230, 0.0000, 1.0000 54 | -0.0200, 0.0000, 1.0000 55 | -0.0170, 0.0000, 1.0000 56 | -0.0130, 0.0000, 1.0000 57 | -0.0100, 0.0000, 1.0000 58 | -0.0070, 0.0000, 1.0000 59 | -0.0030, 0.0000, 1.0000 60 | 0.0000, 0.0000, 1.0000 61 | 0.0000, 0.0000, 1.0000 62 | 0.0000, 0.0030, 1.0000 63 | 0.0000, 0.0070, 1.0000 64 | 0.0000, 0.0100, 1.0000 65 | 0.0000, 0.0130, 1.0000 66 | 0.0000, 0.0170, 1.0000 67 | 0.0000, 0.0200, 1.0000 68 | 0.0000, 0.0230, 1.0000 69 | 0.0000, 0.0270, 1.0000 70 | 0.0000, 0.0300, 1.0000 71 | 0.0000, 0.0330, 1.0000 72 | 0.0000, 0.0370, 1.0000 73 | 0.0000, 0.0400, 1.0000 74 | 0.0000, 0.0430, 1.0000 75 | 0.0000, 0.0470, 1.0000 76 | 0.0000, 0.0470, 1.0000 77 | 0.0000, 0.0430, 1.0000 78 | 0.0000, 0.0400, 1.0000 79 | 0.0000, 0.0370, 1.0000 80 | 0.0000, 0.0330, 1.0000 81 | 0.0000, 0.0300, 1.0000 82 | 0.0000, 0.0270, 1.0000 83 | 0.0000, 0.0230, 1.0000 84 | 0.0000, 0.0200, 1.0000 85 | 0.0000, 0.0170, 1.0000 86 | 0.0000, 0.0130, 1.0000 87 | 0.0000, 0.0100, 1.0000 88 | 0.0000, 0.0070, 1.0000 89 | 0.0000, 0.0030, 1.0000 90 | 0.0000, 0.0000, 1.0000 91 | 0.0000, 0.0000, 1.0000 92 | 0.0000, -0.0030, 1.0000 93 | 0.0000, -0.0070, 1.0000 94 | 0.0000, -0.0100, 1.0000 95 | 0.0000, -0.0130, 1.0000 96 | 0.0000, -0.0170, 1.0000 97 | 0.0000, -0.0200, 1.0000 98 | 0.0000, -0.0230, 1.0000 99 | 0.0000, -0.0270, 1.0000 100 | 0.0000, -0.0300, 1.0000 101 | 0.0000, -0.0330, 1.0000 102 | 0.0000, -0.0370, 1.0000 103 | 0.0000, -0.0400, 1.0000 104 | 0.0000, -0.0430, 1.0000 105 | 0.0000, -0.0470, 1.0000 106 | 0.0000, -0.0470, 1.0000 107 | 0.0000, -0.0430, 1.0000 108 | 0.0000, -0.0400, 1.0000 109 | 0.0000, -0.0370, 1.0000 110 | 0.0000, -0.0330, 1.0000 111 | 0.0000, -0.0300, 1.0000 112 | 0.0000, -0.0270, 1.0000 113 | 0.0000, -0.0230, 1.0000 114 | 0.0000, -0.0200, 1.0000 115 | 0.0000, -0.0170, 1.0000 116 | 0.0000, -0.0130, 1.0000 117 | 0.0000, -0.0100, 1.0000 118 | 0.0000, -0.0070, 1.0000 119 | 0.0000, -0.0030, 1.0000 120 | 0.0000, 0.0000, 1.0000 121 | 0.0000, 0.0000, 1.0000 122 | 0.0000, 0.0000, 0.9980 123 | 0.0000, 0.0000, 0.9960 124 | 0.0000, 0.0000, 0.9940 125 | 0.0000, 0.0000, 0.9920 126 | 0.0000, 0.0000, 0.9900 127 | 0.0000, 0.0000, 0.9880 128 | 0.0000, 0.0000, 0.9860 129 | 0.0000, 0.0000, 0.9840 130 | 0.0000, 0.0000, 0.9820 131 | 0.0000, 0.0000, 0.9800 132 | 0.0000, 0.0000, 0.9780 133 | 0.0000, 0.0000, 0.9760 134 | 0.0000, 0.0000, 0.9740 135 | 0.0000, 0.0000, 0.9720 136 | 0.0000, 0.0000, 0.9720 137 | 0.0000, 0.0000, 0.9740 138 | 0.0000, 0.0000, 0.9760 139 | 0.0000, 0.0000, 0.9780 140 | 0.0000, 0.0000, 0.9800 141 | 0.0000, 0.0000, 0.9820 142 | 0.0000, 0.0000, 0.9840 143 | 0.0000, 0.0000, 0.9860 144 | 0.0000, 0.0000, 0.9880 145 | 0.0000, 0.0000, 0.9900 146 | 0.0000, 0.0000, 0.9920 147 | 0.0000, 0.0000, 0.9940 148 | 0.0000, 0.0000, 0.9960 149 | 0.0000, 0.0000, 0.9980 150 | 0.0000, 0.0000, 1.0000 151 | 0.0000, 0.0000, 1.0000 152 | 0.0000, 0.0000, 1.0020 153 | 0.0000, 0.0000, 1.0040 154 | 0.0000, 0.0000, 1.0060 155 | 0.0000, 0.0000, 1.0080 156 | 0.0000, 0.0000, 1.0100 157 | 0.0000, 0.0000, 1.0120 158 | 0.0000, 0.0000, 1.0140 159 | 0.0000, 0.0000, 1.0160 160 | 0.0000, 0.0000, 1.0180 161 | 0.0000, 0.0000, 1.0200 162 | 0.0000, 0.0000, 1.0220 163 | 0.0000, 0.0000, 1.0240 164 | 0.0000, 0.0000, 1.0260 165 | 0.0000, 0.0000, 1.0280 166 | 0.0000, 0.0000, 1.0280 167 | 0.0000, 0.0000, 1.0260 168 | 0.0000, 0.0000, 1.0240 169 | 0.0000, 0.0000, 1.0220 170 | 0.0000, 0.0000, 1.0200 171 | 0.0000, 0.0000, 1.0180 172 | 0.0000, 0.0000, 1.0160 173 | 0.0000, 0.0000, 1.0140 174 | 0.0000, 0.0000, 1.0120 175 | 0.0000, 0.0000, 1.0100 176 | 0.0000, 0.0000, 1.0080 177 | 0.0000, 0.0000, 1.0060 178 | 0.0000, 0.0000, 1.0040 179 | 0.0000, 0.0000, 1.0020 180 | 0.0000, 0.0000, 1.0000 181 | -------------------------------------------------------------------------------- /bnr/README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA BNR NIM Client 2 | 3 | This package has a sample client which demonstrates interaction with a BNR NIM. 4 | 5 | ## Getting Started 6 | 7 | NVIDIA Maxine NIM Client packages use gRPC APIs. Instructions below demonstrate usage of BNR NIM using Python gRPC client. 8 | Additionally, access the [Try API](https://build.nvidia.com/nvidia/bnr/api) feature to experience the NVIDIA BNR NIM API without hosting your own servers, as it leverages the NVIDIA Cloud Functions backend. 9 | 10 | ## Pre-requisites 11 | 12 | - Ensure you have Python 3.10 or above installed on your system. 13 | Please refer to the [Python documentation](https://www.python.org/downloads/) for download and installation instructions. 14 | - Access to NVIDIA BNR NIM Container / Service. 15 | 16 | ## Usage guide 17 | 18 | ### 1. Clone the repository 19 | 20 | ```bash 21 | git clone https://github.com/nvidia-maxine/nim-clients.git 22 | 23 | // Go to the 'bnr' folder 24 | cd nim-clients/bnr 25 | ``` 26 | 27 | ### 2. Install Dependencies 28 | 29 | ```bash 30 | sudo apt-get install python3-pip 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | ### 3. Host the NIM Server 35 | 36 | Before running client part of BNR, please set up a server. 37 | The simplest way to do that is to follow the [quick start guide](https://docs.nvidia.com/nim/maxine/bnr/latest/index.html). 38 | This step can be skipped when using [Try API](https://build.nvidia.com/nvidia/bnr/api). 39 | 40 | 41 | ### 4. Compile the Protos 42 | 43 | Before running the python client, you can choose to compile the protos. 44 | The grpcio version needed for compilation can be referred at requirements.txt 45 | 46 | To compile protos on Linux, run: 47 | ```bash 48 | // Go to bnr/protos folder 49 | cd bnr/protos 50 | 51 | chmod +x compile_protos.sh 52 | ./compile_protos.sh 53 | ``` 54 | 55 | To compile protos on Windows, run: 56 | ```bash 57 | // Go to bnr/protos folder 58 | cd bnr/protos 59 | 60 | compile_protos.bat 61 | ``` 62 | 63 | ### 5. Run the Python Client 64 | 65 | Go to the scripts directory. 66 | 67 | ```bash 68 | cd scripts 69 | ``` 70 | 71 | #### Usage for Transactional NIM Request 72 | 73 | To run client in transactional mode. Set `--sample-rate` in accordance with the server, default is set to `48000`. The following example command processes the packaged sample audio file in transactional mode and generates a `bnr_48k_output.wav` file in the current folder. 74 | 75 | ```bash 76 | python bnr.py --target 127.0.0.1:8001 --input ../assets/bnr_48k_input.wav --output bnr_48k_output.wav --sample-rate 48000 77 | ``` 78 | 79 | #### Usage for Streaming NIM Request 80 | 81 | To run the client in streaming mode, add `--streaming`. The following example command processes the packaged sample audio file in streaming mode and generates a `bnr_48k_output.wav` file in the current folder. 82 | 83 | ```bash 84 | python bnr.py --target 127.0.0.1:8001 --input ../assets/bnr_48k_input.wav --output bnr_48k_output.wav --streaming --sample-rate 48000 85 | ``` 86 | 87 | Only WAV files are supported. 88 | 89 | #### Usage for Preview API Request 90 | 91 | ```bash 92 | python bnr.py --preview-mode \ 93 | --ssl-mode TLS \ 94 | --target grpc.nvcf.nvidia.com:443 \ 95 | --function-id \ 96 | --api-key $API_KEY_REQUIRED_IF_EXECUTING_OUTSIDE_NGC \ 97 | --input \ 98 | --output \ 99 | ``` 100 | 101 | #### Command Line Arguments 102 | 103 | - `--preview-mode` - Flag to send request to preview NVCF server on https://build.nvidia.com/nvidia/bnr/api. 104 | - `--ssl-mode` - Flag to control if SSL MTLS/TLS encryption should be used. When running preview SSL must be set to TLS. Default value is `None`. 105 | - `--ssl-key` - The path to ssl private key. Default value is `None`. 106 | - `--ssl-cert` - The path to ssl certificate chain. Default value is `None`. 107 | - `--ssl-root-cert` - The path to ssl root certificate. Default value is `None`. 108 | - `--target` - of gRPC service, when hosted locally. Use grpc.nvcf.nvidia.com:443 when hosted on NVCF. 109 | - `--api-key` - NGC API key required for authentication, utilized when using `TRY API` ignored otherwise. 110 | - `--function-id` - NVCF function ID for the service, utilized when using `TRY API` ignored otherwise. 111 | - `--input` - The path to the input audio file. Default value is `../assets/bnr_48k_input.wav`. 112 | - `--output` - The path for the output audio file. Default is current directory (scripts) with name `bnr_48k_output.wav`. 113 | - `--streaming` - Flag to control if streaming mode should be used. Transactional mode will be used by default. 114 | - `--sample-rate` - Sample rate of input audio file in Hz (`16000`, `48000`), default is `48000`. 115 | - `--intensity-ratio` - Intensity ratio value between 0 and 1 to control denoising intensity. Default is 1.0 (maximum denoising). 116 | 117 | Refer the [docs](https://docs.nvidia.com/nim/maxine/bnr/latest/index.html) for more information. 118 | -------------------------------------------------------------------------------- /eye-contact/interfaces/eyecontact_pb2_grpc.py: -------------------------------------------------------------------------------- 1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! 2 | """Client and server classes corresponding to protobuf-defined services.""" 3 | import grpc 4 | import warnings 5 | 6 | import eyecontact_pb2 as eyecontact__pb2 7 | 8 | GRPC_GENERATED_VERSION = '1.67.1' 9 | GRPC_VERSION = grpc.__version__ 10 | _version_not_supported = False 11 | 12 | try: 13 | from grpc._utilities import first_version_is_lower 14 | _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION) 15 | except ImportError: 16 | _version_not_supported = True 17 | 18 | if _version_not_supported: 19 | raise RuntimeError( 20 | f'The grpc package installed is at version {GRPC_VERSION},' 21 | + f' but the generated code in eyecontact_pb2_grpc.py depends on' 22 | + f' grpcio>={GRPC_GENERATED_VERSION}.' 23 | + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' 24 | + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' 25 | ) 26 | 27 | 28 | class MaxineEyeContactServiceStub(object): 29 | """The MaxineEyeContactService provides APIs to run the 30 | Maxine Eye Contact feature. 31 | """ 32 | 33 | def __init__(self, channel): 34 | """Constructor. 35 | 36 | Args: 37 | channel: A grpc.Channel. 38 | """ 39 | self.RedirectGaze = channel.stream_stream( 40 | '/nvidia.maxine.eyecontact.v1.MaxineEyeContactService/RedirectGaze', 41 | request_serializer=eyecontact__pb2.RedirectGazeRequest.SerializeToString, 42 | response_deserializer=eyecontact__pb2.RedirectGazeResponse.FromString, 43 | _registered_method=True) 44 | 45 | 46 | class MaxineEyeContactServiceServicer(object): 47 | """The MaxineEyeContactService provides APIs to run the 48 | Maxine Eye Contact feature. 49 | """ 50 | 51 | def RedirectGaze(self, request_iterator, context): 52 | """RedirectGaze is a bidirectional streaming API to run the 53 | Maxine Eye Contact feature on mp4 video files. 54 | 55 | The input message can contain GazeRedirectionConfig or bytes. 56 | In the beginning of the stream, a request with GazeRedirectionConfig may 57 | be sent to the server to set the feature's parameter. 58 | The server will echo back a response with the config to signify that the 59 | parameters were properly set. If not configured, default values will be 60 | used for the feature's parameters. Any GazeRedirectionConfig sent during 61 | the middle of the stream will be ignored. 62 | 63 | After the optional configuration, the client streams the input mp4 file in 64 | chunks in the input message and receives the output mp4 file in chunks in 65 | the output message. 66 | 67 | The client should only pass one video file per API invocation and the 68 | configuration, if set, is applied to only that invocation. 69 | """ 70 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 71 | context.set_details('Method not implemented!') 72 | raise NotImplementedError('Method not implemented!') 73 | 74 | 75 | def add_MaxineEyeContactServiceServicer_to_server(servicer, server): 76 | rpc_method_handlers = { 77 | 'RedirectGaze': grpc.stream_stream_rpc_method_handler( 78 | servicer.RedirectGaze, 79 | request_deserializer=eyecontact__pb2.RedirectGazeRequest.FromString, 80 | response_serializer=eyecontact__pb2.RedirectGazeResponse.SerializeToString, 81 | ), 82 | } 83 | generic_handler = grpc.method_handlers_generic_handler( 84 | 'nvidia.maxine.eyecontact.v1.MaxineEyeContactService', rpc_method_handlers) 85 | server.add_generic_rpc_handlers((generic_handler,)) 86 | server.add_registered_method_handlers('nvidia.maxine.eyecontact.v1.MaxineEyeContactService', rpc_method_handlers) 87 | 88 | 89 | # This class is part of an EXPERIMENTAL API. 90 | class MaxineEyeContactService(object): 91 | """The MaxineEyeContactService provides APIs to run the 92 | Maxine Eye Contact feature. 93 | """ 94 | 95 | @staticmethod 96 | def RedirectGaze(request_iterator, 97 | target, 98 | options=(), 99 | channel_credentials=None, 100 | call_credentials=None, 101 | insecure=False, 102 | compression=None, 103 | wait_for_ready=None, 104 | timeout=None, 105 | metadata=None): 106 | return grpc.experimental.stream_stream( 107 | request_iterator, 108 | target, 109 | '/nvidia.maxine.eyecontact.v1.MaxineEyeContactService/RedirectGaze', 110 | eyecontact__pb2.RedirectGazeRequest.SerializeToString, 111 | eyecontact__pb2.RedirectGazeResponse.FromString, 112 | options, 113 | channel_credentials, 114 | insecure, 115 | call_credentials, 116 | compression, 117 | wait_for_ready, 118 | timeout, 119 | metadata, 120 | _registered_method=True) 121 | -------------------------------------------------------------------------------- /studio-voice/README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA Studio Voice NIM Client 2 | 3 | This package has a sample client which demonstrates interaction with a Studio Voice NIM. 4 | 5 | ## Getting Started 6 | 7 | NVIDIA Maxine NIM Client packages use gRPC APIs. Instructions below demonstrate usage of Studio Voice NIM using Python gRPC client. 8 | Additionally, access the [Try API](https://build.nvidia.com/nvidia/studiovoice/api) feature to experience the NVIDIA Studio Voice NIM API without hosting your own servers, as it leverages the NVIDIA Cloud Functions backend. 9 | 10 | ## Pre-requisites 11 | 12 | - Ensure you have Python 3.10 or above installed on your system. 13 | Please refer to the [Python documentation](https://www.python.org/downloads/) for download and installation instructions. 14 | - Access to NVIDIA Studio Voice NIM Container / Service. 15 | 16 | ## Usage guide 17 | 18 | ### 1. Clone the repository 19 | 20 | ```bash 21 | git clone https://github.com/nvidia-maxine/nim-clients.git 22 | 23 | // Go to the 'studio-voice' folder 24 | cd nim-clients/studio-voice 25 | ``` 26 | 27 | ### 2. Install Dependencies 28 | 29 | ```bash 30 | sudo apt-get install python3-pip 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | ### 3. Host the NIM Server 35 | 36 | Before running client part of Studio Voice, please set up a server. 37 | The simplest way to do that is to follow the [quick start guide](https://docs.nvidia.com/nim/maxine/studio-voice/latest/index.html). 38 | This step can be skipped when using [Try API](https://build.nvidia.com/nvidia/studiovoice/api). 39 | 40 | 41 | ### 4. Compile the Protos 42 | 43 | Before running the python client, you can choose to compile the protos. 44 | The grpcio version needed for compilation can be referred at requirements.txt 45 | 46 | To compile protos on Linux, run: 47 | ```bash 48 | // Go to studio-voice/protos folder 49 | cd studio-voice/protos 50 | 51 | chmod +x compile_protos.sh 52 | ./compile_protos.sh 53 | ``` 54 | 55 | To compile protos on Windows, run: 56 | ```bash 57 | // Go to studio-voice/protos folder 58 | cd studio-voice/protos 59 | 60 | compile_protos.bat 61 | ``` 62 | 63 | ### 5. Run the Python Client 64 | 65 | Go to the scripts directory. 66 | 67 | ```bash 68 | cd scripts 69 | ``` 70 | 71 | #### Usage for Transactional NIM Request 72 | 73 | To run client in transactional mode. Set `--model-type` in accordance with the server, default is set to `48k-hq`. The following example command processes the packaged sample audio file in transactional mode and generates a `studio_voice_48k_output.wav` file in the current folder. 74 | 75 | ```bash 76 | python studio_voice.py --target 127.0.0.1:8001 --input ../assets/studio_voice_48k_input.wav --output studio_voice_48k_output.wav --model-type 48k-hq 77 | ``` 78 | 79 | #### Usage for Streaming NIM Request 80 | 81 | To run the client in streaming mode, add `--streaming`. The following example command processes the packaged sample audio file in streaming mode and generates a `studio_voice_48k_output.wav` file in the current folder. 82 | 83 | ```bash 84 | python studio_voice.py --target 127.0.0.1:8001 --input ../assets/studio_voice_48k_input.wav --output studio_voice_48k_output.wav --streaming --model-type 48k-ll 85 | ``` 86 | 87 | Only WAV files are supported. 88 | 89 | #### Usage for Preview API Request 90 | 91 | ```bash 92 | python studio_voice.py --preview-mode \ 93 | --ssl-mode TLS \ 94 | --target grpc.nvcf.nvidia.com:443 \ 95 | --function-id \ 96 | --api-key $API_KEY_REQUIRED_IF_EXECUTING_OUTSIDE_NGC \ 97 | --input \ 98 | --output \ 99 | ``` 100 | 101 | #### Command Line Arguments 102 | 103 | - `--preview-mode` - Flag to send request to preview NVCF server on https://build.nvidia.com/nvidia/studiovoice/api. 104 | - `--ssl-mode` - Flag to control if SSL MTLS/TLS encryption should be used. When running preview SSL must be set to TLS. Default value is `None`. 105 | - `--ssl-key` - The path to ssl private key. Default value is `None`. 106 | - `--ssl-cert` - The path to ssl certificate chain. Default value is `None`. 107 | - `--ssl-root-cert` - The path to ssl root certificate. Default value is `None`. 108 | - `--target` - of gRPC service, when hosted locally. Use grpc.nvcf.nvidia.com:443 when hosted on NVCF. 109 | - `--api-key` - NGC API key required for authentication, utilized when using `TRY API` ignored otherwise. 110 | - `--function-id` - NVCF function ID for the service, utilized when using `TRY API` ignored otherwise. 111 | - `--input` - The path to the input audio file. Default value is `../assets/studio_voice_48k_input.wav`. 112 | - `--output` - The path for the output audio file. Default is current directory (scripts) with name `studio_voice_48k_output.wav`. 113 | - `--streaming` - Flag to control if streaming mode should be used. Transactional mode will be used by default. 114 | - `--model-type` - Studio Voice model type hosted on server. It can be set to `48k-hq/48k-ll/16k-hq`. Default value is `48k-hq`. 115 | 116 | Refer the [docs](https://docs.nvidia.com/nim/maxine/studio-voice/latest/index.html) for more information. 117 | -------------------------------------------------------------------------------- /audio2face-2d/python/interfaces/audio2face2d_pb2_grpc.py: -------------------------------------------------------------------------------- 1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! 2 | """Client and server classes corresponding to protobuf-defined services.""" 3 | import grpc 4 | import warnings 5 | 6 | import audio2face2d_pb2 as audio2face2d__pb2 7 | 8 | GRPC_GENERATED_VERSION = '1.67.1' 9 | GRPC_VERSION = grpc.__version__ 10 | _version_not_supported = False 11 | 12 | try: 13 | from grpc._utilities import first_version_is_lower 14 | _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION) 15 | except ImportError: 16 | _version_not_supported = True 17 | 18 | if _version_not_supported: 19 | raise RuntimeError( 20 | f'The grpc package installed is at version {GRPC_VERSION},' 21 | + f' but the generated code in audio2face2d_pb2_grpc.py depends on' 22 | + f' grpcio>={GRPC_GENERATED_VERSION}.' 23 | + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' 24 | + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' 25 | ) 26 | 27 | 28 | class Audio2Face2DServiceStub(object): 29 | """The Audio2Face2DService provides APIs to run the 30 | Maxine Audio to Face - 2D feature. 31 | """ 32 | 33 | def __init__(self, channel): 34 | """Constructor. 35 | 36 | Args: 37 | channel: A grpc.Channel. 38 | """ 39 | self.Animate = channel.stream_stream( 40 | '/nvidia.maxine.audio2face2d.v1.Audio2Face2DService/Animate', 41 | request_serializer=audio2face2d__pb2.AnimateRequest.SerializeToString, 42 | response_deserializer=audio2face2d__pb2.AnimateResponse.FromString, 43 | _registered_method=True) 44 | 45 | 46 | class Audio2Face2DServiceServicer(object): 47 | """The Audio2Face2DService provides APIs to run the 48 | Maxine Audio to Face - 2D feature. 49 | """ 50 | 51 | def Animate(self, request_iterator, context): 52 | """Animate is a bidirectional streaming API to run the 53 | Audio2Face-2D. 54 | 55 | The input message can contain AnimateConfig or bytes. 56 | In the beginning of the stream, a request with AnimateConfig should 57 | be sent to the server to set the feature's parameters. 58 | The server will echo back a response with the config to signify that the 59 | parameters were properly set. It is mandatory to set the portrait_image 60 | config, other configuration parameters are optional and a default value will 61 | be used if not set. Any AnimateConfig sent during the middle of the stream 62 | will be ignored. 63 | 64 | After the configuration step, the client streams the input wav file in 65 | chunks in the input message and receives the output mp4 file in chunks in 66 | the output message. While the inference is running, the server will periodically 67 | echo empty message to keep the channel alive. The client should ignore this message. 68 | 69 | It is recommended that the client should pass one file per API invocation. 70 | The configurations are also set per invocation. 71 | """ 72 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 73 | context.set_details('Method not implemented!') 74 | raise NotImplementedError('Method not implemented!') 75 | 76 | 77 | def add_Audio2Face2DServiceServicer_to_server(servicer, server): 78 | rpc_method_handlers = { 79 | 'Animate': grpc.stream_stream_rpc_method_handler( 80 | servicer.Animate, 81 | request_deserializer=audio2face2d__pb2.AnimateRequest.FromString, 82 | response_serializer=audio2face2d__pb2.AnimateResponse.SerializeToString, 83 | ), 84 | } 85 | generic_handler = grpc.method_handlers_generic_handler( 86 | 'nvidia.maxine.audio2face2d.v1.Audio2Face2DService', rpc_method_handlers) 87 | server.add_generic_rpc_handlers((generic_handler,)) 88 | server.add_registered_method_handlers('nvidia.maxine.audio2face2d.v1.Audio2Face2DService', rpc_method_handlers) 89 | 90 | 91 | # This class is part of an EXPERIMENTAL API. 92 | class Audio2Face2DService(object): 93 | """The Audio2Face2DService provides APIs to run the 94 | Maxine Audio to Face - 2D feature. 95 | """ 96 | 97 | @staticmethod 98 | def Animate(request_iterator, 99 | target, 100 | options=(), 101 | channel_credentials=None, 102 | call_credentials=None, 103 | insecure=False, 104 | compression=None, 105 | wait_for_ready=None, 106 | timeout=None, 107 | metadata=None): 108 | return grpc.experimental.stream_stream( 109 | request_iterator, 110 | target, 111 | '/nvidia.maxine.audio2face2d.v1.Audio2Face2DService/Animate', 112 | audio2face2d__pb2.AnimateRequest.SerializeToString, 113 | audio2face2d__pb2.AnimateResponse.FromString, 114 | options, 115 | channel_credentials, 116 | insecure, 117 | call_credentials, 118 | compression, 119 | wait_for_ready, 120 | timeout, 121 | metadata, 122 | _registered_method=True) 123 | -------------------------------------------------------------------------------- /eye-contact/interfaces/eyecontact_pb2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by the protocol buffer compiler. DO NOT EDIT! 3 | # NO CHECKED-IN PROTOBUF GENCODE 4 | # source: eyecontact.proto 5 | # Protobuf Python Version: 5.27.2 6 | """Generated protocol buffer code.""" 7 | from google.protobuf import descriptor as _descriptor 8 | from google.protobuf import descriptor_pool as _descriptor_pool 9 | from google.protobuf import runtime_version as _runtime_version 10 | from google.protobuf import symbol_database as _symbol_database 11 | from google.protobuf.internal import builder as _builder 12 | _runtime_version.ValidateProtobufRuntimeVersion( 13 | _runtime_version.Domain.PUBLIC, 14 | 5, 15 | 27, 16 | 2, 17 | '', 18 | 'eyecontact.proto' 19 | ) 20 | # @@protoc_insertion_point(imports) 21 | 22 | _sym_db = _symbol_database.Default() 23 | 24 | 25 | from google.protobuf import any_pb2 as google_dot_protobuf_dot_any__pb2 26 | from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 27 | 28 | 29 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10\x65yecontact.proto\x12\x1bnvidia.maxine.eyecontact.v1\x1a\x19google/protobuf/any.proto\x1a\x1bgoogle/protobuf/empty.proto\"]\n\rLossyEncoding\x12\x14\n\x07\x62itrate\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cidr_interval\x18\x02 \x01(\rH\x01\x88\x01\x01\x42\n\n\x08_bitrateB\x0f\n\r_idr_interval\"\xaa\x01\n\x14\x43ustomEncodingParams\x12M\n\x06\x63ustom\x18\x01 \x03(\x0b\x32=.nvidia.maxine.eyecontact.v1.CustomEncodingParams.CustomEntry\x1a\x43\n\x0b\x43ustomEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.google.protobuf.Any:\x02\x38\x01\"\xc5\x01\n\x13OutputVideoEncoding\x12\x12\n\x08lossless\x18\x01 \x01(\x08H\x00\x12;\n\x05lossy\x18\x02 \x01(\x0b\x32*.nvidia.maxine.eyecontact.v1.LossyEncodingH\x00\x12L\n\x0f\x63ustom_encoding\x18\x03 \x01(\x0b\x32\x31.nvidia.maxine.eyecontact.v1.CustomEncodingParamsH\x00\x42\x0f\n\rencoding_type\"\x98\x08\n\x12RedirectGazeConfig\x12\x15\n\x08temporal\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x1b\n\x0e\x64\x65tect_closure\x18\x02 \x01(\rH\x01\x88\x01\x01\x12!\n\x14\x65ye_size_sensitivity\x18\x03 \x01(\rH\x02\x88\x01\x01\x12\x1c\n\x0f\x65nable_lookaway\x18\x04 \x01(\rH\x03\x88\x01\x01\x12 \n\x13lookaway_max_offset\x18\x05 \x01(\rH\x04\x88\x01\x01\x12\"\n\x15lookaway_interval_min\x18\x06 \x01(\rH\x05\x88\x01\x01\x12$\n\x17lookaway_interval_range\x18\x07 \x01(\rH\x06\x88\x01\x01\x12%\n\x18gaze_pitch_threshold_low\x18\x08 \x01(\x02H\x07\x88\x01\x01\x12&\n\x19gaze_pitch_threshold_high\x18\t \x01(\x02H\x08\x88\x01\x01\x12#\n\x16gaze_yaw_threshold_low\x18\n \x01(\x02H\t\x88\x01\x01\x12$\n\x17gaze_yaw_threshold_high\x18\x0b \x01(\x02H\n\x88\x01\x01\x12%\n\x18head_pitch_threshold_low\x18\x0c \x01(\x02H\x0b\x88\x01\x01\x12&\n\x19head_pitch_threshold_high\x18\r \x01(\x02H\x0c\x88\x01\x01\x12#\n\x16head_yaw_threshold_low\x18\x0e \x01(\x02H\r\x88\x01\x01\x12$\n\x17head_yaw_threshold_high\x18\x0f \x01(\x02H\x0e\x88\x01\x01\x12T\n\x15output_video_encoding\x18\x10 \x01(\x0b\x32\x30.nvidia.maxine.eyecontact.v1.OutputVideoEncodingH\x0f\x88\x01\x01\x42\x0b\n\t_temporalB\x11\n\x0f_detect_closureB\x17\n\x15_eye_size_sensitivityB\x12\n\x10_enable_lookawayB\x16\n\x14_lookaway_max_offsetB\x18\n\x16_lookaway_interval_minB\x1a\n\x18_lookaway_interval_rangeB\x1b\n\x19_gaze_pitch_threshold_lowB\x1c\n\x1a_gaze_pitch_threshold_highB\x19\n\x17_gaze_yaw_threshold_lowB\x1a\n\x18_gaze_yaw_threshold_highB\x1b\n\x19_head_pitch_threshold_lowB\x1c\n\x1a_head_pitch_threshold_highB\x19\n\x17_head_yaw_threshold_lowB\x1a\n\x18_head_yaw_threshold_highB\x18\n\x16_output_video_encoding\"\x83\x01\n\x13RedirectGazeRequest\x12\x41\n\x06\x63onfig\x18\x01 \x01(\x0b\x32/.nvidia.maxine.eyecontact.v1.RedirectGazeConfigH\x00\x12\x19\n\x0fvideo_file_data\x18\x02 \x01(\x0cH\x00\x42\x0e\n\x0cstream_input\"\xb2\x01\n\x14RedirectGazeResponse\x12\x41\n\x06\x63onfig\x18\x01 \x01(\x0b\x32/.nvidia.maxine.eyecontact.v1.RedirectGazeConfigH\x00\x12\x19\n\x0fvideo_file_data\x18\x02 \x01(\x0cH\x00\x12+\n\tkeepalive\x18\x03 \x01(\x0b\x32\x16.google.protobuf.EmptyH\x00\x42\x0f\n\rstream_output2\x94\x01\n\x17MaxineEyeContactService\x12y\n\x0cRedirectGaze\x12\x30.nvidia.maxine.eyecontact.v1.RedirectGazeRequest\x1a\x31.nvidia.maxine.eyecontact.v1.RedirectGazeResponse\"\x00(\x01\x30\x01\x62\x06proto3') 30 | 31 | _globals = globals() 32 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) 33 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'eyecontact_pb2', _globals) 34 | if not _descriptor._USE_C_DESCRIPTORS: 35 | DESCRIPTOR._loaded_options = None 36 | _globals['_CUSTOMENCODINGPARAMS_CUSTOMENTRY']._loaded_options = None 37 | _globals['_CUSTOMENCODINGPARAMS_CUSTOMENTRY']._serialized_options = b'8\001' 38 | _globals['_LOSSYENCODING']._serialized_start=105 39 | _globals['_LOSSYENCODING']._serialized_end=198 40 | _globals['_CUSTOMENCODINGPARAMS']._serialized_start=201 41 | _globals['_CUSTOMENCODINGPARAMS']._serialized_end=371 42 | _globals['_CUSTOMENCODINGPARAMS_CUSTOMENTRY']._serialized_start=304 43 | _globals['_CUSTOMENCODINGPARAMS_CUSTOMENTRY']._serialized_end=371 44 | _globals['_OUTPUTVIDEOENCODING']._serialized_start=374 45 | _globals['_OUTPUTVIDEOENCODING']._serialized_end=571 46 | _globals['_REDIRECTGAZECONFIG']._serialized_start=574 47 | _globals['_REDIRECTGAZECONFIG']._serialized_end=1622 48 | _globals['_REDIRECTGAZEREQUEST']._serialized_start=1625 49 | _globals['_REDIRECTGAZEREQUEST']._serialized_end=1756 50 | _globals['_REDIRECTGAZERESPONSE']._serialized_start=1759 51 | _globals['_REDIRECTGAZERESPONSE']._serialized_end=1937 52 | _globals['_MAXINEEYECONTACTSERVICE']._serialized_start=1940 53 | _globals['_MAXINEEYECONTACTSERVICE']._serialized_end=2088 54 | # @@protoc_insertion_point(module_scope) 55 | -------------------------------------------------------------------------------- /eye-contact/interfaces/eyecontact_pb2.pyi: -------------------------------------------------------------------------------- 1 | from google.protobuf import any_pb2 as _any_pb2 2 | from google.protobuf import empty_pb2 as _empty_pb2 3 | from google.protobuf.internal import containers as _containers 4 | from google.protobuf import descriptor as _descriptor 5 | from google.protobuf import message as _message 6 | from typing import ClassVar as _ClassVar, Mapping as _Mapping, Optional as _Optional, Union as _Union 7 | 8 | DESCRIPTOR: _descriptor.FileDescriptor 9 | 10 | class LossyEncoding(_message.Message): 11 | __slots__ = ("bitrate", "idr_interval") 12 | BITRATE_FIELD_NUMBER: _ClassVar[int] 13 | IDR_INTERVAL_FIELD_NUMBER: _ClassVar[int] 14 | bitrate: int 15 | idr_interval: int 16 | def __init__(self, bitrate: _Optional[int] = ..., idr_interval: _Optional[int] = ...) -> None: ... 17 | 18 | class CustomEncodingParams(_message.Message): 19 | __slots__ = ("custom",) 20 | class CustomEntry(_message.Message): 21 | __slots__ = ("key", "value") 22 | KEY_FIELD_NUMBER: _ClassVar[int] 23 | VALUE_FIELD_NUMBER: _ClassVar[int] 24 | key: str 25 | value: _any_pb2.Any 26 | def __init__(self, key: _Optional[str] = ..., value: _Optional[_Union[_any_pb2.Any, _Mapping]] = ...) -> None: ... 27 | CUSTOM_FIELD_NUMBER: _ClassVar[int] 28 | custom: _containers.MessageMap[str, _any_pb2.Any] 29 | def __init__(self, custom: _Optional[_Mapping[str, _any_pb2.Any]] = ...) -> None: ... 30 | 31 | class OutputVideoEncoding(_message.Message): 32 | __slots__ = ("lossless", "lossy", "custom_encoding") 33 | LOSSLESS_FIELD_NUMBER: _ClassVar[int] 34 | LOSSY_FIELD_NUMBER: _ClassVar[int] 35 | CUSTOM_ENCODING_FIELD_NUMBER: _ClassVar[int] 36 | lossless: bool 37 | lossy: LossyEncoding 38 | custom_encoding: CustomEncodingParams 39 | def __init__(self, lossless: bool = ..., lossy: _Optional[_Union[LossyEncoding, _Mapping]] = ..., custom_encoding: _Optional[_Union[CustomEncodingParams, _Mapping]] = ...) -> None: ... 40 | 41 | class RedirectGazeConfig(_message.Message): 42 | __slots__ = ("temporal", "detect_closure", "eye_size_sensitivity", "enable_lookaway", "lookaway_max_offset", "lookaway_interval_min", "lookaway_interval_range", "gaze_pitch_threshold_low", "gaze_pitch_threshold_high", "gaze_yaw_threshold_low", "gaze_yaw_threshold_high", "head_pitch_threshold_low", "head_pitch_threshold_high", "head_yaw_threshold_low", "head_yaw_threshold_high", "output_video_encoding") 43 | TEMPORAL_FIELD_NUMBER: _ClassVar[int] 44 | DETECT_CLOSURE_FIELD_NUMBER: _ClassVar[int] 45 | EYE_SIZE_SENSITIVITY_FIELD_NUMBER: _ClassVar[int] 46 | ENABLE_LOOKAWAY_FIELD_NUMBER: _ClassVar[int] 47 | LOOKAWAY_MAX_OFFSET_FIELD_NUMBER: _ClassVar[int] 48 | LOOKAWAY_INTERVAL_MIN_FIELD_NUMBER: _ClassVar[int] 49 | LOOKAWAY_INTERVAL_RANGE_FIELD_NUMBER: _ClassVar[int] 50 | GAZE_PITCH_THRESHOLD_LOW_FIELD_NUMBER: _ClassVar[int] 51 | GAZE_PITCH_THRESHOLD_HIGH_FIELD_NUMBER: _ClassVar[int] 52 | GAZE_YAW_THRESHOLD_LOW_FIELD_NUMBER: _ClassVar[int] 53 | GAZE_YAW_THRESHOLD_HIGH_FIELD_NUMBER: _ClassVar[int] 54 | HEAD_PITCH_THRESHOLD_LOW_FIELD_NUMBER: _ClassVar[int] 55 | HEAD_PITCH_THRESHOLD_HIGH_FIELD_NUMBER: _ClassVar[int] 56 | HEAD_YAW_THRESHOLD_LOW_FIELD_NUMBER: _ClassVar[int] 57 | HEAD_YAW_THRESHOLD_HIGH_FIELD_NUMBER: _ClassVar[int] 58 | OUTPUT_VIDEO_ENCODING_FIELD_NUMBER: _ClassVar[int] 59 | temporal: int 60 | detect_closure: int 61 | eye_size_sensitivity: int 62 | enable_lookaway: int 63 | lookaway_max_offset: int 64 | lookaway_interval_min: int 65 | lookaway_interval_range: int 66 | gaze_pitch_threshold_low: float 67 | gaze_pitch_threshold_high: float 68 | gaze_yaw_threshold_low: float 69 | gaze_yaw_threshold_high: float 70 | head_pitch_threshold_low: float 71 | head_pitch_threshold_high: float 72 | head_yaw_threshold_low: float 73 | head_yaw_threshold_high: float 74 | output_video_encoding: OutputVideoEncoding 75 | def __init__(self, temporal: _Optional[int] = ..., detect_closure: _Optional[int] = ..., eye_size_sensitivity: _Optional[int] = ..., enable_lookaway: _Optional[int] = ..., lookaway_max_offset: _Optional[int] = ..., lookaway_interval_min: _Optional[int] = ..., lookaway_interval_range: _Optional[int] = ..., gaze_pitch_threshold_low: _Optional[float] = ..., gaze_pitch_threshold_high: _Optional[float] = ..., gaze_yaw_threshold_low: _Optional[float] = ..., gaze_yaw_threshold_high: _Optional[float] = ..., head_pitch_threshold_low: _Optional[float] = ..., head_pitch_threshold_high: _Optional[float] = ..., head_yaw_threshold_low: _Optional[float] = ..., head_yaw_threshold_high: _Optional[float] = ..., output_video_encoding: _Optional[_Union[OutputVideoEncoding, _Mapping]] = ...) -> None: ... 76 | 77 | class RedirectGazeRequest(_message.Message): 78 | __slots__ = ("config", "video_file_data") 79 | CONFIG_FIELD_NUMBER: _ClassVar[int] 80 | VIDEO_FILE_DATA_FIELD_NUMBER: _ClassVar[int] 81 | config: RedirectGazeConfig 82 | video_file_data: bytes 83 | def __init__(self, config: _Optional[_Union[RedirectGazeConfig, _Mapping]] = ..., video_file_data: _Optional[bytes] = ...) -> None: ... 84 | 85 | class RedirectGazeResponse(_message.Message): 86 | __slots__ = ("config", "video_file_data", "keepalive") 87 | CONFIG_FIELD_NUMBER: _ClassVar[int] 88 | VIDEO_FILE_DATA_FIELD_NUMBER: _ClassVar[int] 89 | KEEPALIVE_FIELD_NUMBER: _ClassVar[int] 90 | config: RedirectGazeConfig 91 | video_file_data: bytes 92 | keepalive: _empty_pb2.Empty 93 | def __init__(self, config: _Optional[_Union[RedirectGazeConfig, _Mapping]] = ..., video_file_data: _Optional[bytes] = ..., keepalive: _Optional[_Union[_empty_pb2.Empty, _Mapping]] = ...) -> None: ... 94 | -------------------------------------------------------------------------------- /audio2face-2d/python/interfaces/audio2face2d_pb2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by the protocol buffer compiler. DO NOT EDIT! 3 | # NO CHECKED-IN PROTOBUF GENCODE 4 | # source: audio2face2d.proto 5 | # Protobuf Python Version: 5.27.2 6 | """Generated protocol buffer code.""" 7 | from google.protobuf import descriptor as _descriptor 8 | from google.protobuf import descriptor_pool as _descriptor_pool 9 | from google.protobuf import runtime_version as _runtime_version 10 | from google.protobuf import symbol_database as _symbol_database 11 | from google.protobuf.internal import builder as _builder 12 | _runtime_version.ValidateProtobufRuntimeVersion( 13 | _runtime_version.Domain.PUBLIC, 14 | 5, 15 | 27, 16 | 2, 17 | '', 18 | 'audio2face2d.proto' 19 | ) 20 | # @@protoc_insertion_point(imports) 21 | 22 | _sym_db = _symbol_database.Default() 23 | 24 | 25 | from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 26 | 27 | 28 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x12\x61udio2face2d.proto\x12\x1dnvidia.maxine.audio2face2d.v1\x1a\x1bgoogle/protobuf/empty.proto\"\x83\x08\n\rAnimateConfig\x12\x16\n\x0eportrait_image\x18\x01 \x01(\x0c\x12K\n\x0fmodel_selection\x18\x02 \x01(\x0e\x32-.nvidia.maxine.audio2face2d.v1.ModelSelectionH\x00\x88\x01\x01\x12V\n\x13\x61nimation_crop_mode\x18\x03 \x01(\x0e\x32\x34.nvidia.maxine.audio2face2d.v1.AnimationCroppingModeH\x01\x88\x01\x01\x12H\n\x0ehead_pose_mode\x18\x04 \x01(\x0e\x32+.nvidia.maxine.audio2face2d.v1.HeadPoseModeH\x02\x88\x01\x01\x12\x1c\n\x0f\x65nable_lookaway\x18\x05 \x01(\x08H\x03\x88\x01\x01\x12 \n\x13lookaway_max_offset\x18\x06 \x01(\rH\x04\x88\x01\x01\x12$\n\x17lookaway_interval_range\x18\x07 \x01(\rH\x05\x88\x01\x01\x12\"\n\x15lookaway_interval_min\x18\x08 \x01(\rH\x06\x88\x01\x01\x12\x1c\n\x0f\x62link_frequency\x18\t \x01(\rH\x07\x88\x01\x01\x12\x1b\n\x0e\x62link_duration\x18\n \x01(\rH\x08\x88\x01\x01\x12(\n\x1bmouth_expression_multiplier\x18\x0b \x01(\x02H\t\x88\x01\x01\x12!\n\x14head_pose_multiplier\x18\x0c \x01(\x02H\n\x88\x01\x01\x12Q\n\x13input_head_rotation\x18\r \x01(\x0b\x32/.nvidia.maxine.audio2face2d.v1.QuaternionStreamH\x0b\x88\x01\x01\x12R\n\x16input_head_translation\x18\x0e \x01(\x0b\x32-.nvidia.maxine.audio2face2d.v1.Vector3fStreamH\x0c\x88\x01\x01\x42\x12\n\x10_model_selectionB\x16\n\x14_animation_crop_modeB\x11\n\x0f_head_pose_modeB\x12\n\x10_enable_lookawayB\x16\n\x14_lookaway_max_offsetB\x1a\n\x18_lookaway_interval_rangeB\x18\n\x16_lookaway_interval_minB\x12\n\x10_blink_frequencyB\x11\n\x0f_blink_durationB\x1e\n\x1c_mouth_expression_multiplierB\x17\n\x15_head_pose_multiplierB\x16\n\x14_input_head_rotationB\x19\n\x17_input_head_translation\"+\n\x08Vector3f\x12\t\n\x01x\x18\x01 \x01(\x02\x12\t\n\x01y\x18\x02 \x01(\x02\x12\t\n\x01z\x18\x03 \x01(\x02\"I\n\x0eVector3fStream\x12\x37\n\x06values\x18\x01 \x03(\x0b\x32\'.nvidia.maxine.audio2face2d.v1.Vector3f\"8\n\nQuaternion\x12\t\n\x01x\x18\x01 \x01(\x02\x12\t\n\x01y\x18\x02 \x01(\x02\x12\t\n\x01z\x18\x03 \x01(\x02\x12\t\n\x01w\x18\x04 \x01(\x02\"M\n\x10QuaternionStream\x12\x39\n\x06values\x18\x01 \x03(\x0b\x32).nvidia.maxine.audio2face2d.v1.Quaternion\"{\n\x0e\x41nimateRequest\x12>\n\x06\x63onfig\x18\x01 \x01(\x0b\x32,.nvidia.maxine.audio2face2d.v1.AnimateConfigH\x00\x12\x19\n\x0f\x61udio_file_data\x18\x02 \x01(\x0cH\x00\x42\x0e\n\x0cstream_input\"\xab\x01\n\x0f\x41nimateResponse\x12>\n\x06\x63onfig\x18\x01 \x01(\x0b\x32,.nvidia.maxine.audio2face2d.v1.AnimateConfigH\x00\x12\x19\n\x0fvideo_file_data\x18\x02 \x01(\x0cH\x00\x12,\n\nkeep_alive\x18\x03 \x01(\x0b\x32\x16.google.protobuf.EmptyH\x00\x42\x0f\n\rstream_output*h\n\x0eModelSelection\x12\x1f\n\x1bMODEL_SELECTION_UNSPECIFIED\x10\x00\x12\x18\n\x14MODEL_SELECTION_PERF\x10\x01\x12\x1b\n\x17MODEL_SELECTION_QUALITY\x10\x02*\xc4\x01\n\x15\x41nimationCroppingMode\x12\'\n#ANIMATION_CROPPING_MODE_UNSPECIFIED\x10\x00\x12#\n\x1f\x41NIMATION_CROPPING_MODE_FACEBOX\x10\x01\x12\x31\n-ANIMATION_CROPPING_MODE_REGISTRATION_BLENDING\x10\x02\x12*\n&ANIMATION_CROPPING_MODE_INSET_BLENDING\x10\x03*\xb2\x01\n\x0cHeadPoseMode\x12\x1e\n\x1aHEAD_POSE_MODE_UNSPECIFIED\x10\x00\x12-\n)HEAD_POSE_MODE_RETAIN_FROM_PORTRAIT_IMAGE\x10\x01\x12(\n$HEAD_POSE_MODE_PRE_DEFINED_ANIMATION\x10\x02\x12)\n%HEAD_POSE_MODE_USER_DEFINED_ANIMATION\x10\x03\x32\x85\x01\n\x13\x41udio2Face2DService\x12n\n\x07\x41nimate\x12-.nvidia.maxine.audio2face2d.v1.AnimateRequest\x1a..nvidia.maxine.audio2face2d.v1.AnimateResponse\"\x00(\x01\x30\x01\x62\x06proto3') 29 | 30 | _globals = globals() 31 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) 32 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'audio2face2d_pb2', _globals) 33 | if not _descriptor._USE_C_DESCRIPTORS: 34 | DESCRIPTOR._loaded_options = None 35 | _globals['_MODELSELECTION']._serialized_start=1668 36 | _globals['_MODELSELECTION']._serialized_end=1772 37 | _globals['_ANIMATIONCROPPINGMODE']._serialized_start=1775 38 | _globals['_ANIMATIONCROPPINGMODE']._serialized_end=1971 39 | _globals['_HEADPOSEMODE']._serialized_start=1974 40 | _globals['_HEADPOSEMODE']._serialized_end=2152 41 | _globals['_ANIMATECONFIG']._serialized_start=83 42 | _globals['_ANIMATECONFIG']._serialized_end=1110 43 | _globals['_VECTOR3F']._serialized_start=1112 44 | _globals['_VECTOR3F']._serialized_end=1155 45 | _globals['_VECTOR3FSTREAM']._serialized_start=1157 46 | _globals['_VECTOR3FSTREAM']._serialized_end=1230 47 | _globals['_QUATERNION']._serialized_start=1232 48 | _globals['_QUATERNION']._serialized_end=1288 49 | _globals['_QUATERNIONSTREAM']._serialized_start=1290 50 | _globals['_QUATERNIONSTREAM']._serialized_end=1367 51 | _globals['_ANIMATEREQUEST']._serialized_start=1369 52 | _globals['_ANIMATEREQUEST']._serialized_end=1492 53 | _globals['_ANIMATERESPONSE']._serialized_start=1495 54 | _globals['_ANIMATERESPONSE']._serialized_end=1666 55 | _globals['_AUDIO2FACE2DSERVICE']._serialized_start=2155 56 | _globals['_AUDIO2FACE2DSERVICE']._serialized_end=2288 57 | # @@protoc_insertion_point(module_scope) 58 | -------------------------------------------------------------------------------- /audio2face-2d/assets/head_rotation_animation.csv: -------------------------------------------------------------------------------- 1 | 0.0000, 0.0000, 0.0000, 1.0000 2 | 0.0035, 0.0000, 0.0000, 1.0000 3 | 0.0070, 0.0000, 0.0000, 1.0000 4 | 0.0105, 0.0000, 0.0000, 0.9999 5 | 0.0140, 0.0000, 0.0000, 0.9999 6 | 0.0174, 0.0000, 0.0000, 0.9998 7 | 0.0209, 0.0000, 0.0000, 0.9998 8 | 0.0244, 0.0000, 0.0000, 0.9997 9 | 0.0279, 0.0000, 0.0000, 0.9996 10 | 0.0314, 0.0000, 0.0000, 0.9995 11 | 0.0349, 0.0000, 0.0000, 0.9994 12 | 0.0384, 0.0000, 0.0000, 0.9993 13 | 0.0419, 0.0000, 0.0000, 0.9991 14 | 0.0454, 0.0000, 0.0000, 0.9990 15 | 0.0488, 0.0000, 0.0000, 0.9988 16 | 0.0488, 0.0000, 0.0000, 0.9988 17 | 0.0454, 0.0000, 0.0000, 0.9990 18 | 0.0419, 0.0000, 0.0000, 0.9991 19 | 0.0384, 0.0000, 0.0000, 0.9993 20 | 0.0349, 0.0000, 0.0000, 0.9994 21 | 0.0314, 0.0000, 0.0000, 0.9995 22 | 0.0279, 0.0000, 0.0000, 0.9996 23 | 0.0244, 0.0000, 0.0000, 0.9997 24 | 0.0209, 0.0000, 0.0000, 0.9998 25 | 0.0174, 0.0000, 0.0000, 0.9998 26 | 0.0140, 0.0000, 0.0000, 0.9999 27 | 0.0105, 0.0000, 0.0000, 0.9999 28 | 0.0070, 0.0000, 0.0000, 1.0000 29 | 0.0035, 0.0000, 0.0000, 1.0000 30 | 0.0000, 0.0000, 0.0000, 1.0000 31 | 0.0000, 0.0000, 0.0000, 1.0000 32 | -0.0035, 0.0000, 0.0000, 1.0000 33 | -0.0070, 0.0000, 0.0000, 1.0000 34 | -0.0105, 0.0000, 0.0000, 0.9999 35 | -0.0140, 0.0000, 0.0000, 0.9999 36 | -0.0174, 0.0000, 0.0000, 0.9998 37 | -0.0209, 0.0000, 0.0000, 0.9998 38 | -0.0244, 0.0000, 0.0000, 0.9997 39 | -0.0279, 0.0000, 0.0000, 0.9996 40 | -0.0314, 0.0000, 0.0000, 0.9995 41 | -0.0349, 0.0000, 0.0000, 0.9994 42 | -0.0384, 0.0000, 0.0000, 0.9993 43 | -0.0419, 0.0000, 0.0000, 0.9991 44 | -0.0454, 0.0000, 0.0000, 0.9990 45 | -0.0488, 0.0000, 0.0000, 0.9988 46 | -0.0488, 0.0000, 0.0000, 0.9988 47 | -0.0454, 0.0000, 0.0000, 0.9990 48 | -0.0419, 0.0000, 0.0000, 0.9991 49 | -0.0384, 0.0000, 0.0000, 0.9993 50 | -0.0349, 0.0000, 0.0000, 0.9994 51 | -0.0314, 0.0000, 0.0000, 0.9995 52 | -0.0279, 0.0000, 0.0000, 0.9996 53 | -0.0244, 0.0000, 0.0000, 0.9997 54 | -0.0209, 0.0000, 0.0000, 0.9998 55 | -0.0174, 0.0000, 0.0000, 0.9998 56 | -0.0140, 0.0000, 0.0000, 0.9999 57 | -0.0105, 0.0000, 0.0000, 0.9999 58 | -0.0070, 0.0000, 0.0000, 1.0000 59 | -0.0035, 0.0000, 0.0000, 1.0000 60 | 0.0000, 0.0000, 0.0000, 1.0000 61 | 0.0000, 0.0000, 0.0000, 1.0000 62 | 0.0000, 0.0047, 0.0000, 1.0000 63 | 0.0000, 0.0093, 0.0000, 1.0000 64 | 0.0000, 0.0140, 0.0000, 0.9999 65 | 0.0000, 0.0186, 0.0000, 0.9998 66 | 0.0000, 0.0233, 0.0000, 0.9997 67 | 0.0000, 0.0279, 0.0000, 0.9996 68 | 0.0000, 0.0326, 0.0000, 0.9995 69 | 0.0000, 0.0372, 0.0000, 0.9993 70 | 0.0000, 0.0419, 0.0000, 0.9991 71 | 0.0000, 0.0465, 0.0000, 0.9989 72 | 0.0000, 0.0512, 0.0000, 0.9987 73 | 0.0000, 0.0558, 0.0000, 0.9984 74 | 0.0000, 0.0605, 0.0000, 0.9982 75 | 0.0000, 0.0651, 0.0000, 0.9979 76 | 0.0000, 0.0651, 0.0000, 0.9979 77 | 0.0000, 0.0605, 0.0000, 0.9982 78 | 0.0000, 0.0558, 0.0000, 0.9984 79 | 0.0000, 0.0512, 0.0000, 0.9987 80 | 0.0000, 0.0465, 0.0000, 0.9989 81 | 0.0000, 0.0419, 0.0000, 0.9991 82 | 0.0000, 0.0372, 0.0000, 0.9993 83 | 0.0000, 0.0326, 0.0000, 0.9995 84 | 0.0000, 0.0279, 0.0000, 0.9996 85 | 0.0000, 0.0233, 0.0000, 0.9997 86 | 0.0000, 0.0186, 0.0000, 0.9998 87 | 0.0000, 0.0140, 0.0000, 0.9999 88 | 0.0000, 0.0093, 0.0000, 1.0000 89 | 0.0000, 0.0047, 0.0000, 1.0000 90 | 0.0000, 0.0000, 0.0000, 1.0000 91 | 0.0000, 0.0000, 0.0000, 1.0000 92 | 0.0000, -0.0047, 0.0000, 1.0000 93 | 0.0000, -0.0093, 0.0000, 1.0000 94 | 0.0000, -0.0140, 0.0000, 0.9999 95 | 0.0000, -0.0186, 0.0000, 0.9998 96 | 0.0000, -0.0233, 0.0000, 0.9997 97 | 0.0000, -0.0279, 0.0000, 0.9996 98 | 0.0000, -0.0326, 0.0000, 0.9995 99 | 0.0000, -0.0372, 0.0000, 0.9993 100 | 0.0000, -0.0419, 0.0000, 0.9991 101 | 0.0000, -0.0465, 0.0000, 0.9989 102 | 0.0000, -0.0512, 0.0000, 0.9987 103 | 0.0000, -0.0558, 0.0000, 0.9984 104 | 0.0000, -0.0605, 0.0000, 0.9982 105 | 0.0000, -0.0651, 0.0000, 0.9979 106 | 0.0000, -0.0651, 0.0000, 0.9979 107 | 0.0000, -0.0605, 0.0000, 0.9982 108 | 0.0000, -0.0558, 0.0000, 0.9984 109 | 0.0000, -0.0512, 0.0000, 0.9987 110 | 0.0000, -0.0465, 0.0000, 0.9989 111 | 0.0000, -0.0419, 0.0000, 0.9991 112 | 0.0000, -0.0372, 0.0000, 0.9993 113 | 0.0000, -0.0326, 0.0000, 0.9995 114 | 0.0000, -0.0279, 0.0000, 0.9996 115 | 0.0000, -0.0233, 0.0000, 0.9997 116 | 0.0000, -0.0186, 0.0000, 0.9998 117 | 0.0000, -0.0140, 0.0000, 0.9999 118 | 0.0000, -0.0093, 0.0000, 1.0000 119 | 0.0000, -0.0047, 0.0000, 1.0000 120 | 0.0000, 0.0000, 0.0000, 1.0000 121 | 0.0000, 0.0000, 0.0000, 1.0000 122 | 0.0000, 0.0000, 0.0029, 1.0000 123 | 0.0000, 0.0000, 0.0058, 1.0000 124 | 0.0000, 0.0000, 0.0087, 1.0000 125 | 0.0000, 0.0000, 0.0116, 0.9999 126 | 0.0000, 0.0000, 0.0145, 0.9999 127 | 0.0000, 0.0000, 0.0174, 0.9998 128 | 0.0000, 0.0000, 0.0204, 0.9998 129 | 0.0000, 0.0000, 0.0233, 0.9997 130 | 0.0000, 0.0000, 0.0262, 0.9997 131 | 0.0000, 0.0000, 0.0291, 0.9996 132 | 0.0000, 0.0000, 0.0320, 0.9995 133 | 0.0000, 0.0000, 0.0349, 0.9994 134 | 0.0000, 0.0000, 0.0378, 0.9993 135 | 0.0000, 0.0000, 0.0407, 0.9992 136 | 0.0000, 0.0000, 0.0407, 0.9992 137 | 0.0000, 0.0000, 0.0378, 0.9993 138 | 0.0000, 0.0000, 0.0349, 0.9994 139 | 0.0000, 0.0000, 0.0320, 0.9995 140 | 0.0000, 0.0000, 0.0291, 0.9996 141 | 0.0000, 0.0000, 0.0262, 0.9997 142 | 0.0000, 0.0000, 0.0233, 0.9997 143 | 0.0000, 0.0000, 0.0204, 0.9998 144 | 0.0000, 0.0000, 0.0174, 0.9998 145 | 0.0000, 0.0000, 0.0145, 0.9999 146 | 0.0000, 0.0000, 0.0116, 0.9999 147 | 0.0000, 0.0000, 0.0087, 1.0000 148 | 0.0000, 0.0000, 0.0058, 1.0000 149 | 0.0000, 0.0000, 0.0029, 1.0000 150 | 0.0000, 0.0000, 0.0000, 1.0000 151 | 0.0000, 0.0000, 0.0000, 1.0000 152 | 0.0000, 0.0000, -0.0029, 1.0000 153 | 0.0000, 0.0000, -0.0058, 1.0000 154 | 0.0000, 0.0000, -0.0087, 1.0000 155 | 0.0000, 0.0000, -0.0116, 0.9999 156 | 0.0000, 0.0000, -0.0145, 0.9999 157 | 0.0000, 0.0000, -0.0174, 0.9998 158 | 0.0000, 0.0000, -0.0204, 0.9998 159 | 0.0000, 0.0000, -0.0233, 0.9997 160 | 0.0000, 0.0000, -0.0262, 0.9997 161 | 0.0000, 0.0000, -0.0291, 0.9996 162 | 0.0000, 0.0000, -0.0320, 0.9995 163 | 0.0000, 0.0000, -0.0349, 0.9994 164 | 0.0000, 0.0000, -0.0378, 0.9993 165 | 0.0000, 0.0000, -0.0407, 0.9992 166 | 0.0000, 0.0000, -0.0407, 0.9992 167 | 0.0000, 0.0000, -0.0378, 0.9993 168 | 0.0000, 0.0000, -0.0349, 0.9994 169 | 0.0000, 0.0000, -0.0320, 0.9995 170 | 0.0000, 0.0000, -0.0291, 0.9996 171 | 0.0000, 0.0000, -0.0262, 0.9997 172 | 0.0000, 0.0000, -0.0233, 0.9997 173 | 0.0000, 0.0000, -0.0204, 0.9998 174 | 0.0000, 0.0000, -0.0174, 0.9998 175 | 0.0000, 0.0000, -0.0145, 0.9999 176 | 0.0000, 0.0000, -0.0116, 0.9999 177 | 0.0000, 0.0000, -0.0087, 1.0000 178 | 0.0000, 0.0000, -0.0058, 1.0000 179 | 0.0000, 0.0000, -0.0029, 1.0000 180 | 0.0000, 0.0000, 0.0000, 1.0000 181 | -------------------------------------------------------------------------------- /eye-contact/protos/proto/nvidia/maxine/eyecontact/v1/eyecontact.proto: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // NVIDIA CORPORATION and its licensors retain all intellectual property 4 | // and proprietary rights in and to this software, related documentation 5 | // and any modifications thereto. Any use, reproduction, disclosure or 6 | // distribution of this software and related documentation without an express 7 | // license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | 9 | syntax = "proto3"; 10 | 11 | package nvidia.maxine.eyecontact.v1; 12 | 13 | import "google/protobuf/any.proto"; 14 | import "google/protobuf/empty.proto"; 15 | 16 | // The MaxineEyeContactService provides APIs to run the 17 | // Maxine Eye Contact feature. 18 | service MaxineEyeContactService { 19 | 20 | // RedirectGaze is a bidirectional streaming API to run the 21 | // Maxine Eye Contact feature on mp4 video files. 22 | // 23 | // The input message can contain GazeRedirectionConfig or bytes. 24 | // In the beginning of the stream, a request with GazeRedirectionConfig may 25 | // be sent to the server to set the feature's parameter. 26 | // The server will echo back a response with the config to signify that the 27 | // parameters were properly set. If not configured, default values will be 28 | // used for the feature's parameters. Any GazeRedirectionConfig sent during 29 | // the middle of the stream will be ignored. 30 | // 31 | // After the optional configuration, the client streams the input mp4 file in 32 | // chunks in the input message and receives the output mp4 file in chunks in 33 | // the output message. 34 | // 35 | // The client should only pass one video file per API invocation and the 36 | // configuration, if set, is applied to only that invocation. 37 | rpc RedirectGaze(stream RedirectGazeRequest) 38 | returns (stream RedirectGazeResponse) {} 39 | } 40 | 41 | // LossyEncoding specifies parameters for lossy video compression 42 | message LossyEncoding { 43 | // Target bitrate for video encoding in bits per second 44 | optional uint32 bitrate = 1; 45 | // Interval between IDR frames (keyframes) in number of frames 46 | optional uint32 idr_interval = 2; 47 | } 48 | 49 | // CustomEncodingParams allows specifying custom encoding parameters 50 | message CustomEncodingParams { 51 | // Map of string key-value pairs for custom encoding configuration 52 | // Each value can be any protobuf message type using google.protobuf.Any 53 | map custom = 1; 54 | } 55 | 56 | // OutputVideoEncoding specifies parameters for the output video encoding 57 | message OutputVideoEncoding { 58 | // Only one of these encoding types can be specified 59 | oneof encoding_type { 60 | // If true, use lossless encoding with no compression 61 | bool lossless = 1; 62 | // Use lossy encoding with configurable bitrate and keyframe settings 63 | LossyEncoding lossy = 2; 64 | // Use custom encoding parameters specified as key-value pairs 65 | CustomEncodingParams custom_encoding = 3; 66 | } 67 | } 68 | 69 | // Configuration for Maxine Eye Contact. 70 | message RedirectGazeConfig { 71 | // Flag to control temporal filtering 72 | // Default: 0xffffffff 73 | optional uint32 temporal = 1; 74 | 75 | // Flag to toggle detection of eye closure and occlusion on/off 76 | // Default: 0 | Range: [0, 1] 77 | optional uint32 detect_closure = 2; 78 | 79 | // Eye size sensitivity parameter 80 | // Default: 3 | Range: [2, 6] 81 | optional uint32 eye_size_sensitivity = 3; 82 | 83 | // Flag to toggle look away on/off. 84 | // Default: 0 | Range: [0, 1] 85 | optional uint32 enable_lookaway = 4; 86 | 87 | // Maximum value of gaze offset angle (degrees) during a random look away 88 | // Default: 5 | Range: [1, 10] 89 | optional uint32 lookaway_max_offset = 5; 90 | 91 | // Minimum limit for the number of frames at which random look away occurs 92 | // Default: 100 | Range: [1, 600] 93 | optional uint32 lookaway_interval_min = 6; 94 | 95 | // Range for picking the number of frames at which random look away occurs 96 | // Default: 250 | Range: [1, 600] 97 | optional uint32 lookaway_interval_range = 7; 98 | 99 | // Gaze pitch threshold (degrees) at which the redirection starts 100 | // transitioning 101 | // Default: 20 | Range: [10, 35] 102 | optional float gaze_pitch_threshold_low = 8; 103 | 104 | // Gaze pitch threshold (degrees) at which the redirection is equal to 105 | // estimated gaze 106 | // Default: 30 | Range: [10, 35] 107 | optional float gaze_pitch_threshold_high = 9; 108 | 109 | // Gaze yaw threshold (degrees) at which the redirection starts 110 | // transitioning 111 | // Default: 20 | Range: [10, 35] 112 | optional float gaze_yaw_threshold_low = 10; 113 | 114 | // Gaze yaw threshold (degrees) at which the redirection the redirection 115 | // is equal to estimated gaze 116 | // Default: 30 | Range: [10 ,35] 117 | optional float gaze_yaw_threshold_high = 11; 118 | 119 | // Head pose pitch yaw threshold (degrees) at which the redirection 120 | // start transitioning away from camera towards estimated gaze 121 | // Default: 15 | Range: [10, 35] 122 | optional float head_pitch_threshold_low = 12; 123 | 124 | // Head pose pitch yaw threshold (degrees) at which the redirection is 125 | // equal to estimated gaze 126 | // Default: 15 | Range: [10, 35] 127 | optional float head_pitch_threshold_high = 13; 128 | 129 | // Head pose yaw threshold (degrees) at which the redirection starts 130 | // transitioning 131 | // Default: 15 | Range: [10, 35] 132 | optional float head_yaw_threshold_low = 14; 133 | 134 | // Head pose yaw threshold (degrees) at which the redirection is equal 135 | // to estimated gaze 136 | // Default: 15 | Range: [10, 35] 137 | optional float head_yaw_threshold_high = 15; 138 | 139 | // Output video encoding parameters 140 | optional OutputVideoEncoding output_video_encoding = 16; 141 | } 142 | 143 | // Input message for RedirectGaze API. 144 | // May contain feature configuration or a chunk of input mp4 file data. 145 | message RedirectGazeRequest { 146 | oneof stream_input { 147 | // Configuration parameters for the request 148 | RedirectGazeConfig config = 1; 149 | 150 | // mp4 file based video data 151 | bytes video_file_data = 2; 152 | } 153 | } 154 | 155 | // Output message for RedirectGaze API. 156 | // May contain feature configuration, a chunk of output mp4 file data 157 | // or an empty message to keep the connection alive. 158 | message RedirectGazeResponse { 159 | oneof stream_output { 160 | // Configuration parameters used 161 | RedirectGazeConfig config = 1; 162 | 163 | // Output mp4 video stream data 164 | bytes video_file_data = 2; 165 | 166 | // Keep alive signaling flag 167 | google.protobuf.Empty keepalive = 3; 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /audio2face-2d/python/interfaces/audio2face2d_pb2.pyi: -------------------------------------------------------------------------------- 1 | from google.protobuf import empty_pb2 as _empty_pb2 2 | from google.protobuf.internal import containers as _containers 3 | from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper 4 | from google.protobuf import descriptor as _descriptor 5 | from google.protobuf import message as _message 6 | from typing import ClassVar as _ClassVar, Iterable as _Iterable, Mapping as _Mapping, Optional as _Optional, Union as _Union 7 | 8 | DESCRIPTOR: _descriptor.FileDescriptor 9 | 10 | class ModelSelection(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): 11 | __slots__ = () 12 | MODEL_SELECTION_UNSPECIFIED: _ClassVar[ModelSelection] 13 | MODEL_SELECTION_PERF: _ClassVar[ModelSelection] 14 | MODEL_SELECTION_QUALITY: _ClassVar[ModelSelection] 15 | 16 | class AnimationCroppingMode(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): 17 | __slots__ = () 18 | ANIMATION_CROPPING_MODE_UNSPECIFIED: _ClassVar[AnimationCroppingMode] 19 | ANIMATION_CROPPING_MODE_FACEBOX: _ClassVar[AnimationCroppingMode] 20 | ANIMATION_CROPPING_MODE_REGISTRATION_BLENDING: _ClassVar[AnimationCroppingMode] 21 | ANIMATION_CROPPING_MODE_INSET_BLENDING: _ClassVar[AnimationCroppingMode] 22 | 23 | class HeadPoseMode(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): 24 | __slots__ = () 25 | HEAD_POSE_MODE_UNSPECIFIED: _ClassVar[HeadPoseMode] 26 | HEAD_POSE_MODE_RETAIN_FROM_PORTRAIT_IMAGE: _ClassVar[HeadPoseMode] 27 | HEAD_POSE_MODE_PRE_DEFINED_ANIMATION: _ClassVar[HeadPoseMode] 28 | HEAD_POSE_MODE_USER_DEFINED_ANIMATION: _ClassVar[HeadPoseMode] 29 | MODEL_SELECTION_UNSPECIFIED: ModelSelection 30 | MODEL_SELECTION_PERF: ModelSelection 31 | MODEL_SELECTION_QUALITY: ModelSelection 32 | ANIMATION_CROPPING_MODE_UNSPECIFIED: AnimationCroppingMode 33 | ANIMATION_CROPPING_MODE_FACEBOX: AnimationCroppingMode 34 | ANIMATION_CROPPING_MODE_REGISTRATION_BLENDING: AnimationCroppingMode 35 | ANIMATION_CROPPING_MODE_INSET_BLENDING: AnimationCroppingMode 36 | HEAD_POSE_MODE_UNSPECIFIED: HeadPoseMode 37 | HEAD_POSE_MODE_RETAIN_FROM_PORTRAIT_IMAGE: HeadPoseMode 38 | HEAD_POSE_MODE_PRE_DEFINED_ANIMATION: HeadPoseMode 39 | HEAD_POSE_MODE_USER_DEFINED_ANIMATION: HeadPoseMode 40 | 41 | class AnimateConfig(_message.Message): 42 | __slots__ = ("portrait_image", "model_selection", "animation_crop_mode", "head_pose_mode", "enable_lookaway", "lookaway_max_offset", "lookaway_interval_range", "lookaway_interval_min", "blink_frequency", "blink_duration", "mouth_expression_multiplier", "head_pose_multiplier", "input_head_rotation", "input_head_translation") 43 | PORTRAIT_IMAGE_FIELD_NUMBER: _ClassVar[int] 44 | MODEL_SELECTION_FIELD_NUMBER: _ClassVar[int] 45 | ANIMATION_CROP_MODE_FIELD_NUMBER: _ClassVar[int] 46 | HEAD_POSE_MODE_FIELD_NUMBER: _ClassVar[int] 47 | ENABLE_LOOKAWAY_FIELD_NUMBER: _ClassVar[int] 48 | LOOKAWAY_MAX_OFFSET_FIELD_NUMBER: _ClassVar[int] 49 | LOOKAWAY_INTERVAL_RANGE_FIELD_NUMBER: _ClassVar[int] 50 | LOOKAWAY_INTERVAL_MIN_FIELD_NUMBER: _ClassVar[int] 51 | BLINK_FREQUENCY_FIELD_NUMBER: _ClassVar[int] 52 | BLINK_DURATION_FIELD_NUMBER: _ClassVar[int] 53 | MOUTH_EXPRESSION_MULTIPLIER_FIELD_NUMBER: _ClassVar[int] 54 | HEAD_POSE_MULTIPLIER_FIELD_NUMBER: _ClassVar[int] 55 | INPUT_HEAD_ROTATION_FIELD_NUMBER: _ClassVar[int] 56 | INPUT_HEAD_TRANSLATION_FIELD_NUMBER: _ClassVar[int] 57 | portrait_image: bytes 58 | model_selection: ModelSelection 59 | animation_crop_mode: AnimationCroppingMode 60 | head_pose_mode: HeadPoseMode 61 | enable_lookaway: bool 62 | lookaway_max_offset: int 63 | lookaway_interval_range: int 64 | lookaway_interval_min: int 65 | blink_frequency: int 66 | blink_duration: int 67 | mouth_expression_multiplier: float 68 | head_pose_multiplier: float 69 | input_head_rotation: QuaternionStream 70 | input_head_translation: Vector3fStream 71 | def __init__(self, portrait_image: _Optional[bytes] = ..., model_selection: _Optional[_Union[ModelSelection, str]] = ..., animation_crop_mode: _Optional[_Union[AnimationCroppingMode, str]] = ..., head_pose_mode: _Optional[_Union[HeadPoseMode, str]] = ..., enable_lookaway: bool = ..., lookaway_max_offset: _Optional[int] = ..., lookaway_interval_range: _Optional[int] = ..., lookaway_interval_min: _Optional[int] = ..., blink_frequency: _Optional[int] = ..., blink_duration: _Optional[int] = ..., mouth_expression_multiplier: _Optional[float] = ..., head_pose_multiplier: _Optional[float] = ..., input_head_rotation: _Optional[_Union[QuaternionStream, _Mapping]] = ..., input_head_translation: _Optional[_Union[Vector3fStream, _Mapping]] = ...) -> None: ... 72 | 73 | class Vector3f(_message.Message): 74 | __slots__ = ("x", "y", "z") 75 | X_FIELD_NUMBER: _ClassVar[int] 76 | Y_FIELD_NUMBER: _ClassVar[int] 77 | Z_FIELD_NUMBER: _ClassVar[int] 78 | x: float 79 | y: float 80 | z: float 81 | def __init__(self, x: _Optional[float] = ..., y: _Optional[float] = ..., z: _Optional[float] = ...) -> None: ... 82 | 83 | class Vector3fStream(_message.Message): 84 | __slots__ = ("values",) 85 | VALUES_FIELD_NUMBER: _ClassVar[int] 86 | values: _containers.RepeatedCompositeFieldContainer[Vector3f] 87 | def __init__(self, values: _Optional[_Iterable[_Union[Vector3f, _Mapping]]] = ...) -> None: ... 88 | 89 | class Quaternion(_message.Message): 90 | __slots__ = ("x", "y", "z", "w") 91 | X_FIELD_NUMBER: _ClassVar[int] 92 | Y_FIELD_NUMBER: _ClassVar[int] 93 | Z_FIELD_NUMBER: _ClassVar[int] 94 | W_FIELD_NUMBER: _ClassVar[int] 95 | x: float 96 | y: float 97 | z: float 98 | w: float 99 | def __init__(self, x: _Optional[float] = ..., y: _Optional[float] = ..., z: _Optional[float] = ..., w: _Optional[float] = ...) -> None: ... 100 | 101 | class QuaternionStream(_message.Message): 102 | __slots__ = ("values",) 103 | VALUES_FIELD_NUMBER: _ClassVar[int] 104 | values: _containers.RepeatedCompositeFieldContainer[Quaternion] 105 | def __init__(self, values: _Optional[_Iterable[_Union[Quaternion, _Mapping]]] = ...) -> None: ... 106 | 107 | class AnimateRequest(_message.Message): 108 | __slots__ = ("config", "audio_file_data") 109 | CONFIG_FIELD_NUMBER: _ClassVar[int] 110 | AUDIO_FILE_DATA_FIELD_NUMBER: _ClassVar[int] 111 | config: AnimateConfig 112 | audio_file_data: bytes 113 | def __init__(self, config: _Optional[_Union[AnimateConfig, _Mapping]] = ..., audio_file_data: _Optional[bytes] = ...) -> None: ... 114 | 115 | class AnimateResponse(_message.Message): 116 | __slots__ = ("config", "video_file_data", "keep_alive") 117 | CONFIG_FIELD_NUMBER: _ClassVar[int] 118 | VIDEO_FILE_DATA_FIELD_NUMBER: _ClassVar[int] 119 | KEEP_ALIVE_FIELD_NUMBER: _ClassVar[int] 120 | config: AnimateConfig 121 | video_file_data: bytes 122 | keep_alive: _empty_pb2.Empty 123 | def __init__(self, config: _Optional[_Union[AnimateConfig, _Mapping]] = ..., video_file_data: _Optional[bytes] = ..., keep_alive: _Optional[_Union[_empty_pb2.Empty, _Mapping]] = ...) -> None: ... 124 | -------------------------------------------------------------------------------- /audio2face-2d/README.md: -------------------------------------------------------------------------------- 1 | 2 | # NVIDIA Maxine Audio2Face-2D NIM Client 3 | 4 | This package has a sample client which demonstrates interaction with a Maxine Audio2Face-2D NIM. 5 | 6 | ## Getting Started 7 | 8 | NVIDIA Maxine NIM Client packages use gRPC APIs. Instructions below demonstrate usage of Audio2Face-2D NIM using Python and NodeJS gRPC clients. 9 | 10 | ## Pre-requisites 11 | 12 | Access to NVIDIA Maxine Audio2Face-2D NIM Container / Service 13 | 14 | ### Python 15 | - Ensure you have Python 3.10 or above installed on your system. Please refer to the [Python documentation](https://www.python.org/downloads/) for download and installation instructions. 16 | 17 | ### NodeJS 18 | - Ensure you have NodeJS 18 or above installed on your system. Please refer to the [NodeJS documentation](https://nodejs.org/en/download/package-manager) for download and installation instructions. 19 | 20 | ## Usage guide 21 | 22 | ### 1. Clone the repository 23 | 24 | ```bash 25 | git clone https://github.com/nvidia-maxine/nim-clients.git 26 | 27 | # Go to the 'audio2face-2d' folder 28 | cd nim-clients/audio2face-2d/ 29 | ``` 30 | 31 | ### 2. Install dependencies 32 | #### Python 33 | ```bash 34 | # Install all the required packages using requirements.txt file in python directory 35 | pip install -r python/requirements.txt 36 | ``` 37 | 38 | #### NodeJS 39 | ```bash 40 | # Install all the required packages using package.json file in nodejs directory 41 | npm install --prefix nodejs/ 42 | ``` 43 | 44 | ### 3. Compile the Protos (optional) 45 | 46 | If you want to use the client code provided in the github Client repository, you can skip this step. 47 | The proto files are available in the audio2face-2d/protos folder. You can compile them to generate client interfaces in your preferred programming language. For more details, refer to [Supported languages](https://grpc.io/docs/languages/) in the gRPC documentation. 48 | 49 | Here is an example of how to compile the protos for Python and Node.js on Linux and Windows. 50 | 51 | #### Python 52 | 53 | The `grpcio` version needed for compilation can be referred at `requirements.txt` 54 | 55 | To compile protos on Linux, run: 56 | ```bash 57 | # Go to audio2face-2d/protos/linux/python folder 58 | cd audio2face-2d/protos/linux/python 59 | 60 | chmod +x compile_protos.sh 61 | ./compile_protos.sh 62 | ``` 63 | 64 | To compile protos on Windows, run: 65 | ```bash 66 | # Go to audio2face-2d/protos/windows/python folder 67 | cd audio2face-2d/protos/windows/python 68 | 69 | ./compile_protos.bat 70 | ``` 71 | The compiled proto files will be generated in `nim-clients/audio2face-2d/python/interfaces` directory. 72 | 73 | #### NodeJS 74 | Before running the NodeJS client, you can choose to compile the protos. 75 | 76 | To compile protos on Linux, run: 77 | ```bash 78 | # Go to audio2face-2d/protos/linux/nodejs folder 79 | cd audio2face-2d/protos/linux/nodejs 80 | 81 | chmod +x compile_protos.sh 82 | ./compile_protos.sh 83 | ``` 84 | 85 | To compile protos on Windows, run: 86 | ```bash 87 | # Go to audio2face-2d/protos/windows/nodejs folder 88 | cd audio2face-2d/protos/windows/nodejs 89 | 90 | ./compile_protos.bat 91 | ``` 92 | The compiled proto files will be generated in `nim-clients/audio2face-2d/nodejs/interfaces` directory. 93 | 94 | ### 4. Host the NIM Server 95 | 96 | Before running client part of Maxine Audio2Face-2D, please set up a server. 97 | The simplest way to do that is to follow the [quick start guide](https://docs.nvidia.com/nim/maxine/audio2face-2d/latest/getting-started.html) 98 | 99 | ### 5. Run the Client 100 | #### Python 101 | - Go to the scripts directory 102 | 103 | ```bash 104 | cd scripts 105 | ``` 106 | 107 | #### Usage for Hosted NIM request 108 | 109 | ```bash 110 | python audio2face-2d.py \ 111 | --target \ 112 | --audio-input \ 113 | --portrait-input \ 114 | --output \ 115 | --head-rotation-animation-filepath \ 116 | --head-translation-animation-filepath \ 117 | --ssl-mode \ 118 | --ssl-key \ 119 | --ssl-cert \ 120 | --ssl-root-cert 121 | ``` 122 | 123 | To view details of command line arguments, run this command: 124 | ```bash 125 | python audio2face-2d.py -h 126 | ``` 127 | 128 | - Example command to process the packaged sample inputs 129 | 130 | The following command uses the sample audio and portrait file & generates an output.mp4 file in the current folder 131 | 132 | ```bash 133 | python audio2face-2d.py --target 127.0.0.1:8001 --audio-input ../assets/sample_audio.wav --portrait-input ../assets/sample_portrait_image.png --output out.mp4 134 | ``` 135 | 136 | #### NodeJS 137 | - Go to the scripts directory 138 | 139 | ```bash 140 | cd scripts 141 | ``` 142 | 143 | #### Usage for Hosted NIM request 144 | 145 | ```bash 146 | node audio2face-2d.js \ 147 | --target \ 148 | --audio-input \ 149 | --portrait-input \ 150 | --output \ 151 | --format \ 152 | --head-rotation-animation-filepath \ 153 | --head-translation-animation-filepath \ 154 | --ssl-mode \ 155 | --ssl-key \ 156 | --ssl-cert \ 157 | --ssl-root-cert 158 | ``` 159 | 160 | - Example command to process the packaged sample inputs 161 | 162 | The following command uses the sample audio and portrait file & generates an output.mp4 file in the current folder 163 | 164 | ```bash 165 | node audio2face-2d.js --target 127.0.0.1:8001 --audio-input ../assets/sample_audio.wav --portrait-input ../assets/sample_portrait_image.png --output out.mp4 --format wav 166 | ``` 167 | 168 | The NodeJS client supports both `wav` and `pcm` audio formats. The `--format` option can be used to specify the format. The default format is `wav`. 169 | 170 | The default configuration expected for PCM audio format in the NodeJS client is as follows: 171 | 172 | - Sample rate: 48kHz 173 | - Channels: Mono-channel 174 | - Bit Depth: 16 175 | 176 | If any other config is needed, please change it in the NodeJS client `audio2face-2d/nodejs/scripts/audio2face-2d.js` in the function `sendInputAudioChunks()`. 177 | 178 | #### Note 179 | - The supported audio file format is `wav` or `pcm` and for image is `jpg, png, jpeg`. 180 | - The supported languages are English, Spanish, Mandarin, French, a sample file for English language is provided in assets dir. 181 | 182 | #### Command line arguments 183 | 184 | - `-h, --help` show this help message and exit 185 | - `--target` is `127.0.0.1:8001` 186 | - `--portrait-input` is `../../assets/sample_portrait_image.png` 187 | - `--audio-input` is `../../assets/sample_audio.wav` 188 | - `--output` will be the current directory where the output file will be generated with name `output.mp4` 189 | - `--head-rotation-animation-filepath` is `../../assets/head_rotation_animation.csv`. Used only if head_pose_mode is `HeadPoseMode.HEAD_POSE_MODE_USER_DEFINED_ANIMATION`. 190 | - `--head-translation-animation-filepath` is `../../assets/head_translation_animation.csv`. Used only if head_pose_mode is `HeadPoseMode.HEAD_POSE_MODE_USER_DEFINED_ANIMATION`. 191 | - `--ssl-mode` is DISABLED (no SSL). 192 | - `--ssl-key` is `../ssl_key/ssl_key_client.pem`. Used only if ssl-mode is `MTLS`. 193 | - `--ssl-cert` is `../ssl_key/ssl_cert_client.pem`. Used only if ssl-mode is `MTLS`. 194 | - `--ssl-root-cert` is `../ssl_key/ssl_ca_cert.pem`. Used only if ssl-mode is `MTLS` or `TLS`. 195 | 196 | Only for Nodejs 197 | 198 | - `--format` - The audio format (wav or pcm) 199 | 200 | Refer the [docs](https://docs.nvidia.com/nim/maxine/audio2face-2d/latest/index.html) for more information 201 | -------------------------------------------------------------------------------- /audio2face-2d/protos/proto/nvidia/maxine/audio2face2d/v1/audio2face2d.proto: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a 4 | // copy of this software and associated documentation files (the "Software"), 5 | // to deal in the Software without restriction, including without limitation 6 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | // and/or sell copies of the Software, and to permit persons to whom the 8 | // Software is furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | // DEALINGS IN THE SOFTWARE. 20 | 21 | 22 | syntax = "proto3"; 23 | 24 | package nvidia.maxine.audio2face2d.v1; 25 | 26 | import "google/protobuf/empty.proto"; 27 | 28 | // The Audio2Face2DService provides APIs to run the 29 | // Maxine Audio to Face - 2D feature. 30 | service Audio2Face2DService { 31 | // Animate is a bidirectional streaming API to run the 32 | // Audio2Face-2D. 33 | // 34 | // The input message can contain AnimateConfig or bytes. 35 | // In the beginning of the stream, a request with AnimateConfig should 36 | // be sent to the server to set the feature's parameters. 37 | // The server will echo back a response with the config to signify that the 38 | // parameters were properly set. It is mandatory to set the portrait_image 39 | // config, other configuration parameters are optional and a default value will 40 | // be used if not set. Any AnimateConfig sent during the middle of the stream 41 | // will be ignored. 42 | // 43 | // After the configuration step, the client streams the input wav file in 44 | // chunks in the input message and receives the output mp4 file in chunks in 45 | // the output message. While the inference is running, the server will periodically 46 | // echo empty message to keep the channel alive. The client should ignore this message. 47 | // 48 | // It is recommended that the client should pass one file per API invocation. 49 | // The configurations are also set per invocation. 50 | rpc Animate(stream AnimateRequest) 51 | returns (stream AnimateResponse) { 52 | } 53 | } 54 | 55 | // Configuration for Animate API. 56 | message AnimateConfig { 57 | // Portrait image (jpg/jpeg/png) 58 | bytes portrait_image = 1; 59 | 60 | // Model selection: 0 - performance or 1 - quality 61 | // Default: quality 62 | optional ModelSelection model_selection = 2; 63 | 64 | // Audio2Face animation cropping mode 65 | // Default: ANIMATION_CROPPING_MODE_REGISTRATION_BLENDING 66 | optional AnimationCroppingMode animation_crop_mode = 3; 67 | 68 | // Head Pose Animation mode 69 | // Default: HEAD_POSE_MODE_RETAIN_FROM_PORTRAIT_IMAGE 70 | optional HeadPoseMode head_pose_mode = 4; 71 | 72 | // Flag to enable Gaze look Away 73 | // Default: false 74 | optional bool enable_lookaway = 5; 75 | 76 | // The maximum integer value of gaze offset when lookaway is enabled 77 | // Default:20 Unit: Degrees 78 | optional uint32 lookaway_max_offset = 6; 79 | 80 | // Range for picking the number of frames at which random look away occurs 81 | // Default: 90 | Range: [1, 600] | Unit: Frames 82 | optional uint32 lookaway_interval_range = 7; 83 | 84 | // Minimum limit for the number of frames at which random look away occurs 85 | // Default: 240 | Range: [1, 600] | Unit: Frames 86 | optional uint32 lookaway_interval_min = 8; 87 | 88 | // The frequency of eye blinks per minute 89 | // Default: 6 | Range: [0, 120] | Unit: Frames 90 | // Note: 0 = disable eye blink 91 | optional uint32 blink_frequency = 9; 92 | 93 | // The duration of an eye blink 94 | // Default: 10 | Range: [2, 150] | Unit: Frames 95 | optional uint32 blink_duration = 10; 96 | 97 | // A multiplier to exaggerate the mouth expression. 98 | // Default: 1.4f (for quality mode), 1.0f (for performance mode) 99 | // Range: [1.0f, 2.0f] 100 | optional float mouth_expression_multiplier = 11; 101 | 102 | // A multiplier to dampen range of Head Pose Animation 103 | // This is applicable only for HEAD_POSE_MODE_PRE_DEFINED_ANIMATION 104 | // Default: 1.0f (quality mode), 0.4f (performance mode) | Range: [0.0f, 1.0f] 105 | optional float head_pose_multiplier = 12; 106 | 107 | // Quaternion that provides the head pose rotation to be applied. 108 | // This is valid only for HEAD_POSE_MODE_USER_DEFINED_ANIMATION 109 | optional QuaternionStream input_head_rotation = 13; 110 | 111 | // Vector3f that provides the head pose rotation to be applied. 112 | // This is valid only for HEAD_POSE_MODE_USER_DEFINED_ANIMATION 113 | optional Vector3fStream input_head_translation = 14; 114 | } 115 | 116 | // Model selection option 117 | enum ModelSelection { 118 | MODEL_SELECTION_UNSPECIFIED = 0; 119 | // Performance model 120 | MODEL_SELECTION_PERF = 1; 121 | // Quality model 122 | MODEL_SELECTION_QUALITY = 2; 123 | } 124 | 125 | // Animation cropping mode which controls output video resolution 126 | enum AnimationCroppingMode { 127 | ANIMATION_CROPPING_MODE_UNSPECIFIED = 0; 128 | 129 | // Produces fixed resolution of 512x512 animation output 130 | // Face crop will be extracted from the portrait image provided 131 | ANIMATION_CROPPING_MODE_FACEBOX = 1; 132 | 133 | // The animated face crop will be registered and blended back into the portrait photo. 134 | // The output image includes both the animated 135 | // face crop and the surrounding area, with the same resolution as the portrait photo 136 | ANIMATION_CROPPING_MODE_REGISTRATION_BLENDING = 2; 137 | 138 | // Light weight and faster version of mode 2, without registration. 139 | // Preferred over mode 3 if quality is the primary concern 140 | ANIMATION_CROPPING_MODE_INSET_BLENDING = 3; 141 | } 142 | 143 | // Head Pose mode 144 | enum HeadPoseMode{ 145 | HEAD_POSE_MODE_UNSPECIFIED = 0; 146 | // retains the head pose from input portrait image 147 | HEAD_POSE_MODE_RETAIN_FROM_PORTRAIT_IMAGE = 1; 148 | // NIM generates a pre-defined animation for the head pose 149 | HEAD_POSE_MODE_PRE_DEFINED_ANIMATION = 2; 150 | // NIM generates headpose animation based on headpose_inputs provided by user 151 | HEAD_POSE_MODE_USER_DEFINED_ANIMATION = 3; 152 | } 153 | 154 | // Generic 3D float vector 155 | message Vector3f { 156 | // x-coordinate 157 | float x = 1; 158 | // y-coordinate 159 | float y = 2; 160 | // z-coordinate 161 | float z = 3; 162 | } 163 | 164 | // Stream of 3D-Vectors 165 | message Vector3fStream{ 166 | repeated Vector3f values = 1; 167 | } 168 | 169 | // Generic Quaternion 170 | message Quaternion { 171 | // x-coordinate 172 | float x = 1; 173 | // y-coordinate 174 | float y = 2; 175 | // z-coordinate 176 | float z = 3; 177 | // w-coordinate 178 | float w = 4; 179 | } 180 | 181 | // Stream of Quaternions 182 | message QuaternionStream{ 183 | repeated Quaternion values = 1; 184 | } 185 | 186 | // Input message for Animate API. 187 | // May contain feature configuration or a chunk of input wav file data. 188 | message AnimateRequest { 189 | oneof stream_input { 190 | // Configuration parameters for the request 191 | AnimateConfig config = 1; 192 | 193 | // .wav file based audio data 194 | bytes audio_file_data = 2; 195 | } 196 | } 197 | 198 | // Output message for Animate API. 199 | // May contain feature configuration, a chunk of output mp4 file data 200 | // or an empty message to keep the connection alive. 201 | message AnimateResponse { 202 | oneof stream_output { 203 | // Configuration parameters used 204 | AnimateConfig config = 1; 205 | 206 | // Output .mp4 video stream data 207 | bytes video_file_data = 2; 208 | 209 | // Keep alive signaling flag 210 | google.protobuf.Empty keep_alive = 3; 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /eye-contact/scripts/eye-contact.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a 4 | # copy of this software and associated documentation files (the "Software"), 5 | # to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | # and/or sell copies of the Software, and to permit persons to whom the 8 | # Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | """Main script for running Eye Contact inference with video files. 22 | 23 | This script provides functionality to: 24 | - Parse command line arguments for configuring Eye Contact 25 | - Set up gRPC communication with the Eye Contact service 26 | - Send video data to the service with streaming support 27 | - Process responses and write output video files 28 | 29 | The script supports different SSL modes for secure communication and handles 30 | various input/output formats and configurations. 31 | """ 32 | 33 | # Standard library imports 34 | import os 35 | import sys 36 | import time 37 | from typing import Iterator 38 | import pathlib 39 | 40 | # Third-party imports 41 | import grpc 42 | from tqdm import tqdm 43 | 44 | # Setup paths for local imports 45 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) 46 | SCRIPT_PATH = str(pathlib.Path(__file__).parent.resolve()) 47 | sys.path.append(os.path.join(SCRIPT_PATH, "../interfaces")) 48 | 49 | # Local imports 50 | from config import EyeContactConfig, parse_args # noqa: E402 51 | from constants import DATA_CHUNK_SIZE # noqa: E402 52 | from utils.utils import ( # noqa: E402 53 | create_channel_credentials, 54 | validate_ssl_args, 55 | validate_preview_args, 56 | create_request_metadata, 57 | ) 58 | import eyecontact_pb2 # noqa: E402 59 | import eyecontact_pb2_grpc # noqa: E402 60 | 61 | 62 | def generate_request_for_inference( 63 | eyecontact_config: EyeContactConfig, 64 | ) -> Iterator[eyecontact_pb2.RedirectGazeRequest]: 65 | """Generate a stream of RedirectGazeRequest messages for the Eye Contact service. 66 | 67 | Args: 68 | eyecontact_config: Configuration object containing all Eye Contact 69 | parameters 70 | 71 | Yields: 72 | RedirectGazeRequest messages containing either configuration or chunks 73 | of input data 74 | 75 | Raises: 76 | RuntimeError: If there are errors reading input files 77 | """ 78 | print("Generating request for inference") 79 | 80 | # Get configuration parameters 81 | params = eyecontact_config.get_config_params() 82 | 83 | print("Sending data for inference") 84 | 85 | # Send config first 86 | yield eyecontact_pb2.RedirectGazeRequest(config=eyecontact_pb2.RedirectGazeConfig(**params)) 87 | 88 | # Send video data in chunks 89 | video_chunk_counter = 0 90 | 91 | try: 92 | with open(eyecontact_config.video_filepath, "rb") as video_file: 93 | while True: 94 | video_buffer = video_file.read(DATA_CHUNK_SIZE) 95 | if video_buffer == b"": 96 | break 97 | video_chunk_counter += 1 98 | yield eyecontact_pb2.RedirectGazeRequest(video_file_data=video_buffer) 99 | except IOError as e: 100 | print(f"Error reading video chunk {video_chunk_counter}: {e}") 101 | raise RuntimeError(f"Failed to read video file: {e}") 102 | 103 | print("Data sending completed\n") 104 | 105 | 106 | def write_output_file_from_response( 107 | response_iter: Iterator[eyecontact_pb2.RedirectGazeResponse], 108 | output_filepath: os.PathLike = "output.mp4", 109 | ) -> None: 110 | """Function to write the output file from the incoming gRPC data stream. 111 | 112 | Args: 113 | response_iter: Responses from the server to write into output file 114 | output_filepath: Path to output file 115 | """ 116 | print(f"Writing output in {output_filepath}") 117 | sys.stdout.flush() # Ensure output is flushed before starting progress bar 118 | 119 | # Initialize progress bar for streaming data reception 120 | chunk_count = 0 121 | total_bytes = 0 122 | 123 | with open(output_filepath, "wb") as fd: 124 | # Create progress bar that shows streaming progress 125 | # Use leave=False to clean up the progress bar when done 126 | pbar = tqdm( 127 | desc="Receiving video chunks", 128 | unit="chunks", 129 | unit_scale=False, 130 | dynamic_ncols=True, 131 | leave=False, 132 | bar_format="{desc}: {n} chunks | {rate_fmt} | {postfix}", 133 | ) 134 | 135 | try: 136 | for response in response_iter: 137 | if response.HasField("video_file_data"): 138 | chunk_data = response.video_file_data 139 | fd.write(chunk_data) 140 | 141 | # Update progress tracking 142 | chunk_count += 1 143 | total_bytes += len(chunk_data) 144 | 145 | # Update progress bar 146 | pbar.update(1) 147 | pbar.set_postfix_str(f"{total_bytes / (1024*1024):.1f} MB received") 148 | finally: 149 | pbar.close() 150 | 151 | print( 152 | f"Completed: Received {chunk_count} chunks " f"({total_bytes / (1024*1024):.1f} MB total)" 153 | ) 154 | 155 | 156 | def process_request( 157 | channel: grpc.Channel, 158 | eyecontact_config: EyeContactConfig, 159 | request_metadata: tuple = None, 160 | ) -> None: 161 | """Process gRPC request and handle responses. 162 | 163 | Args: 164 | channel: gRPC channel for server client communication 165 | eyecontact_config: Configuration for the Eye Contact service 166 | request_metadata: Credentials to process preview request 167 | 168 | Raises: 169 | Exception: If any errors occur during processing 170 | """ 171 | try: 172 | stub = eyecontact_pb2_grpc.MaxineEyeContactServiceStub(channel) 173 | start_time = time.time() 174 | 175 | responses = stub.RedirectGaze( 176 | generate_request_for_inference(eyecontact_config=eyecontact_config), 177 | metadata=request_metadata, 178 | ) 179 | 180 | # Skip the echo response if configuration was sent 181 | next(responses) 182 | 183 | write_output_file_from_response( 184 | response_iter=responses, output_filepath=eyecontact_config.output_filepath 185 | ) 186 | end_time = time.time() 187 | print(f"Function invocation completed in {end_time-start_time:.2f}s") 188 | except Exception as e: 189 | print(f"An error occurred: {e}") 190 | 191 | 192 | def main(): 193 | """Main entry point for the Eye Contact client. 194 | 195 | Handles: 196 | 1. Argument parsing 197 | 2. Configuration validation 198 | 3. Channel setup (secure/insecure) 199 | 4. Request processing 200 | """ 201 | args = parse_args() 202 | eyecontact_config = EyeContactConfig.from_args(args) 203 | 204 | try: 205 | eyecontact_config.validate_eyecontact_config() 206 | validate_ssl_args(args) 207 | validate_preview_args(args) 208 | except Exception as e: 209 | print(f"Invalid configuration: {e}") 210 | return 211 | 212 | print(eyecontact_config) 213 | 214 | # Prepare request metadata for preview mode 215 | request_metadata = create_request_metadata(args) 216 | 217 | # Check ssl-mode and create channel_credentials for that mode 218 | if args.ssl_mode != "DISABLED": 219 | channel_credentials = create_channel_credentials(args) 220 | # Establish secure channel when ssl-mode is MTLS/TLS 221 | with grpc.secure_channel(target=args.target, credentials=channel_credentials) as channel: 222 | process_request( 223 | channel=channel, 224 | eyecontact_config=eyecontact_config, 225 | request_metadata=request_metadata, 226 | ) 227 | elif args.preview_mode: 228 | # Establish secure channel when sending request to NVCF server 229 | with grpc.secure_channel( 230 | target=args.target, credentials=grpc.ssl_channel_credentials() 231 | ) as channel: 232 | process_request( 233 | channel=channel, 234 | eyecontact_config=eyecontact_config, 235 | request_metadata=request_metadata, 236 | ) 237 | else: 238 | # Establish insecure channel when ssl-mode is DISABLED 239 | print(f"Establishing insecure channel to {args.target}") 240 | with grpc.insecure_channel(target=args.target) as channel: 241 | process_request( 242 | channel=channel, 243 | eyecontact_config=eyecontact_config, 244 | request_metadata=request_metadata, 245 | ) 246 | 247 | 248 | if __name__ == "__main__": 249 | main() 250 | -------------------------------------------------------------------------------- /eye-contact/README.md: -------------------------------------------------------------------------------- 1 | 2 | # NVIDIA Maxine Eye Contact NIM Client 3 | 4 | This package has a sample client which demonstrates interaction with a Maxine Eye Contact NIM 5 | 6 | ## Getting Started 7 | 8 | NVIDIA Maxine NIM Client packages use gRPC APIs. Instructions below demonstrate usage of Eye contact NIM using Python gRPC client. 9 | To experience the NVIDIA Maxine Eye Contact NIM API without having to host your own servers, use the [Try API](https://build.nvidia.com/nvidia/eyecontact/api) feature, which uses the NVIDIA Cloud Function backend. 10 | 11 | ## Pre-requisites 12 | 13 | - Ensure you have Python 3.10 or above installed on your system. 14 | For download and installation instructions, refer to the [Python documentation](https://www.python.org/downloads/). 15 | - Access to NVIDIA Maxine Eye Contact NIM container and service. 16 | - MP4 input files with H.264 video codec (audio optional) and videos with Variable Frame Rate (VFR) are not supported. 17 | 18 | ## Usage guide 19 | 20 | ### 1. Clone the repository 21 | 22 | ```bash 23 | git clone https://github.com/nvidia-maxine/nim-clients.git 24 | 25 | // Go to the 'eye-contact' folder 26 | cd nim-clients/eye-contact 27 | ``` 28 | 29 | ### 2. Install dependencies 30 | 31 | ```bash 32 | sudo apt-get install python3-pip 33 | pip install -r requirements.txt 34 | ``` 35 | 36 | ### 3. Compile the Protos (optional) 37 | 38 | If you want to use the client code provided in the github Client repository, you can skip this step. 39 | The proto files are available in the eye-contact/protos folder. You can compile them to generate client interfaces in your preferred programming language. For more details, refer to [Supported languages](https://grpc.io/docs/languages/) in the gRPC documentation. 40 | 41 | Here is an example of how to compile the protos for Python on Linux and Windows. 42 | 43 | #### Python 44 | 45 | The `grpcio` version needed for compilation can be referred at `requirements.txt` 46 | 47 | To compile protos on Linux, run: 48 | ```bash 49 | # Go to eye-contact/protos/linux folder 50 | cd eye-contact/protos/linux/ 51 | 52 | chmod +x compile_protos.sh 53 | ./compile_protos.sh 54 | ``` 55 | 56 | To compile protos on Windows, run: 57 | ```bash 58 | # Go to eye-contact/protos/windows folder 59 | cd eye-contact/protos/windows/ 60 | 61 | ./compile_protos.bat 62 | ``` 63 | The compiled proto files will be generated in `nim-clients/eye-contact/interfaces` directory. 64 | 65 | ### 4. Host the NIM Server 66 | 67 | Before running client part of Maxine Eye Contact, please set up a server. 68 | The simplest way to do that is to follow the [quick start guide](https://docs.nvidia.com/nim/maxine/eye-contact/latest/index.html) 69 | This step can be skipped when using [Try API](https://build.nvidia.com/nvidia/eyecontact/api). 70 | 71 | ### 5. Run the Python Client 72 | 73 | - Go to the scripts directory 74 | 75 | ```bash 76 | cd scripts 77 | ``` 78 | 79 | #### Usage for Hosted NIM request 80 | 81 | ```bash 82 | python eye-contact.py \ 83 | --target \ 84 | --input \ 85 | --output \ 86 | --ssl-mode \ 87 | --ssl-key \ 88 | --ssl-cert \ 89 | --ssl-root-cert 90 | ``` 91 | 92 | The following command uses the sample video file and generates an `output.mp4` file in the current folder: 93 | 94 | ```bash 95 | python eye-contact.py --target 127.0.0.1:8001 --input ../assets/transactional.mp4 --output output.mp4 96 | ``` 97 | 98 | The following command uses streaming mode (for streamable video files): 99 | 100 | ```bash 101 | python eye-contact.py --target 127.0.0.1:8001 --input ../assets/streamable.mp4 --output output.mp4 --streaming 102 | ``` 103 | 104 | > **Note:** The supported file type is MP4. 105 | 106 | #### Usage for Preview API request 107 | 108 | ```bash 109 | python eye-contact.py --preview-mode \ 110 | --target grpc.nvcf.nvidia.com:443 \ 111 | --function-id 15c6f1a0-3843-4cde-b5bc-803a4966fbb6 \ 112 | --api-key $API_KEY_REQUIRED_IF_EXECUTING_OUTSIDE_NGC \ 113 | --input \ 114 | --output 115 | ``` 116 | 117 | #### Command line arguments 118 | 119 | - `-h, --help` show this help message and exit 120 | - `--preview-mode` Flag to send request to preview NVCF NIM server on https://build.nvidia.com/nvidia/eyecontact/api. 121 | - `--ssl-mode` {DISABLED,MTLS,TLS} Flag to set SSL mode, default is DISABLED 122 | - `--ssl-key SSL_KEY` The path to ssl private key. 123 | - `--ssl-cert SSL_CERT` The path to ssl certificate chain. 124 | - `--ssl-root-cert` The path to ssl root certificate. 125 | - `--target` IP:port of gRPC service, when hosted locally. Use grpc.nvcf.nvidia.com:443 when hosted on NVCF. 126 | - `--input` The path to the input video file. 127 | - `--output` The path for the output video file. 128 | - `--streaming` Flag to enable gRPC streaming mode. Required for streamable video input. 129 | - `--api-key` NGC API key required for authentication, utilized when using TRY API ignored otherwise 130 | - `--function-id` NVCF function ID for the service, utilized when using TRY API ignored otherwise 131 | 132 | #### Advanced Configuration Parameters 133 | 134 | The Eye Contact client supports extensive parameter customization for fine-tuning behavior: 135 | 136 | **Video Encoding Parameters** 137 | 138 | - `lossless`: Enables lossless video encoding. This setting overrides any bitrate configuration to ensure maximum quality output, although it results in larger file sizes. Use this mode when quality is the top priority. 139 | ```bash 140 | python eye-contact.py --target 127.0.0.1:8001 --lossless 141 | ``` 142 | 143 | - `bitrate`: Sets the target bitrate for video encoding in bits per second (bps). Higher bitrates result in better video quality but larger file sizes. This parameter allows balancing quality and file size by controlling the video bitrate. The default is 3,000,000 bps (3 Mbps). For example, setting `--bitrate 5000000` targets 5 Mbps encoding. 144 | ```bash 145 | python eye-contact.py --target 127.0.0.1:8001 --bitrate 5000000 146 | ``` 147 | 148 | - `idr-interval`: Sets the interval between instantaneous decoding refresh (IDR) frames in the encoded video. IDR frames are special I-frames that clear all reference buffers, allowing the video to be decoded from that point without needing previous frames. Lower values improve seeking accuracy, random access, and overall encoding quality but increase file size; higher values reduce file size but may impact seeking performance and quality. The default is 8 frames. 149 | ```bash 150 | python eye-contact.py --target 127.0.0.1:8001 --idr-interval 10 151 | ``` 152 | 153 | - `custom-encoding-params`: Passes custom encoding parameters as a JSON string, which provides fine-grained control for expert users via JSON configuration. These parameters are used to configure properties of the GStreamer nvvideo4linux2 encoder plugin, allowing direct control over the underlying hardware encoder settings. 154 | ```bash 155 | python eye-contact.py --custom-encoding-params '{"idrinterval": 20, "maxbitrate": 3000000}' 156 | ``` 157 | 158 | **Note:** Custom encoding parameters are for expert users who need fine-grained control over video encoding. Incorrect values can cause encoding failures or poor-quality output. To configure the nvenc encoder, refer to [Gst properties of the Gst-nvvideo4linux2 encoder plugin](https://docs.nvidia.com/metropolis/deepstream/dev-guide/text/DS_plugin_gst-nvvideo4linux2.html#:~:text=The%20following%20table%20summarizes%20the%20Gst%20properties%20of%20the%20Gst%2Dnvvideo4linux2%20encoder%20plugin). 159 | 160 | **Eye Contact Behavior Parameters** 161 | - `--temporal` Flag to control temporal filtering (default: 4294967295). 162 | - `--detect-closure` Flag to toggle detection of eye closure and occlusion (default: 0, choices: [0, 1]). 163 | - `--eye-size-sensitivity` Eye size sensitivity parameter (default: 3, range: [2, 6]). 164 | - `--enable-lookaway` Flag to toggle look away (default: 0, choices: [0, 1]). 165 | - `--lookaway-max-offset` Maximum value of gaze offset angle (degrees) during a random look away (default: 5, range: [1, 10]). 166 | - `--lookaway-interval-min` Minimum number of frames at which random look away occurs (default: 100, range: [1, 600]). 167 | - `--lookaway-interval-range` Range for picking the number of frames at which random look away occurs (default: 250, range: [1, 600]). 168 | 169 | **Gaze Threshold Parameters** 170 | - `--gaze-pitch-threshold-low` Gaze pitch threshold (degrees) at which the redirection starts transitioning (default: 20.0, range: [10, 35]). 171 | - `--gaze-pitch-threshold-high` Gaze pitch threshold (degrees) at which the redirection is equal to estimated gaze (default: 30.0, range: [10, 35]). 172 | - `--gaze-yaw-threshold-low` Gaze yaw threshold (degrees) at which the redirection starts transitioning (default: 20.0, range: [10, 35]). 173 | - `--gaze-yaw-threshold-high` Gaze yaw threshold (degrees) at which the redirection is equal to estimated gaze (default: 30.0, range: [10, 35]). 174 | 175 | **Head Pose Threshold Parameters** 176 | - `--head-pitch-threshold-low` Head pose pitch threshold (degrees) at which the redirection starts transitioning away from camera toward estimated gaze (default: 15.0, range: [10, 35]). 177 | - `--head-pitch-threshold-high` Head pose pitch threshold (degrees) at which the redirection is equal to estimated gaze (default: 15.0, range: [10, 35]). 178 | - `--head-yaw-threshold-low` Head pose yaw threshold (degrees) at which the redirection starts transitioning (default: 15.0, range: [10, 35]). 179 | - `--head-yaw-threshold-high` Head pose yaw threshold (degrees) at which the redirection is equal to estimated gaze (default: 15.0, range: [10, 35]). 180 | 181 | #### Important Notes about Streaming Mode 182 | 183 | Streaming mode (`--streaming`) is required when processing videos that are optimized for streaming (that is, they have the 'moov' atom at the beginning). 184 | 185 | If you encounter an error when processing non-streamable video files, you can convert your video to be streamable using the following command: 186 | ```bash 187 | ffmpeg -i input.mp4 -movflags +faststart output_streamable.mp4 188 | ``` 189 | The client automatically validates video compatibility with the selected mode and provides helpful error messages. 190 | 191 | When using SSL mode, the default path for the credentials is `../ssl_key/.pem`. 192 | 193 | For more information, refer to [Basic Inference](https://docs.nvidia.com/nim/maxine/eye-contact/latest/basic-inference.html) in the Eye Contact NIM documentation. 194 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a 4 | # copy of this software and associated documentation files (the "Software"), 5 | # to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | # and/or sell copies of the Software, and to permit persons to whom the 8 | # Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | import os 22 | import csv 23 | import itertools 24 | from typing import Iterator, List, Union 25 | import argparse 26 | import grpc 27 | from google.protobuf import any_pb2, wrappers_pb2 28 | 29 | 30 | def add_ssl_arguments(parser: argparse.ArgumentParser) -> None: 31 | """Add SSL-related arguments to an argument parser. 32 | 33 | Args: 34 | parser: The argument parser to add SSL arguments to 35 | """ 36 | # SSL and connection arguments 37 | parser.add_argument( 38 | "--ssl-mode", 39 | type=str, 40 | help="Flag to set SSL mode, default is DISABLED", 41 | default="DISABLED", 42 | choices=["DISABLED", "MTLS", "TLS"], 43 | ) 44 | parser.add_argument( 45 | "--ssl-key", 46 | type=str, 47 | default="../ssl_key/ssl_key_client.pem", 48 | help="The path to ssl private key.", 49 | ) 50 | parser.add_argument( 51 | "--ssl-cert", 52 | type=str, 53 | default="../ssl_key/ssl_cert_client.pem", 54 | help="The path to ssl certificate chain.", 55 | ) 56 | parser.add_argument( 57 | "--ssl-root-cert", 58 | type=str, 59 | default="../ssl_key/ssl_ca_cert.pem", 60 | help="The path to ssl root certificate.", 61 | ) 62 | parser.add_argument( 63 | "--target", 64 | type=str, 65 | default="127.0.0.1:8001", 66 | help="IP:port of gRPC service, when hosted locally. Use " 67 | "grpc.nvcf.nvidia.com:443 when hosted on NVCF.", 68 | ) 69 | 70 | 71 | def add_preview_arguments(parser: argparse.ArgumentParser) -> None: 72 | """Add preview mode related arguments to an argument parser. 73 | 74 | Args: 75 | parser: The argument parser to add preview arguments to 76 | """ 77 | # Preview mode and NVCF arguments 78 | parser.add_argument( 79 | "--preview-mode", 80 | action="store_true", 81 | help="Flag to send request to preview NVCF NIM server on " 82 | "https://build.nvidia.com/nvidia/eyecontact/api. ", 83 | ) 84 | parser.add_argument( 85 | "--api-key", 86 | type=str, 87 | help="NGC API key required for authentication, utilized when using " 88 | "TRY API ignored otherwise", 89 | ) 90 | parser.add_argument( 91 | "--function-id", 92 | type=str, 93 | help="NVCF function ID for the service, utilized when using TRY API " "ignored otherwise", 94 | ) 95 | 96 | 97 | def validate_ssl_args(args: argparse.Namespace) -> None: 98 | """Validate SSL-related arguments. 99 | 100 | Args: 101 | args: Parsed command line arguments 102 | 103 | Raises: 104 | RuntimeError: If SSL configuration is invalid 105 | """ 106 | if args.ssl_mode == "MTLS": 107 | if not (args.ssl_key and args.ssl_cert and args.ssl_root_cert): 108 | raise RuntimeError( 109 | "If --ssl-mode is MTLS, --ssl-key, --ssl-cert and " "--ssl-root-cert are required." 110 | ) 111 | elif args.ssl_mode == "TLS": 112 | if not args.ssl_root_cert: 113 | raise RuntimeError("If --ssl-mode is TLS, --ssl-root-cert is required.") 114 | 115 | 116 | def validate_preview_args(args: argparse.Namespace) -> None: 117 | """Validate preview mode related arguments. 118 | 119 | Args: 120 | args: Parsed command line arguments 121 | 122 | Raises: 123 | RuntimeError: If preview configuration is invalid 124 | """ 125 | if args.preview_mode: 126 | if not args.api_key or not args.function_id: 127 | raise RuntimeError( 128 | "If --preview-mode is specified, both --api-key and " "--function-id are required." 129 | ) 130 | 131 | 132 | def create_request_metadata(args: argparse.Namespace) -> tuple | None: 133 | """Create request metadata for preview mode. 134 | 135 | Args: 136 | args: Parsed command line arguments 137 | 138 | Returns: 139 | Request metadata tuple or None 140 | """ 141 | if args.preview_mode: 142 | return ( 143 | ("authorization", "Bearer {}".format(args.api_key)), 144 | ("function-id", args.function_id), 145 | ) 146 | return None 147 | 148 | 149 | def is_file_available(file_path: os.PathLike, file_types: List[str]) -> bool: 150 | """Check if the file exists. 151 | 152 | Args: 153 | file_path: Path to input file 154 | """ 155 | if not os.path.isfile(file_path): 156 | raise FileNotFoundError(f"File '{file_path}' not found") 157 | for file_type in file_types: 158 | if os.path.splitext(file_path)[1].lower() == f".{file_type}": 159 | return True 160 | return False 161 | 162 | 163 | def read_file_content(file_path: os.PathLike) -> bytes: 164 | """Read file content as bytes. 165 | 166 | Args: 167 | file_path: Path to input file 168 | 169 | Returns: 170 | File contents as bytes 171 | """ 172 | with open(file_path, "rb") as file: 173 | return file.read() 174 | 175 | 176 | def roi_csv_reader(reader: csv.reader, row_count: int) -> Iterator[list]: 177 | """Read CSV data as multiple rows . 178 | 179 | Args: 180 | reader: CSV reader object to read from 181 | row_count: Number of rows to include in each batch 182 | 183 | Yields: 184 | List of CSV rows as multiple rows of the specified row count 185 | """ 186 | while True: 187 | rows = list(itertools.islice(reader, row_count)) 188 | if not rows: 189 | break 190 | yield rows 191 | 192 | 193 | def check_streamable(file_path: os.PathLike) -> bool: 194 | """ 195 | Checks if the video is streamable by checking if the moov atom follows 196 | immediately after the ftyp atom in an MP4 file. 197 | 198 | For streamable MP4s, the moov atom must come immediately after: 199 | [4 bytes: size][4 bytes: "ftyp"][... ftyp data ...][4 bytes: size] 200 | [4 bytes: "moov"][... moov data ...] 201 | 202 | For non-streamable MP4s, other atoms like mdat may come between ftyp and 203 | moov: 204 | [4 bytes: size][4 bytes: "ftyp"][... ftyp data ...][4 bytes: size] 205 | [4 bytes: "mdat"][... mdat data ...][moov atom] 206 | 207 | Args: 208 | mp4_header_data: bytes of the first chunk of the MP4 file. Ideally we 209 | need to have at least 40 bytes to check. 210 | 211 | Returns: 212 | A tuple of (is_streamable, ftyp_size). 213 | """ 214 | # Read first 40 bytes of the file 215 | with open(file_path, "rb") as f: 216 | mp4_header_data = f.read(40) 217 | if len(mp4_header_data) < 40: 218 | raise RuntimeError("MP4 file is too small to check if it is streamable") 219 | 220 | # Read the first atom size 221 | ftyp_size = int.from_bytes(mp4_header_data[0:4], byteorder="big") 222 | 223 | # Check if it's a ftyp atom 224 | if mp4_header_data[4:8] != b"ftyp": 225 | return False, -1 226 | 227 | next_atom_type = bytes(mp4_header_data[ftyp_size + 4 : ftyp_size + 8]) 228 | 229 | # Check if the next atom is a moov atom 230 | if next_atom_type == b"moov": 231 | return True 232 | else: 233 | return False 234 | 235 | 236 | def create_channel_credentials(args: argparse.Namespace) -> grpc.ChannelCredentials: 237 | """Create channel credentials based on SSL mode. 238 | 239 | Args: 240 | args: Command line arguments containing SSL configuration 241 | 242 | Returns: 243 | Configured channel credentials 244 | 245 | Raises: 246 | RuntimeError: If required SSL files are missing 247 | """ 248 | channel_credentials = None 249 | if args.ssl_mode == "MTLS": 250 | if not (args.ssl_key and args.ssl_cert and args.ssl_root_cert): 251 | raise RuntimeError( 252 | "If --ssl-mode is MTLS, --ssl-key, --ssl-cert and " "--ssl-root-cert are required." 253 | ) 254 | private_key = read_file_content(args.ssl_key) 255 | certificate_chain = read_file_content(args.ssl_cert) 256 | root_certificates = read_file_content(args.ssl_root_cert) 257 | channel_credentials = grpc.ssl_channel_credentials( 258 | root_certificates=root_certificates, 259 | private_key=private_key, 260 | certificate_chain=certificate_chain, 261 | ) 262 | else: 263 | if not (args.ssl_root_cert): 264 | raise RuntimeError("If --ssl-mode is TLS, --ssl-root-cert is required.") 265 | root_certificates = read_file_content(args.ssl_root_cert) 266 | channel_credentials = grpc.ssl_channel_credentials(root_certificates=root_certificates) 267 | return channel_credentials 268 | 269 | 270 | def create_protobuf_any_value(value: Union[bool, int, float, str]) -> any_pb2.Any: 271 | """Create a google.protobuf.Any message from a Python value. 272 | 273 | Args: 274 | value: The value to convert (bool, int, float, or str) 275 | 276 | Returns: 277 | google.protobuf.Any message 278 | """ 279 | any_message = any_pb2.Any() 280 | 281 | if isinstance(value, bool): 282 | wrapper = wrappers_pb2.BoolValue(value=value) 283 | any_message.Pack(wrapper) 284 | elif isinstance(value, int): 285 | if value > 2147483647 or value < -2147483648: # int32 range 286 | wrapper = wrappers_pb2.Int64Value(value=value) 287 | else: 288 | wrapper = wrappers_pb2.Int32Value(value=value) 289 | any_message.Pack(wrapper) 290 | elif isinstance(value, float): 291 | wrapper = wrappers_pb2.FloatValue(value=value) 292 | any_message.Pack(wrapper) 293 | elif isinstance(value, str): 294 | wrapper = wrappers_pb2.StringValue(value=value) 295 | any_message.Pack(wrapper) 296 | else: 297 | raise ValueError(f"Unsupported type: {type(value)}") 298 | 299 | return any_message 300 | -------------------------------------------------------------------------------- /audio2face-2d/python/scripts/audio2face-2d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a 4 | # copy of this software and associated documentation files (the "Software"), 5 | # to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | # and/or sell copies of the Software, and to permit persons to whom the 8 | # Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | import argparse 22 | import os 23 | import sys 24 | import time 25 | import io 26 | import grpc 27 | 28 | sys.path.append(os.path.join(os.getcwd(), "../interfaces")) 29 | # Importing gRPC compiler auto-generated maxine audio2face-2d library 30 | import audio2face2d_pb2 # noqa: E402 31 | import audio2face2d_pb2_grpc # noqa: E402 32 | from audio2face2d_pb2 import ( # noqa: E402 33 | QuaternionStream, 34 | Quaternion, 35 | Vector3fStream, 36 | Vector3f, 37 | ModelSelection, 38 | AnimationCroppingMode, 39 | HeadPoseMode, 40 | ) 41 | 42 | 43 | def parse_args() -> None: 44 | """ 45 | Parse command-line arguments using argparse. 46 | """ 47 | # Set up argument parsing 48 | parser = argparse.ArgumentParser( 49 | description="Process input audio and portrait files and apply audio2face-2d effect." 50 | ) 51 | parser.add_argument( 52 | "--ssl-mode", 53 | type=str, 54 | help="Flag to set SSL mode, default is None", 55 | default="DISABLED", 56 | choices=["DISABLED", "MTLS", "TLS"], 57 | ) 58 | parser.add_argument( 59 | "--ssl-key", 60 | type=str, 61 | default="../ssl_key/ssl_key_client.pem", 62 | help="The path to ssl private key.", 63 | ) 64 | parser.add_argument( 65 | "--ssl-cert", 66 | type=str, 67 | default="../ssl_key/ssl_cert_client.pem", 68 | help="The path to ssl certificate chain.", 69 | ) 70 | parser.add_argument( 71 | "--ssl-root-cert", 72 | type=str, 73 | default="../ssl_key/ssl_ca_cert.pem", 74 | help="The path to ssl root certificate.", 75 | ) 76 | parser.add_argument( 77 | "--target", 78 | type=str, 79 | default="127.0.0.1:8001", 80 | help="IP:port of gRPC service, when hosted locally.", 81 | ) 82 | parser.add_argument( 83 | "--audio-input", 84 | type=str, 85 | default="../../assets/sample_audio.wav", 86 | help="The path to the input audio file.", 87 | ) 88 | parser.add_argument( 89 | "--portrait-input", 90 | type=str, 91 | default="../../assets/sample_portrait_image.png", 92 | help="The path to the input portrait file.", 93 | ) 94 | parser.add_argument( 95 | "--output", 96 | type=str, 97 | default="output.mp4", 98 | help="The path for the output video file.", 99 | ) 100 | parser.add_argument( 101 | "--head-rotation-animation-filepath", 102 | type=str, 103 | default="../../assets/head_rotation_animation.csv", 104 | help="The path for the head_rotation_animation.csv file. " 105 | "Only required for HEAD_POSE_MODE_USER_DEFINED_ANIMATION", 106 | ) 107 | parser.add_argument( 108 | "--head-translation-animation-filepath", 109 | type=str, 110 | default="../../assets/head_translation_animation.csv", 111 | help="The path for the head_translation_animation.csv file. " 112 | "Only required for HEAD_POSE_MODE_USER_DEFINED_ANIMATION", 113 | ) 114 | return parser.parse_args() 115 | 116 | 117 | def read_file_content(file_path: os.PathLike) -> None: 118 | """Function to read file content as bytes. 119 | 120 | Args: 121 | file_path: Path to input file 122 | """ 123 | with open(file_path, "rb") as file: 124 | return file.read() 125 | 126 | 127 | def generate_request_for_inference(audio_filepath: str, params: dict): 128 | """Generator to produce the request data stream 129 | 130 | Args: 131 | audio_filepath: Path to input file 132 | params: Parameters for the feature 133 | """ 134 | yield audio2face2d_pb2.AnimateRequest(config=audio2face2d_pb2.AnimateConfig(**params)) 135 | file = open(audio_filepath, "rb") 136 | while True: 137 | buffer = file.read(1024 * 1024) 138 | if buffer == b"": 139 | break 140 | yield audio2face2d_pb2.AnimateRequest(audio_file_data=buffer) 141 | print("Data sending done") 142 | 143 | 144 | def process_head_pose_data(head_rotation_path, head_translation_path): 145 | """ 146 | Process head rotation and translation data. 147 | 148 | Args: 149 | head_rotation_path (str): Path to the head rotation animation file. 150 | head_translation_path (str): Path to the head translation animation file. 151 | 152 | Returns: 153 | Tuple[QuaternionStream, Vector3fStream]: Processed rotation and translation data streams. 154 | """ 155 | # Read the head rotation data 156 | with io.StringIO(open(head_rotation_path, "rb").read().decode("utf-8")) as file: 157 | head_rotation_data = [] 158 | for line in file: 159 | values = line.strip().split(",") 160 | if len(values) == 4: 161 | head_rotation_data.append([float(val) for val in values]) 162 | 163 | # Validate the data 164 | assert len(head_rotation_data) > 0, "Head rotation data is empty" 165 | assert all(len(row) == 4 for row in head_rotation_data), "Each row must have 4 values" 166 | 167 | # Create the QuaternionStream 168 | rotation_data_stream = QuaternionStream() 169 | for x in head_rotation_data: 170 | q = Quaternion() 171 | q.x, q.y, q.z, q.w = x 172 | rotation_data_stream.values.append(q) 173 | 174 | # Read the head translation data 175 | with io.StringIO(open(head_translation_path, "rb").read().decode("utf-8")) as file: 176 | head_translation_data = [] 177 | for line in file: 178 | values = line.strip().split(",") 179 | if len(values) == 3: 180 | head_translation_data.append([float(val) for val in values]) 181 | 182 | # Validate the data 183 | assert len(head_translation_data) > 0, "Head translation data is empty" 184 | assert all(len(row) == 3 for row in head_translation_data), "Each row must have 3 values" 185 | 186 | # Create the Vector3fStream 187 | translation_data_stream = Vector3fStream() 188 | for x in head_translation_data: 189 | v = Vector3f() 190 | v.x, v.y, v.z = x 191 | translation_data_stream.values.append(v) 192 | 193 | return rotation_data_stream, translation_data_stream 194 | 195 | 196 | def process_request( 197 | channel: any, 198 | audio_filepath: os.PathLike, 199 | params: dict, 200 | output_filepath: os.PathLike, 201 | ) -> None: 202 | """Function to process gRPC request 203 | 204 | Args: 205 | channel: gRPC channel for server client communication 206 | input_filepath: Path to input file 207 | params: Parameters to control the feature 208 | output_filepath: Path to output file 209 | request_metadata: Credentials to process preview request 210 | """ 211 | try: 212 | stub = audio2face2d_pb2_grpc.Audio2Face2DServiceStub(channel) 213 | start_time = time.time() 214 | responses = stub.Animate( 215 | generate_request_for_inference(audio_filepath=audio_filepath, params=params) 216 | ) 217 | next(responses) 218 | file = open(output_filepath, "wb") 219 | print(f"Writing output in {output_filepath}") 220 | for response in responses: 221 | if response.HasField("video_file_data"): 222 | file.write(response.video_file_data) 223 | end_time = time.time() 224 | print( 225 | f"Function invocation completed in {end_time-start_time:.2f}s, " 226 | f"{output_filepath} file is generated." 227 | ) 228 | except Exception as e: 229 | print(f"An error occurred: {e}") 230 | 231 | 232 | def main(): 233 | """ 234 | Main client function 235 | """ 236 | args = parse_args() 237 | portrait_filepath = args.portrait_input 238 | audio_filepath = args.audio_input 239 | output_filepath = args.output 240 | 241 | # Check file path 242 | if os.path.isfile(portrait_filepath): 243 | print(f"The image file '{portrait_filepath}' exists. Checking for audio file.") 244 | else: 245 | raise FileNotFoundError(f"The image file '{portrait_filepath}' does not exist. Exiting.") 246 | if os.path.isfile(audio_filepath): 247 | print(f"The audio file '{audio_filepath}' exists. Proceeding with processing.") 248 | else: 249 | raise FileNotFoundError(f"The audio file '{audio_filepath}' does not exist. Exiting.") 250 | 251 | portrait_image_encoded = open(portrait_filepath, "rb").read() 252 | 253 | # Configure head pose mode 254 | head_pose_mode = HeadPoseMode.HEAD_POSE_MODE_RETAIN_FROM_PORTRAIT_IMAGE 255 | 256 | # Provide head pose animation values for head pose mode HEAD_POSE_MODE_USER_DEFINED_ANIMATION 257 | if head_pose_mode == HeadPoseMode.HEAD_POSE_MODE_USER_DEFINED_ANIMATION: 258 | rotation_data_stream, translation_data_stream = process_head_pose_data( 259 | args.head_rotation_animation_filepath, 260 | args.head_translation_animation_filepath, 261 | ) 262 | 263 | # Supply params as shown below, refer to the docs for more info. 264 | feature_params = { 265 | "portrait_image": portrait_image_encoded, 266 | "model_selection": ModelSelection.MODEL_SELECTION_QUALITY, 267 | "animation_crop_mode": AnimationCroppingMode.ANIMATION_CROPPING_MODE_REGISTRATION_BLENDING, 268 | "enable_lookaway": 1, # can be 0 or 1 269 | "lookaway_max_offset": 20, # value in [5, 25] 270 | "lookaway_interval_min": 240, # value in [1, 600] 271 | "lookaway_interval_range": 90, # value in [1, 600] 272 | "blink_frequency": 15, # value in [0, 120] 273 | "blink_duration": 6, # value in [2, 150] 274 | "mouth_expression_multiplier": 1.4, # value in [1.0, 2.0] 275 | "head_pose_mode": head_pose_mode, 276 | "head_pose_multiplier": 1.0, # value in [0.0, 1.0] 277 | # "input_head_rotation": rotation_data_stream, # HEAD_POSE_MODE_USER_DEFINED_ANIMATION 278 | # "input_head_translation": translation_data_stream, # HEAD_POSE_MODE_USER_DEFINED_ANIMATION 279 | } 280 | 281 | # Check ssl-mode and create channel_credentials for that mode 282 | if args.ssl_mode != "DISABLED": 283 | channel_credentials = "" 284 | if args.ssl_mode == "MTLS": 285 | if not (args.ssl_key and args.ssl_cert and args.ssl_root_cert): 286 | raise RuntimeError( 287 | "If --ssl-mode is MTLS, --ssl-key, --ssl-cert and --ssl-root-cert are required." 288 | ) 289 | private_key = read_file_content(args.ssl_key) 290 | certificate_chain = read_file_content(args.ssl_cert) 291 | root_certificates = read_file_content(args.ssl_root_cert) 292 | channel_credentials = grpc.ssl_channel_credentials( 293 | root_certificates=root_certificates, 294 | private_key=private_key, 295 | certificate_chain=certificate_chain, 296 | ) 297 | else: 298 | if not (args.ssl_root_cert): 299 | raise RuntimeError("If --ssl-mode is TLS, --ssl-root-cert is required.") 300 | root_certificates = read_file_content(args.ssl_root_cert) 301 | channel_credentials = grpc.ssl_channel_credentials(root_certificates=root_certificates) 302 | 303 | # Establish secure channel when ssl-mode is MTLS/TLS 304 | with grpc.secure_channel(target=args.target, credentials=channel_credentials) as channel: 305 | process_request( 306 | channel=channel, 307 | audio_filepath=audio_filepath, 308 | params=feature_params, 309 | output_filepath=output_filepath, 310 | ) 311 | else: 312 | # Establish insecure channel when ssl-mode is DISABLED 313 | with grpc.insecure_channel(target=args.target) as channel: 314 | process_request( 315 | channel=channel, 316 | audio_filepath=audio_filepath, 317 | params=feature_params, 318 | output_filepath=output_filepath, 319 | ) 320 | 321 | 322 | if __name__ == "__main__": 323 | main() 324 | -------------------------------------------------------------------------------- /studio-voice/scripts/studio_voice.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a 4 | # copy of this software and associated documentation files (the "Software"), 5 | # to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | # and/or sell copies of the Software, and to permit persons to whom the 8 | # Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | import argparse 22 | import os 23 | import sys 24 | import grpc 25 | import time 26 | import soundfile as sf 27 | import numpy as np 28 | from typing import Iterator 29 | 30 | sys.path.append(os.path.join(os.getcwd(), "../interfaces/studio_voice")) 31 | # Importing gRPC compiler auto-generated maxine studiovoice library 32 | import studiovoice_pb2 # noqa: E402 33 | import studiovoice_pb2_grpc # noqa: E402 34 | 35 | 36 | def read_file_content(file_path: os.PathLike) -> None: 37 | """Function to read file content as bytes. 38 | 39 | Args: 40 | file_path: Path to input file 41 | """ 42 | if not os.path.isfile(file_path): 43 | raise FileNotFoundError(f"The file '{file_path}' does not exist. Exiting.") 44 | 45 | with open(file_path, "rb") as file: 46 | return file.read() 47 | 48 | 49 | def generate_request_for_inference( 50 | input_filepath: os.PathLike, model_type: str, sample_rate: int, streaming: bool 51 | ) -> None: 52 | """Generator to produce the request data stream 53 | 54 | Args: 55 | input_filepath: Path to input file 56 | model_type: Studio Voice model type to infer 57 | sample_rate: Input audio sample rate 58 | streaming: Enables grpc streaming mode 59 | """ 60 | if streaming: 61 | """ 62 | Input audio chunk is generated based on model type and sample rate, 63 | 1) High quality models require 6sec input 64 | 2) Low latency models require 10ms input chunk 65 | """ 66 | input_audio, sample_rate_file = sf.read(input_filepath) 67 | input_audio = input_audio.astype(np.float32) # Convert to float32 68 | input_size_in_ms = 10 if (model_type == "48k-ll") else 6000 69 | samples_per_ms = sample_rate // 1000 70 | input_float_size = int(input_size_in_ms * samples_per_ms) 71 | 72 | pad_length = input_float_size - len(input_audio) % input_float_size 73 | input_audio = np.pad(input_audio, (0, pad_length), "constant") 74 | 75 | print( 76 | f"Len {len(input_audio)}, chunk_size {input_float_size}, audio {input_audio}, " 77 | "type {input_audio.dtype}" 78 | ) 79 | for i in range(0, len(input_audio), input_float_size): 80 | data = input_audio[i : i + input_float_size] 81 | yield studiovoice_pb2.EnhanceAudioRequest(audio_stream_data=data.tobytes()) 82 | else: 83 | DATA_CHUNKS = 64 * 1024 # bytes, we send the wav file in 64KB chunks 84 | with open(input_filepath, "rb") as fd: 85 | while True: 86 | buffer = fd.read(DATA_CHUNKS) 87 | if buffer == b"": 88 | break 89 | yield studiovoice_pb2.EnhanceAudioRequest(audio_stream_data=buffer) 90 | 91 | 92 | def write_output_file_from_response( 93 | response_iter: Iterator[studiovoice_pb2.EnhanceAudioResponse], 94 | output_filepath: os.PathLike, 95 | sample_rate: int, 96 | streaming: bool, 97 | ) -> None: 98 | """Function to write the output file from the incoming gRPC data stream. 99 | 100 | Args: 101 | response_iter: Responses from the server to write into output file 102 | output_filepath: Path to output file 103 | sample_rate: Input audio sample rate 104 | streaming: Enables grpc streaming mode 105 | """ 106 | if streaming: 107 | output_audio = [] 108 | response_count = 0 109 | for response in response_iter: 110 | response_count += 1 111 | output_audio.append(np.frombuffer(response.audio_stream_data, np.float32)) 112 | 113 | sf.write(output_filepath, np.hstack(output_audio), sample_rate) 114 | return response_count 115 | else: 116 | with open(output_filepath, "wb") as fd: 117 | for response in response_iter: 118 | if response.HasField("audio_stream_data"): 119 | fd.write(response.audio_stream_data) 120 | 121 | 122 | def parse_args() -> None: 123 | """ 124 | Parse command-line arguments using argparse. 125 | """ 126 | # Set up argument parsing 127 | parser = argparse.ArgumentParser( 128 | description="Process wav audio files using gRPC and apply studio-voice." 129 | ) 130 | parser.add_argument( 131 | "--preview-mode", 132 | action="store_true", 133 | help="Flag to send request to preview NVCF NIM server on " 134 | "https://build.nvidia.com/nvidia/studiovoice/api. ", 135 | ) 136 | parser.add_argument( 137 | "--ssl-mode", 138 | type=str, 139 | help="Flag to set SSL mode, default is None", 140 | default=None, 141 | choices=["MTLS", "TLS"], 142 | ) 143 | parser.add_argument( 144 | "--ssl-key", 145 | type=str, 146 | default=None, 147 | help="The path to ssl private key.", 148 | ) 149 | parser.add_argument( 150 | "--ssl-cert", 151 | type=str, 152 | default=None, 153 | help="The path to ssl certificate chain.", 154 | ) 155 | parser.add_argument( 156 | "--ssl-root-cert", 157 | type=str, 158 | default=None, 159 | help="The path to ssl root certificate.", 160 | ) 161 | parser.add_argument( 162 | "--target", 163 | type=str, 164 | default="127.0.0.1:8001", 165 | help="IP:port of gRPC service, when hosted locally. " 166 | "Use grpc.nvcf.nvidia.com:443 when hosted on NVCF.", 167 | ) 168 | parser.add_argument( 169 | "--input", 170 | type=str, 171 | default="../assets/studio_voice_48k_input.wav", 172 | help="The path to the input audio file.", 173 | ) 174 | parser.add_argument( 175 | "--output", 176 | type=str, 177 | default="studio_voice_48k_output.wav", 178 | help="The path for the output audio file.", 179 | ) 180 | parser.add_argument( 181 | "--api-key", 182 | type=str, 183 | help="NGC API key required for authentication, " 184 | "utilized when using TRY API ignored otherwise", 185 | ) 186 | parser.add_argument( 187 | "--function-id", 188 | type=str, 189 | help="NVCF function ID for the service, utilized when using TRY API ignored otherwise", 190 | ) 191 | parser.add_argument( 192 | "--streaming", 193 | action="store_true", 194 | help="Flag to enable grpc streaming mode. ", 195 | ) 196 | parser.add_argument( 197 | "--model-type", 198 | type=str, 199 | help="Studio Voice model type, default is 48k-hq. ", 200 | default="48k-hq", 201 | choices=["48k-hq", "48k-ll", "16k-hq"], 202 | ) 203 | return parser.parse_args() 204 | 205 | 206 | def process_request( 207 | channel: any, 208 | input_filepath: os.PathLike, 209 | output_filepath: os.PathLike, 210 | model_type: str, 211 | sample_rate: int, 212 | streaming: bool, 213 | request_metadata: dict = None, 214 | ) -> None: 215 | """Function to process gRPC request 216 | 217 | Args: 218 | channel: gRPC channel for server client communication 219 | input_filepath: Path to input file 220 | output_filepath: Path to output file 221 | model_type: Studio Voice model type to infer 222 | sample_rate: Input audio sample rate 223 | streaming: Enables grpc streaming mode 224 | request_metadata: Credentials to process request 225 | """ 226 | try: 227 | stub = studiovoice_pb2_grpc.MaxineStudioVoiceStub(channel) 228 | start_time = time.time() 229 | 230 | responses = stub.EnhanceAudio( 231 | generate_request_for_inference( 232 | input_filepath=input_filepath, 233 | model_type=model_type, 234 | sample_rate=sample_rate, 235 | streaming=streaming, 236 | ), 237 | metadata=request_metadata, 238 | ) 239 | 240 | response_count = write_output_file_from_response( 241 | response_iter=responses, 242 | output_filepath=output_filepath, 243 | sample_rate=sample_rate, 244 | streaming=streaming, 245 | ) 246 | 247 | end_time = time.time() 248 | if streaming: 249 | avg_latency = (end_time - start_time) / response_count 250 | print(f"Average latency per request: {avg_latency*1000:.2f}ms") 251 | print(f"Processed {response_count} chunks.") 252 | 253 | print( 254 | f"Function invocation completed in {end_time-start_time:.2f}s, " 255 | "the output file is generated." 256 | ) 257 | except BaseException as e: 258 | print(e) 259 | 260 | 261 | def main(): 262 | """ 263 | Main client function 264 | """ 265 | args = parse_args() 266 | streaming = args.streaming 267 | model_type = args.model_type 268 | print(f"Streaming mode set to {streaming}") 269 | sample_rate = 48000 270 | if model_type == "16k-hq": 271 | sample_rate = 16000 272 | print(f"Sample Rate: {sample_rate}") 273 | input_filepath = args.input 274 | output_filepath = args.output 275 | 276 | # Check if input file path exists 277 | if os.path.isfile(input_filepath): 278 | print(f"The file '{input_filepath}' exists. Proceeding with processing.") 279 | else: 280 | raise FileNotFoundError(f"The file '{input_filepath}' does not exist. Exiting.") 281 | 282 | # Check the sample rate of the input audio file 283 | input_info = sf.info(input_filepath) 284 | input_sample_rate = input_info.samplerate 285 | print(f"Input file sample rate: {input_sample_rate}") 286 | 287 | # Check if the input file's sample rate matches the expected sample rate 288 | if input_sample_rate != sample_rate: 289 | raise ValueError(f"Sample rate mismatch: expected {sample_rate}, got {input_sample_rate}.") 290 | 291 | if args.preview_mode: 292 | if args.ssl_mode != "TLS": 293 | # Preview mode only supports TLS mode 294 | args.ssl_mode = "TLS" 295 | print("--ssl-mode is set as TLS, since preview_mode is enabled.") 296 | if args.ssl_root_cert: 297 | raise RuntimeError("Preview mode does not support custom root certificate.") 298 | 299 | if args.ssl_mode is not None: 300 | request_metadata = None 301 | root_certificates = None 302 | if args.ssl_mode == "MTLS": 303 | if not (args.ssl_key and args.ssl_cert and args.ssl_root_cert): 304 | raise RuntimeError( 305 | "If --ssl-mode is MTLS, --ssl-key, --ssl-cert and --ssl-root-cert are required." 306 | ) 307 | 308 | private_key = read_file_content(args.ssl_key) 309 | certificate_chain = read_file_content(args.ssl_cert) 310 | root_certificates = read_file_content(args.ssl_root_cert) 311 | channel_credentials = grpc.ssl_channel_credentials( 312 | root_certificates=root_certificates, 313 | private_key=private_key, 314 | certificate_chain=certificate_chain, 315 | ) 316 | else: 317 | # Running with NVCF 318 | if args.preview_mode: 319 | request_metadata = ( 320 | ("authorization", "Bearer {}".format(args.api_key)), 321 | ("function-id", args.function_id), 322 | ) 323 | channel_credentials = grpc.ssl_channel_credentials() 324 | # Running TLS mode, without NVCF 325 | else: 326 | if not (args.ssl_root_cert): 327 | raise RuntimeError("If --ssl-mode is TLS, --ssl-root-cert is required.") 328 | root_certificates = read_file_content(args.ssl_root_cert) 329 | channel_credentials = grpc.ssl_channel_credentials( 330 | root_certificates=root_certificates 331 | ) 332 | 333 | with grpc.secure_channel(target=args.target, credentials=channel_credentials) as channel: 334 | process_request( 335 | channel=channel, 336 | input_filepath=input_filepath, 337 | output_filepath=output_filepath, 338 | model_type=model_type, 339 | sample_rate=sample_rate, 340 | streaming=streaming, 341 | request_metadata=request_metadata, 342 | ) 343 | else: 344 | with grpc.insecure_channel(target=args.target) as channel: 345 | process_request( 346 | channel=channel, 347 | input_filepath=input_filepath, 348 | output_filepath=output_filepath, 349 | model_type=model_type, 350 | sample_rate=sample_rate, 351 | streaming=streaming, 352 | ) 353 | 354 | 355 | if __name__ == "__main__": 356 | main() 357 | -------------------------------------------------------------------------------- /bnr/scripts/bnr.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a 4 | # copy of this software and associated documentation files (the "Software"), 5 | # to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | # and/or sell copies of the Software, and to permit persons to whom the 8 | # Software is furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | import argparse 22 | import os 23 | import sys 24 | import grpc 25 | import time 26 | import soundfile as sf 27 | import numpy as np 28 | from tqdm import tqdm 29 | from typing import Iterator, Optional 30 | 31 | sys.path.append(os.path.join(os.getcwd(), "../interfaces/bnr")) 32 | # Importing gRPC compiler auto-generated maxine bnr library 33 | import bnr_pb2 # noqa: E402 34 | import bnr_pb2_grpc # noqa: E402 35 | 36 | # Sample rate constants 37 | CONST_SAMPLE_48KHZ = 48000 38 | CONST_SAMPLE_16KHZ = 16000 39 | 40 | 41 | def read_file_content(file_path: os.PathLike) -> None: 42 | """Function to read file content as bytes. 43 | 44 | Args: 45 | file_path: Path to input file 46 | """ 47 | if not os.path.isfile(file_path): 48 | raise FileNotFoundError(f"The file '{file_path}' does not exist. Exiting.") 49 | 50 | with open(file_path, "rb") as file: 51 | return file.read() 52 | 53 | 54 | def generate_request_for_inference( 55 | input_filepath: os.PathLike, 56 | sample_rate: int, 57 | streaming: bool, 58 | intensity_ratio: float = None, 59 | progress_bar: Optional[tqdm] = None, 60 | ) -> None: 61 | """Generator to produce the request data stream 62 | 63 | Args: 64 | input_filepath: Path to input file 65 | sample_rate: Input audio sample rate 66 | streaming: Enables grpc streaming mode 67 | intensity_ratio: Controls denoising intensity (0.0 to 1.0), only works with v1 models 68 | progress_bar: (Optional) Progress bar instance (streaming mode only) 69 | """ 70 | # First send the config if intensity_ratio is specified for v1 models 71 | if intensity_ratio is not None: 72 | config_request = bnr_pb2.EnhanceAudioRequest( 73 | config=bnr_pb2.EnhanceAudioConfig(intensity_ratio=intensity_ratio) 74 | ) 75 | config_request.config.intensity_ratio = intensity_ratio 76 | yield config_request 77 | 78 | if streaming: 79 | """ 80 | Input audio chunk is generated based on sample rate and input size 10ms, 81 | """ 82 | input_audio, sample_rate_file = sf.read(input_filepath) 83 | input_audio = input_audio.astype(np.float32) # Convert to float32 84 | input_size_in_ms = 10 85 | samples_per_ms = sample_rate // 1000 86 | input_float_size = int(input_size_in_ms * samples_per_ms) 87 | 88 | pad_length = input_float_size - len(input_audio) % input_float_size 89 | input_audio = np.pad(input_audio, (0, pad_length), "constant") 90 | 91 | if progress_bar is not None: 92 | progress_bar.total = len(input_audio) // input_float_size 93 | 94 | print( 95 | f"Len {len(input_audio)}, chunk_size {input_float_size}, audio {input_audio}, " 96 | f"type {input_audio.dtype}" 97 | ) 98 | 99 | print( 100 | f"Will process {len(input_audio)//sample_rate} seconds of input audio in " 101 | f"{input_size_in_ms} ms chunks" 102 | ) 103 | for i in range(0, len(input_audio), input_float_size): 104 | data = input_audio[i : i + input_float_size] 105 | yield bnr_pb2.EnhanceAudioRequest(audio_stream_data=data.tobytes()) 106 | else: 107 | DATA_CHUNKS = 64 * 1024 # bytes, we send the wav file in 64KB chunks 108 | with open(input_filepath, "rb") as fd: 109 | while True: 110 | buffer = fd.read(DATA_CHUNKS) 111 | if buffer == b"": 112 | break 113 | yield bnr_pb2.EnhanceAudioRequest(audio_stream_data=buffer) 114 | 115 | 116 | def write_output_file_from_response( 117 | response_iter: Iterator[bnr_pb2.EnhanceAudioResponse], 118 | output_filepath: os.PathLike, 119 | sample_rate: int, 120 | streaming: bool, 121 | progress_bar: Optional[tqdm], 122 | ) -> None: 123 | """Function to write the output file from the incoming gRPC data stream. 124 | 125 | Args: 126 | response_iter: Responses from the server to write into output file 127 | output_filepath: Path to output file 128 | sample_rate: Input audio sample rate 129 | streaming: Enables grpc streaming mode 130 | progress_bar: (Optional) Progress bar instance (streaming mode only) 131 | """ 132 | if streaming: 133 | output_audio = [] 134 | response_count = 0 135 | for response in response_iter: 136 | if response.HasField("audio_stream_data"): 137 | response_count += 1 138 | if progress_bar is not None: 139 | progress_bar.update(1) 140 | output_audio.append(np.frombuffer(response.audio_stream_data, np.float32)) 141 | 142 | sf.write(output_filepath, np.hstack(output_audio), sample_rate) 143 | if progress_bar: 144 | progress_bar.close() 145 | return response_count 146 | else: 147 | with open(output_filepath, "wb") as fd: 148 | for response in response_iter: 149 | if response.HasField("audio_stream_data"): 150 | fd.write(response.audio_stream_data) 151 | 152 | 153 | def parse_args() -> None: 154 | """ 155 | Parse command-line arguments using argparse. 156 | """ 157 | # Set up argument parsing 158 | parser = argparse.ArgumentParser( 159 | description="Process wav audio files using gRPC and apply bnr." 160 | ) 161 | parser.add_argument( 162 | "--preview-mode", 163 | action="store_true", 164 | help="Flag to send request to preview NVCF NIM server on " 165 | "https://build.nvidia.com/nvidia/bnr/api", 166 | ) 167 | parser.add_argument( 168 | "--ssl-mode", 169 | type=str, 170 | help="Flag to set SSL mode, default is None", 171 | default=None, 172 | choices=["MTLS", "TLS"], 173 | ) 174 | parser.add_argument( 175 | "--ssl-key", 176 | type=str, 177 | default=None, 178 | help="The path to ssl private key.", 179 | ) 180 | parser.add_argument( 181 | "--ssl-cert", 182 | type=str, 183 | default=None, 184 | help="The path to ssl certificate chain.", 185 | ) 186 | parser.add_argument( 187 | "--ssl-root-cert", 188 | type=str, 189 | default=None, 190 | help="The path to ssl root certificate.", 191 | ) 192 | parser.add_argument( 193 | "--target", 194 | type=str, 195 | default="127.0.0.1:8001", 196 | help="IP:port of gRPC service, when hosted locally. " 197 | "Use grpc.nvcf.nvidia.com:443 when hosted on NVCF.", 198 | ) 199 | parser.add_argument( 200 | "--input", 201 | type=str, 202 | default="../assets/bnr_48k_input.wav", 203 | help="The path to the input audio file.", 204 | ) 205 | parser.add_argument( 206 | "--output", 207 | type=str, 208 | default="bnr_48k_output.wav", 209 | help="The path for the output audio file.", 210 | ) 211 | parser.add_argument( 212 | "--api-key", 213 | type=str, 214 | help="NGC API key required for authentication, " 215 | "utilized when using TRY API ignored otherwise", 216 | ) 217 | parser.add_argument( 218 | "--function-id", 219 | type=str, 220 | help="NVCF function ID for the service, utilized when using TRY API ignored otherwise", 221 | ) 222 | parser.add_argument( 223 | "--streaming", 224 | action="store_true", 225 | help="Flag to enable grpc streaming mode. ", 226 | ) 227 | parser.add_argument( 228 | "--sample-rate", 229 | type=int, 230 | help="Sample rate of input audio file in Hz, default is 48000.", 231 | default=CONST_SAMPLE_48KHZ, 232 | choices=[CONST_SAMPLE_48KHZ, CONST_SAMPLE_16KHZ], 233 | ) 234 | 235 | parser.add_argument( 236 | "--intensity-ratio", 237 | type=float, 238 | help=( 239 | "Intensity ratio value between 0 and 1 to control denoising intensity. " 240 | "Default is 1.0 (maximum denoising)." 241 | ), 242 | default=None, 243 | ) 244 | args = parser.parse_args() 245 | 246 | # Validate intensity_ratio value 247 | if args.intensity_ratio is not None and ( 248 | args.intensity_ratio < 0.0 or args.intensity_ratio > 1.0 249 | ): 250 | parser.error("Intensity ratio value must be between 0.0 and 1.0") 251 | 252 | return args 253 | 254 | 255 | def process_request( 256 | channel: any, 257 | input_filepath: os.PathLike, 258 | output_filepath: os.PathLike, 259 | sample_rate: int, 260 | streaming: bool, 261 | request_metadata: dict = None, 262 | intensity_ratio: float = None, 263 | ) -> None: 264 | """Function to process gRPC request 265 | 266 | Args: 267 | channel: gRPC channel for server client communication 268 | input_filepath: Path to input file 269 | output_filepath: Path to output file 270 | sample_rate: Input audio sample rate 271 | streaming: Enables grpc streaming mode 272 | request_metadata: Credentials to process request 273 | intensity_ratio: Controls denoising intensity (0.0 to 1.0) 274 | """ 275 | try: 276 | stub = bnr_pb2_grpc.MaxineBNRStub(channel) 277 | start_time = time.time() 278 | 279 | progress_bar = None 280 | if streaming: 281 | progress_bar = tqdm() 282 | 283 | responses = stub.EnhanceAudio( 284 | generate_request_for_inference( 285 | input_filepath=input_filepath, 286 | sample_rate=sample_rate, 287 | streaming=streaming, 288 | intensity_ratio=intensity_ratio, 289 | progress_bar=progress_bar, 290 | ), 291 | metadata=request_metadata, 292 | ) 293 | 294 | response_count = write_output_file_from_response( 295 | response_iter=responses, 296 | output_filepath=output_filepath, 297 | sample_rate=sample_rate, 298 | streaming=streaming, 299 | progress_bar=progress_bar, 300 | ) 301 | 302 | end_time = time.time() 303 | if streaming: 304 | avg_latency = (end_time - start_time) / response_count 305 | print(f"Average latency per request: {avg_latency*1000:.2f}ms") 306 | print(f"Processed {response_count} chunks.") 307 | 308 | print( 309 | f"Function invocation completed in {end_time-start_time:.2f}s, " 310 | "the output file is generated." 311 | ) 312 | except BaseException as e: 313 | print(e) 314 | 315 | 316 | def main(): 317 | """ 318 | Main client function 319 | """ 320 | args = parse_args() 321 | streaming = args.streaming 322 | print(f"Streaming mode set to {streaming}") 323 | sample_rate = CONST_SAMPLE_48KHZ 324 | if args.sample_rate == CONST_SAMPLE_16KHZ: 325 | sample_rate = CONST_SAMPLE_16KHZ 326 | print(f"Sample Rate: {sample_rate}") 327 | input_filepath = args.input 328 | output_filepath = args.output 329 | 330 | # Check if input file path exists 331 | if os.path.isfile(input_filepath): 332 | print(f"The file '{input_filepath}' exists. Proceeding with processing.") 333 | else: 334 | raise FileNotFoundError(f"The file '{input_filepath}' does not exist. Exiting.") 335 | 336 | # Check the sample rate of the input audio file 337 | input_info = sf.info(input_filepath) 338 | input_sample_rate = input_info.samplerate 339 | print(f"Input file sample rate: {input_sample_rate}") 340 | 341 | # Check if the input file's sample rate matches the expected sample rate 342 | if input_sample_rate != sample_rate: 343 | raise ValueError(f"Sample rate mismatch: expected {sample_rate}, got {input_sample_rate}.") 344 | 345 | if args.preview_mode: 346 | if args.ssl_mode != "TLS": 347 | # Preview mode only supports TLS mode 348 | args.ssl_mode = "TLS" 349 | print("--ssl-mode is set as TLS, since preview_mode is enabled.") 350 | if args.ssl_root_cert: 351 | raise RuntimeError("Preview mode does not support custom root certificate.") 352 | 353 | if args.ssl_mode is not None: 354 | request_metadata = None 355 | root_certificates = None 356 | if args.ssl_mode == "MTLS": 357 | if not (args.ssl_key and args.ssl_cert and args.ssl_root_cert): 358 | raise RuntimeError( 359 | "If --ssl-mode is MTLS, --ssl-key, --ssl-cert and " 360 | "--ssl-root-cert are required." 361 | ) 362 | 363 | private_key = read_file_content(args.ssl_key) 364 | certificate_chain = read_file_content(args.ssl_cert) 365 | root_certificates = read_file_content(args.ssl_root_cert) 366 | channel_credentials = grpc.ssl_channel_credentials( 367 | root_certificates=root_certificates, 368 | private_key=private_key, 369 | certificate_chain=certificate_chain, 370 | ) 371 | else: 372 | # Running with NVCF 373 | if args.preview_mode: 374 | request_metadata = ( 375 | ("authorization", "Bearer {}".format(args.api_key)), 376 | ("function-id", args.function_id), 377 | ) 378 | channel_credentials = grpc.ssl_channel_credentials() 379 | # Running TLS mode, without NVCF 380 | else: 381 | if not (args.ssl_root_cert): 382 | raise RuntimeError("If --ssl-mode is TLS, --ssl-root-cert is required.") 383 | root_certificates = read_file_content(args.ssl_root_cert) 384 | channel_credentials = grpc.ssl_channel_credentials( 385 | root_certificates=root_certificates 386 | ) 387 | 388 | with grpc.secure_channel(target=args.target, credentials=channel_credentials) as channel: 389 | process_request( 390 | channel=channel, 391 | input_filepath=input_filepath, 392 | output_filepath=output_filepath, 393 | sample_rate=sample_rate, 394 | streaming=streaming, 395 | request_metadata=request_metadata, 396 | intensity_ratio=args.intensity_ratio, 397 | ) 398 | else: 399 | with grpc.insecure_channel(target=args.target) as channel: 400 | process_request( 401 | channel=channel, 402 | input_filepath=input_filepath, 403 | output_filepath=output_filepath, 404 | sample_rate=sample_rate, 405 | streaming=streaming, 406 | intensity_ratio=args.intensity_ratio, 407 | ) 408 | 409 | 410 | if __name__ == "__main__": 411 | main() 412 | --------------------------------------------------------------------------------