├── vvembed ├── __init__.py ├── modular │ ├── __init__.py │ ├── modular_vibevoice_text_tokenizer.py │ ├── modular_vibevoice_diffusion_head.py │ ├── streamer.py │ ├── configuration_vibevoice.py │ └── modeling_vibevoice.py ├── schedule │ ├── __init__.py │ └── timestep_sampler.py ├── scripts │ ├── __init__.py │ └── convert_nnscaler_checkpoint_to_transformers.py ├── processor │ ├── __init__.py │ └── vibevoice_tokenizer_processor.py ├── README.md └── LICENSE ├── requirements.txt ├── node_list.json ├── nodes ├── __init__.py ├── free_memory_node.py ├── load_text_node.py ├── lora_node.py ├── single_speaker_node.py └── multi_speaker_node.py ├── pyproject.toml ├── LICENSE ├── __init__.py ├── examples ├── Single-Speaker.json ├── Pause-Tag.json ├── Multiple-Speaker.json └── VibeVoice-Unload-Memory.json └── README.md /vvembed/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vvembed/modular/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vvembed/schedule/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vvembed/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vvembed/processor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate>=1.6.0 2 | transformers>=4.51.3 3 | diffusers 4 | tqdm 5 | scipy 6 | ml-collections 7 | torch>=2.0.0 8 | torchaudio>=2.0.0 9 | numpy>=1.20.0 10 | librosa>=0.9.0 11 | soundfile>=0.12.0 12 | av>=14.3.0 13 | peft>=0.17.0 14 | huggingface_hub>=0.25.1 15 | absl-py 16 | aiortc 17 | bitsandbytes>=0.48.1 18 | protobuf -------------------------------------------------------------------------------- /node_list.json: -------------------------------------------------------------------------------- 1 | { 2 | "VibeVoice Load Text From File": "Load .txt from ComfyUI input/output/temp", 3 | "VibeVoice Single Speaker": "Single-speaker TTS with optional voice cloning", 4 | "VibeVoice Multiple Speakers": "Multi-speaker TTS ([1]..[4]) with optional clones", 5 | "VibeVoice Free Memory": "Frees loaded VibeVoice models; passthrough audio", 6 | "VibeVoice LoRA": "Configure LoRA adapters for fine-tuned VibeVoice models" 7 | } -------------------------------------------------------------------------------- /nodes/__init__.py: -------------------------------------------------------------------------------- 1 | # Created by Fabio Sarracino 2 | # Nodes module for VibeVoiceWrapper 3 | """ 4 | This module contains all the ComfyUI nodes for VibeVoice integration. 5 | """ 6 | 7 | from .load_text_node import LoadTextFromFileNode 8 | from .single_speaker_node import VibeVoiceSingleSpeakerNode 9 | from .multi_speaker_node import VibeVoiceMultipleSpeakersNode 10 | from .free_memory_node import VibeVoiceFreeMemoryNode 11 | 12 | __all__ = [ 13 | 'LoadTextFromFileNode', 14 | 'VibeVoiceSingleSpeakerNode', 15 | 'VibeVoiceMultipleSpeakersNode', 16 | 'VibeVoiceFreeMemoryNode' 17 | ] -------------------------------------------------------------------------------- /vvembed/schedule/timestep_sampler.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | 5 | class UniformSampler: 6 | def __init__(self, timesteps = 1000): 7 | self.timesteps = timesteps 8 | def sample(self, batch_size, device): 9 | return torch.randint(0, self.timesteps, (batch_size,), device=device) 10 | 11 | class LogitNormalSampler: 12 | def __init__(self, timesteps = 1000, m = 0, s = 1): 13 | self.timesteps = timesteps 14 | timesteps = torch.linspace(0, 1, timesteps) 15 | logit = torch.log(timesteps / (1 - timesteps)) 16 | self.prob = torch.exp(-0.5 * (logit - m) ** 2 / s ** 2) / (s * math.sqrt(2 * math.pi)) 17 | def sample(self, batch_size, device): 18 | return torch.multinomial(self.prob, batch_size, replacement=True).to(device) 19 | -------------------------------------------------------------------------------- /vvembed/README.md: -------------------------------------------------------------------------------- 1 | # Embedded VibeVoice 2 | 3 | This folder contains the embedded VibeVoice code from Microsoft. 4 | 5 | ## Why Embedded? 6 | 7 | The original VibeVoice repository (https://github.com/microsoft/VibeVoice) has been removed from GitHub. Since VibeVoice is licensed under MIT, we have embedded the code here to ensure continued functionality of the ComfyUI wrapper. 8 | 9 | ## License 10 | 11 | The code in this folder is licensed under the MIT License (see LICENSE file). Original copyright belongs to Microsoft Corporation. 12 | 13 | ## Modifications 14 | 15 | The only modifications made to the original code are: 16 | - Changed absolute imports from `vibevoice` to relative imports 17 | - No functional changes to the core logic 18 | 19 | ## Note 20 | 21 | This is a preservation copy to ensure the continued availability of VibeVoice for the ComfyUI community. 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "VibeVoice-ComfyUI" 3 | version = "1.8.1" 4 | description = "ComfyUI wrapper for Microsoft VibeVoice TTS model. Supports single speaker, multi-speaker, and text file loading" 5 | license = {file = "LICENSE"} 6 | authors = [{name = "Fabio Sarracino"}] 7 | dependencies = ["accelerate>=1.6.0", "transformers>=4.51.3", "diffusers", "tqdm", "scipy", "ml-collections", "torch>=2.0.0", "torchaudio>=2.0.0", "numpy>=1.20.0", "librosa>=0.9.0", "soundfile>=0.12.0", "av>=14.3.0", "peft>=0.17.0", "huggingface_hub>=0.25.1", "absl-py", "aiortc", "bitsandbytes>=0.48.1", "protobuf"] 8 | 9 | [project.urls] 10 | Repository = "https://github.com/Enemyx-net/VibeVoice-ComfyUI" 11 | "Bug Tracker" = "https://github.com/Enemyx-net/VibeVoice-ComfyUI/issues" 12 | 13 | [tool.comfy] 14 | PublisherId = "enemyx" 15 | DisplayName = "VibeVoice ComfyUI" 16 | Icon = "" 17 | includes = [] 18 | 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Fabio Sarracino - enemyx.net 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /vvembed/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | --- 24 | 25 | This is the original VibeVoice code from Microsoft, embedded here as the 26 | repository has been removed from GitHub. The code is used under the MIT license. -------------------------------------------------------------------------------- /nodes/free_memory_node.py: -------------------------------------------------------------------------------- 1 | # Created by Fabio Sarracino 2 | # Node to free VibeVoice model memory 3 | 4 | import logging 5 | import torch 6 | import gc 7 | from typing import Any 8 | 9 | # Setup logging 10 | logger = logging.getLogger("VibeVoice") 11 | 12 | class VibeVoiceFreeMemoryNode: 13 | """Node to explicitly free VibeVoice model memory""" 14 | 15 | # Class variables to store node instances 16 | _single_speaker_instances = [] 17 | _multi_speaker_instances = [] 18 | 19 | @classmethod 20 | def INPUT_TYPES(cls): 21 | return { 22 | "required": { 23 | "audio": ("AUDIO", {"tooltip": "Audio input that triggers memory cleanup and gets passed through"}), 24 | } 25 | } 26 | 27 | RETURN_TYPES = ("AUDIO",) 28 | RETURN_NAMES = ("audio",) 29 | FUNCTION = "free_vibevoice_memory" 30 | CATEGORY = "VibeVoiceWrapper" 31 | DESCRIPTION = "Free all loaded VibeVoice models from memory when audio passes through" 32 | 33 | @classmethod 34 | def register_single_speaker(cls, node_instance): 35 | """Register a single speaker node instance""" 36 | if node_instance not in cls._single_speaker_instances: 37 | cls._single_speaker_instances.append(node_instance) 38 | 39 | @classmethod 40 | def register_multi_speaker(cls, node_instance): 41 | """Register a multi speaker node instance""" 42 | if node_instance not in cls._multi_speaker_instances: 43 | cls._multi_speaker_instances.append(node_instance) 44 | 45 | def free_vibevoice_memory(self, audio): 46 | """Free memory from all VibeVoice nodes and pass through the audio""" 47 | 48 | try: 49 | freed_count = 0 50 | 51 | # Try to access and free memory from globally cached instances 52 | # ComfyUI might cache node instances 53 | try: 54 | import sys 55 | from .base_vibevoice import BaseVibeVoiceNode 56 | 57 | # Search in all modules for BaseVibeVoiceNode instances 58 | for module_name, module in sys.modules.items(): 59 | if module and 'vibevoice' in module_name.lower(): 60 | for attr_name in dir(module): 61 | if not attr_name.startswith('_'): 62 | try: 63 | attr = getattr(module, attr_name) 64 | if isinstance(attr, type) and issubclass(attr, BaseVibeVoiceNode): 65 | # Check if the class has any cached instances 66 | for instance_attr in dir(attr): 67 | instance = getattr(attr, instance_attr) 68 | if isinstance(instance, BaseVibeVoiceNode) and hasattr(instance, 'free_memory'): 69 | instance.free_memory() 70 | freed_count += 1 71 | except: 72 | pass 73 | except: 74 | pass 75 | 76 | # Free from registered single speaker instances 77 | for node in self._single_speaker_instances: 78 | if hasattr(node, 'free_memory'): 79 | node.free_memory() 80 | freed_count += 1 81 | 82 | # Free from registered multi speaker instances 83 | for node in self._multi_speaker_instances: 84 | if hasattr(node, 'free_memory'): 85 | node.free_memory() 86 | freed_count += 1 87 | 88 | # Force garbage collection 89 | gc.collect() 90 | 91 | # Clear CUDA cache if available 92 | if torch.cuda.is_available(): 93 | torch.cuda.empty_cache() 94 | torch.cuda.synchronize() 95 | logger.info(f"Freed VibeVoice memory from {freed_count} nodes and cleared CUDA cache") 96 | else: 97 | logger.info(f"Freed VibeVoice memory from {freed_count} nodes") 98 | 99 | # Pass through the audio unchanged 100 | return (audio,) 101 | 102 | except Exception as e: 103 | logger.error(f"Error freeing VibeVoice memory: {str(e)}") 104 | # Still pass through audio even if error occurs 105 | return (audio,) 106 | 107 | @classmethod 108 | def IS_CHANGED(cls, **kwargs): 109 | """Always execute this node""" 110 | return float("nan") # Forces re-execution every time -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Created by Fabio Sarracino 2 | __version__ = "1.8.1" 3 | __author__ = "Fabio Sarracino" 4 | __title__ = "VibeVoice ComfyUI" 5 | 6 | import logging 7 | import os 8 | import sys 9 | import subprocess 10 | 11 | # Setup logging 12 | logger = logging.getLogger("VibeVoice") 13 | logger.propagate = False 14 | 15 | if not logger.handlers: 16 | handler = logging.StreamHandler() 17 | formatter = logging.Formatter('[VibeVoice] %(message)s') 18 | handler.setFormatter(formatter) 19 | logger.addHandler(handler) 20 | logger.setLevel(logging.INFO) 21 | 22 | def apply_timm_compatibility_patches(): 23 | """Apply compatibility patches for timm package conflicts""" 24 | try: 25 | import timm.data 26 | 27 | # Patch missing functions that cause import errors 28 | patches = { 29 | 'ImageNetInfo': lambda: type('ImageNetInfo', (), {'__init__': lambda self: None})(), 30 | 'infer_imagenet_subset': lambda class_to_idx: 'imagenet', 31 | 'get_imagenet_subset_labels': lambda *args, **kwargs: [], 32 | 'get_imagenet_subset_info': lambda *args, **kwargs: {}, 33 | 'resolve_data_config': lambda *args, **kwargs: {} 34 | } 35 | 36 | for attr_name, patch_func in patches.items(): 37 | if not hasattr(timm.data, attr_name): 38 | if attr_name == 'ImageNetInfo': 39 | setattr(timm.data, attr_name, type('ImageNetInfo', (), {'__init__': lambda self: None})) 40 | else: 41 | setattr(timm.data, attr_name, patch_func) 42 | 43 | return True 44 | except Exception as e: 45 | return False 46 | 47 | def check_embedded_vibevoice(): 48 | """Check if embedded VibeVoice is available""" 49 | vvembed_path = os.path.join(os.path.dirname(__file__), 'vvembed') 50 | if not os.path.exists(vvembed_path): 51 | logger.error(f"Embedded VibeVoice not found at {vvembed_path}") 52 | return False 53 | 54 | # Add vvembed to path if not already there 55 | if vvembed_path not in sys.path: 56 | sys.path.insert(0, vvembed_path) 57 | 58 | logger.info("Using embedded VibeVoice (MIT licensed)") 59 | return True 60 | 61 | def ensure_dependencies(): 62 | """Ensure required dependencies are installed""" 63 | try: 64 | import transformers 65 | from packaging import version 66 | if version.parse(transformers.__version__) < version.parse("4.44.0"): 67 | logger.warning("Transformers version < 4.44.0, some features may not work correctly") 68 | except ImportError: 69 | logger.warning("Transformers not installed. Please install: pip install transformers>=4.44.0") 70 | return False 71 | 72 | # Apply timm patches if needed 73 | apply_timm_compatibility_patches() 74 | 75 | return True 76 | 77 | # Initialize node mappings 78 | NODE_CLASS_MAPPINGS = {} 79 | NODE_DISPLAY_NAME_MAPPINGS = {} 80 | 81 | # Register text loading node (always available) 82 | try: 83 | from .nodes.load_text_node import LoadTextFromFileNode 84 | NODE_CLASS_MAPPINGS["LoadTextFromFileNode"] = LoadTextFromFileNode 85 | NODE_DISPLAY_NAME_MAPPINGS["LoadTextFromFileNode"] = "VibeVoice Load Text From File" 86 | except Exception as e: 87 | logger.error(f"Failed to register LoadTextFromFile node: {e}") 88 | 89 | # Register VibeVoice nodes (using embedded VibeVoice) 90 | if check_embedded_vibevoice() and ensure_dependencies(): 91 | try: 92 | from .nodes.single_speaker_node import VibeVoiceSingleSpeakerNode 93 | from .nodes.multi_speaker_node import VibeVoiceMultipleSpeakersNode 94 | from .nodes.free_memory_node import VibeVoiceFreeMemoryNode 95 | from .nodes.lora_node import VibeVoiceLoRANode 96 | 97 | # Single speaker node 98 | NODE_CLASS_MAPPINGS["VibeVoiceSingleSpeakerNode"] = VibeVoiceSingleSpeakerNode 99 | NODE_DISPLAY_NAME_MAPPINGS["VibeVoiceSingleSpeakerNode"] = "VibeVoice Single Speaker" 100 | 101 | # Multi speaker node 102 | NODE_CLASS_MAPPINGS["VibeVoiceMultipleSpeakersNode"] = VibeVoiceMultipleSpeakersNode 103 | NODE_DISPLAY_NAME_MAPPINGS["VibeVoiceMultipleSpeakersNode"] = "VibeVoice Multiple Speakers" 104 | 105 | # Free memory node 106 | NODE_CLASS_MAPPINGS["VibeVoiceFreeMemoryNode"] = VibeVoiceFreeMemoryNode 107 | NODE_DISPLAY_NAME_MAPPINGS["VibeVoiceFreeMemoryNode"] = "VibeVoice Free Memory" 108 | 109 | # LoRA configuration node 110 | NODE_CLASS_MAPPINGS["VibeVoiceLoRANode"] = VibeVoiceLoRANode 111 | NODE_DISPLAY_NAME_MAPPINGS["VibeVoiceLoRANode"] = "VibeVoice LoRA" 112 | 113 | logger.info("VibeVoice nodes registered successfully") 114 | 115 | except Exception as e: 116 | logger.error(f"Failed to register VibeVoice nodes: {e}") 117 | logger.info("Please ensure transformers>=4.44.0 is installed") 118 | else: 119 | logger.warning("VibeVoice nodes unavailable - check embedded module and dependencies") 120 | 121 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS', '__version__'] -------------------------------------------------------------------------------- /examples/Single-Speaker.json: -------------------------------------------------------------------------------- 1 | {"id":"c6ef8963-032c-45f6-954f-b5f6b354343b","revision":0,"last_node_id":44,"last_link_id":61,"nodes":[{"id":15,"type":"LoadAudio","pos":[15.256911277770996,126.44892883300781],"size":[270,136],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[60]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"LoadAudio"},"widgets_values":["Voice.mp3",null,null],"color":"#2a363b","bgcolor":"#3f5159"},{"id":21,"type":"Note","pos":[-83.88814544677734,580.3738403320312],"size":[415,88],"flags":{},"order":4,"mode":0,"inputs":[],"outputs":[],"title":"Load Text From File","properties":{},"widgets_values":["Use Load Text From File if you want to use a .txt file instead of text-area. You can load .txt files from ComfyUI/input, ComfyUI/output or ComfyUI/temp directories."],"color":"#432","bgcolor":"#653"},{"id":40,"type":"Note","pos":[377.95758056640625,593.4078979492188],"size":[415,88],"flags":{},"order":5,"mode":0,"inputs":[],"outputs":[],"title":"Voice Speed Factor","properties":{},"widgets_values":["The voice speed factor influences the original source audio to attempt to achieve a slower or faster final speech. 1.0 is the normal speed. It is recommended not to exceed values between 0.95 and 1.05. The effect is best when you provide a sample audio of at least 20 seconds."],"color":"#432","bgcolor":"#653"},{"id":16,"type":"PreviewAudio","pos":[894.1837768554688,126.69258117675781],"size":[270,88],"flags":{},"order":7,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":61},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null}],"outputs":[],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"PreviewAudio"},"widgets_values":[],"color":"#323","bgcolor":"#535"},{"id":44,"type":"VibeVoiceSingleSpeakerNode","pos":[388.8460693359375,126.70189666748047],"size":[400,420],"flags":{},"order":6,"mode":0,"inputs":[{"localized_name":"voice_to_clone","name":"voice_to_clone","shape":7,"type":"AUDIO","link":60},{"localized_name":"lora","name":"lora","shape":7,"type":"LORA_CONFIG","link":null},{"localized_name":"text","name":"text","type":"STRING","widget":{"name":"text"},"link":null},{"localized_name":"model","name":"model","type":"COMBO","widget":{"name":"model"},"link":null},{"localized_name":"attention_type","name":"attention_type","type":"COMBO","widget":{"name":"attention_type"},"link":null},{"localized_name":"quantize_llm","name":"quantize_llm","type":"COMBO","widget":{"name":"quantize_llm"},"link":null},{"localized_name":"free_memory_after_generate","name":"free_memory_after_generate","type":"BOOLEAN","widget":{"name":"free_memory_after_generate"},"link":null},{"localized_name":"diffusion_steps","name":"diffusion_steps","type":"INT","widget":{"name":"diffusion_steps"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null},{"localized_name":"cfg_scale","name":"cfg_scale","type":"FLOAT","widget":{"name":"cfg_scale"},"link":null},{"localized_name":"use_sampling","name":"use_sampling","type":"BOOLEAN","widget":{"name":"use_sampling"},"link":null},{"localized_name":"temperature","name":"temperature","shape":7,"type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","shape":7,"type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"max_words_per_chunk","name":"max_words_per_chunk","shape":7,"type":"INT","widget":{"name":"max_words_per_chunk"},"link":null},{"localized_name":"voice_speed_factor","name":"voice_speed_factor","shape":7,"type":"FLOAT","widget":{"name":"voice_speed_factor"},"link":null}],"outputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","links":[61]}],"properties":{"Node name for S&R":"VibeVoiceSingleSpeakerNode"},"widgets_values":["Hello, this is a test of the VibeVoice text-to-speech system.","VibeVoice-1.5B","auto","full precision",true,20,42,"fixed",1.3,false,0.95,0.95,250,1],"color":"#223","bgcolor":"#335"},{"id":28,"type":"LoadTextFromFileNode","pos":[-11.502296447753906,465.8179626464844],"size":[289.5152282714844,58],"flags":{},"order":0,"mode":4,"inputs":[{"localized_name":"file","name":"file","type":"COMBO","widget":{"name":"file"},"link":null}],"outputs":[{"localized_name":"text","name":"text","type":"STRING","links":null}],"properties":{"Node name for S&R":"LoadTextFromFileNode","cnr_id":"VibeVoice-ComfyUI","ver":"5a24489a7b0bf0c406d291dd51e82a085d338d44"},"widgets_values":["No text files found in any directory"],"color":"#323","bgcolor":"#535"},{"id":22,"type":"Note","pos":[-539.2780151367188,186.78372192382812],"size":[408.66363525390625,236.39089965820312],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[],"title":"1) Download Models","properties":{},"widgets_values":["You have to manually download the models you would like to use and put them into: ComfyUI/models/vibevoice/\n\nMake a directory for each model and put all the files inside them.\n\nVibeVoice-1.5B model (~ 5.4 GB):\nhttps://huggingface.co/microsoft/VibeVoice-1.5B/tree/main\n\nVibeVoice-Large model (~ 18.7 GB):\nhttps://huggingface.co/aoi-ot/VibeVoice-Large/tree/main\n\nVibeVoice-Large-Q-8bit model (~ 11.6 GB):\nhttps://huggingface.co/FabioSarracino/VibeVoice-Large-Q8/tree/main\n\nVibeVoice-Large-Q-4bit model (~ 6.6 GB):\nhttps://huggingface.co/DevParker/VibeVoice7b-low-vram/tree/main/4bit"],"color":"#432","bgcolor":"#653"},{"id":42,"type":"Note","pos":[-538.9786987304688,486.52374267578125],"size":[407.2561950683594,155.19009399414062],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[],"title":"2) Download Tokenizer","properties":{},"widgets_values":["You have to manually download the Qwen2.5 Tokenizer files and put them into: ComfyUI/models/vibevoice/tokenizer/\n\nhttps://huggingface.co/Qwen/Qwen2.5-1.5B/tree/main\n\nRequired files: tokenizer_config.json, vocab.json, merges.txt, tokenizer.json (~11MB)\n\nPut the files directly inside tokenizer directory without make another directory inside."],"color":"#432","bgcolor":"#653"}],"links":[[60,15,0,44,0,"AUDIO"],[61,44,0,16,0,"AUDIO"]],"groups":[{"id":2,"title":"Instructions before use:","bounding":[-562.1800537109375,89.24514770507812,453.3775939941406,595.2697143554688],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"ds":{"scale":0.9090909090909091,"offset":[795.8030854327329,-23.374334793282447]}},"version":0.4} -------------------------------------------------------------------------------- /nodes/load_text_node.py: -------------------------------------------------------------------------------- 1 | # Created by Fabio Sarracino 2 | 3 | import os 4 | import logging 5 | import hashlib 6 | import folder_paths 7 | 8 | # Setup logging 9 | logger = logging.getLogger("VibeVoice") 10 | 11 | class LoadTextFromFileNode: 12 | @classmethod 13 | def INPUT_TYPES(cls): 14 | # Get all text files from all directories 15 | all_files = [] 16 | 17 | # Add files from each directory with prefix 18 | for dir_name in ["input", "output", "temp"]: 19 | files = cls.get_files_for_directory(dir_name) 20 | for f in files: 21 | if f != "No text files found": 22 | all_files.append(f"{dir_name}/{f}") 23 | 24 | if not all_files: 25 | all_files = ["No text files found in any directory"] 26 | 27 | return { 28 | "required": { 29 | "file": (sorted(all_files), { 30 | "tooltip": "Select a text file to load (format: directory/filename)" 31 | }), 32 | } 33 | } 34 | 35 | @classmethod 36 | def get_files_for_directory(cls, source_dir): 37 | """Get list of text files for the selected directory""" 38 | # Get the appropriate directory path 39 | if source_dir == "input": 40 | dir_path = folder_paths.get_input_directory() 41 | elif source_dir == "output": 42 | dir_path = folder_paths.get_output_directory() 43 | elif source_dir == "temp": 44 | dir_path = folder_paths.get_temp_directory() 45 | else: 46 | return [] 47 | 48 | files = [] 49 | try: 50 | for f in os.listdir(dir_path): 51 | if os.path.isfile(os.path.join(dir_path, f)): 52 | # Check for text file extensions 53 | if f.lower().endswith(('.txt')): 54 | files.append(f) 55 | except Exception as e: 56 | logger.warning(f"Error listing files in {source_dir}: {e}") 57 | 58 | return files 59 | 60 | RETURN_TYPES = ("STRING",) 61 | RETURN_NAMES = ("text",) 62 | FUNCTION = "load_text" 63 | CATEGORY = "VibeVoiceWrapper" 64 | DESCRIPTION = "Load text content from a .txt file" 65 | 66 | def load_text(self, file: str): 67 | """Load text content from file""" 68 | 69 | try: 70 | # Check if no file selected 71 | if not file or file == "No text files found in any directory": 72 | raise Exception("Please select a valid text file.") 73 | 74 | # Parse directory and filename from the combined string 75 | if "/" not in file: 76 | raise Exception(f"Invalid file format: {file}") 77 | 78 | source_dir, filename = file.split("/", 1) 79 | 80 | # Get the appropriate directory path 81 | if source_dir == "input": 82 | dir_path = folder_paths.get_input_directory() 83 | elif source_dir == "output": 84 | dir_path = folder_paths.get_output_directory() 85 | elif source_dir == "temp": 86 | dir_path = folder_paths.get_temp_directory() 87 | else: 88 | raise Exception(f"Invalid source directory: {source_dir}") 89 | 90 | # Build full file path 91 | file_path = os.path.join(dir_path, filename) 92 | 93 | if not os.path.exists(file_path): 94 | raise Exception(f"File not found: {file_path}") 95 | 96 | # Read file with UTF-8 encoding (most common) 97 | with open(file_path, 'r', encoding='utf-8') as f: 98 | text_content = f.read() 99 | 100 | if not text_content.strip(): 101 | raise Exception("File is empty or contains only whitespace") 102 | 103 | return (text_content,) 104 | 105 | except UnicodeDecodeError as e: 106 | raise Exception(f"Encoding error reading file: {str(e)}. File may not be UTF-8 encoded.") 107 | except Exception as e: 108 | logger.error(f"Failed to load text file: {str(e)}") 109 | raise Exception(f"Error loading text file: {str(e)}") 110 | 111 | @classmethod 112 | def IS_CHANGED(cls, file): 113 | """Cache key for ComfyUI""" 114 | if not file or file == "No text files found in any directory": 115 | return "no_file" 116 | 117 | # Parse directory and filename 118 | if "/" not in file: 119 | return f"{file}_invalid" 120 | 121 | source_dir, filename = file.split("/", 1) 122 | 123 | # Get the appropriate directory path 124 | if source_dir == "input": 125 | dir_path = folder_paths.get_input_directory() 126 | elif source_dir == "output": 127 | dir_path = folder_paths.get_output_directory() 128 | elif source_dir == "temp": 129 | dir_path = folder_paths.get_temp_directory() 130 | else: 131 | return f"{file}_invalid_dir" 132 | 133 | file_path = os.path.join(dir_path, filename) 134 | 135 | if not os.path.exists(file_path): 136 | return f"{file}_not_found" 137 | 138 | # Use file hash for cache invalidation 139 | try: 140 | m = hashlib.sha256() 141 | with open(file_path, 'rb') as f: 142 | m.update(f.read()) 143 | return m.digest().hex() 144 | except: 145 | return f"{file}_error" 146 | 147 | @classmethod 148 | def VALIDATE_INPUTS(cls, file, **kwargs): 149 | """Validate that the file exists""" 150 | if not file or file == "No text files found in any directory": 151 | return "No valid text file selected" 152 | 153 | # Parse directory and filename 154 | if "/" not in file: 155 | return f"Invalid file format: {file}" 156 | 157 | source_dir, filename = file.split("/", 1) 158 | 159 | # Get the appropriate directory path 160 | if source_dir == "input": 161 | dir_path = folder_paths.get_input_directory() 162 | elif source_dir == "output": 163 | dir_path = folder_paths.get_output_directory() 164 | elif source_dir == "temp": 165 | dir_path = folder_paths.get_temp_directory() 166 | else: 167 | return f"Invalid source directory: {source_dir}" 168 | 169 | file_path = os.path.join(dir_path, filename) 170 | if not os.path.exists(file_path): 171 | return f"File not found: {filename} in {source_dir}" 172 | 173 | return True -------------------------------------------------------------------------------- /examples/Pause-Tag.json: -------------------------------------------------------------------------------- 1 | {"id":"b70cf6f7-8531-4faa-9843-9c963a4ba577","revision":0,"last_node_id":47,"last_link_id":58,"nodes":[{"id":28,"type":"LoadTextFromFileNode","pos":[-51.13530731201172,497.1748352050781],"size":[289.5152282714844,58],"flags":{},"order":0,"mode":4,"inputs":[{"localized_name":"file","name":"file","type":"COMBO","widget":{"name":"file"},"link":null}],"outputs":[{"localized_name":"text","name":"text","type":"STRING","links":null}],"properties":{"Node name for S&R":"LoadTextFromFileNode","cnr_id":"VibeVoice-ComfyUI","ver":"5a24489a7b0bf0c406d291dd51e82a085d338d44"},"widgets_values":["No text files found in any directory"],"color":"#323","bgcolor":"#535"},{"id":38,"type":"Note","pos":[775.2548828125,307.8158874511719],"size":[415,88],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[],"title":"Pause System","properties":{},"widgets_values":["[pause]: add 1 second of silence.\n[pause:{number}] add {number}ms of pause\nWARNING: the pause tag forces the text to be split into chunks. This may worsen the model’s ability to understand the context. The model’s context is represented ONLY by its own chunk."],"color":"#432","bgcolor":"#653"},{"id":15,"type":"LoadAudio","pos":[-52.503074645996094,163.9591064453125],"size":[270,136],"flags":{},"order":2,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[57]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"LoadAudio"},"widgets_values":["Voice.mp3",null,null],"color":"#2a363b","bgcolor":"#3f5159"},{"id":21,"type":"Note","pos":[-119.67156219482422,637.6148071289062],"size":[415,88],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[],"title":"Load Text From File","properties":{},"widgets_values":["Use Load Text From File if you want to use a .txt file instead of text-area. You can load .txt files from ComfyUI/input, ComfyUI/output or ComfyUI/temp directories."],"color":"#432","bgcolor":"#653"},{"id":16,"type":"PreviewAudio","pos":[845.1698608398438,163.10276794433594],"size":[270,88],"flags":{},"order":8,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":58},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null}],"outputs":[],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"PreviewAudio"},"widgets_values":[],"color":"#323","bgcolor":"#535"},{"id":40,"type":"Note","pos":[325.02294921875,636.903564453125],"size":[415,88],"flags":{},"order":4,"mode":0,"inputs":[],"outputs":[],"title":"Voice Speed Factor","properties":{},"widgets_values":["The voice speed factor influences the original source audio to attempt to achieve a slower or faster final speech. 1.0 is the normal speed. It is recommended not to exceed values between 0.95 and 1.05. The effect is best when you provide a sample audio of at least 20 seconds."],"color":"#432","bgcolor":"#653"},{"id":45,"type":"VibeVoiceSingleSpeakerNode","pos":[327.48126220703125,164.61436462402344],"size":[400,420],"flags":{},"order":7,"mode":0,"inputs":[{"localized_name":"voice_to_clone","name":"voice_to_clone","shape":7,"type":"AUDIO","link":57},{"localized_name":"lora","name":"lora","shape":7,"type":"LORA_CONFIG","link":null},{"localized_name":"text","name":"text","type":"STRING","widget":{"name":"text"},"link":null},{"localized_name":"model","name":"model","type":"COMBO","widget":{"name":"model"},"link":null},{"localized_name":"attention_type","name":"attention_type","type":"COMBO","widget":{"name":"attention_type"},"link":null},{"localized_name":"quantize_llm","name":"quantize_llm","type":"COMBO","widget":{"name":"quantize_llm"},"link":null},{"localized_name":"free_memory_after_generate","name":"free_memory_after_generate","type":"BOOLEAN","widget":{"name":"free_memory_after_generate"},"link":null},{"localized_name":"diffusion_steps","name":"diffusion_steps","type":"INT","widget":{"name":"diffusion_steps"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null},{"localized_name":"cfg_scale","name":"cfg_scale","type":"FLOAT","widget":{"name":"cfg_scale"},"link":null},{"localized_name":"use_sampling","name":"use_sampling","type":"BOOLEAN","widget":{"name":"use_sampling"},"link":null},{"localized_name":"temperature","name":"temperature","shape":7,"type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","shape":7,"type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"max_words_per_chunk","name":"max_words_per_chunk","shape":7,"type":"INT","widget":{"name":"max_words_per_chunk"},"link":null},{"localized_name":"voice_speed_factor","name":"voice_speed_factor","shape":7,"type":"FLOAT","widget":{"name":"voice_speed_factor"},"link":null}],"outputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","links":[58]}],"properties":{"Node name for S&R":"VibeVoiceSingleSpeakerNode"},"widgets_values":["Hello, this is a test of the VibeVoice text-to-speech system. [pause] Do you like my voice? [pause:500] What's your name?","VibeVoice-1.5B","auto","full precision",true,20,42,"fixed",1.3,false,0.95,0.95,250,1],"color":"#223","bgcolor":"#335"},{"id":46,"type":"Note","pos":[-576.477294921875,222.4726104736328],"size":[408.66363525390625,236.39089965820312],"flags":{},"order":5,"mode":0,"inputs":[],"outputs":[],"title":"1) Download Models","properties":{},"widgets_values":["You have to manually download the models you would like to use and put them into: ComfyUI/models/vibevoice/\n\nMake a directory for each model and put all the files inside them.\n\nVibeVoice-1.5B model (~ 5.4 GB):\nhttps://huggingface.co/microsoft/VibeVoice-1.5B/tree/main\n\nVibeVoice-Large model (~ 18.7 GB):\nhttps://huggingface.co/aoi-ot/VibeVoice-Large/tree/main\n\nVibeVoice-Large-Q-8bit model (~ 11.6 GB):\nhttps://huggingface.co/FabioSarracino/VibeVoice-Large-Q8/tree/main\n\nVibeVoice-Large-Q-4bit model (~ 6.6 GB):\nhttps://huggingface.co/DevParker/VibeVoice7b-low-vram/tree/main/4bit"],"color":"#432","bgcolor":"#653"},{"id":47,"type":"Note","pos":[-576.177978515625,522.212646484375],"size":[407.2561950683594,155.19009399414062],"flags":{},"order":6,"mode":0,"inputs":[],"outputs":[],"title":"2) Download Tokenizer","properties":{},"widgets_values":["You have to manually download the Qwen2.5 Tokenizer files and put them into: ComfyUI/models/vibevoice/tokenizer/\n\nhttps://huggingface.co/Qwen/Qwen2.5-1.5B/tree/main\n\nRequired files: tokenizer_config.json, vocab.json, merges.txt, tokenizer.json (~11MB)\n\nPut the files directly inside tokenizer directory without make another directory inside."],"color":"#432","bgcolor":"#653"}],"links":[[57,15,0,45,0,"AUDIO"],[58,45,0,16,0,"AUDIO"]],"groups":[{"id":2,"title":"Instructions before use:","bounding":[-599.3793334960938,124.93412017822266,453.3775939941406,595.2697143554688],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"ds":{"scale":0.8264462809917354,"offset":[815.9689977237014,-22.084207406969263]}},"version":0.4} -------------------------------------------------------------------------------- /vvembed/scripts/convert_nnscaler_checkpoint_to_transformers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import argparse 5 | import json 6 | import os 7 | from pathlib import Path 8 | import re 9 | import torch 10 | from typing import Dict, List, Tuple 11 | 12 | from modular.configuration_vibevoice import ( 13 | VibeVoiceConfig 14 | ) 15 | from modular.modeling_vibevoice import VibeVoiceForConditionalGeneration 16 | from transformers.utils import logging 17 | 18 | logger = logging.get_logger(__name__) 19 | 20 | def convert_vibevoice_nnscaler_checkpoint_to_hf( 21 | checkpoint_path: str, 22 | pytorch_dump_folder_path: str, 23 | config_path: str = None, 24 | ): 25 | """ 26 | Convert a nnscaler VibeVoice checkpoint to HuggingFace format. 27 | Supports both regular checkpoints and tensor parallel checkpoints. 28 | """ 29 | 30 | # Load regular checkpoint 31 | logger.info(f"Loading regular checkpoint from {checkpoint_path}") 32 | checkpoint = torch.load(checkpoint_path, map_location="cpu") # ['model', 'optimizer', 'lr_scheduler', 'train_status', 'train_args', 'rng_states', 'nnscaler', 'dataloader'] 33 | 34 | # config = checkpoint['train_args'] 35 | init_config_name = checkpoint['train_args']['vars']['model_args']['config_path']['relative_path'] 36 | pretrained_name = checkpoint['train_args']['vars']['data_args']['tokenizer_path'] 37 | 38 | init_config_path = Path(__file__).parent.parent / 'configs' / init_config_name.split('/')[-1] 39 | if init_config_path.exists(): 40 | logger.info(f"Loading initial config from {init_config_path}") 41 | with open(init_config_path, 'r') as f: 42 | init_config = json.load(f) 43 | else: 44 | raise FileNotFoundError(f"Initial config file {init_config_path} not found. Please provide a valid path.") 45 | 46 | tie_word_embeddings = init_config['decoder_config'].get('tie_word_embeddings', True) 47 | logger.info(f"Tie word embeddings: {tie_word_embeddings}") 48 | 49 | init_config['decoder_config']['use_cache'] = True 50 | config = VibeVoiceConfig(**init_config, tie_word_embeddings=tie_word_embeddings) 51 | 52 | # # Extract the model state dict 53 | model_state_dict = {k.replace('model.model.', 'model.'): v for k, v in checkpoint["model"].items() if k.startswith('model.model.')} 54 | if not tie_word_embeddings and 'model.lm_head.weight' in checkpoint["model"].keys(): 55 | # If not tying weights, we need to add the lm_head weight separately 56 | model_state_dict['lm_head.weight'] = checkpoint["model"]['model.lm_head.weight'] 57 | 58 | # Override with provided config if available 59 | if config_path: 60 | logger.info(f"Loading config from {config_path}") 61 | with open(config_path, 'r') as f: 62 | config_dict = json.load(f) 63 | config = VibeVoiceConfig.from_dict(config_dict) 64 | 65 | # Set the default dtype to bfloat16 before creating the model 66 | original_dtype = torch.get_default_dtype() 67 | torch.set_default_dtype(torch.bfloat16) 68 | 69 | # Create the HuggingFace model 70 | logger.info("Creating HuggingFace VibeVoiceForConditionalGeneration model") 71 | model = VibeVoiceForConditionalGeneration(config) 72 | 73 | # Restore original dtype 74 | torch.set_default_dtype(original_dtype) 75 | 76 | # Load the state dict 77 | logger.info("Loading weights into model") 78 | missing_keys, unexpected_keys = model.load_state_dict(model_state_dict, strict=False) 79 | 80 | if missing_keys: 81 | logger.warning(f"Missing keys: {missing_keys}") 82 | if unexpected_keys: 83 | logger.warning(f"Unexpected keys: {unexpected_keys}") 84 | 85 | # Create output directory 86 | os.makedirs(pytorch_dump_folder_path, exist_ok=True) 87 | 88 | # Save the model and config 89 | logger.info(f"Saving model to {pytorch_dump_folder_path}") 90 | 91 | # Save config 92 | config.save_pretrained(pytorch_dump_folder_path) 93 | 94 | # Save VibeVoiceProcessor configuration 95 | logger.info("Saving VibeVoiceProcessor configuration") 96 | processor_config = { 97 | "processor_class": "VibeVoiceProcessor", 98 | "speech_tok_compress_ratio": 3200, 99 | "db_normalize": True, 100 | # Audio processor configuration 101 | "audio_processor": { 102 | "feature_extractor_type": "VibeVoiceTokenizerProcessor", 103 | "sampling_rate": 24000, 104 | "normalize_audio": True, 105 | "target_dB_FS": -25, 106 | "eps": 1e-6, 107 | }, 108 | "language_model_pretrained_name": pretrained_name, 109 | } 110 | 111 | processor_config_path = os.path.join(pytorch_dump_folder_path, "preprocessor_config.json") 112 | with open(processor_config_path, 'w') as f: 113 | json.dump(processor_config, f, indent=2) 114 | logger.info(f"Saved processor config to {processor_config_path}") 115 | 116 | # Save model with sharding 117 | # save_pretrained handles tied weights automatically 118 | logger.info("Saving model weights with sharding...") 119 | model.save_pretrained( 120 | pytorch_dump_folder_path, 121 | max_shard_size="2GB", # Set maximum size for each shard 122 | safe_serialization=True # Ensure saving in .safetensors format 123 | ) 124 | logger.info(f"Model weights saved to {pytorch_dump_folder_path}") 125 | 126 | logger.info("Conversion complete!") 127 | 128 | # Verify the saved model can be loaded 129 | logger.info("Verifying saved model...") 130 | loaded_model = VibeVoiceForConditionalGeneration.from_pretrained(pytorch_dump_folder_path) 131 | logger.info("Model successfully loaded from saved checkpoint!") 132 | 133 | def main(): 134 | parser = argparse.ArgumentParser() 135 | parser.add_argument( 136 | "--nnscaler_checkpoint_path", 137 | type=str, 138 | required=True, 139 | help="Path to the fairseq checkpoint (.pt file). For tensor parallel checkpoints, " 140 | "provide any one of the part files (e.g., checkpoint_1_5000-model_part-0.pt), " 141 | "and the script will automatically detect and merge all parts.", 142 | ) 143 | parser.add_argument( 144 | "--pytorch_dump_folder_path", 145 | type=str, 146 | required=True, 147 | help="Path to the output PyTorch model directory", 148 | ) 149 | parser.add_argument( 150 | "--config_path", 151 | type=str, 152 | default=None, 153 | help="Optional path to a config JSON file to override extracted config", 154 | ) 155 | 156 | args = parser.parse_args() 157 | 158 | convert_vibevoice_nnscaler_checkpoint_to_hf( 159 | args.nnscaler_checkpoint_path, 160 | args.pytorch_dump_folder_path, 161 | args.config_path, 162 | ) 163 | 164 | 165 | if __name__ == "__main__": 166 | main() -------------------------------------------------------------------------------- /examples/Multiple-Speaker.json: -------------------------------------------------------------------------------- 1 | {"id":"e5ca15c5-18b5-4d37-8852-795692a14b29","revision":0,"last_node_id":38,"last_link_id":57,"nodes":[{"id":19,"type":"LoadTextFromFileNode","pos":[9.889446258544922,621.1560668945312],"size":[270,58],"flags":{},"order":0,"mode":4,"inputs":[{"localized_name":"file","name":"file","type":"COMBO","widget":{"name":"file"},"link":null}],"outputs":[{"localized_name":"text","name":"text","type":"STRING","links":null}],"properties":{"Node name for S&R":"LoadTextFromFileNode","cnr_id":"VibeVoice-ComfyUI","ver":"5a24489a7b0bf0c406d291dd51e82a085d338d44"},"widgets_values":["No text files found in any directory"],"color":"#323","bgcolor":"#535"},{"id":31,"type":"Note","pos":[379.3583984375,725.9093627929688],"size":[415,88],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[],"title":"Voice Speed Factor","properties":{},"widgets_values":["The voice speed factor influences the original source audio to attempt to achieve a slower or faster final speech. 1.0 is the normal speed. It is recommended not to exceed values between 0.95 and 1.05. The effect is best when you provide a sample audio of at least 20 seconds."],"color":"#432","bgcolor":"#653"},{"id":16,"type":"PreviewAudio","pos":[896.3719482421875,189.1308135986328],"size":[270,88],"flags":{},"order":8,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":57},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null}],"outputs":[],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"PreviewAudio"},"widgets_values":[],"color":"#323","bgcolor":"#535"},{"id":20,"type":"Note","pos":[-55.931907653808594,726.6131591796875],"size":[415,88],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[],"title":"Load Text From File","properties":{},"widgets_values":["Use Load Text From File if you want to use a .txt file instead of text-area. You can load .txt files from ComfyUI/input, ComfyUI/output or ComfyUI/temp directories."],"color":"#432","bgcolor":"#653"},{"id":15,"type":"LoadAudio","pos":[-12.263749122619629,190.64144897460938],"size":[270,136],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[55]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"LoadAudio"},"widgets_values":["Voice1.mp3",null,null],"color":"#2a363b","bgcolor":"#3f5159"},{"id":17,"type":"LoadAudio","pos":[-11.774602890014648,403.2247009277344],"size":[270,136],"flags":{},"order":4,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[56]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"LoadAudio"},"widgets_values":["Voice2.mp3",null,null],"color":"#2a363b","bgcolor":"#3f5159"},{"id":36,"type":"VibeVoiceMultipleSpeakersNode","pos":[393.1620178222656,189.6568145751953],"size":[400,456],"flags":{},"order":7,"mode":0,"inputs":[{"localized_name":"speaker1_voice","name":"speaker1_voice","shape":7,"type":"AUDIO","link":55},{"localized_name":"speaker2_voice","name":"speaker2_voice","shape":7,"type":"AUDIO","link":56},{"localized_name":"speaker3_voice","name":"speaker3_voice","shape":7,"type":"AUDIO","link":null},{"localized_name":"speaker4_voice","name":"speaker4_voice","shape":7,"type":"AUDIO","link":null},{"localized_name":"lora","name":"lora","shape":7,"type":"LORA_CONFIG","link":null},{"localized_name":"text","name":"text","type":"STRING","widget":{"name":"text"},"link":null},{"localized_name":"model","name":"model","type":"COMBO","widget":{"name":"model"},"link":null},{"localized_name":"attention_type","name":"attention_type","type":"COMBO","widget":{"name":"attention_type"},"link":null},{"localized_name":"quantize_llm","name":"quantize_llm","type":"COMBO","widget":{"name":"quantize_llm"},"link":null},{"localized_name":"free_memory_after_generate","name":"free_memory_after_generate","type":"BOOLEAN","widget":{"name":"free_memory_after_generate"},"link":null},{"localized_name":"diffusion_steps","name":"diffusion_steps","type":"INT","widget":{"name":"diffusion_steps"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null},{"localized_name":"cfg_scale","name":"cfg_scale","type":"FLOAT","widget":{"name":"cfg_scale"},"link":null},{"localized_name":"use_sampling","name":"use_sampling","type":"BOOLEAN","widget":{"name":"use_sampling"},"link":null},{"localized_name":"temperature","name":"temperature","shape":7,"type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","shape":7,"type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"voice_speed_factor","name":"voice_speed_factor","shape":7,"type":"FLOAT","widget":{"name":"voice_speed_factor"},"link":null}],"outputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","links":[57]}],"properties":{"Node name for S&R":"VibeVoiceMultipleSpeakersNode"},"widgets_values":["[1]: Hello, this is the first speaker.\n[2]: Hi there, I'm the second speaker.\n[1]: Nice to meet you!\n[2]: Nice to meet you too!","VibeVoice-Large","auto","4bit",true,20,42,"fixed",1.3,false,0.95,0.95,1],"color":"#223","bgcolor":"#335"},{"id":37,"type":"Note","pos":[-530.1146850585938,279.4844055175781],"size":[408.66363525390625,236.39089965820312],"flags":{},"order":5,"mode":0,"inputs":[],"outputs":[],"title":"1) Download Models","properties":{},"widgets_values":["You have to manually download the models you would like to use and put them into: ComfyUI/models/vibevoice/\n\nMake a directory for each model and put all the files inside them.\n\nVibeVoice-1.5B model (~ 5.4 GB):\nhttps://huggingface.co/microsoft/VibeVoice-1.5B/tree/main\n\nVibeVoice-Large model (~ 18.7 GB):\nhttps://huggingface.co/aoi-ot/VibeVoice-Large/tree/main\n\nVibeVoice-Large-Q-8bit model (~ 11.6 GB):\nhttps://huggingface.co/FabioSarracino/VibeVoice-Large-Q8/tree/main\n\nVibeVoice-Large-Q-4bit model (~ 6.6 GB):\nhttps://huggingface.co/DevParker/VibeVoice7b-low-vram/tree/main/4bit"],"color":"#432","bgcolor":"#653"},{"id":38,"type":"Note","pos":[-529.8153686523438,579.2252807617188],"size":[407.2561950683594,155.19009399414062],"flags":{},"order":6,"mode":0,"inputs":[],"outputs":[],"title":"2) Download Tokenizer","properties":{},"widgets_values":["You have to manually download the Qwen2.5 Tokenizer files and put them into: ComfyUI/models/vibevoice/tokenizer/\n\nhttps://huggingface.co/Qwen/Qwen2.5-1.5B/tree/main\n\nRequired files: tokenizer_config.json, vocab.json, merges.txt, tokenizer.json (~11MB)\n\nPut the files directly inside tokenizer directory without make another directory inside."],"color":"#432","bgcolor":"#653"}],"links":[[55,15,0,36,0,"AUDIO"],[56,17,0,36,1,"AUDIO"],[57,36,0,16,0,"AUDIO"]],"groups":[{"id":1,"title":"Instructions before use:","bounding":[-553.0167846679688,181.94606018066406,453.3775939941406,595.2697143554688],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"ds":{"scale":0.9090909090909097,"offset":[944.6168885013626,-55.446182500052494]}},"version":0.4} -------------------------------------------------------------------------------- /examples/VibeVoice-Unload-Memory.json: -------------------------------------------------------------------------------- 1 | {"id":"fc471b7e-ccef-427f-be3f-29dec93a90ea","revision":0,"last_node_id":45,"last_link_id":56,"nodes":[{"id":34,"type":"VibeVoiceFreeMemoryNode","pos":[913.2552490234375,127.35599517822266],"size":[189.03964233398438,26],"flags":{},"order":8,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":56}],"outputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","links":[42]}],"properties":{"Node name for S&R":"VibeVoiceFreeMemoryNode","cnr_id":"VibeVoice-ComfyUI","ver":"5a24489a7b0bf0c406d291dd51e82a085d338d44"},"widgets_values":[],"color":"#322","bgcolor":"#533"},{"id":16,"type":"PreviewAudio","pos":[1273.2957763671875,127.3007583618164],"size":[270,88],"flags":{},"order":9,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":42},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null}],"outputs":[],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"PreviewAudio"},"widgets_values":[],"color":"#323","bgcolor":"#535"},{"id":35,"type":"Note","pos":[809.6192016601562,208.98324584960938],"size":[432.1000061035156,126.30000305175781],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[],"title":"Free Memory Node","properties":{},"widgets_values":["The VibeVoice Free Memory node releases memory as soon as it receives the audio input (acting as a passthrough for the audio itself). In this specific use case, however, it’s redundant, since it would be enough to enable the “free_memory_after_generate” parameter of the previous node. The ideal use case is, for example, when you have a loop generating multiple audio clips, and only after the final generation you pass the last audio and free the memory."],"color":"#432","bgcolor":"#653"},{"id":28,"type":"LoadTextFromFileNode","pos":[-30.95530128479004,453.30511474609375],"size":[289.5152282714844,58],"flags":{},"order":2,"mode":4,"inputs":[{"localized_name":"file","name":"file","type":"COMBO","widget":{"name":"file"},"link":null}],"outputs":[{"localized_name":"text","name":"text","type":"STRING","links":null}],"properties":{"Node name for S&R":"LoadTextFromFileNode","cnr_id":"VibeVoice-ComfyUI","ver":"5a24489a7b0bf0c406d291dd51e82a085d338d44"},"widgets_values":["No text files found in any directory"],"color":"#323","bgcolor":"#535"},{"id":40,"type":"Note","pos":[367.98895263671875,597.8056640625],"size":[415,88],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[],"title":"Voice Speed Factor","properties":{},"widgets_values":["The voice speed factor influences the original source audio to attempt to achieve a slower or faster final speech. 1.0 is the normal speed. It is recommended not to exceed values between 0.95 and 1.05. The effect is best when you provide a sample audio of at least 20 seconds."],"color":"#432","bgcolor":"#653"},{"id":15,"type":"LoadAudio","pos":[-21.549091339111328,127.7799301147461],"size":[270,136],"flags":{},"order":4,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[55]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"LoadAudio"},"widgets_values":["Voice.mp3",null,null],"color":"#2a363b","bgcolor":"#3f5159"},{"id":43,"type":"VibeVoiceSingleSpeakerNode","pos":[373.596435546875,128.40489196777344],"size":[400,420],"flags":{},"order":7,"mode":0,"inputs":[{"localized_name":"voice_to_clone","name":"voice_to_clone","shape":7,"type":"AUDIO","link":55},{"localized_name":"lora","name":"lora","shape":7,"type":"LORA_CONFIG","link":null},{"localized_name":"text","name":"text","type":"STRING","widget":{"name":"text"},"link":null},{"localized_name":"model","name":"model","type":"COMBO","widget":{"name":"model"},"link":null},{"localized_name":"attention_type","name":"attention_type","type":"COMBO","widget":{"name":"attention_type"},"link":null},{"localized_name":"quantize_llm","name":"quantize_llm","type":"COMBO","widget":{"name":"quantize_llm"},"link":null},{"localized_name":"free_memory_after_generate","name":"free_memory_after_generate","type":"BOOLEAN","widget":{"name":"free_memory_after_generate"},"link":null},{"localized_name":"diffusion_steps","name":"diffusion_steps","type":"INT","widget":{"name":"diffusion_steps"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null},{"localized_name":"cfg_scale","name":"cfg_scale","type":"FLOAT","widget":{"name":"cfg_scale"},"link":null},{"localized_name":"use_sampling","name":"use_sampling","type":"BOOLEAN","widget":{"name":"use_sampling"},"link":null},{"localized_name":"temperature","name":"temperature","shape":7,"type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","shape":7,"type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"max_words_per_chunk","name":"max_words_per_chunk","shape":7,"type":"INT","widget":{"name":"max_words_per_chunk"},"link":null},{"localized_name":"voice_speed_factor","name":"voice_speed_factor","shape":7,"type":"FLOAT","widget":{"name":"voice_speed_factor"},"link":null}],"outputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","links":[56]}],"properties":{"Node name for S&R":"VibeVoiceSingleSpeakerNode"},"widgets_values":["Hello, this is a test of the VibeVoice text-to-speech system.","VibeVoice-1.5B","auto","full precision",true,20,42,"fixed",1.3,false,0.95,0.95,250,1],"color":"#223","bgcolor":"#335"},{"id":44,"type":"Note","pos":[-546.2021484375,184.94338989257812],"size":[408.66363525390625,236.39089965820312],"flags":{},"order":5,"mode":0,"inputs":[],"outputs":[],"title":"1) Download Models","properties":{},"widgets_values":["You have to manually download the models you would like to use and put them into: ComfyUI/models/vibevoice/\n\nMake a directory for each model and put all the files inside them.\n\nVibeVoice-1.5B model (~ 5.4 GB):\nhttps://huggingface.co/microsoft/VibeVoice-1.5B/tree/main\n\nVibeVoice-Large model (~ 18.7 GB):\nhttps://huggingface.co/aoi-ot/VibeVoice-Large/tree/main\n\nVibeVoice-Large-Q-8bit model (~ 11.6 GB):\nhttps://huggingface.co/FabioSarracino/VibeVoice-Large-Q8/tree/main\n\nVibeVoice-Large-Q-4bit model (~ 6.6 GB):\nhttps://huggingface.co/DevParker/VibeVoice7b-low-vram/tree/main/4bit"],"color":"#432","bgcolor":"#653"},{"id":45,"type":"Note","pos":[-545.90283203125,484.68328857421875],"size":[407.2561950683594,155.19009399414062],"flags":{},"order":6,"mode":0,"inputs":[],"outputs":[],"title":"2) Download Tokenizer","properties":{},"widgets_values":["You have to manually download the Qwen2.5 Tokenizer files and put them into: ComfyUI/models/vibevoice/tokenizer/\n\nhttps://huggingface.co/Qwen/Qwen2.5-1.5B/tree/main\n\nRequired files: tokenizer_config.json, vocab.json, merges.txt, tokenizer.json (~11MB)\n\nPut the files directly inside tokenizer directory without make another directory inside."],"color":"#432","bgcolor":"#653"},{"id":21,"type":"Note","pos":[-84.54156494140625,599.46435546875],"size":[415,88],"flags":{},"order":0,"mode":0,"inputs":[],"outputs":[],"title":"Load Text From File","properties":{},"widgets_values":["Use Load Text From File if you want to use a .txt file instead of text-area. You can load .txt files from ComfyUI/input, ComfyUI/output or ComfyUI/temp directories."],"color":"#432","bgcolor":"#653"}],"links":[[42,34,0,16,0,"AUDIO"],[55,15,0,43,0,"AUDIO"],[56,43,0,34,0,"AUDIO"]],"groups":[{"id":2,"title":"Instructions before use:","bounding":[-569.1041870117188,87.40498352050781,453.3775939941406,595.2697143554688],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"ds":{"scale":0.9090909090909091,"offset":[570.2036733851843,-33.504933709055805]}},"version":0.4} -------------------------------------------------------------------------------- /vvembed/modular/modular_vibevoice_text_tokenizer.py: -------------------------------------------------------------------------------- 1 | """Tokenization classes for vibevoice.""" 2 | 3 | from typing import List, Optional, Union 4 | 5 | from transformers.utils import logging 6 | from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer 7 | from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast 8 | 9 | logger = logging.get_logger(__name__) 10 | 11 | 12 | class VibeVoiceTextTokenizer(Qwen2Tokenizer): 13 | """ 14 | Construct a VibeVoice tokenizer. Based on the Qwen2 tokenizer with additional special tokens for speech. 15 | 16 | Args: 17 | vocab_file (`str`): 18 | Path to the vocabulary file. 19 | merges_file (`str`): 20 | Path to the merges file. 21 | errors (`str`, *optional*, defaults to `"replace"`): 22 | Paradigm to follow when decoding bytes to UTF-8. 23 | unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 24 | The unknown token. 25 | bos_token (`str`, *optional*): 26 | The beginning of sequence token. Not used for vibevoice. 27 | eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 28 | The end of sequence token. 29 | pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 30 | The token used for padding. 31 | add_special_tokens (`bool`, *optional*, defaults to `True`): 32 | Whether or not to add special tokens when encoding. 33 | """ 34 | 35 | model_input_names = ["input_ids", "attention_mask"] 36 | 37 | def __init__( 38 | self, 39 | vocab_file, 40 | merges_file, 41 | errors="replace", 42 | unk_token="<|endoftext|>", 43 | bos_token=None, 44 | eos_token="<|endoftext|>", 45 | pad_token="<|endoftext|>", 46 | add_prefix_space=False, 47 | add_special_tokens=True, 48 | **kwargs, 49 | ): 50 | super().__init__( 51 | vocab_file=vocab_file, 52 | merges_file=merges_file, 53 | errors=errors, 54 | unk_token=unk_token, 55 | bos_token=bos_token, 56 | eos_token=eos_token, 57 | pad_token=pad_token, 58 | add_prefix_space=add_prefix_space, 59 | add_special_tokens=add_special_tokens, 60 | **kwargs, 61 | ) 62 | 63 | # Add VibeVoice-specific special tokens 64 | self._add_vibevoice_special_tokens() 65 | 66 | def _add_vibevoice_special_tokens(self): 67 | """Add VibeVoice-specific special tokens.""" 68 | special_tokens = { 69 | "additional_special_tokens": [ 70 | "<|vision_start|>", # Speech start (reusing vision tokens) 71 | "<|vision_end|>", # Speech end 72 | "<|vision_pad|>", # Speech diffusion pad 73 | ] 74 | } 75 | num_added = self.add_special_tokens(special_tokens) 76 | 77 | # Cache special token IDs 78 | self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>") 79 | self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>") 80 | self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>") 81 | 82 | self._eos_id = self.convert_tokens_to_ids('<|endoftext|>') 83 | 84 | return num_added 85 | 86 | @property 87 | def eos_id(self) -> int: 88 | """Id of the end of sequence token.""" 89 | return self._eos_id 90 | 91 | @property 92 | def speech_start_id(self) -> int: 93 | """Id of the speech start token.""" 94 | return self._speech_start_id 95 | 96 | @property 97 | def speech_end_id(self) -> int: 98 | """Id of the speech end token.""" 99 | return self._speech_end_id 100 | 101 | @property 102 | def speech_diffusion_id(self) -> int: 103 | """Id of the speech diffusion token.""" 104 | return self._speech_diffusion_id 105 | 106 | @property 107 | def pad_id(self) -> int: 108 | """Id used for padding (returns -100 for loss masking).""" 109 | return -100 110 | 111 | 112 | class VibeVoiceTextTokenizerFast(Qwen2TokenizerFast): 113 | """ 114 | Construct a "fast" VibeVoice tokenizer (backed by HuggingFace's *tokenizers* library). 115 | Based on the Qwen2 tokenizer with additional special tokens for speech. 116 | 117 | Args: 118 | vocab_file (`str`, *optional*): 119 | Path to the vocabulary file. 120 | merges_file (`str`, *optional*): 121 | Path to the merges file. 122 | tokenizer_file (`str`, *optional*): 123 | Path to [tokenizers](https://github.com/huggingface/tokenizers) file. 124 | unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 125 | The unknown token. 126 | bos_token (`str`, *optional*): 127 | The beginning of sequence token. Not used for vibevoice. 128 | eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 129 | The end of sequence token. 130 | pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): 131 | The token used for padding. 132 | """ 133 | 134 | model_input_names = ["input_ids", "attention_mask"] 135 | 136 | def __init__( 137 | self, 138 | vocab_file=None, 139 | merges_file=None, 140 | tokenizer_file=None, 141 | unk_token="<|endoftext|>", 142 | bos_token=None, 143 | eos_token="<|endoftext|>", 144 | pad_token="<|endoftext|>", 145 | add_prefix_space=False, 146 | **kwargs, 147 | ): 148 | super().__init__( 149 | vocab_file=vocab_file, 150 | merges_file=merges_file, 151 | tokenizer_file=tokenizer_file, 152 | unk_token=unk_token, 153 | bos_token=bos_token, 154 | eos_token=eos_token, 155 | pad_token=pad_token, 156 | add_prefix_space=add_prefix_space, 157 | **kwargs, 158 | ) 159 | 160 | # Add VibeVoice-specific special tokens 161 | self._add_vibevoice_special_tokens() 162 | 163 | def _add_vibevoice_special_tokens(self): 164 | """Add VibeVoice-specific special tokens.""" 165 | special_tokens = { 166 | "additional_special_tokens": [ 167 | "<|vision_start|>", # Speech start (reusing vision tokens) 168 | "<|vision_end|>", # Speech end 169 | "<|vision_pad|>", # Speech diffusion pad 170 | ] 171 | } 172 | num_added = self.add_special_tokens(special_tokens) 173 | 174 | # Cache special token IDs 175 | self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>") 176 | self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>") 177 | self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>") 178 | 179 | # self._eos_id = self.convert_tokens_to_ids('<|endoftext|>') 180 | self._eos_id = self.eos_token_id # qwen2 / qwen3 181 | self._pad_id = self.convert_tokens_to_ids('<|image_pad|>') 182 | 183 | return num_added 184 | 185 | @property 186 | def eos_id(self) -> int: 187 | """Id of the end of sequence token.""" 188 | return self._eos_id 189 | 190 | @property 191 | def speech_start_id(self) -> int: 192 | """Id of the speech start token.""" 193 | return self._speech_start_id 194 | 195 | @property 196 | def speech_end_id(self) -> int: 197 | """Id of the speech end token.""" 198 | return self._speech_end_id 199 | 200 | @property 201 | def speech_diffusion_id(self) -> int: 202 | """Id of the speech diffusion token.""" 203 | return self._speech_diffusion_id 204 | 205 | @property 206 | def pad_id(self) -> int: 207 | """Id used for padding (returns -100 for loss masking).""" 208 | return self._pad_id 209 | 210 | 211 | __all__ = [ 212 | "VibeVoiceTextTokenizer", 213 | "VibeVoiceTextTokenizerFast", 214 | ] -------------------------------------------------------------------------------- /nodes/lora_node.py: -------------------------------------------------------------------------------- 1 | # Created by Fabio Sarracino 2 | # Original LoRa code implementation by jpgallegoar-vpai user via PR #127 3 | # LoRA configuration node for VibeVoice 4 | 5 | import logging 6 | import os 7 | from typing import Dict, Any, List 8 | 9 | # Setup logging 10 | logger = logging.getLogger("VibeVoice") 11 | 12 | # Cache for LoRA scanning to avoid repeated logs 13 | _lora_cache = { 14 | "first_load_logged": False 15 | } 16 | 17 | def get_available_loras() -> List[str]: 18 | """Get list of available LoRA folders in ComfyUI/models/vibevoice/loras""" 19 | try: 20 | import folder_paths 21 | 22 | # Get the ComfyUI models directory 23 | models_dir = folder_paths.get_folder_paths("checkpoints")[0] 24 | # Navigate to vibevoice/loras directory 25 | loras_dir = os.path.join(os.path.dirname(models_dir), "vibevoice", "loras") 26 | 27 | # Create directory if it doesn't exist 28 | os.makedirs(loras_dir, exist_ok=True) 29 | 30 | # List all directories in the loras folder 31 | lora_folders = [] 32 | if os.path.exists(loras_dir): 33 | for item in os.listdir(loras_dir): 34 | item_path = os.path.join(loras_dir, item) 35 | if os.path.isdir(item_path): 36 | # Check if it contains LoRA files 37 | adapter_config = os.path.join(item_path, "adapter_config.json") 38 | adapter_model_st = os.path.join(item_path, "adapter_model.safetensors") 39 | adapter_model_bin = os.path.join(item_path, "adapter_model.bin") 40 | 41 | # Consider it a valid LoRA if it has config or model files 42 | if os.path.exists(adapter_config) or os.path.exists(adapter_model_st) or os.path.exists(adapter_model_bin): 43 | lora_folders.append(item) 44 | 45 | # Only log on first scan to avoid spam 46 | if not _lora_cache["first_load_logged"]: 47 | if not lora_folders: 48 | logger.info("No LoRA adapters found in ComfyUI/models/vibevoice/loras") 49 | _lora_cache["first_load_logged"] = True 50 | 51 | # Always include "None" option to disable LoRA 52 | if not lora_folders: 53 | return ["None"] 54 | 55 | # Sort alphabetically and add None option at the beginning 56 | lora_folders.sort() 57 | return ["None"] + lora_folders 58 | 59 | except Exception as e: 60 | logger.error(f"Error listing LoRA folders: {e}") 61 | return ["None"] 62 | 63 | class VibeVoiceLoRANode: 64 | """Node for configuring LoRA adapters for VibeVoice models""" 65 | 66 | def __init__(self): 67 | pass 68 | 69 | @classmethod 70 | def INPUT_TYPES(cls): 71 | # Get available LoRA folders dynamically 72 | available_loras = get_available_loras() 73 | 74 | return { 75 | "required": { 76 | "lora_name": (available_loras, { 77 | "default": "None", 78 | "tooltip": "Select a LoRA adapter from ComfyUI/models/vibevoice/loras folder" 79 | }), 80 | "llm_strength": ("FLOAT", { 81 | "default": 1.0, 82 | "min": 0.0, 83 | "max": 2.0, 84 | "step": 0.05, 85 | "tooltip": "Strength of the LLM LoRA adapter. Controls how much the LoRA affects the language model" 86 | }), 87 | "use_llm": ("BOOLEAN", { 88 | "default": True, 89 | "tooltip": "Apply LLM (language model) LoRA component when available" 90 | }), 91 | "use_diffusion_head": ("BOOLEAN", { 92 | "default": True, 93 | "tooltip": "Apply diffusion head LoRA/replacement when available" 94 | }), 95 | "use_acoustic_connector": ("BOOLEAN", { 96 | "default": True, 97 | "tooltip": "Apply acoustic connector LoRA component when available" 98 | }), 99 | "use_semantic_connector": ("BOOLEAN", { 100 | "default": True, 101 | "tooltip": "Apply semantic connector LoRA component when available" 102 | }), 103 | } 104 | } 105 | 106 | RETURN_TYPES = ("LORA_CONFIG",) 107 | RETURN_NAMES = ("lora",) 108 | FUNCTION = "configure_lora" 109 | CATEGORY = "VibeVoiceWrapper" 110 | DESCRIPTION = "Configure LoRA adapters for fine-tuned VibeVoice models. Place LoRA folders in ComfyUI/models/vibevoice/loras/" 111 | 112 | def configure_lora(self, lora_name: str = "None", llm_strength: float = 1.0, 113 | use_llm: bool = True, use_diffusion_head: bool = True, 114 | use_acoustic_connector: bool = True, use_semantic_connector: bool = True): 115 | """Configure LoRA settings and validate the path""" 116 | 117 | # Handle "None" selection 118 | if lora_name == "None": 119 | logger.info("No LoRA selected, using base model") 120 | return ({ 121 | "path": None, 122 | "llm_strength": llm_strength, 123 | "use_llm": use_llm, 124 | "use_diffusion_head": use_diffusion_head, 125 | "use_acoustic_connector": use_acoustic_connector, 126 | "use_semantic_connector": use_semantic_connector 127 | },) 128 | 129 | try: 130 | import folder_paths 131 | 132 | # Build full path to the LoRA folder 133 | models_dir = folder_paths.get_folder_paths("checkpoints")[0] 134 | loras_dir = os.path.join(os.path.dirname(models_dir), "vibevoice", "loras") 135 | lora_path = os.path.join(loras_dir, lora_name) 136 | 137 | # Validate the path exists 138 | if not os.path.exists(lora_path): 139 | logger.error(f"LoRA path does not exist: {lora_path}") 140 | raise Exception(f"LoRA folder not found: {lora_name}") 141 | 142 | if not os.path.isdir(lora_path): 143 | logger.error(f"LoRA path is not a directory: {lora_path}") 144 | raise Exception(f"LoRA path must be a directory: {lora_name}") 145 | 146 | # Check for required files 147 | adapter_config = os.path.join(lora_path, "adapter_config.json") 148 | adapter_model_st = os.path.join(lora_path, "adapter_model.safetensors") 149 | adapter_model_bin = os.path.join(lora_path, "adapter_model.bin") 150 | 151 | if not os.path.exists(adapter_config): 152 | logger.warning(f"adapter_config.json not found in {lora_name}") 153 | 154 | if not os.path.exists(adapter_model_st) and not os.path.exists(adapter_model_bin): 155 | logger.warning(f"No adapter model file found in {lora_name}") 156 | logger.warning("Expected: adapter_model.safetensors or adapter_model.bin") 157 | 158 | logger.info(f"LoRA configured: {lora_name} ({lora_path})") 159 | 160 | # Check for optional components 161 | components_found = [] 162 | diffusion_head_path = os.path.join(lora_path, "diffusion_head") 163 | acoustic_connector_path = os.path.join(lora_path, "acoustic_connector") 164 | semantic_connector_path = os.path.join(lora_path, "semantic_connector") 165 | 166 | if os.path.exists(diffusion_head_path): 167 | components_found.append("diffusion_head") 168 | if os.path.exists(acoustic_connector_path): 169 | components_found.append("acoustic_connector") 170 | if os.path.exists(semantic_connector_path): 171 | components_found.append("semantic_connector") 172 | 173 | if components_found: 174 | logger.info(f"Additional LoRA components found: {', '.join(components_found)}") 175 | 176 | # Create configuration dictionary 177 | lora_config = { 178 | "path": lora_path, 179 | "llm_strength": llm_strength, 180 | "use_llm": use_llm, 181 | "use_diffusion_head": use_diffusion_head, 182 | "use_acoustic_connector": use_acoustic_connector, 183 | "use_semantic_connector": use_semantic_connector 184 | } 185 | 186 | # Log configuration 187 | enabled_components = [] 188 | if use_llm: 189 | enabled_components.append(f"LLM (strength: {llm_strength})") 190 | if use_diffusion_head: 191 | enabled_components.append("Diffusion Head") 192 | if use_acoustic_connector: 193 | enabled_components.append("Acoustic Connector") 194 | if use_semantic_connector: 195 | enabled_components.append("Semantic Connector") 196 | 197 | if enabled_components: 198 | logger.info(f"LoRA components enabled: {', '.join(enabled_components)}") 199 | else: 200 | logger.warning("All LoRA components are disabled") 201 | 202 | return (lora_config,) 203 | 204 | except ImportError: 205 | logger.error("Could not import folder_paths from ComfyUI") 206 | raise Exception("Failed to access ComfyUI folders") 207 | except Exception as e: 208 | logger.error(f"Error configuring LoRA: {e}") 209 | raise 210 | 211 | @classmethod 212 | def IS_CHANGED(cls, lora_name: str = "None", **kwargs): 213 | """Cache key for ComfyUI - includes all parameters""" 214 | return f"{lora_name}_{kwargs.get('llm_strength', 1.0)}_{kwargs.get('use_llm', True)}_{kwargs.get('use_diffusion_head', True)}_{kwargs.get('use_acoustic_connector', True)}_{kwargs.get('use_semantic_connector', True)}" -------------------------------------------------------------------------------- /vvembed/modular/modular_vibevoice_diffusion_head.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Optional, Tuple, Union 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from transformers.models.auto import AutoModel 9 | from transformers.modeling_utils import PreTrainedModel 10 | # from transformers.modeling_layers import GradientCheckpointingLayer 11 | from transformers.activations import ACT2FN 12 | from transformers.utils import logging 13 | 14 | from .configuration_vibevoice import VibeVoiceDiffusionHeadConfig 15 | 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | 20 | class RMSNorm(nn.Module): 21 | def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine=True, memory_efficient=False): 22 | super().__init__() 23 | self.dim = dim 24 | self.eps = eps 25 | self.elementwise_affine = elementwise_affine 26 | if self.elementwise_affine: 27 | self.weight = nn.Parameter(torch.ones(dim)) 28 | else: 29 | self.register_parameter('weight', None) 30 | 31 | def _norm(self, x): 32 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 33 | 34 | def forward(self, x): 35 | output = self._norm(x.float()).type_as(x) 36 | if self.weight is not None: 37 | output = output * self.weight 38 | return output 39 | 40 | def extra_repr(self) -> str: 41 | return f'dim={self.dim}, eps={self.eps}, elementwise_affine={self.elementwise_affine}' 42 | 43 | def modulate(x, shift, scale): 44 | """Apply modulation to input tensor.""" 45 | return x * (1 + scale) + shift 46 | 47 | 48 | class TimestepEmbedder(nn.Module): 49 | """ 50 | Embeds scalar timesteps into vector representations. 51 | 52 | Args: 53 | hidden_size (`int`): Size of the output embedding 54 | frequency_embedding_size (`int`, optional): Size of the intermediate frequency embedding 55 | """ 56 | def __init__(self, hidden_size, frequency_embedding_size=256): 57 | super().__init__() 58 | self.mlp = nn.Sequential( 59 | nn.Linear(frequency_embedding_size, hidden_size, bias=False), 60 | # nn.SiLU(), 61 | ACT2FN['silu'], 62 | nn.Linear(hidden_size, hidden_size, bias=False), 63 | ) 64 | self.frequency_embedding_size = frequency_embedding_size 65 | 66 | @staticmethod 67 | def timestep_embedding(t, dim, max_period=10000): 68 | """ 69 | Create sinusoidal timestep embeddings. 70 | 71 | Args: 72 | t (`torch.Tensor`): A 1-D Tensor of N indices, one per batch element. 73 | These may be fractional. 74 | dim (`int`): The dimension of the output. 75 | max_period (`int`, optional): Controls the minimum frequency of the embeddings. 76 | 77 | Returns: 78 | `torch.Tensor`: An [N, D] Tensor of positional embeddings. 79 | """ 80 | half = dim // 2 81 | freqs = torch.exp( 82 | -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half 83 | ).to(t.device) 84 | args = t[:, None].float() * freqs[None] 85 | embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) 86 | if dim % 2: 87 | embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) 88 | return embedding.to(t.dtype) 89 | 90 | def forward(self, t): 91 | t_freq = self.timestep_embedding(t, self.frequency_embedding_size) 92 | t_emb = self.mlp(t_freq) 93 | return t_emb 94 | 95 | 96 | class FeedForwardNetwork(nn.Module): 97 | """ 98 | Standard feed-forward network with SwiGLU activation. 99 | 100 | Args: 101 | embed_dim (`int`): Input dimension 102 | ffn_dim (`int`): Hidden dimension 103 | """ 104 | def __init__( 105 | self, 106 | embed_dim, 107 | ffn_dim, 108 | ): 109 | super().__init__() 110 | self.embed_dim = embed_dim 111 | self.gate_proj = nn.Linear(self.embed_dim, ffn_dim, bias=False) 112 | self.up_proj = nn.Linear(self.embed_dim, ffn_dim, bias=False) 113 | self.down_proj = nn.Linear(ffn_dim, self.embed_dim, bias=False) 114 | self.act_fn = ACT2FN['silu'] # Using SiLU as the activation function 115 | 116 | def forward(self, x): 117 | gate = self.gate_proj(x) 118 | up = self.up_proj(x) 119 | 120 | # SwiGLU activation 121 | # gate = F.silu(gate) 122 | gate = self.act_fn(gate) 123 | return self.down_proj(gate * up) 124 | 125 | 126 | class HeadLayer(nn.Module): 127 | """ 128 | A layer in the diffusion head. 129 | 130 | Args: 131 | embed_dim (`int`): Input dimension 132 | ffn_dim (`int`): Hidden dimension 133 | cond_dim (`int`): Condition embedding dimension 134 | norm_eps (`float`, optional): Epsilon for normalization 135 | """ 136 | def __init__( 137 | self, 138 | embed_dim, 139 | ffn_dim, 140 | cond_dim, 141 | norm_eps=1e-5, 142 | ): 143 | super().__init__() 144 | self.embed_dim = embed_dim 145 | self.cond_dim = cond_dim 146 | self.ffn_dim = ffn_dim 147 | self.ffn = FeedForwardNetwork( 148 | self.embed_dim, 149 | self.ffn_dim, 150 | ) 151 | self.norm = RMSNorm(self.embed_dim, eps=norm_eps) 152 | self.adaLN_modulation = nn.Sequential( 153 | # nn.SiLU(), 154 | ACT2FN['silu'], 155 | nn.Linear(cond_dim, 3 * self.embed_dim, bias=False) 156 | ) 157 | 158 | def forward(self, x, c): 159 | shift_ffn, scale_ffn, gate_ffn = self.adaLN_modulation(c).chunk(3, dim=-1) 160 | x = x + gate_ffn * self.ffn(modulate(self.norm(x), shift_ffn, scale_ffn)) 161 | return x 162 | 163 | 164 | class FinalLayer(nn.Module): 165 | """ 166 | Final layer in the diffusion head. 167 | 168 | Args: 169 | hidden_size (`int`): Input dimension 170 | output_size (`int`): Output dimension 171 | cond_size (`int`): Condition embedding dimension 172 | norm_eps (`float`, optional): Epsilon for normalization 173 | """ 174 | def __init__(self, hidden_size, output_size, cond_size, norm_eps=1e-5): 175 | super().__init__() 176 | self.norm_final = RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=False) 177 | self.linear = nn.Linear(hidden_size, output_size, bias=False) 178 | self.adaLN_modulation = nn.Sequential( 179 | # nn.SiLU(), 180 | ACT2FN['silu'], 181 | nn.Linear(cond_size, 2 * hidden_size, bias=False) 182 | ) 183 | 184 | def forward(self, x, c): 185 | shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1) 186 | x = modulate(self.norm_final(x), shift, scale) 187 | x = self.linear(x) 188 | return x 189 | 190 | 191 | class VibeVoiceDiffusionHead(PreTrainedModel): 192 | """ 193 | Diffusion head model for vibevoice. 194 | 195 | Args: 196 | config (`VibeVoiceDiffusionHeadConfig`): Model configuration 197 | latent_size (`int`, optional): Size of the latent space. If not provided, uses `config.latent_size`. 198 | """ 199 | config_class = VibeVoiceDiffusionHeadConfig 200 | supports_gradient_checkpointing = True 201 | _supports_flash_attn_2 = True 202 | _supports_sdpa = True 203 | 204 | def __init__( 205 | self, 206 | config, 207 | ): 208 | super().__init__(config) 209 | self.config = config 210 | self.cond_dim = config.hidden_size 211 | latent_size = config.latent_size 212 | 213 | self.noisy_images_proj = nn.Linear(latent_size, config.hidden_size, bias=False) 214 | self.cond_proj = nn.Linear(config.hidden_size, self.cond_dim, bias=False) 215 | self.t_embedder = TimestepEmbedder(self.cond_dim) 216 | 217 | ffn_dim = int(config.hidden_size * config.head_ffn_ratio) 218 | 219 | # Create the intermediate layers 220 | self.layers = nn.ModuleList([ 221 | HeadLayer( 222 | embed_dim=config.hidden_size, 223 | ffn_dim=ffn_dim, 224 | cond_dim=self.cond_dim, 225 | norm_eps=config.rms_norm_eps 226 | ) 227 | for _ in range(config.head_layers) 228 | ]) 229 | 230 | # Final layer for output 231 | self.final_layer = FinalLayer( 232 | hidden_size=config.hidden_size, 233 | output_size=latent_size, 234 | cond_size=self.cond_dim, 235 | norm_eps=config.rms_norm_eps 236 | ) 237 | 238 | self.initialize_weights() 239 | 240 | def initialize_weights(self): 241 | """Initialize the weights of the model.""" 242 | # Initialize timestep embedder 243 | nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) 244 | nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) 245 | 246 | # Zero-out adaLN modulation layers 247 | for layer in self.layers: 248 | nn.init.constant_(layer.adaLN_modulation[-1].weight, 0) 249 | 250 | # Zero-out output layers 251 | nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0) 252 | nn.init.constant_(self.final_layer.linear.weight, 0) 253 | 254 | def forward( 255 | self, 256 | noisy_images, 257 | timesteps, 258 | condition, 259 | ): 260 | """ 261 | Forward pass of the prediction head. 262 | 263 | Args: 264 | noisy_images (`torch.Tensor`): Noisy images/latents to denoise 265 | timesteps (`torch.Tensor`): Timesteps for diffusion 266 | condition (`torch.Tensor`): Conditioning information 267 | 268 | Returns: 269 | `torch.Tensor`: The predicted noise/velocity 270 | """ 271 | x = self.noisy_images_proj(noisy_images) 272 | t = self.t_embedder(timesteps) 273 | condition = self.cond_proj(condition) 274 | c = condition + t 275 | 276 | for layer in self.layers: 277 | x = layer(x, c) 278 | 279 | x = self.final_layer(x, c) 280 | return x 281 | 282 | 283 | AutoModel.register(VibeVoiceDiffusionHeadConfig, VibeVoiceDiffusionHead) 284 | 285 | __all__ = [ 286 | "VibeVoiceDiffusionHead", 287 | ] -------------------------------------------------------------------------------- /vvembed/modular/streamer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import torch 4 | 5 | import asyncio 6 | from queue import Queue 7 | from typing import TYPE_CHECKING, Optional 8 | 9 | 10 | from transformers.generation import BaseStreamer 11 | 12 | 13 | class AudioStreamer(BaseStreamer): 14 | """ 15 | Audio streamer that stores audio chunks in queues for each sample in the batch. 16 | This allows streaming audio generation for multiple samples simultaneously. 17 | 18 | Parameters: 19 | batch_size (`int`): 20 | The batch size for generation 21 | stop_signal (`any`, *optional*): 22 | The signal to put in the queue when generation ends. Defaults to None. 23 | timeout (`float`, *optional*): 24 | The timeout for the audio queue. If `None`, the queue will block indefinitely. 25 | """ 26 | 27 | def __init__( 28 | self, 29 | batch_size: int, 30 | stop_signal: Optional[any] = None, 31 | timeout: Optional[float] = None, 32 | ): 33 | self.batch_size = batch_size 34 | self.stop_signal = stop_signal 35 | self.timeout = timeout 36 | 37 | # Create a queue for each sample in the batch 38 | self.audio_queues = [Queue() for _ in range(batch_size)] 39 | self.finished_flags = [False for _ in range(batch_size)] 40 | self.sample_indices_map = {} # Maps from sample index to queue index 41 | 42 | def put(self, audio_chunks: torch.Tensor, sample_indices: torch.Tensor): 43 | """ 44 | Receives audio chunks and puts them in the appropriate queues. 45 | 46 | Args: 47 | audio_chunks: Tensor of shape (num_samples, ...) containing audio chunks 48 | sample_indices: Tensor indicating which samples these chunks belong to 49 | """ 50 | for i, sample_idx in enumerate(sample_indices): 51 | idx = sample_idx.item() 52 | if idx < self.batch_size and not self.finished_flags[idx]: 53 | # Convert to numpy or keep as tensor based on preference 54 | audio_chunk = audio_chunks[i].detach().cpu() 55 | self.audio_queues[idx].put(audio_chunk, timeout=self.timeout) 56 | 57 | def end(self, sample_indices: Optional[torch.Tensor] = None): 58 | """ 59 | Signals the end of generation for specified samples or all samples. 60 | 61 | Args: 62 | sample_indices: Optional tensor of sample indices to end. If None, ends all. 63 | """ 64 | if sample_indices is None: 65 | # End all samples 66 | for idx in range(self.batch_size): 67 | if not self.finished_flags[idx]: 68 | self.audio_queues[idx].put(self.stop_signal, timeout=self.timeout) 69 | self.finished_flags[idx] = True 70 | else: 71 | # End specific samples 72 | for sample_idx in sample_indices: 73 | idx = sample_idx.item() if torch.is_tensor(sample_idx) else sample_idx 74 | if idx < self.batch_size and not self.finished_flags[idx]: 75 | self.audio_queues[idx].put(self.stop_signal, timeout=self.timeout) 76 | self.finished_flags[idx] = True 77 | 78 | def __iter__(self): 79 | """Returns an iterator over the batch of audio streams.""" 80 | return AudioBatchIterator(self) 81 | 82 | def get_stream(self, sample_idx: int): 83 | """Get the audio stream for a specific sample.""" 84 | if sample_idx >= self.batch_size: 85 | raise ValueError(f"Sample index {sample_idx} exceeds batch size {self.batch_size}") 86 | return AudioSampleIterator(self, sample_idx) 87 | 88 | 89 | class AudioSampleIterator: 90 | """Iterator for a single audio stream from the batch.""" 91 | 92 | def __init__(self, streamer: AudioStreamer, sample_idx: int): 93 | self.streamer = streamer 94 | self.sample_idx = sample_idx 95 | 96 | def __iter__(self): 97 | return self 98 | 99 | def __next__(self): 100 | value = self.streamer.audio_queues[self.sample_idx].get(timeout=self.streamer.timeout) 101 | if value == self.streamer.stop_signal: 102 | raise StopIteration() 103 | return value 104 | 105 | 106 | class AudioBatchIterator: 107 | """Iterator that yields audio chunks for all samples in the batch.""" 108 | 109 | def __init__(self, streamer: AudioStreamer): 110 | self.streamer = streamer 111 | self.active_samples = set(range(streamer.batch_size)) 112 | 113 | def __iter__(self): 114 | return self 115 | 116 | def __next__(self): 117 | if not self.active_samples: 118 | raise StopIteration() 119 | 120 | batch_chunks = {} 121 | samples_to_remove = set() 122 | 123 | # Try to get chunks from all active samples 124 | for idx in self.active_samples: 125 | try: 126 | value = self.streamer.audio_queues[idx].get(block=False) 127 | if value == self.streamer.stop_signal: 128 | samples_to_remove.add(idx) 129 | else: 130 | batch_chunks[idx] = value 131 | except: 132 | # Queue is empty for this sample, skip it this iteration 133 | pass 134 | 135 | # Remove finished samples 136 | self.active_samples -= samples_to_remove 137 | 138 | if batch_chunks: 139 | return batch_chunks 140 | elif self.active_samples: 141 | # If no chunks were ready but we still have active samples, 142 | # wait a bit and try again 143 | import time 144 | time.sleep(0.01) 145 | return self.__next__() 146 | else: 147 | raise StopIteration() 148 | 149 | 150 | class AsyncAudioStreamer(AudioStreamer): 151 | """ 152 | Async version of AudioStreamer for use in async contexts. 153 | """ 154 | 155 | def __init__( 156 | self, 157 | batch_size: int, 158 | stop_signal: Optional[any] = None, 159 | timeout: Optional[float] = None, 160 | ): 161 | super().__init__(batch_size, stop_signal, timeout) 162 | # Replace regular queues with async queues 163 | self.audio_queues = [asyncio.Queue() for _ in range(batch_size)] 164 | self.loop = asyncio.get_running_loop() 165 | 166 | def put(self, audio_chunks: torch.Tensor, sample_indices: torch.Tensor): 167 | """Put audio chunks in the appropriate async queues.""" 168 | for i, sample_idx in enumerate(sample_indices): 169 | idx = sample_idx.item() 170 | if idx < self.batch_size and not self.finished_flags[idx]: 171 | audio_chunk = audio_chunks[i].detach().cpu() 172 | self.loop.call_soon_threadsafe( 173 | self.audio_queues[idx].put_nowait, audio_chunk 174 | ) 175 | 176 | def end(self, sample_indices: Optional[torch.Tensor] = None): 177 | """Signal the end of generation for specified samples.""" 178 | if sample_indices is None: 179 | indices_to_end = range(self.batch_size) 180 | else: 181 | indices_to_end = [s.item() if torch.is_tensor(s) else s for s in sample_indices] 182 | 183 | for idx in indices_to_end: 184 | if idx < self.batch_size and not self.finished_flags[idx]: 185 | self.loop.call_soon_threadsafe( 186 | self.audio_queues[idx].put_nowait, self.stop_signal 187 | ) 188 | self.finished_flags[idx] = True 189 | 190 | async def get_stream(self, sample_idx: int): 191 | """Get async iterator for a specific sample's audio stream.""" 192 | if sample_idx >= self.batch_size: 193 | raise ValueError(f"Sample index {sample_idx} exceeds batch size {self.batch_size}") 194 | 195 | while True: 196 | value = await self.audio_queues[sample_idx].get() 197 | if value == self.stop_signal: 198 | break 199 | yield value 200 | 201 | def __aiter__(self): 202 | """Returns an async iterator over all audio streams.""" 203 | return AsyncAudioBatchIterator(self) 204 | 205 | 206 | class AsyncAudioBatchIterator: 207 | """Async iterator for batch audio streaming.""" 208 | 209 | def __init__(self, streamer: AsyncAudioStreamer): 210 | self.streamer = streamer 211 | self.active_samples = set(range(streamer.batch_size)) 212 | 213 | def __aiter__(self): 214 | return self 215 | 216 | async def __anext__(self): 217 | if not self.active_samples: 218 | raise StopAsyncIteration() 219 | 220 | batch_chunks = {} 221 | samples_to_remove = set() 222 | 223 | # Create tasks for all active samples 224 | tasks = { 225 | idx: asyncio.create_task(self._get_chunk(idx)) 226 | for idx in self.active_samples 227 | } 228 | 229 | # Wait for at least one chunk to be ready 230 | done, pending = await asyncio.wait( 231 | tasks.values(), 232 | return_when=asyncio.FIRST_COMPLETED, 233 | timeout=self.streamer.timeout 234 | ) 235 | 236 | # Cancel pending tasks 237 | for task in pending: 238 | task.cancel() 239 | 240 | # Process completed tasks 241 | for idx, task in tasks.items(): 242 | if task in done: 243 | try: 244 | value = await task 245 | if value == self.streamer.stop_signal: 246 | samples_to_remove.add(idx) 247 | else: 248 | batch_chunks[idx] = value 249 | except asyncio.CancelledError: 250 | pass 251 | 252 | self.active_samples -= samples_to_remove 253 | 254 | if batch_chunks: 255 | return batch_chunks 256 | elif self.active_samples: 257 | # Try again if we still have active samples 258 | return await self.__anext__() 259 | else: 260 | raise StopAsyncIteration() 261 | 262 | async def _get_chunk(self, idx): 263 | """Helper to get a chunk from a specific queue.""" 264 | return await self.streamer.audio_queues[idx].get() -------------------------------------------------------------------------------- /vvembed/modular/configuration_vibevoice.py: -------------------------------------------------------------------------------- 1 | # Original code by Microsoft 2 | # updated by Fabio Sarracino - Enemyx-net 3 | 4 | """ VibeVoice_AcousticTokenizer model configuration""" 5 | 6 | from typing import Dict, List, Optional, Tuple 7 | 8 | from transformers.configuration_utils import PretrainedConfig 9 | from transformers.utils import logging 10 | 11 | from transformers.models.qwen2.configuration_qwen2 import Qwen2Config 12 | 13 | logger = logging.get_logger(__name__) 14 | 15 | # to be improved... 16 | 17 | 18 | class VibeVoiceAcousticTokenizerConfig(PretrainedConfig): 19 | model_type = "vibevoice_acoustic_tokenizer" 20 | 21 | def __init__( 22 | self, 23 | channels: int = 1, 24 | corpus_normalize: float = 0.0, 25 | causal: bool = True, 26 | vae_dim: int = 64, 27 | fix_std: float = 0.5, 28 | std_dist_type: str = 'gaussian', 29 | # common 30 | mixer_layer: str = 'depthwise_conv', 31 | conv_norm: str = 'none', 32 | pad_mode: str = 'constant', 33 | disable_last_norm: bool = True, 34 | layernorm: str = 'RMSNorm', 35 | layernorm_eps: float = 1e-5, 36 | layernorm_elementwise_affine: bool = True, 37 | conv_bias: bool = True, 38 | layer_scale_init_value: float = 1e-6, 39 | weight_init_value: float = 1e-2, 40 | # encoder specific 41 | encoder_n_filters: int = 32, 42 | encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2], 43 | encoder_depths: str = "3-3-3-3-3-3-8", 44 | # decoder specific 45 | decoder_n_filters: int = 32, 46 | decoder_ratios: Optional[List[int]] = None, # if None, same as encoder 47 | decoder_depths: Optional[str] = None, 48 | **kwargs 49 | ): 50 | super().__init__(**kwargs) 51 | self.channels = channels 52 | self.corpus_normalize = corpus_normalize 53 | self.causal = causal 54 | self.vae_dim = vae_dim 55 | self.fix_std = fix_std 56 | self.std_dist_type = std_dist_type 57 | 58 | # common parameters 59 | self.conv_norm = conv_norm 60 | self.pad_mode = pad_mode 61 | self.layernorm_eps = layernorm_eps 62 | self.disable_last_norm = disable_last_norm 63 | self.layernorm = layernorm 64 | self.layernorm_elementwise_affine = layernorm_elementwise_affine 65 | self.conv_bias = conv_bias 66 | self.layer_scale_init_value = layer_scale_init_value 67 | self.weight_init_value = weight_init_value 68 | self.mixer_layer = mixer_layer 69 | 70 | # encoder specific parameters 71 | self.encoder_n_filters = encoder_n_filters 72 | self.encoder_ratios = encoder_ratios 73 | self.encoder_depths = encoder_depths 74 | 75 | # decoder specific parameters 76 | self.decoder_ratios = decoder_ratios if decoder_ratios is not None else encoder_ratios 77 | self.decoder_n_filters = decoder_n_filters 78 | self.decoder_depths = decoder_depths 79 | 80 | 81 | class VibeVoiceSemanticTokenizerConfig(PretrainedConfig): 82 | model_type = "vibevoice_semantic_tokenizer" 83 | 84 | def __init__( 85 | self, 86 | channels: int = 1, 87 | corpus_normalize: float = 0.0, 88 | causal: bool = True, 89 | vae_dim: int = 64, 90 | fix_std: float = 0, 91 | std_dist_type: str = 'none', 92 | # common 93 | mixer_layer: str = 'depthwise_conv', 94 | conv_norm: str = 'none', 95 | pad_mode: str = 'constant', 96 | disable_last_norm: bool = True, 97 | layernorm: str = 'RMSNorm', 98 | layernorm_eps: float = 1e-5, 99 | layernorm_elementwise_affine: bool = True, 100 | conv_bias: bool = True, 101 | layer_scale_init_value: float = 1e-6, 102 | weight_init_value: float = 1e-2, 103 | # encoder specific 104 | encoder_n_filters: int = 32, 105 | encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2], 106 | encoder_depths: str = "3-3-3-3-3-3-8", 107 | **kwargs 108 | ): 109 | super().__init__(**kwargs) 110 | self.channels = channels 111 | self.corpus_normalize = corpus_normalize 112 | self.causal = causal 113 | self.vae_dim = vae_dim 114 | self.fix_std = fix_std 115 | self.std_dist_type = std_dist_type 116 | 117 | # common parameters 118 | self.conv_norm = conv_norm 119 | self.pad_mode = pad_mode 120 | self.layernorm_eps = layernorm_eps 121 | self.disable_last_norm = disable_last_norm 122 | self.layernorm = layernorm 123 | self.layernorm_elementwise_affine = layernorm_elementwise_affine 124 | self.conv_bias = conv_bias 125 | self.layer_scale_init_value = layer_scale_init_value 126 | self.weight_init_value = weight_init_value 127 | self.mixer_layer = mixer_layer 128 | 129 | # encoder specific parameters 130 | self.encoder_n_filters = encoder_n_filters 131 | self.encoder_ratios = encoder_ratios 132 | self.encoder_depths = encoder_depths 133 | 134 | 135 | class VibeVoiceDiffusionHeadConfig(PretrainedConfig): 136 | model_type = "vibevoice_diffusion_head" 137 | 138 | def __init__( 139 | self, 140 | hidden_size=768, 141 | head_layers=4, 142 | head_ffn_ratio=3.0, 143 | rms_norm_eps=1e-5, 144 | latent_size=64, 145 | speech_vae_dim=None, 146 | prediction_type="v_prediction", 147 | diffusion_type="ddpm", 148 | ddpm_num_steps=1000, 149 | ddpm_num_inference_steps=20, 150 | ddpm_beta_schedule="cosine", 151 | ddpm_batch_mul=4, 152 | **kwargs 153 | ): 154 | self.hidden_size = hidden_size 155 | self.head_layers = head_layers 156 | self.head_ffn_ratio = head_ffn_ratio 157 | self.rms_norm_eps = rms_norm_eps 158 | self.latent_size = latent_size 159 | self.speech_vae_dim = speech_vae_dim 160 | self.prediction_type = prediction_type 161 | self.diffusion_type = diffusion_type 162 | self.ddpm_num_steps = ddpm_num_steps 163 | self.ddpm_num_inference_steps = ddpm_num_inference_steps 164 | self.ddpm_beta_schedule = ddpm_beta_schedule 165 | self.ddpm_batch_mul = ddpm_batch_mul 166 | 167 | super().__init__(**kwargs) 168 | 169 | class VibeVoiceConfig(PretrainedConfig): 170 | model_type = "vibevoice" 171 | is_composition = True 172 | sub_configs = { 173 | "acoustic_tokenizer_config": VibeVoiceAcousticTokenizerConfig, 174 | "semantic_tokenizer_config": VibeVoiceSemanticTokenizerConfig, 175 | "decoder_config": Qwen2Config, 176 | "diffusion_head_config": VibeVoiceDiffusionHeadConfig, 177 | } 178 | # keys_to_ignore_at_inference = ["past_key_values"] 179 | # Default tensor parallel plan for base model `Qwen2` 180 | base_model_tp_plan = { 181 | "layers.*.self_attn.q_proj": "colwise", 182 | "layers.*.self_attn.k_proj": "colwise", 183 | "layers.*.self_attn.v_proj": "colwise", 184 | "layers.*.self_attn.o_proj": "rowwise", 185 | "layers.*.mlp.gate_proj": "colwise", 186 | "layers.*.mlp.up_proj": "colwise", 187 | "layers.*.mlp.down_proj": "rowwise", 188 | } 189 | 190 | def __init__( 191 | self, 192 | acoustic_tokenizer_config=None, 193 | semantic_tokenizer_config=None, 194 | decoder_config=None, 195 | diffusion_head_config=None, 196 | **kwargs 197 | ): 198 | 199 | # kwargs["_attn_implementation"] = "flash_attention_2" 200 | kwargs["_attn_implementation_autoset"] = False 201 | 202 | if acoustic_tokenizer_config is None: 203 | self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"]() 204 | elif isinstance(acoustic_tokenizer_config, dict): 205 | acoustic_tokenizer_config["model_type"] = "vibevoice_acoustic_tokenizer" 206 | self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"](**acoustic_tokenizer_config) 207 | elif isinstance(acoustic_tokenizer_config, VibeVoiceAcousticTokenizerConfig): 208 | # If an instance of the config class is provided 209 | self.acoustic_tokenizer_config = acoustic_tokenizer_config 210 | 211 | if semantic_tokenizer_config is None: 212 | self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"]() 213 | elif isinstance(semantic_tokenizer_config, dict): 214 | semantic_tokenizer_config["model_type"] = "vibevoice_semantic_tokenizer" 215 | self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"](**semantic_tokenizer_config) 216 | elif isinstance(semantic_tokenizer_config, VibeVoiceSemanticTokenizerConfig): 217 | # If an instance of the config class is provided 218 | self.semantic_tokenizer_config = semantic_tokenizer_config 219 | 220 | if decoder_config is None: 221 | self.decoder_config = self.sub_configs["decoder_config"]() 222 | elif isinstance(decoder_config, dict): 223 | # If a dictionary is provided, instantiate the config class with it 224 | # self.decoder_config = self.sub_configs["decoder_config"](**decoder_config) 225 | if decoder_config.get("model_type", '') == "qwen2": 226 | self.decoder_config = Qwen2Config(**decoder_config) 227 | else: 228 | raise ValueError(f"Unsupported decoder model type: {decoder_config.get('model_type', '')}") 229 | elif isinstance(decoder_config, (Qwen2Config,)): 230 | # If an instance of the config class is provided 231 | self.decoder_config = decoder_config 232 | 233 | if diffusion_head_config is None: 234 | self.diffusion_head_config = self.sub_configs["diffusion_head_config"]() 235 | elif isinstance(diffusion_head_config, dict): 236 | diffusion_head_config["model_type"] = "vibevoice_diffusion_head" 237 | self.diffusion_head_config = self.sub_configs["diffusion_head_config"](**diffusion_head_config) 238 | elif isinstance(diffusion_head_config, VibeVoiceDiffusionHeadConfig): 239 | # If an instance of the config class is provided 240 | self.diffusion_head_config = diffusion_head_config 241 | 242 | # other parameters 243 | self.acoustic_vae_dim = getattr(self.acoustic_tokenizer_config, 'vae_dim', 64) 244 | self.semantic_vae_dim = getattr(self.semantic_tokenizer_config, 'vae_dim', 128) 245 | 246 | # Add attributes required by newer transformers versions from decoder_config 247 | # These are used by GenerationMixin in newer versions 248 | if hasattr(self.decoder_config, 'num_hidden_layers'): 249 | self.num_hidden_layers = self.decoder_config.num_hidden_layers 250 | if hasattr(self.decoder_config, 'vocab_size'): 251 | self.vocab_size = self.decoder_config.vocab_size 252 | if hasattr(self.decoder_config, 'hidden_size'): 253 | self.hidden_size = self.decoder_config.hidden_size 254 | if hasattr(self.decoder_config, 'num_attention_heads'): 255 | self.num_attention_heads = self.decoder_config.num_attention_heads 256 | if hasattr(self.decoder_config, 'num_key_value_heads'): 257 | self.num_key_value_heads = self.decoder_config.num_key_value_heads 258 | if hasattr(self.decoder_config, 'intermediate_size'): 259 | self.intermediate_size = self.decoder_config.intermediate_size 260 | if hasattr(self.decoder_config, 'max_position_embeddings'): 261 | self.max_position_embeddings = self.decoder_config.max_position_embeddings 262 | 263 | super().__init__(**kwargs) 264 | 265 | __all__ = [ 266 | "VibeVoiceAcousticTokenizerConfig", 267 | "VibeVoiceSemanticTokenizerConfig", 268 | "VibeVoiceDiffusionHeadConfig", 269 | "VibeVoiceConfig" 270 | ] -------------------------------------------------------------------------------- /nodes/single_speaker_node.py: -------------------------------------------------------------------------------- 1 | # Created by Fabio Sarracino 2 | 3 | import logging 4 | import os 5 | import tempfile 6 | import torch 7 | import numpy as np 8 | import re 9 | from typing import List, Optional 10 | 11 | from .base_vibevoice import BaseVibeVoiceNode, get_available_models 12 | 13 | # Setup logging 14 | logger = logging.getLogger("VibeVoice") 15 | 16 | class VibeVoiceSingleSpeakerNode(BaseVibeVoiceNode): 17 | def __init__(self): 18 | super().__init__() 19 | # Register this instance for memory management 20 | try: 21 | from .free_memory_node import VibeVoiceFreeMemoryNode 22 | VibeVoiceFreeMemoryNode.register_single_speaker(self) 23 | except: 24 | pass 25 | 26 | @classmethod 27 | def INPUT_TYPES(cls): 28 | # Get available models dynamically 29 | available_models = get_available_models() 30 | model_choices = [display_name for _, display_name in available_models] 31 | default_model = model_choices[0] if model_choices else "No models found" 32 | 33 | return { 34 | "required": { 35 | "text": ("STRING", { 36 | "multiline": True, 37 | "default": "Hello, this is a test of the VibeVoice text-to-speech system.", 38 | "tooltip": "Text to convert to speech. Gets disabled when connected to another node.", 39 | "forceInput": False, 40 | "dynamicPrompts": True 41 | }), 42 | "model": (model_choices if model_choices else ["No models found"], { 43 | "default": default_model, 44 | "tooltip": "Select a model from ComfyUI/models/vibevoice/ folder" 45 | }), 46 | "attention_type": (["auto", "eager", "sdpa", "flash_attention_2", "sage"], { 47 | "default": "auto", 48 | "tooltip": "Attention implementation. Auto selects the best available, eager is standard, sdpa is optimized PyTorch, flash_attention_2 requires compatible GPU, sage uses quantized attention for speedup (CUDA only)" 49 | }), 50 | "quantize_llm": (["full precision", "4bit", "8bit"], { 51 | "default": "full precision", 52 | "tooltip": "Dynamically quantize only the LLM component for non-quantized models. 4bit: major VRAM savings with minimal quality loss. 8bit: good balance of quality and memory usage. Full precision: original quality. Note: ignored for pre-quantized models. Requires CUDA GPU." 53 | }), 54 | "free_memory_after_generate": ("BOOLEAN", {"default": True, "tooltip": "Free model from memory after generation to save VRAM/RAM. Disable to keep model loaded for faster subsequent generations"}), 55 | "diffusion_steps": ("INT", {"default": 20, "min": 1, "max": 100, "step": 1, "tooltip": "Number of denoising steps. More steps = theoretically better quality but slower. Default: 20"}), 56 | "seed": ("INT", {"default": 42, "min": 0, "max": 2**32-1, "tooltip": "Random seed for generation. Default 42 is used in official examples"}), 57 | "cfg_scale": ("FLOAT", {"default": 1.3, "min": 0.5, "max": 3.5, "step": 0.05, "tooltip": "Classifier-free guidance scale (official default: 1.3)"}), 58 | "use_sampling": ("BOOLEAN", {"default": False, "tooltip": "Enable sampling mode. When False (default), uses deterministic generation like official examples"}), 59 | }, 60 | "optional": { 61 | "voice_to_clone": ("AUDIO", {"tooltip": "Optional: Reference voice to clone. If not provided, synthetic voice will be used."}), 62 | "lora": ("LORA_CONFIG", {"tooltip": "Optional: LoRA configuration from VibeVoice LoRA node"}), 63 | "temperature": ("FLOAT", {"default": 0.95, "min": 0.1, "max": 2.0, "step": 0.05, "tooltip": "Only used when sampling is enabled"}), 64 | "top_p": ("FLOAT", {"default": 0.95, "min": 0.1, "max": 1.0, "step": 0.05, "tooltip": "Only used when sampling is enabled"}), 65 | "max_words_per_chunk": ("INT", {"default": 250, "min": 100, "max": 500, "step": 50, "tooltip": "Maximum words per chunk for long texts. Lower values prevent speed issues but create more chunks."}), 66 | "voice_speed_factor": ("FLOAT", { 67 | "default": 1.0, 68 | "min": 0.8, 69 | "max": 1.2, 70 | "step": 0.01, 71 | "tooltip": "1.0 = normal speed, <1.0 = slower speed, >1.0 = faster speed" 72 | }), 73 | } 74 | } 75 | 76 | RETURN_TYPES = ("AUDIO",) 77 | RETURN_NAMES = ("audio",) 78 | FUNCTION = "generate_speech" 79 | CATEGORY = "VibeVoiceWrapper" 80 | DESCRIPTION = "Generate speech from text using Microsoft VibeVoice with optional voice cloning" 81 | 82 | def _prepare_voice_samples(self, speakers: list, voice_to_clone, voice_speed_factor: float = 1.0) -> List[np.ndarray]: 83 | """Prepare voice samples from input audio or create synthetic ones""" 84 | 85 | if voice_to_clone is not None: 86 | # Use the base class method to prepare audio with speed adjustment 87 | audio_np = self._prepare_audio_from_comfyui(voice_to_clone, speed_factor=voice_speed_factor) 88 | if audio_np is not None: 89 | return [audio_np] 90 | 91 | # Create synthetic voice samples for speakers 92 | voice_samples = [] 93 | for i, speaker in enumerate(speakers): 94 | voice_sample = self._create_synthetic_voice_sample(i) 95 | voice_samples.append(voice_sample) 96 | 97 | return voice_samples 98 | 99 | def generate_speech(self, text: str = "", model: str = "VibeVoice-1.5B", 100 | attention_type: str = "auto", quantize_llm: str = "full precision", 101 | free_memory_after_generate: bool = True, 102 | diffusion_steps: int = 20, seed: int = 42, cfg_scale: float = 1.3, 103 | use_sampling: bool = False, voice_to_clone=None, lora=None, 104 | temperature: float = 0.95, top_p: float = 0.95, 105 | max_words_per_chunk: int = 250, voice_speed_factor: float = 1.0): 106 | """Generate speech from text using VibeVoice""" 107 | 108 | try: 109 | # Use text directly (it now serves as both manual input and connection input) 110 | if text and text.strip(): 111 | final_text = text 112 | else: 113 | raise Exception("No text provided. Please enter text or connect from LoadTextFromFile node.") 114 | 115 | # Get the actual folder path for the selected model 116 | available_models = get_available_models() 117 | model_path = None 118 | for folder, display_name in available_models: 119 | if display_name == model: 120 | model_path = folder 121 | break 122 | 123 | if not model_path: 124 | raise Exception(f"Model '{model}' not found in models/vibevoice/") 125 | 126 | # Extract LoRA configuration if provided 127 | lora_path = None 128 | llm_lora_strength = 1.0 129 | if lora and isinstance(lora, dict): 130 | lora_path = lora.get("path", None) 131 | llm_lora_strength = lora.get("llm_strength", 1.0) 132 | 133 | # Set LoRA component flags based on configuration 134 | self.use_llm_lora = lora.get("use_llm", True) 135 | self.use_diffusion_head_lora = lora.get("use_diffusion_head", True) 136 | self.use_acoustic_connector_lora = lora.get("use_acoustic_connector", True) 137 | self.use_semantic_connector_lora = lora.get("use_semantic_connector", True) 138 | 139 | if lora_path: 140 | logger.info(f"Using LoRA from: {lora_path}") 141 | 142 | # Load model with optional LoRA 143 | self.load_model(model, model_path, attention_type, quantize_llm=quantize_llm, lora_path=lora_path) 144 | 145 | # For single speaker, we just use ["Speaker 1"] 146 | speakers = ["Speaker 1"] 147 | 148 | # Parse pause keywords from text 149 | segments = self._parse_pause_keywords(final_text) 150 | 151 | # Process segments 152 | all_audio_segments = [] 153 | voice_samples = None # Will be created on first text segment 154 | sample_rate = 24000 # VibeVoice uses 24kHz 155 | 156 | for seg_idx, (seg_type, seg_content) in enumerate(segments): 157 | if seg_type == 'pause': 158 | # Generate silence for pause 159 | duration_ms = seg_content 160 | logger.info(f"Adding {duration_ms}ms pause") 161 | silence_audio = self._generate_silence(duration_ms, sample_rate) 162 | all_audio_segments.append(silence_audio) 163 | 164 | elif seg_type == 'text': 165 | # Process text segment (with chunking if needed) 166 | word_count = len(seg_content.split()) 167 | 168 | if word_count > max_words_per_chunk: 169 | # Split long text into chunks 170 | logger.info(f"Text segment {seg_idx+1} has {word_count} words, splitting into chunks...") 171 | text_chunks = self._split_text_into_chunks(seg_content, max_words_per_chunk) 172 | 173 | for chunk_idx, chunk in enumerate(text_chunks): 174 | logger.info(f"Processing chunk {chunk_idx+1}/{len(text_chunks)} of segment {seg_idx+1}...") 175 | 176 | # Format chunk for VibeVoice 177 | formatted_text = self._format_text_for_vibevoice(chunk, speakers) 178 | 179 | # Create voice samples on first text segment 180 | if voice_samples is None: 181 | voice_samples = self._prepare_voice_samples(speakers, voice_to_clone, voice_speed_factor) 182 | 183 | # Generate audio for this chunk 184 | chunk_audio = self._generate_with_vibevoice( 185 | formatted_text, voice_samples, cfg_scale, 186 | seed, # Use same seed for voice consistency 187 | diffusion_steps, use_sampling, temperature, top_p, 188 | llm_lora_strength=llm_lora_strength 189 | ) 190 | 191 | all_audio_segments.append(chunk_audio) 192 | else: 193 | # Process as single chunk 194 | logger.info(f"Processing text segment {seg_idx+1} ({word_count} words)") 195 | 196 | # Format text for VibeVoice 197 | formatted_text = self._format_text_for_vibevoice(seg_content, speakers) 198 | 199 | # Create voice samples on first text segment 200 | if voice_samples is None: 201 | voice_samples = self._prepare_voice_samples(speakers, voice_to_clone, voice_speed_factor) 202 | 203 | # Generate audio 204 | segment_audio = self._generate_with_vibevoice( 205 | formatted_text, voice_samples, cfg_scale, seed, diffusion_steps, 206 | use_sampling, temperature, top_p, llm_lora_strength=llm_lora_strength 207 | ) 208 | 209 | all_audio_segments.append(segment_audio) 210 | 211 | # Concatenate all audio segments (including pauses) 212 | if all_audio_segments: 213 | logger.info(f"Concatenating {len(all_audio_segments)} audio segments (including pauses)...") 214 | 215 | # Extract waveforms from all segments 216 | waveforms = [] 217 | for audio_segment in all_audio_segments: 218 | if isinstance(audio_segment, dict) and "waveform" in audio_segment: 219 | waveforms.append(audio_segment["waveform"]) 220 | 221 | if waveforms: 222 | # Filter out None values if any 223 | valid_waveforms = [w for w in waveforms if w is not None] 224 | 225 | if valid_waveforms: 226 | # Concatenate along the time dimension (last dimension) 227 | combined_waveform = torch.cat(valid_waveforms, dim=-1) 228 | 229 | # Create final audio dict 230 | audio_dict = { 231 | "waveform": combined_waveform, 232 | "sample_rate": sample_rate 233 | } 234 | logger.info(f"Successfully generated audio with {len(segments)} segments") 235 | else: 236 | raise Exception("No valid audio waveforms generated") 237 | else: 238 | raise Exception("Failed to extract waveforms from audio segments") 239 | else: 240 | raise Exception("No audio segments generated") 241 | 242 | # Free memory if requested 243 | if free_memory_after_generate: 244 | self.free_memory() 245 | 246 | return (audio_dict,) 247 | 248 | except Exception as e: 249 | # Check if this is an interruption by the user 250 | import comfy.model_management as mm 251 | if isinstance(e, mm.InterruptProcessingException): 252 | # User interrupted - just log it and re-raise to stop the workflow 253 | logger.info("Generation interrupted by user") 254 | raise # Propagate the interruption to stop the workflow 255 | else: 256 | # Real error - show it 257 | logger.error(f"Single speaker speech generation failed: {str(e)}") 258 | raise Exception(f"Error generating speech: {str(e)}") 259 | 260 | @classmethod 261 | def IS_CHANGED(cls, text="", model="VibeVoice-1.5B", voice_to_clone=None, lora=None, **kwargs): 262 | """Cache key for ComfyUI""" 263 | voice_hash = hash(str(voice_to_clone)) if voice_to_clone else 0 264 | lora_hash = hash(str(lora)) if lora else 0 265 | return f"{hash(text)}_{model}_{voice_hash}_{lora_hash}_{kwargs.get('cfg_scale', 1.3)}_{kwargs.get('seed', 0)}" -------------------------------------------------------------------------------- /vvembed/processor/vibevoice_tokenizer_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Processor class for VibeVoice models. 3 | """ 4 | 5 | import os 6 | import json 7 | import warnings 8 | from typing import List, Optional, Union, Dict, Any 9 | 10 | import numpy as np 11 | import torch 12 | 13 | from transformers.feature_extraction_utils import FeatureExtractionMixin 14 | from transformers.utils import logging 15 | 16 | logger = logging.get_logger(__name__) 17 | 18 | 19 | class AudioNormalizer: 20 | """ 21 | Audio normalization class for VibeVoice tokenizer. 22 | 23 | This class provides audio normalization to ensure consistent input levels 24 | for the VibeVoice tokenizer while maintaining audio quality. 25 | """ 26 | 27 | def __init__(self, target_dB_FS: float = -25, eps: float = 1e-6): 28 | """ 29 | Initialize the audio normalizer. 30 | 31 | Args: 32 | target_dB_FS (float): Target dB FS level for the audio. Default: -25 33 | eps (float): Small value to avoid division by zero. Default: 1e-6 34 | """ 35 | self.target_dB_FS = target_dB_FS 36 | self.eps = eps 37 | 38 | def tailor_dB_FS(self, audio: np.ndarray) -> tuple: 39 | """ 40 | Adjust the audio to the target dB FS level. 41 | 42 | Args: 43 | audio (np.ndarray): Input audio signal 44 | 45 | Returns: 46 | tuple: (normalized_audio, rms, scalar) 47 | """ 48 | rms = np.sqrt(np.mean(audio**2)) 49 | scalar = 10 ** (self.target_dB_FS / 20) / (rms + self.eps) 50 | normalized_audio = audio * scalar 51 | return normalized_audio, rms, scalar 52 | 53 | def avoid_clipping(self, audio: np.ndarray, scalar: Optional[float] = None) -> tuple: 54 | """ 55 | Avoid clipping by scaling down if necessary. 56 | 57 | Args: 58 | audio (np.ndarray): Input audio signal 59 | scalar (float, optional): Explicit scaling factor 60 | 61 | Returns: 62 | tuple: (normalized_audio, scalar) 63 | """ 64 | if scalar is None: 65 | max_val = np.max(np.abs(audio)) 66 | if max_val > 1.0: 67 | scalar = max_val + self.eps 68 | else: 69 | scalar = 1.0 70 | 71 | return audio / scalar, scalar 72 | 73 | def __call__(self, audio: np.ndarray) -> np.ndarray: 74 | """ 75 | Normalize the audio by adjusting to target dB FS and avoiding clipping. 76 | 77 | Args: 78 | audio (np.ndarray): Input audio signal 79 | 80 | Returns: 81 | np.ndarray: Normalized audio signal 82 | """ 83 | # First adjust to target dB FS 84 | audio, _, _ = self.tailor_dB_FS(audio) 85 | # Then avoid clipping 86 | audio, _ = self.avoid_clipping(audio) 87 | return audio 88 | 89 | 90 | # Change from ProcessorMixin to FeatureExtractionMixin which is designed for single components 91 | class VibeVoiceTokenizerProcessor(FeatureExtractionMixin): 92 | """ 93 | Processor for VibeVoice acoustic tokenizer models. 94 | 95 | This processor handles audio preprocessing for VibeVoice models, including: 96 | - Audio format conversion (stereo to mono) 97 | - Optional audio normalization 98 | - Streaming support for infinite-length audio 99 | 100 | Args: 101 | sampling_rate (int, optional): Expected sampling rate. Defaults to 24000. 102 | normalize_audio (bool, optional): Whether to normalize audio. Defaults to True. 103 | target_dB_FS (float, optional): Target dB FS for normalization. Defaults to -25. 104 | eps (float, optional): Small value for numerical stability. Defaults to 1e-6. 105 | """ 106 | model_input_names = ["input_features"] 107 | 108 | def __init__( 109 | self, 110 | sampling_rate: int = 24000, 111 | normalize_audio: bool = True, 112 | target_dB_FS: float = -25, 113 | eps: float = 1e-6, 114 | **kwargs, 115 | ): 116 | super().__init__(**kwargs) 117 | 118 | self.sampling_rate = sampling_rate 119 | self.normalize_audio = normalize_audio 120 | 121 | # Initialize audio normalizer if needed 122 | if self.normalize_audio: 123 | self.normalizer = AudioNormalizer(target_dB_FS=target_dB_FS, eps=eps) 124 | else: 125 | self.normalizer = None 126 | 127 | # Save config 128 | self.feature_extractor_dict = { 129 | "sampling_rate": sampling_rate, 130 | "normalize_audio": normalize_audio, 131 | "target_dB_FS": target_dB_FS, 132 | "eps": eps, 133 | } 134 | 135 | def _ensure_mono(self, audio: np.ndarray) -> np.ndarray: 136 | """ 137 | Convert stereo audio to mono if needed. 138 | 139 | Args: 140 | audio (np.ndarray): Input audio array 141 | 142 | Returns: 143 | np.ndarray: Mono audio array 144 | """ 145 | if len(audio.shape) == 1: 146 | return audio 147 | elif len(audio.shape) == 2: 148 | if audio.shape[0] == 2: # (2, time) 149 | return np.mean(audio, axis=0) 150 | elif audio.shape[1] == 2: # (time, 2) 151 | return np.mean(audio, axis=1) 152 | else: 153 | # If one dimension is 1, squeeze it 154 | if audio.shape[0] == 1: 155 | return audio.squeeze(0) 156 | elif audio.shape[1] == 1: 157 | return audio.squeeze(1) 158 | else: 159 | raise ValueError(f"Unexpected audio shape: {audio.shape}") 160 | else: 161 | raise ValueError(f"Audio should be 1D or 2D, got shape: {audio.shape}") 162 | 163 | def _process_single_audio(self, audio: Union[np.ndarray, List[float]]) -> np.ndarray: 164 | """ 165 | Process a single audio array. 166 | 167 | Args: 168 | audio: Single audio input 169 | 170 | Returns: 171 | np.ndarray: Processed audio 172 | """ 173 | # Convert to numpy array 174 | if not isinstance(audio, np.ndarray): 175 | audio = np.array(audio, dtype=np.float32) 176 | else: 177 | audio = audio.astype(np.float32) 178 | 179 | # Ensure mono 180 | audio = self._ensure_mono(audio) 181 | 182 | # Normalize if requested 183 | if self.normalize_audio and self.normalizer is not None: 184 | audio = self.normalizer(audio) 185 | 186 | return audio 187 | 188 | def __call__( 189 | self, 190 | audio: Union[str, np.ndarray, List[float], List[np.ndarray], List[List[float]], List[str]] = None, 191 | sampling_rate: Optional[int] = None, 192 | return_tensors: Optional[str] = None, 193 | **kwargs, 194 | ): 195 | """ 196 | Process audio for VibeVoice models. 197 | 198 | Args: 199 | audio: Audio input(s) to process. Can be: 200 | - str: Path to audio file 201 | - np.ndarray: Audio array 202 | - List[float]: Audio as list of floats 203 | - List[np.ndarray]: Batch of audio arrays 204 | - List[str]: Batch of audio file paths 205 | sampling_rate (int, optional): Sampling rate of the input audio 206 | return_tensors (str, optional): Return format ('pt' for PyTorch, 'np' for NumPy) 207 | 208 | Returns: 209 | dict: Processed audio inputs with keys: 210 | - input_features: Audio tensor(s) ready for the model 211 | """ 212 | if audio is None: 213 | raise ValueError("Audio input is required") 214 | 215 | # Validate sampling rate 216 | if sampling_rate is not None and sampling_rate != self.sampling_rate: 217 | logger.warning( 218 | f"Input sampling rate ({sampling_rate}) differs from expected " 219 | f"sampling rate ({self.sampling_rate}). Please resample your audio." 220 | ) 221 | 222 | # Handle different input types 223 | if isinstance(audio, str): 224 | # Single audio file path 225 | audio = self._load_audio_from_path(audio) 226 | is_batched = False 227 | elif isinstance(audio, list): 228 | if len(audio) == 0: 229 | raise ValueError("Empty audio list provided") 230 | 231 | # Check if it's a list of file paths 232 | if all(isinstance(item, str) for item in audio): 233 | # Batch of audio file paths 234 | audio = [self._load_audio_from_path(path) for path in audio] 235 | is_batched = True 236 | else: 237 | # Check if it's batched audio arrays 238 | is_batched = isinstance(audio[0], (np.ndarray, list)) 239 | else: 240 | # Single audio array or list 241 | is_batched = False 242 | 243 | # Process audio 244 | if is_batched: 245 | processed_audio = [self._process_single_audio(a) for a in audio] 246 | else: 247 | processed_audio = [self._process_single_audio(audio)] 248 | 249 | # Convert to tensors if requested 250 | if return_tensors == "pt": 251 | if len(processed_audio) == 1: 252 | # Create a proper batch dimension (B, T) 253 | input_features = torch.from_numpy(processed_audio[0]).unsqueeze(0).unsqueeze(1) 254 | else: 255 | # For batched input with different lengths, create a batch properly 256 | input_features = torch.stack([torch.from_numpy(a) for a in processed_audio]).unsqueeze(1) 257 | elif return_tensors == "np": 258 | if len(processed_audio) == 1: 259 | input_features = processed_audio[0][np.newaxis, np.newaxis, :] 260 | else: 261 | input_features = np.stack(processed_audio)[:, np.newaxis, :] 262 | else: 263 | input_features = processed_audio[0] if len(processed_audio) == 1 else processed_audio 264 | 265 | outputs = { 266 | "audio": input_features, # Use "audio" instead of "input_features" 267 | } 268 | 269 | return outputs 270 | 271 | def _load_audio_from_path(self, audio_path: str) -> np.ndarray: 272 | """ 273 | Load audio from file path. 274 | 275 | Args: 276 | audio_path (str): Path to audio file 277 | 278 | Returns: 279 | np.ndarray: Loaded audio array 280 | """ 281 | # Get file extension to determine loading method 282 | file_ext = os.path.splitext(audio_path)[1].lower() 283 | 284 | if file_ext in ['.wav', '.mp3', '.flac', '.m4a', '.ogg']: 285 | # Audio file - use librosa 286 | import librosa 287 | audio_array, sr = librosa.load( 288 | audio_path, 289 | sr=self.sampling_rate, 290 | mono=True 291 | ) 292 | return audio_array 293 | elif file_ext == '.pt': 294 | # PyTorch tensor file 295 | audio_tensor = torch.load(audio_path, map_location='cpu').squeeze() 296 | if isinstance(audio_tensor, torch.Tensor): 297 | audio_array = audio_tensor.numpy() 298 | else: 299 | audio_array = np.array(audio_tensor) 300 | return audio_array.astype(np.float32) 301 | elif file_ext == '.npy': 302 | # NumPy file 303 | audio_array = np.load(audio_path) 304 | return audio_array.astype(np.float32) 305 | else: 306 | raise ValueError( 307 | f"Unsupported file format: {file_ext}. " 308 | f"Supported formats: .wav, .mp3, .flac, .m4a, .ogg, .pt, .npy, .npz" 309 | ) 310 | 311 | def preprocess_audio( 312 | self, 313 | audio_path_or_array: Union[str, np.ndarray], 314 | normalize: Optional[bool] = None, 315 | ) -> np.ndarray: 316 | """ 317 | Convenience method to preprocess audio from file path or array. 318 | This method is kept for backward compatibility but __call__ is recommended. 319 | 320 | Args: 321 | audio_path_or_array: Path to audio file or numpy array 322 | normalize: Whether to normalize (overrides default setting) 323 | 324 | Returns: 325 | np.ndarray: Preprocessed audio array 326 | """ 327 | if isinstance(audio_path_or_array, str): 328 | audio_array = self._load_audio_from_path(audio_path_or_array) 329 | else: 330 | audio_array = np.array(audio_path_or_array, dtype=np.float32) 331 | 332 | # Override normalization setting if specified 333 | original_normalize = self.normalize_audio 334 | if normalize is not None: 335 | self.normalize_audio = normalize 336 | 337 | try: 338 | processed = self._process_single_audio(audio_array) 339 | finally: 340 | # Restore original setting 341 | self.normalize_audio = original_normalize 342 | 343 | return processed 344 | 345 | # Override to_dict method for configuration saving 346 | def to_dict(self) -> Dict[str, Any]: 347 | """ 348 | Convert the object to a dict containing all attributes needed for serialization. 349 | """ 350 | return self.feature_extractor_dict 351 | 352 | def save_audio( 353 | self, 354 | audio: Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, np.ndarray]]], 355 | output_path: str = "output.wav", 356 | sampling_rate: Optional[int] = None, 357 | normalize: bool = False, 358 | batch_prefix: str = "audio_", 359 | ): 360 | """ 361 | Save audio data to WAV file(s). 362 | 363 | Args: 364 | audio: Audio data to save. Can be: 365 | - torch.Tensor: PyTorch tensor with shape (B, C, T) or (B, T) or (T) 366 | - np.ndarray: NumPy array with shape (B, C, T) or (B, T) or (T) 367 | - List of tensors or arrays 368 | output_path: Path where to save the audio. If saving multiple files, 369 | this is treated as a directory and individual files will be saved inside. 370 | sampling_rate: Sampling rate for the saved audio. Defaults to the processor's rate. 371 | normalize: Whether to normalize audio before saving. 372 | batch_prefix: Prefix for batch files when saving multiple audios. 373 | 374 | Returns: 375 | List[str]: Paths to the saved audio files. 376 | """ 377 | if sampling_rate is None: 378 | sampling_rate = self.sampling_rate 379 | 380 | try: 381 | import soundfile as sf 382 | except ImportError: 383 | raise ImportError( 384 | "soundfile is required to save audio files. " 385 | "Install it with: pip install soundfile" 386 | ) 387 | 388 | # Ensure audio is in the right format 389 | if isinstance(audio, torch.Tensor): 390 | # Convert PyTorch tensor to numpy 391 | audio_np = audio.float().detach().cpu().numpy() 392 | elif isinstance(audio, np.ndarray): 393 | audio_np = audio 394 | elif isinstance(audio, list): 395 | # Handle list of tensors or arrays 396 | if all(isinstance(a, torch.Tensor) for a in audio): 397 | audio_np = [a.float().detach().cpu().numpy() for a in audio] 398 | else: 399 | audio_np = audio 400 | else: 401 | raise ValueError(f"Unsupported audio type: {type(audio)}") 402 | 403 | saved_paths = [] 404 | 405 | # Handle based on shape or type 406 | if isinstance(audio_np, list): 407 | # Multiple separate audios to save 408 | output_dir = output_path 409 | 410 | # Ensure output directory exists 411 | os.makedirs(output_dir, exist_ok=True) 412 | 413 | # Save each audio 414 | for i, audio_item in enumerate(audio_np): 415 | audio_item = self._prepare_audio_for_save(audio_item, normalize) 416 | file_path = os.path.join(output_dir, f"{batch_prefix}{i}.wav") 417 | sf.write(file_path, audio_item, sampling_rate) 418 | saved_paths.append(file_path) 419 | 420 | else: 421 | # Handle different dimensions 422 | if len(audio_np.shape) >= 3: # (B, C, T) or similar 423 | # Get batch size 424 | batch_size = audio_np.shape[0] 425 | 426 | if batch_size > 1: 427 | # Multiple audios in a batch 428 | output_dir = output_path 429 | 430 | # Ensure output directory exists 431 | os.makedirs(output_dir, exist_ok=True) 432 | 433 | # Save each audio in the batch 434 | for i in range(batch_size): 435 | # Extract single audio and remove channel dim if present 436 | single_audio = audio_np[i] 437 | if len(single_audio.shape) > 1: 438 | if single_audio.shape[0] == 1: # (1, T) 439 | single_audio = single_audio.squeeze(0) 440 | 441 | single_audio = self._prepare_audio_for_save(single_audio, normalize) 442 | file_path = os.path.join(output_dir, f"{batch_prefix}{i}.wav") 443 | sf.write(file_path, single_audio, sampling_rate) 444 | saved_paths.append(file_path) 445 | else: 446 | # Single audio with batch and channel dims 447 | audio_item = audio_np.squeeze() # Remove batch and channel dimensions 448 | audio_item = self._prepare_audio_for_save(audio_item, normalize) 449 | sf.write(output_path, audio_item, sampling_rate) 450 | saved_paths.append(output_path) 451 | else: 452 | # Single audio without batch dimension 453 | audio_item = self._prepare_audio_for_save(audio_np, normalize) 454 | sf.write(output_path, audio_item, sampling_rate) 455 | saved_paths.append(output_path) 456 | 457 | return saved_paths 458 | 459 | def _prepare_audio_for_save(self, audio: np.ndarray, normalize: bool) -> np.ndarray: 460 | """ 461 | Prepare audio for saving by ensuring it's the right shape and optionally normalizing. 462 | 463 | Args: 464 | audio: Audio data as numpy array 465 | normalize: Whether to normalize audio 466 | 467 | Returns: 468 | np.ndarray: Processed audio ready for saving 469 | """ 470 | # Ensure right dimensionality 471 | if len(audio.shape) > 1 and audio.shape[0] == 1: # (1, T) 472 | audio = audio.squeeze(0) 473 | 474 | # Normalize if requested 475 | if normalize: 476 | max_val = np.abs(audio).max() 477 | if max_val > 0: 478 | audio = audio / max_val 479 | 480 | return audio 481 | 482 | 483 | __all__ = ["VibeVoiceTokenizerProcessor", "AudioNormalizer"] -------------------------------------------------------------------------------- /nodes/multi_speaker_node.py: -------------------------------------------------------------------------------- 1 | # Created by Fabio Sarracino 2 | 3 | import logging 4 | import os 5 | import re 6 | import tempfile 7 | import torch 8 | import numpy as np 9 | from typing import List, Optional 10 | 11 | from .base_vibevoice import BaseVibeVoiceNode, get_available_models 12 | 13 | # Setup logging 14 | logger = logging.getLogger("VibeVoice") 15 | 16 | class VibeVoiceMultipleSpeakersNode(BaseVibeVoiceNode): 17 | def __init__(self): 18 | super().__init__() 19 | # Register this instance for memory management 20 | try: 21 | from .free_memory_node import VibeVoiceFreeMemoryNode 22 | VibeVoiceFreeMemoryNode.register_multi_speaker(self) 23 | except: 24 | pass 25 | 26 | @classmethod 27 | def INPUT_TYPES(cls): 28 | # Get available models dynamically 29 | available_models = get_available_models() 30 | model_choices = [display_name for _, display_name in available_models] 31 | # Try to select Large model by default if available 32 | default_model = "VibeVoice-Large" 33 | if default_model not in model_choices: 34 | default_model = model_choices[0] if model_choices else "No models found" 35 | 36 | return { 37 | "required": { 38 | "text": ("STRING", { 39 | "multiline": True, 40 | "default": "[1]: Hello, this is the first speaker.\n[2]: Hi there, I'm the second speaker.\n[1]: Nice to meet you!\n[2]: Nice to meet you too!", 41 | "tooltip": "Text with speaker labels. Use '[N]:' format where N is 1-4. Gets disabled when connected to another node.", 42 | "forceInput": False, 43 | "dynamicPrompts": True 44 | }), 45 | "model": (model_choices if model_choices else ["No models found"], { 46 | "default": default_model, 47 | "tooltip": "Select a model from ComfyUI/models/vibevoice/ folder. Large is recommended for multi-speaker" 48 | }), 49 | "attention_type": (["auto", "eager", "sdpa", "flash_attention_2", "sage"], { 50 | "default": "auto", 51 | "tooltip": "Attention implementation. Auto selects the best available, eager is standard, sdpa is optimized PyTorch, flash_attention_2 requires compatible GPU, sage uses quantized attention for speedup (CUDA only)" 52 | }), 53 | "quantize_llm": (["full precision", "4bit", "8bit"], { 54 | "default": "full precision", 55 | "tooltip": "Dynamically quantize only the LLM component for non-quantized models. 4bit: major VRAM savings with minimal quality loss. 8bit: good balance of quality and memory usage. Full precision: original quality. Note: ignored for pre-quantized models. Requires CUDA GPU." 56 | }), 57 | "free_memory_after_generate": ("BOOLEAN", {"default": True, "tooltip": "Free model from memory after generation to save VRAM/RAM. Disable to keep model loaded for faster subsequent generations"}), 58 | "diffusion_steps": ("INT", {"default": 20, "min": 1, "max": 100, "step": 1, "tooltip": "Number of denoising steps. More steps = theoretically better quality but slower. Default: 20"}), 59 | "seed": ("INT", {"default": 42, "min": 0, "max": 2**32-1, "tooltip": "Random seed for generation. Default 42 is used in official examples"}), 60 | "cfg_scale": ("FLOAT", {"default": 1.3, "min": 0.5, "max": 3.5, "step": 0.05, "tooltip": "Classifier-free guidance scale (official default: 1.3)"}), 61 | "use_sampling": ("BOOLEAN", {"default": False, "tooltip": "Enable sampling mode. When False (default), uses deterministic generation like official examples"}), 62 | }, 63 | "optional": { 64 | "speaker1_voice": ("AUDIO", {"tooltip": "Optional: Voice sample for Speaker 1. If not provided, synthetic voice will be used."}), 65 | "speaker2_voice": ("AUDIO", {"tooltip": "Optional: Voice sample for Speaker 2. If not provided, synthetic voice will be used."}), 66 | "speaker3_voice": ("AUDIO", {"tooltip": "Optional: Voice sample for Speaker 3. If not provided, synthetic voice will be used."}), 67 | "speaker4_voice": ("AUDIO", {"tooltip": "Optional: Voice sample for Speaker 4. If not provided, synthetic voice will be used."}), 68 | "lora": ("LORA_CONFIG", {"tooltip": "Optional: LoRA configuration from VibeVoice LoRA node"}), 69 | "temperature": ("FLOAT", {"default": 0.95, "min": 0.1, "max": 2.0, "step": 0.05, "tooltip": "Only used when sampling is enabled"}), 70 | "top_p": ("FLOAT", {"default": 0.95, "min": 0.1, "max": 1.0, "step": 0.05, "tooltip": "Only used when sampling is enabled"}), 71 | "voice_speed_factor": ("FLOAT", { 72 | "default": 1.0, 73 | "min": 0.8, 74 | "max": 1.2, 75 | "step": 0.01, 76 | "tooltip": "1.0 = normal speed, <1.0 = slower speed, >1.0 = faster speed (applies to all speakers)" 77 | }), 78 | } 79 | } 80 | 81 | RETURN_TYPES = ("AUDIO",) 82 | RETURN_NAMES = ("audio",) 83 | FUNCTION = "generate_speech" 84 | CATEGORY = "VibeVoiceWrapper" 85 | DESCRIPTION = "Generate multi-speaker conversations with up to 4 distinct voices using Microsoft VibeVoice" 86 | 87 | def _prepare_voice_sample(self, voice_audio, speaker_idx: int, voice_speed_factor: float = 1.0) -> Optional[np.ndarray]: 88 | """Prepare a single voice sample from input audio with speed adjustment""" 89 | return self._prepare_audio_from_comfyui(voice_audio, speed_factor=voice_speed_factor) 90 | 91 | def generate_speech(self, text: str = "", model: str = "VibeVoice-7B-Preview", 92 | attention_type: str = "auto", quantize_llm: str = "full precision", 93 | free_memory_after_generate: bool = True, 94 | diffusion_steps: int = 20, seed: int = 42, cfg_scale: float = 1.3, 95 | use_sampling: bool = False, lora=None, 96 | speaker1_voice=None, speaker2_voice=None, 97 | speaker3_voice=None, speaker4_voice=None, 98 | temperature: float = 0.95, top_p: float = 0.95, 99 | voice_speed_factor: float = 1.0): 100 | """Generate multi-speaker speech from text using VibeVoice""" 101 | 102 | try: 103 | # Check text input 104 | if not text or not text.strip(): 105 | raise Exception("No text provided. Please enter text with speaker labels (e.g., '[1]: Hello' or '[2]: Hi')") 106 | 107 | # First detect how many speakers are in the text 108 | bracket_pattern = r'\[(\d+)\]\s*:' 109 | speakers_numbers = sorted(list(set([int(m) for m in re.findall(bracket_pattern, text)]))) 110 | 111 | # Limit to 1-4 speakers 112 | if not speakers_numbers: 113 | num_speakers = 1 # Default to 1 if no speaker format found 114 | else: 115 | num_speakers = min(max(speakers_numbers), 4) # Max speaker number, capped at 4 116 | if max(speakers_numbers) > 4: 117 | print(f"[VibeVoice] Warning: Found {max(speakers_numbers)} speakers, limiting to 4") 118 | 119 | # Direct conversion from [N]: to Speaker (N-1): for VibeVoice processor 120 | # This avoids multiple conversion steps 121 | converted_text = text 122 | 123 | # Find all [N]: patterns in the text 124 | speakers_in_text = sorted(list(set([int(m) for m in re.findall(bracket_pattern, text)]))) 125 | 126 | if not speakers_in_text: 127 | # No [N]: format found, try Speaker N: format 128 | speaker_pattern = r'Speaker\s+(\d+)\s*:' 129 | speakers_in_text = sorted(list(set([int(m) for m in re.findall(speaker_pattern, text)]))) 130 | 131 | if speakers_in_text: 132 | # Text already in Speaker N format, convert to 0-based 133 | for speaker_num in sorted(speakers_in_text, reverse=True): 134 | pattern = f'Speaker\\s+{speaker_num}\\s*:' 135 | replacement = f'Speaker {speaker_num - 1}:' 136 | converted_text = re.sub(pattern, replacement, converted_text) 137 | else: 138 | # No speaker format found 139 | speakers_in_text = [1] 140 | 141 | # Parse pause keywords even for single speaker 142 | pause_segments = self._parse_pause_keywords(text) 143 | 144 | # Store speaker segments for pause processing 145 | speaker_segments_with_pauses = [] 146 | segments = [] 147 | 148 | for seg_type, seg_content in pause_segments: 149 | if seg_type == 'pause': 150 | speaker_segments_with_pauses.append(('pause', seg_content, None)) 151 | else: 152 | # Clean up newlines 153 | text_clean = seg_content.replace('\n', ' ').replace('\r', ' ') 154 | text_clean = ' '.join(text_clean.split()) 155 | 156 | if text_clean: 157 | speaker_segments_with_pauses.append(('text', text_clean, 1)) 158 | segments.append(f"Speaker 0: {text_clean}") 159 | 160 | # Join all segments for fallback 161 | converted_text = '\n'.join(segments) if segments else f"Speaker 0: {text}" 162 | else: 163 | # Convert [N]: directly to Speaker (N-1): and handle multi-line text 164 | # Split text to preserve speaker segments while cleaning up newlines within each segment 165 | segments = [] 166 | 167 | # Find all speaker markers with their positions 168 | speaker_matches = list(re.finditer(f'\\[({"|".join(map(str, speakers_in_text))})\\]\\s*:', converted_text)) 169 | 170 | # Store speaker segments for pause processing 171 | speaker_segments_with_pauses = [] 172 | 173 | for i, match in enumerate(speaker_matches): 174 | speaker_num = int(match.group(1)) 175 | start = match.end() 176 | 177 | # Find where this speaker's text ends (at next speaker or end of text) 178 | if i + 1 < len(speaker_matches): 179 | end = speaker_matches[i + 1].start() 180 | else: 181 | end = len(converted_text) 182 | 183 | # Extract the speaker's text (keep pause keywords for now) 184 | speaker_text = converted_text[start:end].strip() 185 | 186 | # Parse pause keywords within this speaker's text 187 | pause_segments = self._parse_pause_keywords(speaker_text) 188 | 189 | # Process each segment (text or pause) for this speaker 190 | for seg_type, seg_content in pause_segments: 191 | if seg_type == 'pause': 192 | # Add pause segment 193 | speaker_segments_with_pauses.append(('pause', seg_content, None)) 194 | else: 195 | # Clean up the text segment 196 | text_clean = seg_content.replace('\n', ' ').replace('\r', ' ') 197 | text_clean = ' '.join(text_clean.split()) 198 | 199 | if text_clean: # Only add non-empty text 200 | # Add text segment with speaker info 201 | speaker_segments_with_pauses.append(('text', text_clean, speaker_num)) 202 | # Also build the traditional segments for fallback 203 | segments.append(f'Speaker {speaker_num - 1}: {text_clean}') 204 | 205 | # Join all segments with newlines (required for multi-speaker format) - for fallback 206 | converted_text = '\n'.join(segments) if segments else "" 207 | 208 | # Build speaker names list - these are just for logging, not used by processor 209 | # The processor uses the speaker labels in the text itself 210 | speakers = [f"Speaker {i}" for i in range(len(speakers_in_text))] 211 | 212 | # Get the actual folder path for the selected model 213 | available_models = get_available_models() 214 | model_path = None 215 | for folder, display_name in available_models: 216 | if display_name == model: 217 | model_path = folder 218 | break 219 | 220 | if not model_path: 221 | raise Exception(f"Model '{model}' not found in models/vibevoice/") 222 | 223 | # Extract LoRA configuration if provided 224 | lora_path = None 225 | llm_lora_strength = 1.0 226 | if lora and isinstance(lora, dict): 227 | lora_path = lora.get("path", None) 228 | llm_lora_strength = lora.get("llm_strength", 1.0) 229 | 230 | # Set LoRA component flags based on configuration 231 | self.use_llm_lora = lora.get("use_llm", True) 232 | self.use_diffusion_head_lora = lora.get("use_diffusion_head", True) 233 | self.use_acoustic_connector_lora = lora.get("use_acoustic_connector", True) 234 | self.use_semantic_connector_lora = lora.get("use_semantic_connector", True) 235 | 236 | if lora_path: 237 | logger.info(f"Using LoRA from: {lora_path}") 238 | 239 | # Load model with optional LoRA 240 | self.load_model(model, model_path, attention_type, quantize_llm=quantize_llm, lora_path=lora_path) 241 | 242 | voice_inputs = [speaker1_voice, speaker2_voice, speaker3_voice, speaker4_voice] 243 | 244 | # Prepare voice samples in order of appearance 245 | voice_samples = [] 246 | for i, speaker_num in enumerate(speakers_in_text): 247 | idx = speaker_num - 1 # Convert to 0-based for voice array 248 | 249 | # Try to use provided voice sample 250 | if idx < len(voice_inputs) and voice_inputs[idx] is not None: 251 | voice_sample = self._prepare_voice_sample(voice_inputs[idx], idx, voice_speed_factor) 252 | if voice_sample is None: 253 | # Use the actual speaker index for consistent synthetic voice 254 | voice_sample = self._create_synthetic_voice_sample(idx) 255 | else: 256 | # Use the actual speaker index for consistent synthetic voice 257 | voice_sample = self._create_synthetic_voice_sample(idx) 258 | 259 | voice_samples.append(voice_sample) 260 | 261 | # Ensure voice_samples count matches detected speakers 262 | if len(voice_samples) != len(speakers_in_text): 263 | logger.error(f"Mismatch: {len(speakers_in_text)} speakers but {len(voice_samples)} voice samples!") 264 | raise Exception(f"Voice sample count mismatch: expected {len(speakers_in_text)}, got {len(voice_samples)}") 265 | 266 | # Check if we have pause segments to process 267 | if 'speaker_segments_with_pauses' in locals() and speaker_segments_with_pauses: 268 | # Process segments with pauses 269 | all_audio_segments = [] 270 | sample_rate = 24000 # VibeVoice uses 24kHz 271 | 272 | # Group consecutive text segments from same speaker for efficiency 273 | grouped_segments = [] 274 | current_group = [] 275 | current_speaker = None 276 | 277 | for seg_type, seg_content, speaker_num in speaker_segments_with_pauses: 278 | if seg_type == 'pause': 279 | # Save current group if any 280 | if current_group: 281 | grouped_segments.append(('text_group', current_group, current_speaker)) 282 | current_group = [] 283 | current_speaker = None 284 | # Add pause 285 | grouped_segments.append(('pause', seg_content, None)) 286 | else: 287 | # Text segment 288 | if speaker_num == current_speaker: 289 | # Same speaker, add to current group 290 | current_group.append(seg_content) 291 | else: 292 | # Different speaker, save current group and start new one 293 | if current_group: 294 | grouped_segments.append(('text_group', current_group, current_speaker)) 295 | current_group = [seg_content] 296 | current_speaker = speaker_num 297 | 298 | # Save last group if any 299 | if current_group: 300 | grouped_segments.append(('text_group', current_group, current_speaker)) 301 | 302 | # Process grouped segments 303 | for seg_type, seg_content, speaker_num in grouped_segments: 304 | if seg_type == 'pause': 305 | # Generate silence 306 | duration_ms = seg_content 307 | logger.info(f"Adding {duration_ms}ms pause") 308 | silence_audio = self._generate_silence(duration_ms, sample_rate) 309 | all_audio_segments.append(silence_audio) 310 | else: 311 | # Process text group for a speaker 312 | combined_text = ' '.join(seg_content) 313 | formatted_text = f"Speaker {speaker_num - 1}: {combined_text}" 314 | 315 | # Get voice sample for this speaker 316 | speaker_idx = speakers_in_text.index(speaker_num) 317 | speaker_voice_samples = [voice_samples[speaker_idx]] 318 | 319 | logger.info(f"Generating audio for Speaker {speaker_num}: {len(combined_text.split())} words") 320 | 321 | # Generate audio for this speaker's text 322 | segment_audio = self._generate_with_vibevoice( 323 | formatted_text, speaker_voice_samples, cfg_scale, seed, 324 | diffusion_steps, use_sampling, temperature, top_p, 325 | llm_lora_strength=llm_lora_strength 326 | ) 327 | 328 | all_audio_segments.append(segment_audio) 329 | 330 | # Concatenate all audio segments 331 | if all_audio_segments: 332 | logger.info(f"Concatenating {len(all_audio_segments)} audio segments (including pauses)...") 333 | 334 | # Extract waveforms 335 | waveforms = [] 336 | for audio_segment in all_audio_segments: 337 | if isinstance(audio_segment, dict) and "waveform" in audio_segment: 338 | waveforms.append(audio_segment["waveform"]) 339 | 340 | if waveforms: 341 | # Filter out None values if any 342 | valid_waveforms = [w for w in waveforms if w is not None] 343 | 344 | if valid_waveforms: 345 | # Concatenate along time dimension 346 | combined_waveform = torch.cat(valid_waveforms, dim=-1) 347 | 348 | audio_dict = { 349 | "waveform": combined_waveform, 350 | "sample_rate": sample_rate 351 | } 352 | logger.info(f"Successfully generated multi-speaker audio with pauses") 353 | else: 354 | raise Exception("No valid audio waveforms generated") 355 | else: 356 | raise Exception("Failed to extract waveforms from audio segments") 357 | else: 358 | raise Exception("No audio segments generated") 359 | else: 360 | # Fallback to original method without pause support 361 | logger.info("Processing without pause support (no pause keywords found)") 362 | audio_dict = self._generate_with_vibevoice( 363 | converted_text, voice_samples, cfg_scale, seed, diffusion_steps, 364 | use_sampling, temperature, top_p, llm_lora_strength=llm_lora_strength 365 | ) 366 | 367 | # Free memory if requested 368 | if free_memory_after_generate: 369 | self.free_memory() 370 | 371 | return (audio_dict,) 372 | 373 | except Exception as e: 374 | # Check if this is an interruption by the user 375 | import comfy.model_management as mm 376 | if isinstance(e, mm.InterruptProcessingException): 377 | # User interrupted - just log it and re-raise to stop the workflow 378 | logger.info("Generation interrupted by user") 379 | raise # Propagate the interruption to stop the workflow 380 | else: 381 | # Real error - show it 382 | logger.error(f"Multi-speaker speech generation failed: {str(e)}") 383 | raise Exception(f"Error generating multi-speaker speech: {str(e)}") 384 | 385 | @classmethod 386 | def IS_CHANGED(cls, text="", model="VibeVoice-7B-Preview", 387 | speaker1_voice=None, speaker2_voice=None, 388 | speaker3_voice=None, speaker4_voice=None, lora=None, **kwargs): 389 | """Cache key for ComfyUI""" 390 | voices_hash = hash(str([speaker1_voice, speaker2_voice, speaker3_voice, speaker4_voice])) 391 | lora_hash = hash(str(lora)) if lora else 0 392 | return f"{hash(text)}_{model}_{voices_hash}_{lora_hash}_{kwargs.get('cfg_scale', 1.3)}_{kwargs.get('seed', 0)}" -------------------------------------------------------------------------------- /vvembed/modular/modeling_vibevoice.py: -------------------------------------------------------------------------------- 1 | # Original code by Microsoft 2 | # updated by Fabio Sarracino - Enemyx-net 3 | 4 | from dataclasses import dataclass 5 | from typing import Dict, List, Optional, Tuple, Union, Callable 6 | from tqdm import tqdm 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.distributed as dist 11 | 12 | from transformers.models.auto import AutoModel, AutoModelForCausalLM 13 | 14 | from transformers.activations import ACT2FN 15 | from transformers.modeling_outputs import CausalLMOutput, BaseModelOutputWithPast, ModelOutput 16 | from transformers.models.llama.modeling_llama import LlamaRMSNorm 17 | from transformers import modeling_utils 18 | from transformers.modeling_utils import PreTrainedModel 19 | from transformers.modeling_flash_attention_utils import FlashAttentionKwargs 20 | from transformers.utils import logging 21 | 22 | 23 | from .modular_vibevoice_tokenizer import VibeVoiceTokenizerStreamingCache, VibeVoiceAcousticTokenizerModel, VibeVoiceSemanticTokenizerModel 24 | from .modular_vibevoice_diffusion_head import VibeVoiceDiffusionHead 25 | 26 | # Import schedule module with robust path handling to avoid conflicts with PyPI 'schedule' package 27 | import sys 28 | import os 29 | 30 | # Get the path to vvembed directory 31 | _current_dir = os.path.dirname(os.path.abspath(__file__)) 32 | _vvembed_dir = os.path.dirname(_current_dir) 33 | _schedule_path = os.path.join(_vvembed_dir, 'schedule') 34 | 35 | # Ensure vvembed is at the front of sys.path to prioritize our schedule module 36 | if _vvembed_dir not in sys.path: 37 | sys.path.insert(0, _vvembed_dir) 38 | elif sys.path.index(_vvembed_dir) > 0: 39 | # Move it to the front if it's not already 40 | sys.path.remove(_vvembed_dir) 41 | sys.path.insert(0, _vvembed_dir) 42 | 43 | # Verify the schedule module exists 44 | if not os.path.exists(_schedule_path): 45 | raise ImportError( 46 | f"Cannot find 'schedule' directory in vvembed. " 47 | f"Expected at: {_schedule_path}" 48 | ) 49 | 50 | # Import with our schedule module prioritized 51 | try: 52 | from schedule.dpm_solver import DPMSolverMultistepScheduler 53 | except ImportError as e: 54 | raise ImportError( 55 | f"Failed to import DPMSolverMultistepScheduler from {_schedule_path}. " 56 | f"There might be a conflict with another Python package. " 57 | f"Original error: {e}" 58 | ) 59 | 60 | from .configuration_vibevoice import VibeVoiceConfig 61 | 62 | 63 | logger = logging.get_logger(__name__) 64 | 65 | if not hasattr(modeling_utils, "ALL_PARALLEL_STYLES") or modeling_utils.ALL_PARALLEL_STYLES is None: 66 | modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none", "colwise", "rowwise"] 67 | 68 | @dataclass 69 | class VibeVoiceCausalLMOutputWithPast(ModelOutput): 70 | loss: Optional[torch.FloatTensor] = None 71 | diffusion_loss: Optional[torch.FloatTensor] = None 72 | speech_token_num: Optional[int] = None 73 | logits: torch.FloatTensor = None 74 | past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None 75 | hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None 76 | attentions: Optional[Tuple[torch.FloatTensor, ...]] = None 77 | 78 | 79 | @dataclass 80 | class VibeVoiceGenerationOutput(ModelOutput): 81 | """ 82 | Output type for VibeVoice generation. 83 | 84 | Args: 85 | sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`): 86 | The generated sequences. 87 | speech_outputs (`List[torch.FloatTensor]`, *optional*): 88 | List of generated speech waveforms or latents for each speech segment. 89 | """ 90 | sequences: torch.LongTensor = None 91 | speech_outputs: Optional[List[torch.FloatTensor]] = None 92 | 93 | 94 | class SpeechConnector(nn.Module): 95 | def __init__(self, input_dim, output_dim): 96 | super().__init__() 97 | self.fc1 = nn.Linear(input_dim, output_dim) 98 | self.norm = LlamaRMSNorm(output_dim, eps=1e-6) 99 | self.fc2 = nn.Linear(output_dim, output_dim) 100 | 101 | def forward(self, features, **kwargs): 102 | x = self.fc1(features) 103 | x = self.norm(x) 104 | x = self.fc2(x) 105 | return x 106 | 107 | 108 | # @auto_docstring 109 | class VibeVoicePreTrainedModel(PreTrainedModel): 110 | config_class = VibeVoiceConfig 111 | base_model_prefix = "model" 112 | supports_gradient_checkpointing = True 113 | _skip_keys_device_placement = "past_key_values" 114 | _supports_cache_class = True 115 | _supports_flash_attn_2 = True 116 | _supports_sdpa = True 117 | _supports_quantized_cache = True 118 | _supports_static_cache = True 119 | _supports_attention_backend = True 120 | 121 | def _init_weights(self, module): 122 | if isinstance(module, VibeVoiceDiffusionHead): 123 | module.initialize_weights() 124 | return 125 | 126 | # Use the language model's initializer_range if available 127 | if hasattr(self.config, 'language_model_config') and hasattr(self.config.language_model_config, 'initializer_range'): 128 | std = self.config.language_model_config.initializer_range 129 | elif hasattr(self.config, 'decoder_config') and hasattr(self.config.decoder_config, 'initializer_range'): 130 | std = self.config.decoder_config.initializer_range 131 | else: 132 | std = 0.02 # Default value 133 | 134 | if isinstance(module, nn.Linear): 135 | module.weight.data.normal_(mean=0.0, std=std) 136 | if module.bias is not None: 137 | module.bias.data.zero_() 138 | elif isinstance(module, nn.LayerNorm): 139 | module.weight.data.fill_(1.0) 140 | module.bias.data.zero_() 141 | 142 | # @auto_docstring 143 | class VibeVoiceModel(VibeVoicePreTrainedModel): 144 | def __init__(self, config): 145 | super().__init__(config) 146 | 147 | if hasattr(config, 'torch_dtype') and config.torch_dtype is not None: 148 | if isinstance(config.torch_dtype, str): 149 | dtype = getattr(torch, config.torch_dtype) 150 | else: 151 | dtype = config.torch_dtype 152 | else: 153 | dtype = torch.float32 154 | 155 | # Initialize Qwen2 model for language modeling 156 | lm_config = config.decoder_config 157 | self.language_model = AutoModel.from_config(lm_config) 158 | 159 | # Initialize speech components if needed 160 | self.acoustic_tokenizer = AutoModel.from_config(config.acoustic_tokenizer_config).to(dtype) 161 | self.semantic_tokenizer = AutoModel.from_config(config.semantic_tokenizer_config).to(dtype) 162 | 163 | self.acoustic_connector = SpeechConnector(config.acoustic_vae_dim, lm_config.hidden_size).to(dtype) 164 | self.semantic_connector = SpeechConnector(config.semantic_vae_dim, lm_config.hidden_size).to(dtype) 165 | 166 | # Register scaling factors as buffers - use 1D tensors for FSDP compatibility 167 | self.register_buffer('speech_scaling_factor', torch.tensor(float('nan'))) 168 | self.register_buffer('speech_bias_factor', torch.tensor(float('nan'))) 169 | 170 | # Initialize prediction head for speech generation 171 | self.prediction_head = AutoModel.from_config(config.diffusion_head_config).to(dtype) 172 | 173 | # Initialize noise scheduler 174 | self.noise_scheduler = DPMSolverMultistepScheduler( 175 | num_train_timesteps=config.diffusion_head_config.ddpm_num_steps, 176 | beta_schedule=config.diffusion_head_config.ddpm_beta_schedule, 177 | prediction_type=config.diffusion_head_config.prediction_type 178 | ) 179 | 180 | def get_input_embeddings(self): 181 | if hasattr(self.language_model, 'embed_tokens'): 182 | # If the language model has an embed_tokens attribute, return it 183 | return self.language_model.embed_tokens 184 | 185 | for name, attr in self.language_model.fullmap.items(): # parallel by nnscaler, the name is changed 186 | if attr.orig_name == 'embed_tokens.weight': 187 | return getattr(self.language_model, name) 188 | assert False, 'should not arrive here' 189 | 190 | def set_input_embeddings(self, value): 191 | self.language_model.embed_tokens = value 192 | 193 | def set_speech_tokenizers(self, acoustic_tokenizer=None, semantic_tokenizer=None): 194 | """Set the speech tokenizers used for encoding and decoding speech.""" 195 | self.acoustic_tokenizer = acoustic_tokenizer 196 | self.semantic_tokenizer = semantic_tokenizer 197 | 198 | # Reset the encoder to evaluation mode 199 | if self.acoustic_tokenizer is not None: 200 | self.acoustic_tokenizer.eval() 201 | 202 | if self.semantic_tokenizer is not None: 203 | self.semantic_tokenizer.eval() 204 | 205 | def forward( 206 | self, 207 | input_ids: torch.LongTensor = None, 208 | attention_mask: Optional[torch.Tensor] = None, 209 | position_ids: Optional[torch.LongTensor] = None, 210 | past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, 211 | inputs_embeds: Optional[torch.FloatTensor] = None, 212 | use_cache: Optional[bool] = None, 213 | output_attentions: Optional[bool] = None, 214 | output_hidden_states: Optional[bool] = None, 215 | return_dict: Optional[bool] = None, 216 | cache_position: Optional[torch.LongTensor] = None, 217 | **kwargs, 218 | ) -> Union[Tuple, BaseModelOutputWithPast]: 219 | 220 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 221 | 222 | # Forward through language model 223 | outputs = self.language_model( 224 | input_ids=input_ids, 225 | attention_mask=attention_mask, 226 | position_ids=position_ids, 227 | past_key_values=past_key_values, 228 | inputs_embeds=inputs_embeds, 229 | use_cache=use_cache, 230 | output_attentions=output_attentions, 231 | output_hidden_states=output_hidden_states, 232 | return_dict=return_dict, 233 | cache_position=cache_position, 234 | **kwargs, 235 | ) 236 | 237 | if not return_dict: 238 | return outputs 239 | 240 | return BaseModelOutputWithPast( 241 | last_hidden_state=outputs.last_hidden_state, 242 | past_key_values=outputs.past_key_values, 243 | hidden_states=outputs.hidden_states, 244 | attentions=outputs.attentions, 245 | ) 246 | 247 | 248 | class VibeVoiceForConditionalGeneration(VibeVoicePreTrainedModel): 249 | _tied_weights_keys = ["lm_head.weight"] 250 | _tp_plan = {"lm_head": "colwise_rep"} 251 | 252 | def __init__(self, config): 253 | super().__init__(config) 254 | self.model = VibeVoiceModel(config) 255 | self.vocab_size = config.decoder_config.vocab_size 256 | self.lm_head = nn.Linear(config.decoder_config.hidden_size, self.vocab_size, bias=False) 257 | 258 | self.post_init() 259 | 260 | def get_input_embeddings(self): 261 | return self.model.get_input_embeddings() 262 | 263 | def set_input_embeddings(self, value): 264 | self.model.set_input_embeddings(value) 265 | 266 | def get_output_embeddings(self): 267 | return self.lm_head 268 | 269 | def set_decoder(self, decoder): 270 | self.model.language_model = decoder 271 | 272 | def get_decoder(self): 273 | return self.model.language_model 274 | 275 | def tie_weights(self): 276 | """ 277 | Tie the weights between the input embeddings and the output embeddings. 278 | """ 279 | if getattr(self.config.decoder_config, 'tie_word_embeddings', False): 280 | # The standard PreTrainedModel method will handle the tying. 281 | # It typically does a simple parameter object assignment, which is 282 | # CORRECT to do BEFORE FSDP wraps the model. 283 | output_embeddings = self.get_output_embeddings() 284 | input_embeddings = self.get_input_embeddings() 285 | if hasattr(input_embeddings, 'weight'): 286 | output_embeddings.weight = input_embeddings.weight 287 | else: 288 | # maybe returned input_embeddings a tensor directly 289 | output_embeddings.weight = input_embeddings 290 | 291 | if getattr(output_embeddings, "bias", None) is not None: 292 | output_embeddings.bias.data = nn.functional.pad( 293 | output_embeddings.bias.data, 294 | (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0]), 295 | "constant", 296 | 0, 297 | ) 298 | print("✅ Tied input and output embeddings using standard assignment.") 299 | else: 300 | print("ℹ️ tie_word_embeddings is False, not tying weights.") 301 | 302 | # Also, ensure set_output_embeddings is safe, though your implementation looks okay. 303 | # The key is to avoid calling it after accelerator.prepare(). 304 | def set_output_embeddings(self, new_embeddings): 305 | # Your current implementation using data.copy_ is good practice, 306 | # but the best way is to not call this after prepare(). 307 | self.lm_head = new_embeddings 308 | 309 | def forward_speech_features( 310 | self, 311 | speech_tensors=None, 312 | speech_masks=None, 313 | speech_type="audio", 314 | return_unmask=False 315 | ): 316 | if speech_tensors is None: 317 | # Use config to get vae_dim instead of non-existent self.args 318 | vae_dim = self.config.acoustic_tokenizer_config.vae_dim 319 | audio_features = torch.zeros(1, 1, vae_dim).to(self.get_input_embeddings().weight) 320 | connect_features = self.model.acoustic_connector(audio_features) 321 | return audio_features, connect_features 322 | else: 323 | with torch.no_grad(): 324 | if speech_type == "audio": 325 | with torch.no_grad(): 326 | frames = self.model.acoustic_tokenizer.encode(speech_tensors.unsqueeze(1))[0][0] 327 | audio_tokens = frames.sample(self.model.acoustic_tokenizer.std_dist_type)[0] 328 | 329 | elif speech_type == "vae": 330 | # Use config to get vae_dim instead of non-existent self.args 331 | vae_dim = self.config.acoustic_tokenizer_config.vae_dim 332 | speech_mode = speech_tensors.reshape(speech_tensors.size(0), -1, vae_dim) 333 | 334 | # gaussian sample from the speech_mode 335 | batch_size = speech_mode.size(0) 336 | value = self.model.acoustic_tokenizer.fix_std / 0.8 337 | std = torch.randn(batch_size, dtype=speech_mode.dtype, device=speech_mode.device) * value 338 | std = std.view(-1, *[1] * (speech_mode.dim() - 1)) 339 | audio_tokens = speech_mode + std * torch.randn(speech_mode.shape).to(speech_mode) 340 | else: 341 | raise NotImplementedError(f"Speech type {speech_type} not implemented") 342 | 343 | if torch.isnan(self.model.speech_scaling_factor) or torch.isnan(self.model.speech_bias_factor): 344 | scaling_factor = 1. / audio_tokens[speech_masks].flatten().std() 345 | bias_factor = -audio_tokens[speech_masks].flatten().mean() 346 | 347 | # Only use distributed operations if the process group is initialized 348 | if dist.is_available() and dist.is_initialized(): 349 | dist.all_reduce(scaling_factor, op=dist.ReduceOp.SUM) 350 | dist.all_reduce(bias_factor, op=dist.ReduceOp.SUM) 351 | world_size = dist.get_world_size() 352 | self.model.speech_scaling_factor.copy_(scaling_factor / world_size) 353 | self.model.speech_bias_factor.copy_(bias_factor / world_size) 354 | print(f"Speech scaling factor (distributed): {self.model.speech_scaling_factor}, bias factor: {self.model.speech_bias_factor}", flush=True) 355 | else: 356 | # Single process case 357 | self.model.speech_scaling_factor.copy_(scaling_factor) 358 | self.model.speech_bias_factor.copy_(bias_factor) 359 | print(f"Speech scaling factor (single process): {self.model.speech_scaling_factor}, bias factor: {self.model.speech_bias_factor}", flush=True) 360 | 361 | audio_features = (audio_tokens + self.model.speech_bias_factor) * self.model.speech_scaling_factor 362 | 363 | connect_features = self.model.acoustic_connector(audio_features) 364 | if return_unmask: 365 | return audio_features, connect_features 366 | return audio_features[speech_masks], connect_features[speech_masks] 367 | 368 | def forward( 369 | self, 370 | input_ids: torch.LongTensor = None, 371 | attention_mask: Optional[torch.Tensor] = None, 372 | position_ids: Optional[torch.LongTensor] = None, 373 | past_key_values: Optional[List[torch.FloatTensor]] = None, 374 | inputs_embeds: Optional[torch.FloatTensor] = None, 375 | labels: Optional[torch.LongTensor] = None, 376 | use_cache: Optional[bool] = False, 377 | output_attentions: Optional[bool] = None, 378 | output_hidden_states: Optional[bool] = None, 379 | return_dict: Optional[bool] = None, 380 | cache_position: Optional[torch.LongTensor] = None, 381 | # New arguments for speech processing and loss calculation 382 | speech_tensors: Optional[torch.FloatTensor] = None, 383 | speech_masks: Optional[torch.BoolTensor] = None, 384 | speeches_loss_input: Optional[torch.FloatTensor] = None, 385 | speech_semantic_tensors: Optional[torch.FloatTensor] = None, 386 | acoustic_input_mask: Optional[torch.BoolTensor] = None, 387 | acoustic_loss_mask: Optional[torch.BoolTensor] = None, 388 | ddpm_batch_mul: int = 1, 389 | **kwargs: Optional[Dict[str, Union[torch.Tensor, str]]], 390 | ) -> Union[Tuple, VibeVoiceCausalLMOutputWithPast]: 391 | 392 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 393 | 394 | x = self.get_input_embeddings()(input_ids) 395 | 396 | semantic_speech_all_connect_features = self.model.semantic_connector(speech_semantic_tensors) 397 | if speeches_loss_input is not None: 398 | # only part audio need diffuse 399 | speech_all_features, speech_all_connect_features = self.forward_speech_features( 400 | speech_tensors=speech_tensors.type_as(x) if speech_tensors is not None else None, 401 | speech_masks=speech_masks, 402 | speech_type=kwargs.get("speech_type", "audio"), 403 | return_unmask=True 404 | ) 405 | if speech_tensors is not None: 406 | if semantic_speech_all_connect_features is not None: 407 | x[acoustic_input_mask] = speech_all_connect_features[speech_masks] + semantic_speech_all_connect_features[speech_masks] 408 | else: 409 | x[acoustic_input_mask] = speech_all_connect_features[speech_masks] 410 | speech_features = speech_all_features[speeches_loss_input.unsqueeze(-1) & speech_masks] # only part audio need diffuse 411 | speech_connect_features = speech_all_connect_features[speeches_loss_input.unsqueeze(-1) & speech_masks] 412 | else: 413 | speech_features, speech_connect_features = self.forward_speech_features( 414 | speech_tensors=speech_tensors.type_as(x) if speech_tensors is not None else None, 415 | speech_masks=speech_masks, 416 | speech_type=kwargs.get("speech_type", "audio"), 417 | ) 418 | if speech_tensors is not None: 419 | x[acoustic_input_mask] = speech_connect_features 420 | 421 | outputs = self.model( 422 | input_ids=None, 423 | attention_mask=attention_mask, 424 | position_ids=position_ids, 425 | past_key_values=past_key_values, 426 | inputs_embeds=x, 427 | use_cache=use_cache, 428 | output_attentions=output_attentions, 429 | output_hidden_states=False, 430 | return_dict=return_dict, 431 | cache_position=cache_position, 432 | ) 433 | 434 | hidden_states = outputs.last_hidden_state 435 | logits = self.lm_head(hidden_states) 436 | # logits = logits.float() 437 | 438 | loss = None 439 | if labels is not None: 440 | # The custom CE loss with masking is calculated in the training script. 441 | # We leave the standard loss calculation here as None. 442 | pass 443 | 444 | # --- Diffusion Loss Calculation --- 445 | diffusion_loss = None 446 | # This block is executed only if we are in a context that involves speech. 447 | if speech_tensors is not None and acoustic_loss_mask.sum().item() > 0: 448 | condition_features = hidden_states[acoustic_loss_mask] 449 | 450 | speech_len, latent_size = speech_features.shape 451 | 452 | noise = torch.randn( 453 | (speech_len * ddpm_batch_mul, latent_size), 454 | device=hidden_states.device, 455 | dtype=hidden_states.dtype 456 | ) 457 | 458 | timesteps = torch.multinomial( 459 | torch.ones(self.config.diffusion_head_config.ddpm_num_steps), 460 | speech_len * ddpm_batch_mul, 461 | replacement=True, 462 | ).to(hidden_states.device) 463 | 464 | speech_features_repeated = speech_features.repeat_interleave(ddpm_batch_mul, dim=0) 465 | condition_features_repeated = condition_features.repeat_interleave(ddpm_batch_mul, dim=0) 466 | 467 | noisy_speech_features = self.model.noise_scheduler.add_noise( 468 | speech_features_repeated, noise, timesteps 469 | ) 470 | 471 | model_output = self.model.prediction_head( 472 | noisy_speech_features, 473 | timesteps.type_as(x), 474 | condition_features_repeated 475 | ) 476 | 477 | prediction_type = self.config.diffusion_head_config.prediction_type 478 | if prediction_type == "epsilon": 479 | target_for_loss = noise 480 | elif prediction_type == "v_prediction": 481 | target_for_loss = self.model.noise_scheduler.get_velocity( 482 | speech_features_repeated, noise, timesteps 483 | ) 484 | else: 485 | raise NotImplementedError(f"Prediction type {prediction_type} not implemented") 486 | 487 | diffusion_loss = F.mse_loss(model_output.float(), target_for_loss.float(), reduction='sum') 488 | if latent_size > 0 and ddpm_batch_mul > 0: 489 | diffusion_loss = diffusion_loss / latent_size / ddpm_batch_mul 490 | else: 491 | diffusion_loss = torch.tensor(0.0, device=diffusion_loss.device) 492 | 493 | else: 494 | # Dummy loss for DDP to work when there are no speech samples in a batch, 495 | # but we are in a speech context. 496 | diffusion_loss = sum(p.sum() for p in self.model.prediction_head.parameters()) * 0.0 497 | diffusion_loss += sum(p.sum() for p in self.model.acoustic_connector.parameters()) * 0.0 498 | diffusion_loss += sum(p.sum() for p in self.model.semantic_connector.parameters()) * 0.0 499 | # --- End Diffusion Loss Calculation --- 500 | 501 | if not return_dict: 502 | output = (logits, speech_len) + outputs.to_tuple()[1:] 503 | return (loss, diffusion_loss) + output 504 | 505 | return VibeVoiceCausalLMOutputWithPast( 506 | loss=loss, 507 | diffusion_loss=diffusion_loss, 508 | speech_token_num=speech_len if speech_tensors is not None else 0, 509 | logits=logits, 510 | past_key_values=outputs.past_key_values, 511 | hidden_states=outputs.hidden_states, 512 | attentions=outputs.attentions, 513 | ) 514 | 515 | AutoModel.register(VibeVoiceConfig, VibeVoiceModel) 516 | AutoModelForCausalLM.register(VibeVoiceConfig, VibeVoiceForConditionalGeneration) 517 | 518 | __all__ = [ 519 | "VibeVoiceModel", 520 | "VibeVoicePreTrainedModel", 521 | "VibeVoiceForConditionalGeneration", 522 | "VibeVoiceCausalLMOutputWithPast", 523 | "VibeVoiceGenerationOutput", 524 | ] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VibeVoice ComfyUI Nodes 2 | 3 | A comprehensive ComfyUI integration for Microsoft's VibeVoice text-to-speech model, enabling high-quality single and multi-speaker voice synthesis directly within your ComfyUI workflows. 4 | 5 | ## ✨ Features 6 | 7 | ### Core Functionality 8 | - 🎤 **Single Speaker TTS**: Generate natural speech with optional voice cloning 9 | - 👥 **Multi-Speaker Conversations**: Support for up to 4 distinct speakers 10 | - 🎯 **Voice Cloning**: Clone voices from audio samples 11 | - 🎨 **LoRA Support**: Fine-tune voices with custom LoRA adapters (v1.4.0+) 12 | - 🎚️ **Voice Speed Control**: Adjust speech rate by modifying reference voice speed (v1.5.0+) 13 | - 📝 **Text File Loading**: Load scripts from text files 14 | - 📚 **Automatic Text Chunking**: Handles long texts seamlessly with configurable chunk size 15 | - ⏸️ **Custom Pause Tags**: Insert silences with `[pause]` and `[pause:ms]` tags (wrapper feature) 16 | - 🔄 **Node Chaining**: Connect multiple VibeVoice nodes for complex workflows 17 | - ⏹️ **Interruption Support**: Cancel operations before or between generations 18 | - 🔧 **Flexible Configuration**: Control temperature, sampling, and guidance scale 19 | 20 | ### Performance & Optimization 21 | - ⚡ **Attention Mechanisms**: Choose between auto, eager, sdpa, flash_attention_2 or sage 22 | - 🎛️ **Diffusion Steps**: Adjustable quality vs speed trade-off (default: 20) 23 | - 💾 **Memory Management**: Toggle automatic VRAM cleanup after generation 24 | - 🧹 **Free Memory Node**: Manual memory control for complex workflows 25 | - 🍎 **Apple Silicon Support**: Native GPU acceleration on M1/M2/M3 Macs via MPS 26 | - 🔢 **8-Bit Quantization**: Perfect audio quality with high VRAM reduction 27 | - 🔢 **4-Bit Quantization**: Maximum VRAM savings with minimal quality loss 28 | 29 | ### Compatibility & Installation 30 | - 📦 **Self-Contained**: Embedded VibeVoice code, no external dependencies 31 | - 🔄 **Universal Compatibility**: Adaptive support for transformers v4.51.3+ 32 | - 🖥️ **Cross-Platform**: Works on Windows, Linux, and macOS 33 | - 🎮 **Multi-Backend**: Supports CUDA, CPU, and MPS (Apple Silicon) 34 | 35 | ## 🎥 Video Demo 36 |
37 |
38 |
39 |
40 |
41 | Click to watch the demo video
42 |