├── vvembed
    ├── __init__.py
    ├── modular
    │   ├── __init__.py
    │   ├── modular_vibevoice_text_tokenizer.py
    │   ├── modular_vibevoice_diffusion_head.py
    │   ├── streamer.py
    │   ├── configuration_vibevoice.py
    │   └── modeling_vibevoice.py
    ├── schedule
    │   ├── __init__.py
    │   └── timestep_sampler.py
    ├── scripts
    │   ├── __init__.py
    │   └── convert_nnscaler_checkpoint_to_transformers.py
    ├── processor
    │   ├── __init__.py
    │   └── vibevoice_tokenizer_processor.py
    ├── README.md
    └── LICENSE
├── requirements.txt
├── node_list.json
├── nodes
    ├── __init__.py
    ├── free_memory_node.py
    ├── load_text_node.py
    ├── lora_node.py
    ├── single_speaker_node.py
    └── multi_speaker_node.py
├── pyproject.toml
├── LICENSE
├── __init__.py
├── examples
    ├── Single-Speaker.json
    ├── Pause-Tag.json
    ├── Multiple-Speaker.json
    └── VibeVoice-Unload-Memory.json
└── README.md


/vvembed/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vvembed/modular/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vvembed/schedule/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vvembed/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vvembed/processor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate>=1.6.0
 2 | transformers>=4.51.3
 3 | diffusers
 4 | tqdm
 5 | scipy
 6 | ml-collections
 7 | torch>=2.0.0
 8 | torchaudio>=2.0.0
 9 | numpy>=1.20.0
10 | librosa>=0.9.0
11 | soundfile>=0.12.0
12 | av>=14.3.0
13 | peft>=0.17.0
14 | huggingface_hub>=0.25.1
15 | absl-py
16 | aiortc
17 | bitsandbytes>=0.48.1
18 | protobuf


--------------------------------------------------------------------------------
/node_list.json:
--------------------------------------------------------------------------------
1 | {
2 |   "VibeVoice Load Text From File": "Load .txt from ComfyUI input/output/temp",
3 |   "VibeVoice Single Speaker": "Single-speaker TTS with optional voice cloning",
4 |   "VibeVoice Multiple Speakers": "Multi-speaker TTS ([1]..[4]) with optional clones",
5 |   "VibeVoice Free Memory": "Frees loaded VibeVoice models; passthrough audio",
6 |   "VibeVoice LoRA": "Configure LoRA adapters for fine-tuned VibeVoice models"
7 | }


--------------------------------------------------------------------------------
/nodes/__init__.py:
--------------------------------------------------------------------------------
 1 | # Created by Fabio Sarracino
 2 | # Nodes module for VibeVoiceWrapper
 3 | """
 4 | This module contains all the ComfyUI nodes for VibeVoice integration.
 5 | """
 6 | 
 7 | from .load_text_node import LoadTextFromFileNode
 8 | from .single_speaker_node import VibeVoiceSingleSpeakerNode
 9 | from .multi_speaker_node import VibeVoiceMultipleSpeakersNode
10 | from .free_memory_node import VibeVoiceFreeMemoryNode
11 | 
12 | __all__ = [
13 |     'LoadTextFromFileNode', 
14 |     'VibeVoiceSingleSpeakerNode', 
15 |     'VibeVoiceMultipleSpeakersNode',
16 |     'VibeVoiceFreeMemoryNode'
17 | ]


--------------------------------------------------------------------------------
/vvembed/schedule/timestep_sampler.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | 
 4 | 
 5 | class UniformSampler:
 6 |     def __init__(self, timesteps = 1000):
 7 |         self.timesteps = timesteps
 8 |     def sample(self, batch_size, device):
 9 |         return torch.randint(0, self.timesteps, (batch_size,), device=device)
10 |     
11 | class LogitNormalSampler:
12 |     def __init__(self, timesteps = 1000, m = 0, s = 1):
13 |         self.timesteps = timesteps
14 |         timesteps = torch.linspace(0, 1, timesteps)
15 |         logit = torch.log(timesteps / (1 - timesteps))
16 |         self.prob = torch.exp(-0.5 * (logit - m) ** 2 / s ** 2) / (s * math.sqrt(2 * math.pi))
17 |     def sample(self, batch_size, device):
18 |         return torch.multinomial(self.prob, batch_size, replacement=True).to(device)
19 |     


--------------------------------------------------------------------------------
/vvembed/README.md:
--------------------------------------------------------------------------------
 1 | # Embedded VibeVoice
 2 | 
 3 | This folder contains the embedded VibeVoice code from Microsoft.
 4 | 
 5 | ## Why Embedded?
 6 | 
 7 | The original VibeVoice repository (https://github.com/microsoft/VibeVoice) has been removed from GitHub. Since VibeVoice is licensed under MIT, we have embedded the code here to ensure continued functionality of the ComfyUI wrapper.
 8 | 
 9 | ## License
10 | 
11 | The code in this folder is licensed under the MIT License (see LICENSE file). Original copyright belongs to Microsoft Corporation.
12 | 
13 | ## Modifications
14 | 
15 | The only modifications made to the original code are:
16 | - Changed absolute imports from `vibevoice` to relative imports
17 | - No functional changes to the core logic
18 | 
19 | ## Note
20 | 
21 | This is a preservation copy to ensure the continued availability of VibeVoice for the ComfyUI community.
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "VibeVoice-ComfyUI"
 3 | version = "1.8.1"
 4 | description = "ComfyUI wrapper for Microsoft VibeVoice TTS model. Supports single speaker, multi-speaker, and text file loading"
 5 | license = {file = "LICENSE"} 
 6 | authors = [{name = "Fabio Sarracino"}]
 7 | dependencies = ["accelerate>=1.6.0", "transformers>=4.51.3", "diffusers", "tqdm", "scipy", "ml-collections", "torch>=2.0.0", "torchaudio>=2.0.0", "numpy>=1.20.0", "librosa>=0.9.0", "soundfile>=0.12.0", "av>=14.3.0", "peft>=0.17.0", "huggingface_hub>=0.25.1", "absl-py", "aiortc", "bitsandbytes>=0.48.1", "protobuf"]
 8 | 
 9 | [project.urls]
10 | Repository = "https://github.com/Enemyx-net/VibeVoice-ComfyUI"
11 | "Bug Tracker" = "https://github.com/Enemyx-net/VibeVoice-ComfyUI/issues"
12 | 
13 | [tool.comfy]
14 | PublisherId = "enemyx"
15 | DisplayName = "VibeVoice ComfyUI"
16 | Icon = ""
17 | includes = []
18 | 
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Fabio Sarracino - enemyx.net
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/vvembed/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Microsoft Corporation.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | ---
24 | 
25 | This is the original VibeVoice code from Microsoft, embedded here as the
26 | repository has been removed from GitHub. The code is used under the MIT license.


--------------------------------------------------------------------------------
/nodes/free_memory_node.py:
--------------------------------------------------------------------------------
  1 | # Created by Fabio Sarracino
  2 | # Node to free VibeVoice model memory
  3 | 
  4 | import logging
  5 | import torch
  6 | import gc
  7 | from typing import Any
  8 | 
  9 | # Setup logging
 10 | logger = logging.getLogger("VibeVoice")
 11 | 
 12 | class VibeVoiceFreeMemoryNode:
 13 |     """Node to explicitly free VibeVoice model memory"""
 14 |     
 15 |     # Class variables to store node instances
 16 |     _single_speaker_instances = []
 17 |     _multi_speaker_instances = []
 18 |     
 19 |     @classmethod
 20 |     def INPUT_TYPES(cls):
 21 |         return {
 22 |             "required": {
 23 |                 "audio": ("AUDIO", {"tooltip": "Audio input that triggers memory cleanup and gets passed through"}),
 24 |             }
 25 |         }
 26 | 
 27 |     RETURN_TYPES = ("AUDIO",)
 28 |     RETURN_NAMES = ("audio",)
 29 |     FUNCTION = "free_vibevoice_memory"
 30 |     CATEGORY = "VibeVoiceWrapper"
 31 |     DESCRIPTION = "Free all loaded VibeVoice models from memory when audio passes through"
 32 |     
 33 |     @classmethod
 34 |     def register_single_speaker(cls, node_instance):
 35 |         """Register a single speaker node instance"""
 36 |         if node_instance not in cls._single_speaker_instances:
 37 |             cls._single_speaker_instances.append(node_instance)
 38 |     
 39 |     @classmethod
 40 |     def register_multi_speaker(cls, node_instance):
 41 |         """Register a multi speaker node instance"""
 42 |         if node_instance not in cls._multi_speaker_instances:
 43 |             cls._multi_speaker_instances.append(node_instance)
 44 |     
 45 |     def free_vibevoice_memory(self, audio):
 46 |         """Free memory from all VibeVoice nodes and pass through the audio"""
 47 |         
 48 |         try:
 49 |             freed_count = 0
 50 |             
 51 |             # Try to access and free memory from globally cached instances
 52 |             # ComfyUI might cache node instances
 53 |             try:
 54 |                 import sys
 55 |                 from .base_vibevoice import BaseVibeVoiceNode
 56 |                 
 57 |                 # Search in all modules for BaseVibeVoiceNode instances
 58 |                 for module_name, module in sys.modules.items():
 59 |                     if module and 'vibevoice' in module_name.lower():
 60 |                         for attr_name in dir(module):
 61 |                             if not attr_name.startswith('_'):
 62 |                                 try:
 63 |                                     attr = getattr(module, attr_name)
 64 |                                     if isinstance(attr, type) and issubclass(attr, BaseVibeVoiceNode):
 65 |                                         # Check if the class has any cached instances
 66 |                                         for instance_attr in dir(attr):
 67 |                                             instance = getattr(attr, instance_attr)
 68 |                                             if isinstance(instance, BaseVibeVoiceNode) and hasattr(instance, 'free_memory'):
 69 |                                                 instance.free_memory()
 70 |                                                 freed_count += 1
 71 |                                 except:
 72 |                                     pass
 73 |             except:
 74 |                 pass
 75 |             
 76 |             # Free from registered single speaker instances
 77 |             for node in self._single_speaker_instances:
 78 |                 if hasattr(node, 'free_memory'):
 79 |                     node.free_memory()
 80 |                     freed_count += 1
 81 |             
 82 |             # Free from registered multi speaker instances  
 83 |             for node in self._multi_speaker_instances:
 84 |                 if hasattr(node, 'free_memory'):
 85 |                     node.free_memory()
 86 |                     freed_count += 1
 87 |             
 88 |             # Force garbage collection
 89 |             gc.collect()
 90 |             
 91 |             # Clear CUDA cache if available
 92 |             if torch.cuda.is_available():
 93 |                 torch.cuda.empty_cache()
 94 |                 torch.cuda.synchronize()
 95 |                 logger.info(f"Freed VibeVoice memory from {freed_count} nodes and cleared CUDA cache")
 96 |             else:
 97 |                 logger.info(f"Freed VibeVoice memory from {freed_count} nodes")
 98 |             
 99 |             # Pass through the audio unchanged
100 |             return (audio,)
101 |                 
102 |         except Exception as e:
103 |             logger.error(f"Error freeing VibeVoice memory: {str(e)}")
104 |             # Still pass through audio even if error occurs
105 |             return (audio,)
106 |     
107 |     @classmethod
108 |     def IS_CHANGED(cls, **kwargs):
109 |         """Always execute this node"""
110 |         return float("nan")  # Forces re-execution every time


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
  1 | # Created by Fabio Sarracino
  2 | __version__ = "1.8.1"
  3 | __author__ = "Fabio Sarracino"
  4 | __title__ = "VibeVoice ComfyUI"
  5 | 
  6 | import logging
  7 | import os
  8 | import sys
  9 | import subprocess
 10 | 
 11 | # Setup logging
 12 | logger = logging.getLogger("VibeVoice")
 13 | logger.propagate = False
 14 | 
 15 | if not logger.handlers:
 16 |     handler = logging.StreamHandler()
 17 |     formatter = logging.Formatter('[VibeVoice] %(message)s')
 18 |     handler.setFormatter(formatter)
 19 |     logger.addHandler(handler)
 20 |     logger.setLevel(logging.INFO)
 21 | 
 22 | def apply_timm_compatibility_patches():
 23 |     """Apply compatibility patches for timm package conflicts"""
 24 |     try:
 25 |         import timm.data
 26 |         
 27 |         # Patch missing functions that cause import errors
 28 |         patches = {
 29 |             'ImageNetInfo': lambda: type('ImageNetInfo', (), {'__init__': lambda self: None})(),
 30 |             'infer_imagenet_subset': lambda class_to_idx: 'imagenet',
 31 |             'get_imagenet_subset_labels': lambda *args, **kwargs: [],
 32 |             'get_imagenet_subset_info': lambda *args, **kwargs: {},
 33 |             'resolve_data_config': lambda *args, **kwargs: {}
 34 |         }
 35 |         
 36 |         for attr_name, patch_func in patches.items():
 37 |             if not hasattr(timm.data, attr_name):
 38 |                 if attr_name == 'ImageNetInfo':
 39 |                     setattr(timm.data, attr_name, type('ImageNetInfo', (), {'__init__': lambda self: None}))
 40 |                 else:
 41 |                     setattr(timm.data, attr_name, patch_func)
 42 |         
 43 |         return True
 44 |     except Exception as e:
 45 |         return False
 46 | 
 47 | def check_embedded_vibevoice():
 48 |     """Check if embedded VibeVoice is available"""
 49 |     vvembed_path = os.path.join(os.path.dirname(__file__), 'vvembed')
 50 |     if not os.path.exists(vvembed_path):
 51 |         logger.error(f"Embedded VibeVoice not found at {vvembed_path}")
 52 |         return False
 53 |     
 54 |     # Add vvembed to path if not already there
 55 |     if vvembed_path not in sys.path:
 56 |         sys.path.insert(0, vvembed_path)
 57 |     
 58 |     logger.info("Using embedded VibeVoice (MIT licensed)")
 59 |     return True
 60 | 
 61 | def ensure_dependencies():
 62 |     """Ensure required dependencies are installed"""
 63 |     try:
 64 |         import transformers
 65 |         from packaging import version
 66 |         if version.parse(transformers.__version__) < version.parse("4.44.0"):
 67 |             logger.warning("Transformers version < 4.44.0, some features may not work correctly")
 68 |     except ImportError:
 69 |         logger.warning("Transformers not installed. Please install: pip install transformers>=4.44.0")
 70 |         return False
 71 |     
 72 |     # Apply timm patches if needed
 73 |     apply_timm_compatibility_patches()
 74 |     
 75 |     return True
 76 | 
 77 | # Initialize node mappings
 78 | NODE_CLASS_MAPPINGS = {}
 79 | NODE_DISPLAY_NAME_MAPPINGS = {}
 80 | 
 81 | # Register text loading node (always available)
 82 | try:
 83 |     from .nodes.load_text_node import LoadTextFromFileNode
 84 |     NODE_CLASS_MAPPINGS["LoadTextFromFileNode"] = LoadTextFromFileNode
 85 |     NODE_DISPLAY_NAME_MAPPINGS["LoadTextFromFileNode"] = "VibeVoice Load Text From File"
 86 | except Exception as e:
 87 |     logger.error(f"Failed to register LoadTextFromFile node: {e}")
 88 | 
 89 | # Register VibeVoice nodes (using embedded VibeVoice)
 90 | if check_embedded_vibevoice() and ensure_dependencies():
 91 |     try:
 92 |         from .nodes.single_speaker_node import VibeVoiceSingleSpeakerNode
 93 |         from .nodes.multi_speaker_node import VibeVoiceMultipleSpeakersNode
 94 |         from .nodes.free_memory_node import VibeVoiceFreeMemoryNode
 95 |         from .nodes.lora_node import VibeVoiceLoRANode
 96 | 
 97 |         # Single speaker node
 98 |         NODE_CLASS_MAPPINGS["VibeVoiceSingleSpeakerNode"] = VibeVoiceSingleSpeakerNode
 99 |         NODE_DISPLAY_NAME_MAPPINGS["VibeVoiceSingleSpeakerNode"] = "VibeVoice Single Speaker"
100 | 
101 |         # Multi speaker node
102 |         NODE_CLASS_MAPPINGS["VibeVoiceMultipleSpeakersNode"] = VibeVoiceMultipleSpeakersNode
103 |         NODE_DISPLAY_NAME_MAPPINGS["VibeVoiceMultipleSpeakersNode"] = "VibeVoice Multiple Speakers"
104 | 
105 |         # Free memory node
106 |         NODE_CLASS_MAPPINGS["VibeVoiceFreeMemoryNode"] = VibeVoiceFreeMemoryNode
107 |         NODE_DISPLAY_NAME_MAPPINGS["VibeVoiceFreeMemoryNode"] = "VibeVoice Free Memory"
108 | 
109 |         # LoRA configuration node
110 |         NODE_CLASS_MAPPINGS["VibeVoiceLoRANode"] = VibeVoiceLoRANode
111 |         NODE_DISPLAY_NAME_MAPPINGS["VibeVoiceLoRANode"] = "VibeVoice LoRA"
112 |         
113 |         logger.info("VibeVoice nodes registered successfully")
114 |         
115 |     except Exception as e:
116 |         logger.error(f"Failed to register VibeVoice nodes: {e}")
117 |         logger.info("Please ensure transformers>=4.44.0 is installed")
118 | else:
119 |     logger.warning("VibeVoice nodes unavailable - check embedded module and dependencies")
120 | 
121 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS', '__version__']


--------------------------------------------------------------------------------
/examples/Single-Speaker.json:
--------------------------------------------------------------------------------
1 | {"id":"c6ef8963-032c-45f6-954f-b5f6b354343b","revision":0,"last_node_id":44,"last_link_id":61,"nodes":[{"id":15,"type":"LoadAudio","pos":[15.256911277770996,126.44892883300781],"size":[270,136],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[60]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"LoadAudio"},"widgets_values":["Voice.mp3",null,null],"color":"#2a363b","bgcolor":"#3f5159"},{"id":21,"type":"Note","pos":[-83.88814544677734,580.3738403320312],"size":[415,88],"flags":{},"order":4,"mode":0,"inputs":[],"outputs":[],"title":"Load Text From File","properties":{},"widgets_values":["Use Load Text From File if you want to use a .txt file instead of text-area. You can load .txt files from ComfyUI/input, ComfyUI/output or ComfyUI/temp directories."],"color":"#432","bgcolor":"#653"},{"id":40,"type":"Note","pos":[377.95758056640625,593.4078979492188],"size":[415,88],"flags":{},"order":5,"mode":0,"inputs":[],"outputs":[],"title":"Voice Speed Factor","properties":{},"widgets_values":["The voice speed factor influences the original source audio to attempt to achieve a slower or faster final speech. 1.0 is the normal speed. It is recommended not to exceed values ​​between 0.95 and 1.05. The effect is best when you provide a sample audio of at least 20 seconds."],"color":"#432","bgcolor":"#653"},{"id":16,"type":"PreviewAudio","pos":[894.1837768554688,126.69258117675781],"size":[270,88],"flags":{},"order":7,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":61},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null}],"outputs":[],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"PreviewAudio"},"widgets_values":[],"color":"#323","bgcolor":"#535"},{"id":44,"type":"VibeVoiceSingleSpeakerNode","pos":[388.8460693359375,126.70189666748047],"size":[400,420],"flags":{},"order":6,"mode":0,"inputs":[{"localized_name":"voice_to_clone","name":"voice_to_clone","shape":7,"type":"AUDIO","link":60},{"localized_name":"lora","name":"lora","shape":7,"type":"LORA_CONFIG","link":null},{"localized_name":"text","name":"text","type":"STRING","widget":{"name":"text"},"link":null},{"localized_name":"model","name":"model","type":"COMBO","widget":{"name":"model"},"link":null},{"localized_name":"attention_type","name":"attention_type","type":"COMBO","widget":{"name":"attention_type"},"link":null},{"localized_name":"quantize_llm","name":"quantize_llm","type":"COMBO","widget":{"name":"quantize_llm"},"link":null},{"localized_name":"free_memory_after_generate","name":"free_memory_after_generate","type":"BOOLEAN","widget":{"name":"free_memory_after_generate"},"link":null},{"localized_name":"diffusion_steps","name":"diffusion_steps","type":"INT","widget":{"name":"diffusion_steps"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null},{"localized_name":"cfg_scale","name":"cfg_scale","type":"FLOAT","widget":{"name":"cfg_scale"},"link":null},{"localized_name":"use_sampling","name":"use_sampling","type":"BOOLEAN","widget":{"name":"use_sampling"},"link":null},{"localized_name":"temperature","name":"temperature","shape":7,"type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","shape":7,"type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"max_words_per_chunk","name":"max_words_per_chunk","shape":7,"type":"INT","widget":{"name":"max_words_per_chunk"},"link":null},{"localized_name":"voice_speed_factor","name":"voice_speed_factor","shape":7,"type":"FLOAT","widget":{"name":"voice_speed_factor"},"link":null}],"outputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","links":[61]}],"properties":{"Node name for S&R":"VibeVoiceSingleSpeakerNode"},"widgets_values":["Hello, this is a test of the VibeVoice text-to-speech system.","VibeVoice-1.5B","auto","full precision",true,20,42,"fixed",1.3,false,0.95,0.95,250,1],"color":"#223","bgcolor":"#335"},{"id":28,"type":"LoadTextFromFileNode","pos":[-11.502296447753906,465.8179626464844],"size":[289.5152282714844,58],"flags":{},"order":0,"mode":4,"inputs":[{"localized_name":"file","name":"file","type":"COMBO","widget":{"name":"file"},"link":null}],"outputs":[{"localized_name":"text","name":"text","type":"STRING","links":null}],"properties":{"Node name for S&R":"LoadTextFromFileNode","cnr_id":"VibeVoice-ComfyUI","ver":"5a24489a7b0bf0c406d291dd51e82a085d338d44"},"widgets_values":["No text files found in any directory"],"color":"#323","bgcolor":"#535"},{"id":22,"type":"Note","pos":[-539.2780151367188,186.78372192382812],"size":[408.66363525390625,236.39089965820312],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[],"title":"1) Download Models","properties":{},"widgets_values":["You have to manually download the models you would like to use and put them into: ComfyUI/models/vibevoice/\n\nMake a directory for each model and put all the files inside them.\n\nVibeVoice-1.5B model (~ 5.4 GB):\nhttps://huggingface.co/microsoft/VibeVoice-1.5B/tree/main\n\nVibeVoice-Large model (~ 18.7 GB):\nhttps://huggingface.co/aoi-ot/VibeVoice-Large/tree/main\n\nVibeVoice-Large-Q-8bit model (~ 11.6 GB):\nhttps://huggingface.co/FabioSarracino/VibeVoice-Large-Q8/tree/main\n\nVibeVoice-Large-Q-4bit model (~ 6.6 GB):\nhttps://huggingface.co/DevParker/VibeVoice7b-low-vram/tree/main/4bit"],"color":"#432","bgcolor":"#653"},{"id":42,"type":"Note","pos":[-538.9786987304688,486.52374267578125],"size":[407.2561950683594,155.19009399414062],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[],"title":"2) Download Tokenizer","properties":{},"widgets_values":["You have to manually download the Qwen2.5 Tokenizer files and put them into: ComfyUI/models/vibevoice/tokenizer/\n\nhttps://huggingface.co/Qwen/Qwen2.5-1.5B/tree/main\n\nRequired files: tokenizer_config.json, vocab.json, merges.txt, tokenizer.json (~11MB)\n\nPut the files directly inside tokenizer directory without make another directory inside."],"color":"#432","bgcolor":"#653"}],"links":[[60,15,0,44,0,"AUDIO"],[61,44,0,16,0,"AUDIO"]],"groups":[{"id":2,"title":"Instructions before use:","bounding":[-562.1800537109375,89.24514770507812,453.3775939941406,595.2697143554688],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"ds":{"scale":0.9090909090909091,"offset":[795.8030854327329,-23.374334793282447]}},"version":0.4}


--------------------------------------------------------------------------------
/nodes/load_text_node.py:
--------------------------------------------------------------------------------
  1 | # Created by Fabio Sarracino
  2 | 
  3 | import os
  4 | import logging
  5 | import hashlib
  6 | import folder_paths
  7 | 
  8 | # Setup logging
  9 | logger = logging.getLogger("VibeVoice")
 10 | 
 11 | class LoadTextFromFileNode:
 12 |     @classmethod
 13 |     def INPUT_TYPES(cls):
 14 |         # Get all text files from all directories
 15 |         all_files = []
 16 |         
 17 |         # Add files from each directory with prefix
 18 |         for dir_name in ["input", "output", "temp"]:
 19 |             files = cls.get_files_for_directory(dir_name)
 20 |             for f in files:
 21 |                 if f != "No text files found":
 22 |                     all_files.append(f"{dir_name}/{f}")
 23 |         
 24 |         if not all_files:
 25 |             all_files = ["No text files found in any directory"]
 26 |         
 27 |         return {
 28 |             "required": {
 29 |                 "file": (sorted(all_files), {
 30 |                     "tooltip": "Select a text file to load (format: directory/filename)"
 31 |                 }),
 32 |             }
 33 |         }
 34 |     
 35 |     @classmethod
 36 |     def get_files_for_directory(cls, source_dir):
 37 |         """Get list of text files for the selected directory"""
 38 |         # Get the appropriate directory path
 39 |         if source_dir == "input":
 40 |             dir_path = folder_paths.get_input_directory()
 41 |         elif source_dir == "output":
 42 |             dir_path = folder_paths.get_output_directory()
 43 |         elif source_dir == "temp":
 44 |             dir_path = folder_paths.get_temp_directory()
 45 |         else:
 46 |             return []
 47 |         
 48 |         files = []
 49 |         try:
 50 |             for f in os.listdir(dir_path):
 51 |                 if os.path.isfile(os.path.join(dir_path, f)):
 52 |                     # Check for text file extensions
 53 |                     if f.lower().endswith(('.txt')):
 54 |                         files.append(f)
 55 |         except Exception as e:
 56 |             logger.warning(f"Error listing files in {source_dir}: {e}")
 57 |             
 58 |         return files
 59 | 
 60 |     RETURN_TYPES = ("STRING",)
 61 |     RETURN_NAMES = ("text",)
 62 |     FUNCTION = "load_text"
 63 |     CATEGORY = "VibeVoiceWrapper"
 64 |     DESCRIPTION = "Load text content from a .txt file"
 65 | 
 66 |     def load_text(self, file: str):
 67 |         """Load text content from file"""
 68 |         
 69 |         try:
 70 |             # Check if no file selected
 71 |             if not file or file == "No text files found in any directory":
 72 |                 raise Exception("Please select a valid text file.")
 73 |             
 74 |             # Parse directory and filename from the combined string
 75 |             if "/" not in file:
 76 |                 raise Exception(f"Invalid file format: {file}")
 77 |             
 78 |             source_dir, filename = file.split("/", 1)
 79 |             
 80 |             # Get the appropriate directory path
 81 |             if source_dir == "input":
 82 |                 dir_path = folder_paths.get_input_directory()
 83 |             elif source_dir == "output":
 84 |                 dir_path = folder_paths.get_output_directory()
 85 |             elif source_dir == "temp":
 86 |                 dir_path = folder_paths.get_temp_directory()
 87 |             else:
 88 |                 raise Exception(f"Invalid source directory: {source_dir}")
 89 |             
 90 |             # Build full file path
 91 |             file_path = os.path.join(dir_path, filename)
 92 |             
 93 |             if not os.path.exists(file_path):
 94 |                 raise Exception(f"File not found: {file_path}")
 95 |             
 96 |             # Read file with UTF-8 encoding (most common)
 97 |             with open(file_path, 'r', encoding='utf-8') as f:
 98 |                 text_content = f.read()
 99 |             
100 |             if not text_content.strip():
101 |                 raise Exception("File is empty or contains only whitespace")
102 |             
103 |             return (text_content,)
104 |             
105 |         except UnicodeDecodeError as e:
106 |             raise Exception(f"Encoding error reading file: {str(e)}. File may not be UTF-8 encoded.")
107 |         except Exception as e:
108 |             logger.error(f"Failed to load text file: {str(e)}")
109 |             raise Exception(f"Error loading text file: {str(e)}")
110 | 
111 |     @classmethod
112 |     def IS_CHANGED(cls, file):
113 |         """Cache key for ComfyUI"""
114 |         if not file or file == "No text files found in any directory":
115 |             return "no_file"
116 |         
117 |         # Parse directory and filename
118 |         if "/" not in file:
119 |             return f"{file}_invalid"
120 |         
121 |         source_dir, filename = file.split("/", 1)
122 |         
123 |         # Get the appropriate directory path
124 |         if source_dir == "input":
125 |             dir_path = folder_paths.get_input_directory()
126 |         elif source_dir == "output":
127 |             dir_path = folder_paths.get_output_directory()
128 |         elif source_dir == "temp":
129 |             dir_path = folder_paths.get_temp_directory()
130 |         else:
131 |             return f"{file}_invalid_dir"
132 |         
133 |         file_path = os.path.join(dir_path, filename)
134 |         
135 |         if not os.path.exists(file_path):
136 |             return f"{file}_not_found"
137 |         
138 |         # Use file hash for cache invalidation
139 |         try:
140 |             m = hashlib.sha256()
141 |             with open(file_path, 'rb') as f:
142 |                 m.update(f.read())
143 |             return m.digest().hex()
144 |         except:
145 |             return f"{file}_error"
146 |     
147 |     @classmethod
148 |     def VALIDATE_INPUTS(cls, file, **kwargs):
149 |         """Validate that the file exists"""
150 |         if not file or file == "No text files found in any directory":
151 |             return "No valid text file selected"
152 |         
153 |         # Parse directory and filename
154 |         if "/" not in file:
155 |             return f"Invalid file format: {file}"
156 |         
157 |         source_dir, filename = file.split("/", 1)
158 |         
159 |         # Get the appropriate directory path
160 |         if source_dir == "input":
161 |             dir_path = folder_paths.get_input_directory()
162 |         elif source_dir == "output":
163 |             dir_path = folder_paths.get_output_directory()
164 |         elif source_dir == "temp":
165 |             dir_path = folder_paths.get_temp_directory()
166 |         else:
167 |             return f"Invalid source directory: {source_dir}"
168 |         
169 |         file_path = os.path.join(dir_path, filename)
170 |         if not os.path.exists(file_path):
171 |             return f"File not found: {filename} in {source_dir}"
172 |         
173 |         return True


--------------------------------------------------------------------------------
/examples/Pause-Tag.json:
--------------------------------------------------------------------------------
1 | {"id":"b70cf6f7-8531-4faa-9843-9c963a4ba577","revision":0,"last_node_id":47,"last_link_id":58,"nodes":[{"id":28,"type":"LoadTextFromFileNode","pos":[-51.13530731201172,497.1748352050781],"size":[289.5152282714844,58],"flags":{},"order":0,"mode":4,"inputs":[{"localized_name":"file","name":"file","type":"COMBO","widget":{"name":"file"},"link":null}],"outputs":[{"localized_name":"text","name":"text","type":"STRING","links":null}],"properties":{"Node name for S&R":"LoadTextFromFileNode","cnr_id":"VibeVoice-ComfyUI","ver":"5a24489a7b0bf0c406d291dd51e82a085d338d44"},"widgets_values":["No text files found in any directory"],"color":"#323","bgcolor":"#535"},{"id":38,"type":"Note","pos":[775.2548828125,307.8158874511719],"size":[415,88],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[],"title":"Pause System","properties":{},"widgets_values":["[pause]: add 1 second of silence.\n[pause:{number}] add {number}ms of pause\nWARNING: the pause tag forces the text to be split into chunks. This may worsen the model’s ability to understand the context. The model’s context is represented ONLY by its own chunk."],"color":"#432","bgcolor":"#653"},{"id":15,"type":"LoadAudio","pos":[-52.503074645996094,163.9591064453125],"size":[270,136],"flags":{},"order":2,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[57]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"LoadAudio"},"widgets_values":["Voice.mp3",null,null],"color":"#2a363b","bgcolor":"#3f5159"},{"id":21,"type":"Note","pos":[-119.67156219482422,637.6148071289062],"size":[415,88],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[],"title":"Load Text From File","properties":{},"widgets_values":["Use Load Text From File if you want to use a .txt file instead of text-area. You can load .txt files from ComfyUI/input, ComfyUI/output or ComfyUI/temp directories."],"color":"#432","bgcolor":"#653"},{"id":16,"type":"PreviewAudio","pos":[845.1698608398438,163.10276794433594],"size":[270,88],"flags":{},"order":8,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":58},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null}],"outputs":[],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"PreviewAudio"},"widgets_values":[],"color":"#323","bgcolor":"#535"},{"id":40,"type":"Note","pos":[325.02294921875,636.903564453125],"size":[415,88],"flags":{},"order":4,"mode":0,"inputs":[],"outputs":[],"title":"Voice Speed Factor","properties":{},"widgets_values":["The voice speed factor influences the original source audio to attempt to achieve a slower or faster final speech. 1.0 is the normal speed. It is recommended not to exceed values ​​between 0.95 and 1.05. The effect is best when you provide a sample audio of at least 20 seconds."],"color":"#432","bgcolor":"#653"},{"id":45,"type":"VibeVoiceSingleSpeakerNode","pos":[327.48126220703125,164.61436462402344],"size":[400,420],"flags":{},"order":7,"mode":0,"inputs":[{"localized_name":"voice_to_clone","name":"voice_to_clone","shape":7,"type":"AUDIO","link":57},{"localized_name":"lora","name":"lora","shape":7,"type":"LORA_CONFIG","link":null},{"localized_name":"text","name":"text","type":"STRING","widget":{"name":"text"},"link":null},{"localized_name":"model","name":"model","type":"COMBO","widget":{"name":"model"},"link":null},{"localized_name":"attention_type","name":"attention_type","type":"COMBO","widget":{"name":"attention_type"},"link":null},{"localized_name":"quantize_llm","name":"quantize_llm","type":"COMBO","widget":{"name":"quantize_llm"},"link":null},{"localized_name":"free_memory_after_generate","name":"free_memory_after_generate","type":"BOOLEAN","widget":{"name":"free_memory_after_generate"},"link":null},{"localized_name":"diffusion_steps","name":"diffusion_steps","type":"INT","widget":{"name":"diffusion_steps"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null},{"localized_name":"cfg_scale","name":"cfg_scale","type":"FLOAT","widget":{"name":"cfg_scale"},"link":null},{"localized_name":"use_sampling","name":"use_sampling","type":"BOOLEAN","widget":{"name":"use_sampling"},"link":null},{"localized_name":"temperature","name":"temperature","shape":7,"type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","shape":7,"type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"max_words_per_chunk","name":"max_words_per_chunk","shape":7,"type":"INT","widget":{"name":"max_words_per_chunk"},"link":null},{"localized_name":"voice_speed_factor","name":"voice_speed_factor","shape":7,"type":"FLOAT","widget":{"name":"voice_speed_factor"},"link":null}],"outputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","links":[58]}],"properties":{"Node name for S&R":"VibeVoiceSingleSpeakerNode"},"widgets_values":["Hello, this is a test of the VibeVoice text-to-speech system. [pause] Do you like my voice? [pause:500] What's your name?","VibeVoice-1.5B","auto","full precision",true,20,42,"fixed",1.3,false,0.95,0.95,250,1],"color":"#223","bgcolor":"#335"},{"id":46,"type":"Note","pos":[-576.477294921875,222.4726104736328],"size":[408.66363525390625,236.39089965820312],"flags":{},"order":5,"mode":0,"inputs":[],"outputs":[],"title":"1) Download Models","properties":{},"widgets_values":["You have to manually download the models you would like to use and put them into: ComfyUI/models/vibevoice/\n\nMake a directory for each model and put all the files inside them.\n\nVibeVoice-1.5B model (~ 5.4 GB):\nhttps://huggingface.co/microsoft/VibeVoice-1.5B/tree/main\n\nVibeVoice-Large model (~ 18.7 GB):\nhttps://huggingface.co/aoi-ot/VibeVoice-Large/tree/main\n\nVibeVoice-Large-Q-8bit model (~ 11.6 GB):\nhttps://huggingface.co/FabioSarracino/VibeVoice-Large-Q8/tree/main\n\nVibeVoice-Large-Q-4bit model (~ 6.6 GB):\nhttps://huggingface.co/DevParker/VibeVoice7b-low-vram/tree/main/4bit"],"color":"#432","bgcolor":"#653"},{"id":47,"type":"Note","pos":[-576.177978515625,522.212646484375],"size":[407.2561950683594,155.19009399414062],"flags":{},"order":6,"mode":0,"inputs":[],"outputs":[],"title":"2) Download Tokenizer","properties":{},"widgets_values":["You have to manually download the Qwen2.5 Tokenizer files and put them into: ComfyUI/models/vibevoice/tokenizer/\n\nhttps://huggingface.co/Qwen/Qwen2.5-1.5B/tree/main\n\nRequired files: tokenizer_config.json, vocab.json, merges.txt, tokenizer.json (~11MB)\n\nPut the files directly inside tokenizer directory without make another directory inside."],"color":"#432","bgcolor":"#653"}],"links":[[57,15,0,45,0,"AUDIO"],[58,45,0,16,0,"AUDIO"]],"groups":[{"id":2,"title":"Instructions before use:","bounding":[-599.3793334960938,124.93412017822266,453.3775939941406,595.2697143554688],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"ds":{"scale":0.8264462809917354,"offset":[815.9689977237014,-22.084207406969263]}},"version":0.4}


--------------------------------------------------------------------------------
/vvembed/scripts/convert_nnscaler_checkpoint_to_transformers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | import argparse
  5 | import json
  6 | import os
  7 | from pathlib import Path
  8 | import re
  9 | import torch
 10 | from typing import Dict, List, Tuple
 11 | 
 12 | from modular.configuration_vibevoice import (
 13 |     VibeVoiceConfig
 14 | )
 15 | from modular.modeling_vibevoice import VibeVoiceForConditionalGeneration
 16 | from transformers.utils import logging
 17 | 
 18 | logger = logging.get_logger(__name__)
 19 | 
 20 | def convert_vibevoice_nnscaler_checkpoint_to_hf(
 21 |     checkpoint_path: str,
 22 |     pytorch_dump_folder_path: str,
 23 |     config_path: str = None,
 24 | ):
 25 |     """
 26 |     Convert a nnscaler VibeVoice checkpoint to HuggingFace format.
 27 |     Supports both regular checkpoints and tensor parallel checkpoints.
 28 |     """
 29 |     
 30 |     # Load regular checkpoint
 31 |     logger.info(f"Loading regular checkpoint from {checkpoint_path}")
 32 |     checkpoint = torch.load(checkpoint_path, map_location="cpu") # ['model', 'optimizer', 'lr_scheduler', 'train_status', 'train_args', 'rng_states', 'nnscaler', 'dataloader']
 33 |     
 34 |     # config = checkpoint['train_args']
 35 |     init_config_name = checkpoint['train_args']['vars']['model_args']['config_path']['relative_path']
 36 |     pretrained_name = checkpoint['train_args']['vars']['data_args']['tokenizer_path']
 37 |     
 38 |     init_config_path = Path(__file__).parent.parent / 'configs' / init_config_name.split('/')[-1]
 39 |     if init_config_path.exists():
 40 |         logger.info(f"Loading initial config from {init_config_path}")
 41 |         with open(init_config_path, 'r') as f:
 42 |             init_config = json.load(f)
 43 |     else:
 44 |         raise FileNotFoundError(f"Initial config file {init_config_path} not found. Please provide a valid path.")
 45 | 
 46 |     tie_word_embeddings = init_config['decoder_config'].get('tie_word_embeddings', True)
 47 |     logger.info(f"Tie word embeddings: {tie_word_embeddings}")
 48 | 
 49 |     init_config['decoder_config']['use_cache'] = True
 50 |     config = VibeVoiceConfig(**init_config, tie_word_embeddings=tie_word_embeddings)
 51 | 
 52 |     # # Extract the model state dict
 53 |     model_state_dict = {k.replace('model.model.', 'model.'): v for k, v in checkpoint["model"].items() if k.startswith('model.model.')}
 54 |     if not tie_word_embeddings and 'model.lm_head.weight' in checkpoint["model"].keys():
 55 |         # If not tying weights, we need to add the lm_head weight separately
 56 |         model_state_dict['lm_head.weight'] = checkpoint["model"]['model.lm_head.weight']
 57 |     
 58 |     # Override with provided config if available
 59 |     if config_path:
 60 |         logger.info(f"Loading config from {config_path}")
 61 |         with open(config_path, 'r') as f:
 62 |             config_dict = json.load(f)
 63 |         config = VibeVoiceConfig.from_dict(config_dict)
 64 |     
 65 |     # Set the default dtype to bfloat16 before creating the model
 66 |     original_dtype = torch.get_default_dtype()
 67 |     torch.set_default_dtype(torch.bfloat16)
 68 | 
 69 |     # Create the HuggingFace model
 70 |     logger.info("Creating HuggingFace VibeVoiceForConditionalGeneration model")
 71 |     model = VibeVoiceForConditionalGeneration(config)
 72 |     
 73 |     # Restore original dtype
 74 |     torch.set_default_dtype(original_dtype)
 75 | 
 76 |     # Load the state dict
 77 |     logger.info("Loading weights into model")
 78 |     missing_keys, unexpected_keys = model.load_state_dict(model_state_dict, strict=False)
 79 |     
 80 |     if missing_keys:
 81 |         logger.warning(f"Missing keys: {missing_keys}")
 82 |     if unexpected_keys:
 83 |         logger.warning(f"Unexpected keys: {unexpected_keys}")
 84 |     
 85 |     # Create output directory
 86 |     os.makedirs(pytorch_dump_folder_path, exist_ok=True)
 87 |     
 88 |     # Save the model and config
 89 |     logger.info(f"Saving model to {pytorch_dump_folder_path}")
 90 |     
 91 |     # Save config
 92 |     config.save_pretrained(pytorch_dump_folder_path)
 93 |     
 94 |     # Save VibeVoiceProcessor configuration
 95 |     logger.info("Saving VibeVoiceProcessor configuration")
 96 |     processor_config = {
 97 |         "processor_class": "VibeVoiceProcessor",
 98 |         "speech_tok_compress_ratio": 3200,
 99 |         "db_normalize": True,
100 |         # Audio processor configuration
101 |         "audio_processor": {
102 |             "feature_extractor_type": "VibeVoiceTokenizerProcessor",
103 |             "sampling_rate": 24000,
104 |             "normalize_audio": True,
105 |             "target_dB_FS": -25,
106 |             "eps": 1e-6,
107 |         },
108 |         "language_model_pretrained_name": pretrained_name,
109 |     }
110 |     
111 |     processor_config_path = os.path.join(pytorch_dump_folder_path, "preprocessor_config.json")
112 |     with open(processor_config_path, 'w') as f:
113 |         json.dump(processor_config, f, indent=2)
114 |     logger.info(f"Saved processor config to {processor_config_path}")
115 |     
116 |     # Save model with sharding
117 |     # save_pretrained handles tied weights automatically
118 |     logger.info("Saving model weights with sharding...")
119 |     model.save_pretrained(
120 |         pytorch_dump_folder_path,
121 |         max_shard_size="2GB",  # Set maximum size for each shard
122 |         safe_serialization=True  # Ensure saving in .safetensors format
123 |     )
124 |     logger.info(f"Model weights saved to {pytorch_dump_folder_path}")
125 |     
126 |     logger.info("Conversion complete!")
127 |     
128 |     # Verify the saved model can be loaded
129 |     logger.info("Verifying saved model...")
130 |     loaded_model = VibeVoiceForConditionalGeneration.from_pretrained(pytorch_dump_folder_path)
131 |     logger.info("Model successfully loaded from saved checkpoint!")
132 | 
133 | def main():
134 |     parser = argparse.ArgumentParser()
135 |     parser.add_argument(
136 |         "--nnscaler_checkpoint_path",
137 |         type=str,
138 |         required=True,
139 |         help="Path to the fairseq checkpoint (.pt file). For tensor parallel checkpoints, "
140 |              "provide any one of the part files (e.g., checkpoint_1_5000-model_part-0.pt), "
141 |              "and the script will automatically detect and merge all parts.",
142 |     )
143 |     parser.add_argument(
144 |         "--pytorch_dump_folder_path", 
145 |         type=str,
146 |         required=True,
147 |         help="Path to the output PyTorch model directory",
148 |     )
149 |     parser.add_argument(
150 |         "--config_path",
151 |         type=str,
152 |         default=None,
153 |         help="Optional path to a config JSON file to override extracted config",
154 |     )
155 |     
156 |     args = parser.parse_args()
157 |     
158 |     convert_vibevoice_nnscaler_checkpoint_to_hf(
159 |         args.nnscaler_checkpoint_path,
160 |         args.pytorch_dump_folder_path,
161 |         args.config_path,
162 |     )
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     main()


--------------------------------------------------------------------------------
/examples/Multiple-Speaker.json:
--------------------------------------------------------------------------------
1 | {"id":"e5ca15c5-18b5-4d37-8852-795692a14b29","revision":0,"last_node_id":38,"last_link_id":57,"nodes":[{"id":19,"type":"LoadTextFromFileNode","pos":[9.889446258544922,621.1560668945312],"size":[270,58],"flags":{},"order":0,"mode":4,"inputs":[{"localized_name":"file","name":"file","type":"COMBO","widget":{"name":"file"},"link":null}],"outputs":[{"localized_name":"text","name":"text","type":"STRING","links":null}],"properties":{"Node name for S&R":"LoadTextFromFileNode","cnr_id":"VibeVoice-ComfyUI","ver":"5a24489a7b0bf0c406d291dd51e82a085d338d44"},"widgets_values":["No text files found in any directory"],"color":"#323","bgcolor":"#535"},{"id":31,"type":"Note","pos":[379.3583984375,725.9093627929688],"size":[415,88],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[],"title":"Voice Speed Factor","properties":{},"widgets_values":["The voice speed factor influences the original source audio to attempt to achieve a slower or faster final speech. 1.0 is the normal speed. It is recommended not to exceed values ​​between 0.95 and 1.05. The effect is best when you provide a sample audio of at least 20 seconds."],"color":"#432","bgcolor":"#653"},{"id":16,"type":"PreviewAudio","pos":[896.3719482421875,189.1308135986328],"size":[270,88],"flags":{},"order":8,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":57},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null}],"outputs":[],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"PreviewAudio"},"widgets_values":[],"color":"#323","bgcolor":"#535"},{"id":20,"type":"Note","pos":[-55.931907653808594,726.6131591796875],"size":[415,88],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[],"title":"Load Text From File","properties":{},"widgets_values":["Use Load Text From File if you want to use a .txt file instead of text-area. You can load .txt files from ComfyUI/input, ComfyUI/output or ComfyUI/temp directories."],"color":"#432","bgcolor":"#653"},{"id":15,"type":"LoadAudio","pos":[-12.263749122619629,190.64144897460938],"size":[270,136],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[55]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"LoadAudio"},"widgets_values":["Voice1.mp3",null,null],"color":"#2a363b","bgcolor":"#3f5159"},{"id":17,"type":"LoadAudio","pos":[-11.774602890014648,403.2247009277344],"size":[270,136],"flags":{},"order":4,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[56]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"LoadAudio"},"widgets_values":["Voice2.mp3",null,null],"color":"#2a363b","bgcolor":"#3f5159"},{"id":36,"type":"VibeVoiceMultipleSpeakersNode","pos":[393.1620178222656,189.6568145751953],"size":[400,456],"flags":{},"order":7,"mode":0,"inputs":[{"localized_name":"speaker1_voice","name":"speaker1_voice","shape":7,"type":"AUDIO","link":55},{"localized_name":"speaker2_voice","name":"speaker2_voice","shape":7,"type":"AUDIO","link":56},{"localized_name":"speaker3_voice","name":"speaker3_voice","shape":7,"type":"AUDIO","link":null},{"localized_name":"speaker4_voice","name":"speaker4_voice","shape":7,"type":"AUDIO","link":null},{"localized_name":"lora","name":"lora","shape":7,"type":"LORA_CONFIG","link":null},{"localized_name":"text","name":"text","type":"STRING","widget":{"name":"text"},"link":null},{"localized_name":"model","name":"model","type":"COMBO","widget":{"name":"model"},"link":null},{"localized_name":"attention_type","name":"attention_type","type":"COMBO","widget":{"name":"attention_type"},"link":null},{"localized_name":"quantize_llm","name":"quantize_llm","type":"COMBO","widget":{"name":"quantize_llm"},"link":null},{"localized_name":"free_memory_after_generate","name":"free_memory_after_generate","type":"BOOLEAN","widget":{"name":"free_memory_after_generate"},"link":null},{"localized_name":"diffusion_steps","name":"diffusion_steps","type":"INT","widget":{"name":"diffusion_steps"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null},{"localized_name":"cfg_scale","name":"cfg_scale","type":"FLOAT","widget":{"name":"cfg_scale"},"link":null},{"localized_name":"use_sampling","name":"use_sampling","type":"BOOLEAN","widget":{"name":"use_sampling"},"link":null},{"localized_name":"temperature","name":"temperature","shape":7,"type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","shape":7,"type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"voice_speed_factor","name":"voice_speed_factor","shape":7,"type":"FLOAT","widget":{"name":"voice_speed_factor"},"link":null}],"outputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","links":[57]}],"properties":{"Node name for S&R":"VibeVoiceMultipleSpeakersNode"},"widgets_values":["[1]: Hello, this is the first speaker.\n[2]: Hi there, I'm the second speaker.\n[1]: Nice to meet you!\n[2]: Nice to meet you too!","VibeVoice-Large","auto","4bit",true,20,42,"fixed",1.3,false,0.95,0.95,1],"color":"#223","bgcolor":"#335"},{"id":37,"type":"Note","pos":[-530.1146850585938,279.4844055175781],"size":[408.66363525390625,236.39089965820312],"flags":{},"order":5,"mode":0,"inputs":[],"outputs":[],"title":"1) Download Models","properties":{},"widgets_values":["You have to manually download the models you would like to use and put them into: ComfyUI/models/vibevoice/\n\nMake a directory for each model and put all the files inside them.\n\nVibeVoice-1.5B model (~ 5.4 GB):\nhttps://huggingface.co/microsoft/VibeVoice-1.5B/tree/main\n\nVibeVoice-Large model (~ 18.7 GB):\nhttps://huggingface.co/aoi-ot/VibeVoice-Large/tree/main\n\nVibeVoice-Large-Q-8bit model (~ 11.6 GB):\nhttps://huggingface.co/FabioSarracino/VibeVoice-Large-Q8/tree/main\n\nVibeVoice-Large-Q-4bit model (~ 6.6 GB):\nhttps://huggingface.co/DevParker/VibeVoice7b-low-vram/tree/main/4bit"],"color":"#432","bgcolor":"#653"},{"id":38,"type":"Note","pos":[-529.8153686523438,579.2252807617188],"size":[407.2561950683594,155.19009399414062],"flags":{},"order":6,"mode":0,"inputs":[],"outputs":[],"title":"2) Download Tokenizer","properties":{},"widgets_values":["You have to manually download the Qwen2.5 Tokenizer files and put them into: ComfyUI/models/vibevoice/tokenizer/\n\nhttps://huggingface.co/Qwen/Qwen2.5-1.5B/tree/main\n\nRequired files: tokenizer_config.json, vocab.json, merges.txt, tokenizer.json (~11MB)\n\nPut the files directly inside tokenizer directory without make another directory inside."],"color":"#432","bgcolor":"#653"}],"links":[[55,15,0,36,0,"AUDIO"],[56,17,0,36,1,"AUDIO"],[57,36,0,16,0,"AUDIO"]],"groups":[{"id":1,"title":"Instructions before use:","bounding":[-553.0167846679688,181.94606018066406,453.3775939941406,595.2697143554688],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"ds":{"scale":0.9090909090909097,"offset":[944.6168885013626,-55.446182500052494]}},"version":0.4}


--------------------------------------------------------------------------------
/examples/VibeVoice-Unload-Memory.json:
--------------------------------------------------------------------------------
1 | {"id":"fc471b7e-ccef-427f-be3f-29dec93a90ea","revision":0,"last_node_id":45,"last_link_id":56,"nodes":[{"id":34,"type":"VibeVoiceFreeMemoryNode","pos":[913.2552490234375,127.35599517822266],"size":[189.03964233398438,26],"flags":{},"order":8,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":56}],"outputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","links":[42]}],"properties":{"Node name for S&R":"VibeVoiceFreeMemoryNode","cnr_id":"VibeVoice-ComfyUI","ver":"5a24489a7b0bf0c406d291dd51e82a085d338d44"},"widgets_values":[],"color":"#322","bgcolor":"#533"},{"id":16,"type":"PreviewAudio","pos":[1273.2957763671875,127.3007583618164],"size":[270,88],"flags":{},"order":9,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":42},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null}],"outputs":[],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"PreviewAudio"},"widgets_values":[],"color":"#323","bgcolor":"#535"},{"id":35,"type":"Note","pos":[809.6192016601562,208.98324584960938],"size":[432.1000061035156,126.30000305175781],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[],"title":"Free Memory Node","properties":{},"widgets_values":["The VibeVoice Free Memory node releases memory as soon as it receives the audio input (acting as a passthrough for the audio itself). In this specific use case, however, it’s redundant, since it would be enough to enable the “free_memory_after_generate” parameter of the previous node. The ideal use case is, for example, when you have a loop generating multiple audio clips, and only after the final generation you pass the last audio and free the memory."],"color":"#432","bgcolor":"#653"},{"id":28,"type":"LoadTextFromFileNode","pos":[-30.95530128479004,453.30511474609375],"size":[289.5152282714844,58],"flags":{},"order":2,"mode":4,"inputs":[{"localized_name":"file","name":"file","type":"COMBO","widget":{"name":"file"},"link":null}],"outputs":[{"localized_name":"text","name":"text","type":"STRING","links":null}],"properties":{"Node name for S&R":"LoadTextFromFileNode","cnr_id":"VibeVoice-ComfyUI","ver":"5a24489a7b0bf0c406d291dd51e82a085d338d44"},"widgets_values":["No text files found in any directory"],"color":"#323","bgcolor":"#535"},{"id":40,"type":"Note","pos":[367.98895263671875,597.8056640625],"size":[415,88],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[],"title":"Voice Speed Factor","properties":{},"widgets_values":["The voice speed factor influences the original source audio to attempt to achieve a slower or faster final speech. 1.0 is the normal speed. It is recommended not to exceed values ​​between 0.95 and 1.05. The effect is best when you provide a sample audio of at least 20 seconds."],"color":"#432","bgcolor":"#653"},{"id":15,"type":"LoadAudio","pos":[-21.549091339111328,127.7799301147461],"size":[270,136],"flags":{},"order":4,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[55]}],"properties":{"cnr_id":"comfy-core","ver":"0.3.49","Node name for S&R":"LoadAudio"},"widgets_values":["Voice.mp3",null,null],"color":"#2a363b","bgcolor":"#3f5159"},{"id":43,"type":"VibeVoiceSingleSpeakerNode","pos":[373.596435546875,128.40489196777344],"size":[400,420],"flags":{},"order":7,"mode":0,"inputs":[{"localized_name":"voice_to_clone","name":"voice_to_clone","shape":7,"type":"AUDIO","link":55},{"localized_name":"lora","name":"lora","shape":7,"type":"LORA_CONFIG","link":null},{"localized_name":"text","name":"text","type":"STRING","widget":{"name":"text"},"link":null},{"localized_name":"model","name":"model","type":"COMBO","widget":{"name":"model"},"link":null},{"localized_name":"attention_type","name":"attention_type","type":"COMBO","widget":{"name":"attention_type"},"link":null},{"localized_name":"quantize_llm","name":"quantize_llm","type":"COMBO","widget":{"name":"quantize_llm"},"link":null},{"localized_name":"free_memory_after_generate","name":"free_memory_after_generate","type":"BOOLEAN","widget":{"name":"free_memory_after_generate"},"link":null},{"localized_name":"diffusion_steps","name":"diffusion_steps","type":"INT","widget":{"name":"diffusion_steps"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null},{"localized_name":"cfg_scale","name":"cfg_scale","type":"FLOAT","widget":{"name":"cfg_scale"},"link":null},{"localized_name":"use_sampling","name":"use_sampling","type":"BOOLEAN","widget":{"name":"use_sampling"},"link":null},{"localized_name":"temperature","name":"temperature","shape":7,"type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","shape":7,"type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"max_words_per_chunk","name":"max_words_per_chunk","shape":7,"type":"INT","widget":{"name":"max_words_per_chunk"},"link":null},{"localized_name":"voice_speed_factor","name":"voice_speed_factor","shape":7,"type":"FLOAT","widget":{"name":"voice_speed_factor"},"link":null}],"outputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","links":[56]}],"properties":{"Node name for S&R":"VibeVoiceSingleSpeakerNode"},"widgets_values":["Hello, this is a test of the VibeVoice text-to-speech system.","VibeVoice-1.5B","auto","full precision",true,20,42,"fixed",1.3,false,0.95,0.95,250,1],"color":"#223","bgcolor":"#335"},{"id":44,"type":"Note","pos":[-546.2021484375,184.94338989257812],"size":[408.66363525390625,236.39089965820312],"flags":{},"order":5,"mode":0,"inputs":[],"outputs":[],"title":"1) Download Models","properties":{},"widgets_values":["You have to manually download the models you would like to use and put them into: ComfyUI/models/vibevoice/\n\nMake a directory for each model and put all the files inside them.\n\nVibeVoice-1.5B model (~ 5.4 GB):\nhttps://huggingface.co/microsoft/VibeVoice-1.5B/tree/main\n\nVibeVoice-Large model (~ 18.7 GB):\nhttps://huggingface.co/aoi-ot/VibeVoice-Large/tree/main\n\nVibeVoice-Large-Q-8bit model (~ 11.6 GB):\nhttps://huggingface.co/FabioSarracino/VibeVoice-Large-Q8/tree/main\n\nVibeVoice-Large-Q-4bit model (~ 6.6 GB):\nhttps://huggingface.co/DevParker/VibeVoice7b-low-vram/tree/main/4bit"],"color":"#432","bgcolor":"#653"},{"id":45,"type":"Note","pos":[-545.90283203125,484.68328857421875],"size":[407.2561950683594,155.19009399414062],"flags":{},"order":6,"mode":0,"inputs":[],"outputs":[],"title":"2) Download Tokenizer","properties":{},"widgets_values":["You have to manually download the Qwen2.5 Tokenizer files and put them into: ComfyUI/models/vibevoice/tokenizer/\n\nhttps://huggingface.co/Qwen/Qwen2.5-1.5B/tree/main\n\nRequired files: tokenizer_config.json, vocab.json, merges.txt, tokenizer.json (~11MB)\n\nPut the files directly inside tokenizer directory without make another directory inside."],"color":"#432","bgcolor":"#653"},{"id":21,"type":"Note","pos":[-84.54156494140625,599.46435546875],"size":[415,88],"flags":{},"order":0,"mode":0,"inputs":[],"outputs":[],"title":"Load Text From File","properties":{},"widgets_values":["Use Load Text From File if you want to use a .txt file instead of text-area. You can load .txt files from ComfyUI/input, ComfyUI/output or ComfyUI/temp directories."],"color":"#432","bgcolor":"#653"}],"links":[[42,34,0,16,0,"AUDIO"],[55,15,0,43,0,"AUDIO"],[56,43,0,34,0,"AUDIO"]],"groups":[{"id":2,"title":"Instructions before use:","bounding":[-569.1041870117188,87.40498352050781,453.3775939941406,595.2697143554688],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"ds":{"scale":0.9090909090909091,"offset":[570.2036733851843,-33.504933709055805]}},"version":0.4}


--------------------------------------------------------------------------------
/vvembed/modular/modular_vibevoice_text_tokenizer.py:
--------------------------------------------------------------------------------
  1 | """Tokenization classes for vibevoice."""
  2 | 
  3 | from typing import List, Optional, Union
  4 | 
  5 | from transformers.utils import logging
  6 | from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
  7 | from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast
  8 | 
  9 | logger = logging.get_logger(__name__)
 10 | 
 11 | 
 12 | class VibeVoiceTextTokenizer(Qwen2Tokenizer):
 13 |     """
 14 |     Construct a VibeVoice tokenizer. Based on the Qwen2 tokenizer with additional special tokens for speech.
 15 |     
 16 |     Args:
 17 |         vocab_file (`str`):
 18 |             Path to the vocabulary file.
 19 |         merges_file (`str`):
 20 |             Path to the merges file.
 21 |         errors (`str`, *optional*, defaults to `"replace"`):
 22 |             Paradigm to follow when decoding bytes to UTF-8.
 23 |         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 24 |             The unknown token.
 25 |         bos_token (`str`, *optional*):
 26 |             The beginning of sequence token. Not used for vibevoice.
 27 |         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 28 |             The end of sequence token.
 29 |         pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
 30 |             The token used for padding.
 31 |         add_special_tokens (`bool`, *optional*, defaults to `True`):
 32 |             Whether or not to add special tokens when encoding.
 33 |     """
 34 | 
 35 |     model_input_names = ["input_ids", "attention_mask"]
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         vocab_file,
 40 |         merges_file,
 41 |         errors="replace",
 42 |         unk_token="<|endoftext|>",
 43 |         bos_token=None,
 44 |         eos_token="<|endoftext|>",
 45 |         pad_token="<|endoftext|>",
 46 |         add_prefix_space=False,
 47 |         add_special_tokens=True,
 48 |         **kwargs,
 49 |     ):
 50 |         super().__init__(
 51 |             vocab_file=vocab_file,
 52 |             merges_file=merges_file,
 53 |             errors=errors,
 54 |             unk_token=unk_token,
 55 |             bos_token=bos_token,
 56 |             eos_token=eos_token,
 57 |             pad_token=pad_token,
 58 |             add_prefix_space=add_prefix_space,
 59 |             add_special_tokens=add_special_tokens,
 60 |             **kwargs,
 61 |         )
 62 |         
 63 |         # Add VibeVoice-specific special tokens
 64 |         self._add_vibevoice_special_tokens()
 65 |         
 66 |     def _add_vibevoice_special_tokens(self):
 67 |         """Add VibeVoice-specific special tokens."""
 68 |         special_tokens = {
 69 |             "additional_special_tokens": [
 70 |                 "<|vision_start|>",  # Speech start (reusing vision tokens)
 71 |                 "<|vision_end|>",  # Speech end
 72 |                 "<|vision_pad|>",  # Speech diffusion pad
 73 |             ]
 74 |         }
 75 |         num_added = self.add_special_tokens(special_tokens)
 76 |         
 77 |         # Cache special token IDs
 78 |         self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>")
 79 |         self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>")
 80 |         self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>")
 81 |         
 82 |         self._eos_id = self.convert_tokens_to_ids('<|endoftext|>')
 83 | 
 84 |         return num_added
 85 |     
 86 |     @property
 87 |     def eos_id(self) -> int:
 88 |         """Id of the end of sequence token."""
 89 |         return self._eos_id
 90 |     
 91 |     @property
 92 |     def speech_start_id(self) -> int:
 93 |         """Id of the speech start token."""
 94 |         return self._speech_start_id
 95 |     
 96 |     @property
 97 |     def speech_end_id(self) -> int:
 98 |         """Id of the speech end token."""
 99 |         return self._speech_end_id
100 |     
101 |     @property
102 |     def speech_diffusion_id(self) -> int:
103 |         """Id of the speech diffusion token."""
104 |         return self._speech_diffusion_id
105 |     
106 |     @property
107 |     def pad_id(self) -> int:
108 |         """Id used for padding (returns -100 for loss masking)."""
109 |         return -100
110 | 
111 | 
112 | class VibeVoiceTextTokenizerFast(Qwen2TokenizerFast):
113 |     """
114 |     Construct a "fast" VibeVoice tokenizer (backed by HuggingFace's *tokenizers* library).
115 |     Based on the Qwen2 tokenizer with additional special tokens for speech.
116 |     
117 |     Args:
118 |         vocab_file (`str`, *optional*):
119 |             Path to the vocabulary file.
120 |         merges_file (`str`, *optional*):
121 |             Path to the merges file.
122 |         tokenizer_file (`str`, *optional*):
123 |             Path to [tokenizers](https://github.com/huggingface/tokenizers) file.
124 |         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
125 |             The unknown token.
126 |         bos_token (`str`, *optional*):
127 |             The beginning of sequence token. Not used for vibevoice.
128 |         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
129 |             The end of sequence token.
130 |         pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
131 |             The token used for padding.
132 |     """
133 | 
134 |     model_input_names = ["input_ids", "attention_mask"]
135 | 
136 |     def __init__(
137 |         self,
138 |         vocab_file=None,
139 |         merges_file=None,
140 |         tokenizer_file=None,
141 |         unk_token="<|endoftext|>",
142 |         bos_token=None,
143 |         eos_token="<|endoftext|>",
144 |         pad_token="<|endoftext|>",
145 |         add_prefix_space=False,
146 |         **kwargs,
147 |     ):
148 |         super().__init__(
149 |             vocab_file=vocab_file,
150 |             merges_file=merges_file,
151 |             tokenizer_file=tokenizer_file,
152 |             unk_token=unk_token,
153 |             bos_token=bos_token,
154 |             eos_token=eos_token,
155 |             pad_token=pad_token,
156 |             add_prefix_space=add_prefix_space,
157 |             **kwargs,
158 |         )
159 |         
160 |         # Add VibeVoice-specific special tokens
161 |         self._add_vibevoice_special_tokens()
162 |         
163 |     def _add_vibevoice_special_tokens(self):
164 |         """Add VibeVoice-specific special tokens."""
165 |         special_tokens = {
166 |             "additional_special_tokens": [
167 |                 "<|vision_start|>",  # Speech start (reusing vision tokens)
168 |                 "<|vision_end|>",  # Speech end
169 |                 "<|vision_pad|>",  # Speech diffusion pad
170 |             ]
171 |         }
172 |         num_added = self.add_special_tokens(special_tokens)
173 |         
174 |         # Cache special token IDs
175 |         self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>")
176 |         self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>")
177 |         self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>")
178 | 
179 |         # self._eos_id = self.convert_tokens_to_ids('<|endoftext|>')
180 |         self._eos_id = self.eos_token_id # qwen2 / qwen3
181 |         self._pad_id = self.convert_tokens_to_ids('<|image_pad|>')
182 |         
183 |         return num_added
184 |     
185 |     @property
186 |     def eos_id(self) -> int:
187 |         """Id of the end of sequence token."""
188 |         return self._eos_id
189 |     
190 |     @property
191 |     def speech_start_id(self) -> int:
192 |         """Id of the speech start token."""
193 |         return self._speech_start_id
194 |     
195 |     @property
196 |     def speech_end_id(self) -> int:
197 |         """Id of the speech end token."""
198 |         return self._speech_end_id
199 |     
200 |     @property
201 |     def speech_diffusion_id(self) -> int:
202 |         """Id of the speech diffusion token."""
203 |         return self._speech_diffusion_id
204 |     
205 |     @property
206 |     def pad_id(self) -> int:
207 |         """Id used for padding (returns -100 for loss masking)."""
208 |         return self._pad_id
209 | 
210 | 
211 | __all__ = [
212 |     "VibeVoiceTextTokenizer", 
213 |     "VibeVoiceTextTokenizerFast", 
214 | ]


--------------------------------------------------------------------------------
/nodes/lora_node.py:
--------------------------------------------------------------------------------
  1 | # Created by Fabio Sarracino
  2 | # Original LoRa code implementation by jpgallegoar-vpai user via PR #127 
  3 | # LoRA configuration node for VibeVoice
  4 | 
  5 | import logging
  6 | import os
  7 | from typing import Dict, Any, List
  8 | 
  9 | # Setup logging
 10 | logger = logging.getLogger("VibeVoice")
 11 | 
 12 | # Cache for LoRA scanning to avoid repeated logs
 13 | _lora_cache = {
 14 |     "first_load_logged": False
 15 | }
 16 | 
 17 | def get_available_loras() -> List[str]:
 18 |     """Get list of available LoRA folders in ComfyUI/models/vibevoice/loras"""
 19 |     try:
 20 |         import folder_paths
 21 | 
 22 |         # Get the ComfyUI models directory
 23 |         models_dir = folder_paths.get_folder_paths("checkpoints")[0]
 24 |         # Navigate to vibevoice/loras directory
 25 |         loras_dir = os.path.join(os.path.dirname(models_dir), "vibevoice", "loras")
 26 | 
 27 |         # Create directory if it doesn't exist
 28 |         os.makedirs(loras_dir, exist_ok=True)
 29 | 
 30 |         # List all directories in the loras folder
 31 |         lora_folders = []
 32 |         if os.path.exists(loras_dir):
 33 |             for item in os.listdir(loras_dir):
 34 |                 item_path = os.path.join(loras_dir, item)
 35 |                 if os.path.isdir(item_path):
 36 |                     # Check if it contains LoRA files
 37 |                     adapter_config = os.path.join(item_path, "adapter_config.json")
 38 |                     adapter_model_st = os.path.join(item_path, "adapter_model.safetensors")
 39 |                     adapter_model_bin = os.path.join(item_path, "adapter_model.bin")
 40 | 
 41 |                     # Consider it a valid LoRA if it has config or model files
 42 |                     if os.path.exists(adapter_config) or os.path.exists(adapter_model_st) or os.path.exists(adapter_model_bin):
 43 |                         lora_folders.append(item)
 44 | 
 45 |         # Only log on first scan to avoid spam
 46 |         if not _lora_cache["first_load_logged"]:
 47 |             if not lora_folders:
 48 |                 logger.info("No LoRA adapters found in ComfyUI/models/vibevoice/loras")
 49 |             _lora_cache["first_load_logged"] = True
 50 | 
 51 |         # Always include "None" option to disable LoRA
 52 |         if not lora_folders:
 53 |             return ["None"]
 54 | 
 55 |         # Sort alphabetically and add None option at the beginning
 56 |         lora_folders.sort()
 57 |         return ["None"] + lora_folders
 58 | 
 59 |     except Exception as e:
 60 |         logger.error(f"Error listing LoRA folders: {e}")
 61 |         return ["None"]
 62 | 
 63 | class VibeVoiceLoRANode:
 64 |     """Node for configuring LoRA adapters for VibeVoice models"""
 65 | 
 66 |     def __init__(self):
 67 |         pass
 68 | 
 69 |     @classmethod
 70 |     def INPUT_TYPES(cls):
 71 |         # Get available LoRA folders dynamically
 72 |         available_loras = get_available_loras()
 73 | 
 74 |         return {
 75 |             "required": {
 76 |                 "lora_name": (available_loras, {
 77 |                     "default": "None",
 78 |                     "tooltip": "Select a LoRA adapter from ComfyUI/models/vibevoice/loras folder"
 79 |                 }),
 80 |                 "llm_strength": ("FLOAT", {
 81 |                     "default": 1.0,
 82 |                     "min": 0.0,
 83 |                     "max": 2.0,
 84 |                     "step": 0.05,
 85 |                     "tooltip": "Strength of the LLM LoRA adapter. Controls how much the LoRA affects the language model"
 86 |                 }),
 87 |                 "use_llm": ("BOOLEAN", {
 88 |                     "default": True,
 89 |                     "tooltip": "Apply LLM (language model) LoRA component when available"
 90 |                 }),
 91 |                 "use_diffusion_head": ("BOOLEAN", {
 92 |                     "default": True,
 93 |                     "tooltip": "Apply diffusion head LoRA/replacement when available"
 94 |                 }),
 95 |                 "use_acoustic_connector": ("BOOLEAN", {
 96 |                     "default": True,
 97 |                     "tooltip": "Apply acoustic connector LoRA component when available"
 98 |                 }),
 99 |                 "use_semantic_connector": ("BOOLEAN", {
100 |                     "default": True,
101 |                     "tooltip": "Apply semantic connector LoRA component when available"
102 |                 }),
103 |             }
104 |         }
105 | 
106 |     RETURN_TYPES = ("LORA_CONFIG",)
107 |     RETURN_NAMES = ("lora",)
108 |     FUNCTION = "configure_lora"
109 |     CATEGORY = "VibeVoiceWrapper"
110 |     DESCRIPTION = "Configure LoRA adapters for fine-tuned VibeVoice models. Place LoRA folders in ComfyUI/models/vibevoice/loras/"
111 | 
112 |     def configure_lora(self, lora_name: str = "None", llm_strength: float = 1.0,
113 |                       use_llm: bool = True, use_diffusion_head: bool = True,
114 |                       use_acoustic_connector: bool = True, use_semantic_connector: bool = True):
115 |         """Configure LoRA settings and validate the path"""
116 | 
117 |         # Handle "None" selection
118 |         if lora_name == "None":
119 |             logger.info("No LoRA selected, using base model")
120 |             return ({
121 |                 "path": None,
122 |                 "llm_strength": llm_strength,
123 |                 "use_llm": use_llm,
124 |                 "use_diffusion_head": use_diffusion_head,
125 |                 "use_acoustic_connector": use_acoustic_connector,
126 |                 "use_semantic_connector": use_semantic_connector
127 |             },)
128 | 
129 |         try:
130 |             import folder_paths
131 | 
132 |             # Build full path to the LoRA folder
133 |             models_dir = folder_paths.get_folder_paths("checkpoints")[0]
134 |             loras_dir = os.path.join(os.path.dirname(models_dir), "vibevoice", "loras")
135 |             lora_path = os.path.join(loras_dir, lora_name)
136 | 
137 |             # Validate the path exists
138 |             if not os.path.exists(lora_path):
139 |                 logger.error(f"LoRA path does not exist: {lora_path}")
140 |                 raise Exception(f"LoRA folder not found: {lora_name}")
141 | 
142 |             if not os.path.isdir(lora_path):
143 |                 logger.error(f"LoRA path is not a directory: {lora_path}")
144 |                 raise Exception(f"LoRA path must be a directory: {lora_name}")
145 | 
146 |             # Check for required files
147 |             adapter_config = os.path.join(lora_path, "adapter_config.json")
148 |             adapter_model_st = os.path.join(lora_path, "adapter_model.safetensors")
149 |             adapter_model_bin = os.path.join(lora_path, "adapter_model.bin")
150 | 
151 |             if not os.path.exists(adapter_config):
152 |                 logger.warning(f"adapter_config.json not found in {lora_name}")
153 | 
154 |             if not os.path.exists(adapter_model_st) and not os.path.exists(adapter_model_bin):
155 |                 logger.warning(f"No adapter model file found in {lora_name}")
156 |                 logger.warning("Expected: adapter_model.safetensors or adapter_model.bin")
157 | 
158 |             logger.info(f"LoRA configured: {lora_name} ({lora_path})")
159 | 
160 |             # Check for optional components
161 |             components_found = []
162 |             diffusion_head_path = os.path.join(lora_path, "diffusion_head")
163 |             acoustic_connector_path = os.path.join(lora_path, "acoustic_connector")
164 |             semantic_connector_path = os.path.join(lora_path, "semantic_connector")
165 | 
166 |             if os.path.exists(diffusion_head_path):
167 |                 components_found.append("diffusion_head")
168 |             if os.path.exists(acoustic_connector_path):
169 |                 components_found.append("acoustic_connector")
170 |             if os.path.exists(semantic_connector_path):
171 |                 components_found.append("semantic_connector")
172 | 
173 |             if components_found:
174 |                 logger.info(f"Additional LoRA components found: {', '.join(components_found)}")
175 | 
176 |             # Create configuration dictionary
177 |             lora_config = {
178 |                 "path": lora_path,
179 |                 "llm_strength": llm_strength,
180 |                 "use_llm": use_llm,
181 |                 "use_diffusion_head": use_diffusion_head,
182 |                 "use_acoustic_connector": use_acoustic_connector,
183 |                 "use_semantic_connector": use_semantic_connector
184 |             }
185 | 
186 |             # Log configuration
187 |             enabled_components = []
188 |             if use_llm:
189 |                 enabled_components.append(f"LLM (strength: {llm_strength})")
190 |             if use_diffusion_head:
191 |                 enabled_components.append("Diffusion Head")
192 |             if use_acoustic_connector:
193 |                 enabled_components.append("Acoustic Connector")
194 |             if use_semantic_connector:
195 |                 enabled_components.append("Semantic Connector")
196 | 
197 |             if enabled_components:
198 |                 logger.info(f"LoRA components enabled: {', '.join(enabled_components)}")
199 |             else:
200 |                 logger.warning("All LoRA components are disabled")
201 | 
202 |             return (lora_config,)
203 | 
204 |         except ImportError:
205 |             logger.error("Could not import folder_paths from ComfyUI")
206 |             raise Exception("Failed to access ComfyUI folders")
207 |         except Exception as e:
208 |             logger.error(f"Error configuring LoRA: {e}")
209 |             raise
210 | 
211 |     @classmethod
212 |     def IS_CHANGED(cls, lora_name: str = "None", **kwargs):
213 |         """Cache key for ComfyUI - includes all parameters"""
214 |         return f"{lora_name}_{kwargs.get('llm_strength', 1.0)}_{kwargs.get('use_llm', True)}_{kwargs.get('use_diffusion_head', True)}_{kwargs.get('use_acoustic_connector', True)}_{kwargs.get('use_semantic_connector', True)}"


--------------------------------------------------------------------------------
/vvembed/modular/modular_vibevoice_diffusion_head.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import Optional, Tuple, Union
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | from transformers.models.auto import AutoModel
  9 | from transformers.modeling_utils import PreTrainedModel
 10 | # from transformers.modeling_layers import GradientCheckpointingLayer
 11 | from transformers.activations import ACT2FN
 12 | from transformers.utils import logging
 13 | 
 14 | from .configuration_vibevoice import VibeVoiceDiffusionHeadConfig
 15 | 
 16 | 
 17 | logger = logging.get_logger(__name__)
 18 | 
 19 | 
 20 | class RMSNorm(nn.Module):
 21 |     def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine=True, memory_efficient=False):
 22 |         super().__init__()
 23 |         self.dim = dim
 24 |         self.eps = eps
 25 |         self.elementwise_affine = elementwise_affine
 26 |         if self.elementwise_affine:
 27 |             self.weight = nn.Parameter(torch.ones(dim))
 28 |         else:
 29 |             self.register_parameter('weight', None)
 30 | 
 31 |     def _norm(self, x):
 32 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
 33 | 
 34 |     def forward(self, x):
 35 |         output = self._norm(x.float()).type_as(x)
 36 |         if self.weight is not None:
 37 |             output = output * self.weight
 38 |         return output
 39 | 
 40 |     def extra_repr(self) -> str:
 41 |         return f'dim={self.dim}, eps={self.eps}, elementwise_affine={self.elementwise_affine}'
 42 |     
 43 | def modulate(x, shift, scale):
 44 |     """Apply modulation to input tensor."""
 45 |     return x * (1 + scale) + shift
 46 | 
 47 | 
 48 | class TimestepEmbedder(nn.Module):
 49 |     """
 50 |     Embeds scalar timesteps into vector representations.
 51 |     
 52 |     Args:
 53 |         hidden_size (`int`): Size of the output embedding
 54 |         frequency_embedding_size (`int`, optional): Size of the intermediate frequency embedding
 55 |     """
 56 |     def __init__(self, hidden_size, frequency_embedding_size=256):
 57 |         super().__init__()
 58 |         self.mlp = nn.Sequential(
 59 |             nn.Linear(frequency_embedding_size, hidden_size, bias=False),
 60 |             # nn.SiLU(),
 61 |             ACT2FN['silu'],
 62 |             nn.Linear(hidden_size, hidden_size, bias=False),
 63 |         )
 64 |         self.frequency_embedding_size = frequency_embedding_size
 65 | 
 66 |     @staticmethod
 67 |     def timestep_embedding(t, dim, max_period=10000):
 68 |         """
 69 |         Create sinusoidal timestep embeddings.
 70 |         
 71 |         Args:
 72 |             t (`torch.Tensor`): A 1-D Tensor of N indices, one per batch element.
 73 |                             These may be fractional.
 74 |             dim (`int`): The dimension of the output.
 75 |             max_period (`int`, optional): Controls the minimum frequency of the embeddings.
 76 |             
 77 |         Returns:
 78 |             `torch.Tensor`: An [N, D] Tensor of positional embeddings.
 79 |         """
 80 |         half = dim // 2
 81 |         freqs = torch.exp(
 82 |             -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
 83 |         ).to(t.device)
 84 |         args = t[:, None].float() * freqs[None]
 85 |         embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
 86 |         if dim % 2:
 87 |             embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
 88 |         return embedding.to(t.dtype)
 89 | 
 90 |     def forward(self, t):
 91 |         t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
 92 |         t_emb = self.mlp(t_freq)
 93 |         return t_emb
 94 | 
 95 | 
 96 | class FeedForwardNetwork(nn.Module):
 97 |     """
 98 |     Standard feed-forward network with SwiGLU activation.
 99 |     
100 |     Args:
101 |         embed_dim (`int`): Input dimension
102 |         ffn_dim (`int`): Hidden dimension
103 |     """
104 |     def __init__(
105 |         self,
106 |         embed_dim,
107 |         ffn_dim,
108 |     ):
109 |         super().__init__()
110 |         self.embed_dim = embed_dim
111 |         self.gate_proj = nn.Linear(self.embed_dim, ffn_dim, bias=False)
112 |         self.up_proj = nn.Linear(self.embed_dim, ffn_dim, bias=False)
113 |         self.down_proj = nn.Linear(ffn_dim, self.embed_dim, bias=False)
114 |         self.act_fn = ACT2FN['silu']  # Using SiLU as the activation function
115 | 
116 |     def forward(self, x):
117 |         gate = self.gate_proj(x)
118 |         up = self.up_proj(x)
119 |         
120 |         # SwiGLU activation
121 |         # gate = F.silu(gate)
122 |         gate = self.act_fn(gate)
123 |         return self.down_proj(gate * up)
124 | 
125 |     
126 | class HeadLayer(nn.Module):
127 |     """
128 |     A layer in the diffusion head.
129 |     
130 |     Args:
131 |         embed_dim (`int`): Input dimension
132 |         ffn_dim (`int`): Hidden dimension
133 |         cond_dim (`int`): Condition embedding dimension
134 |         norm_eps (`float`, optional): Epsilon for normalization
135 |     """
136 |     def __init__(
137 |         self,
138 |         embed_dim,
139 |         ffn_dim,
140 |         cond_dim,
141 |         norm_eps=1e-5,
142 |     ):
143 |         super().__init__()
144 |         self.embed_dim = embed_dim
145 |         self.cond_dim = cond_dim
146 |         self.ffn_dim = ffn_dim
147 |         self.ffn = FeedForwardNetwork(
148 |             self.embed_dim,
149 |             self.ffn_dim,
150 |         )
151 |         self.norm = RMSNorm(self.embed_dim, eps=norm_eps)
152 |         self.adaLN_modulation = nn.Sequential(
153 |             # nn.SiLU(),
154 |             ACT2FN['silu'],
155 |             nn.Linear(cond_dim, 3 * self.embed_dim, bias=False)
156 |         )
157 | 
158 |     def forward(self, x, c):
159 |         shift_ffn, scale_ffn, gate_ffn = self.adaLN_modulation(c).chunk(3, dim=-1)
160 |         x = x + gate_ffn * self.ffn(modulate(self.norm(x), shift_ffn, scale_ffn))
161 |         return x
162 | 
163 | 
164 | class FinalLayer(nn.Module):
165 |     """
166 |     Final layer in the diffusion head.
167 |     
168 |     Args:
169 |         hidden_size (`int`): Input dimension
170 |         output_size (`int`): Output dimension
171 |         cond_size (`int`): Condition embedding dimension
172 |         norm_eps (`float`, optional): Epsilon for normalization
173 |     """
174 |     def __init__(self, hidden_size, output_size, cond_size, norm_eps=1e-5):
175 |         super().__init__()
176 |         self.norm_final = RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=False)
177 |         self.linear = nn.Linear(hidden_size, output_size, bias=False)
178 |         self.adaLN_modulation = nn.Sequential(
179 |             # nn.SiLU(),
180 |             ACT2FN['silu'],
181 |             nn.Linear(cond_size, 2 * hidden_size, bias=False)
182 |         )
183 | 
184 |     def forward(self, x, c):
185 |         shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
186 |         x = modulate(self.norm_final(x), shift, scale)
187 |         x = self.linear(x)
188 |         return x
189 | 
190 | 
191 | class VibeVoiceDiffusionHead(PreTrainedModel):
192 |     """
193 |     Diffusion head model for vibevoice.
194 |     
195 |     Args:
196 |         config (`VibeVoiceDiffusionHeadConfig`): Model configuration
197 |         latent_size (`int`, optional): Size of the latent space. If not provided, uses `config.latent_size`.
198 |     """
199 |     config_class = VibeVoiceDiffusionHeadConfig
200 |     supports_gradient_checkpointing = True
201 |     _supports_flash_attn_2 = True  
202 |     _supports_sdpa = True  
203 |     
204 |     def __init__(
205 |         self,
206 |         config,
207 |     ):
208 |         super().__init__(config)
209 |         self.config = config
210 |         self.cond_dim = config.hidden_size
211 |         latent_size = config.latent_size
212 |         
213 |         self.noisy_images_proj = nn.Linear(latent_size, config.hidden_size, bias=False)
214 |         self.cond_proj = nn.Linear(config.hidden_size, self.cond_dim, bias=False)
215 |         self.t_embedder = TimestepEmbedder(self.cond_dim)
216 |         
217 |         ffn_dim = int(config.hidden_size * config.head_ffn_ratio)
218 |         
219 |         # Create the intermediate layers
220 |         self.layers = nn.ModuleList([
221 |             HeadLayer(
222 |                 embed_dim=config.hidden_size,
223 |                 ffn_dim=ffn_dim,
224 |                 cond_dim=self.cond_dim,
225 |                 norm_eps=config.rms_norm_eps
226 |             )
227 |             for _ in range(config.head_layers)
228 |         ])
229 |         
230 |         # Final layer for output
231 |         self.final_layer = FinalLayer(
232 |             hidden_size=config.hidden_size, 
233 |             output_size=latent_size,
234 |             cond_size=self.cond_dim,
235 |             norm_eps=config.rms_norm_eps
236 |         )
237 |         
238 |         self.initialize_weights()
239 | 
240 |     def initialize_weights(self):
241 |         """Initialize the weights of the model."""
242 |         # Initialize timestep embedder
243 |         nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
244 |         nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
245 | 
246 |         # Zero-out adaLN modulation layers
247 |         for layer in self.layers:
248 |             nn.init.constant_(layer.adaLN_modulation[-1].weight, 0)
249 | 
250 |         # Zero-out output layers
251 |         nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
252 |         nn.init.constant_(self.final_layer.linear.weight, 0)
253 | 
254 |     def forward(
255 |         self,
256 |         noisy_images,
257 |         timesteps,
258 |         condition,
259 |     ):
260 |         """
261 |         Forward pass of the prediction head.
262 |         
263 |         Args:
264 |             noisy_images (`torch.Tensor`): Noisy images/latents to denoise
265 |             timesteps (`torch.Tensor`): Timesteps for diffusion
266 |             condition (`torch.Tensor`): Conditioning information
267 |             
268 |         Returns:
269 |             `torch.Tensor`: The predicted noise/velocity
270 |         """
271 |         x = self.noisy_images_proj(noisy_images)
272 |         t = self.t_embedder(timesteps)
273 |         condition = self.cond_proj(condition)
274 |         c = condition + t
275 |         
276 |         for layer in self.layers:
277 |             x = layer(x, c)
278 |             
279 |         x = self.final_layer(x, c)
280 |         return x
281 | 
282 | 
283 | AutoModel.register(VibeVoiceDiffusionHeadConfig, VibeVoiceDiffusionHead)
284 | 
285 | __all__ = [
286 |     "VibeVoiceDiffusionHead",
287 | ]


--------------------------------------------------------------------------------
/vvembed/modular/streamer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import torch
  4 | 
  5 | import asyncio
  6 | from queue import Queue
  7 | from typing import TYPE_CHECKING, Optional
  8 | 
  9 | 
 10 | from transformers.generation import BaseStreamer
 11 | 
 12 | 
 13 | class AudioStreamer(BaseStreamer):
 14 |     """
 15 |     Audio streamer that stores audio chunks in queues for each sample in the batch.
 16 |     This allows streaming audio generation for multiple samples simultaneously.
 17 |     
 18 |     Parameters:
 19 |         batch_size (`int`):
 20 |             The batch size for generation
 21 |         stop_signal (`any`, *optional*):
 22 |             The signal to put in the queue when generation ends. Defaults to None.
 23 |         timeout (`float`, *optional*):
 24 |             The timeout for the audio queue. If `None`, the queue will block indefinitely.
 25 |     """
 26 |     
 27 |     def __init__(
 28 |         self, 
 29 |         batch_size: int,
 30 |         stop_signal: Optional[any] = None,
 31 |         timeout: Optional[float] = None,
 32 |     ):
 33 |         self.batch_size = batch_size
 34 |         self.stop_signal = stop_signal
 35 |         self.timeout = timeout
 36 |         
 37 |         # Create a queue for each sample in the batch
 38 |         self.audio_queues = [Queue() for _ in range(batch_size)]
 39 |         self.finished_flags = [False for _ in range(batch_size)]
 40 |         self.sample_indices_map = {}  # Maps from sample index to queue index
 41 |         
 42 |     def put(self, audio_chunks: torch.Tensor, sample_indices: torch.Tensor):
 43 |         """
 44 |         Receives audio chunks and puts them in the appropriate queues.
 45 |         
 46 |         Args:
 47 |             audio_chunks: Tensor of shape (num_samples, ...) containing audio chunks
 48 |             sample_indices: Tensor indicating which samples these chunks belong to
 49 |         """
 50 |         for i, sample_idx in enumerate(sample_indices):
 51 |             idx = sample_idx.item()
 52 |             if idx < self.batch_size and not self.finished_flags[idx]:
 53 |                 # Convert to numpy or keep as tensor based on preference
 54 |                 audio_chunk = audio_chunks[i].detach().cpu()
 55 |                 self.audio_queues[idx].put(audio_chunk, timeout=self.timeout)
 56 |     
 57 |     def end(self, sample_indices: Optional[torch.Tensor] = None):
 58 |         """
 59 |         Signals the end of generation for specified samples or all samples.
 60 |         
 61 |         Args:
 62 |             sample_indices: Optional tensor of sample indices to end. If None, ends all.
 63 |         """
 64 |         if sample_indices is None:
 65 |             # End all samples
 66 |             for idx in range(self.batch_size):
 67 |                 if not self.finished_flags[idx]:
 68 |                     self.audio_queues[idx].put(self.stop_signal, timeout=self.timeout)
 69 |                     self.finished_flags[idx] = True
 70 |         else:
 71 |             # End specific samples
 72 |             for sample_idx in sample_indices:
 73 |                 idx = sample_idx.item() if torch.is_tensor(sample_idx) else sample_idx
 74 |                 if idx < self.batch_size and not self.finished_flags[idx]:
 75 |                     self.audio_queues[idx].put(self.stop_signal, timeout=self.timeout)
 76 |                     self.finished_flags[idx] = True
 77 |     
 78 |     def __iter__(self):
 79 |         """Returns an iterator over the batch of audio streams."""
 80 |         return AudioBatchIterator(self)
 81 |     
 82 |     def get_stream(self, sample_idx: int):
 83 |         """Get the audio stream for a specific sample."""
 84 |         if sample_idx >= self.batch_size:
 85 |             raise ValueError(f"Sample index {sample_idx} exceeds batch size {self.batch_size}")
 86 |         return AudioSampleIterator(self, sample_idx)
 87 | 
 88 | 
 89 | class AudioSampleIterator:
 90 |     """Iterator for a single audio stream from the batch."""
 91 |     
 92 |     def __init__(self, streamer: AudioStreamer, sample_idx: int):
 93 |         self.streamer = streamer
 94 |         self.sample_idx = sample_idx
 95 |         
 96 |     def __iter__(self):
 97 |         return self
 98 |     
 99 |     def __next__(self):
100 |         value = self.streamer.audio_queues[self.sample_idx].get(timeout=self.streamer.timeout)
101 |         if value == self.streamer.stop_signal:
102 |             raise StopIteration()
103 |         return value
104 | 
105 | 
106 | class AudioBatchIterator:
107 |     """Iterator that yields audio chunks for all samples in the batch."""
108 |     
109 |     def __init__(self, streamer: AudioStreamer):
110 |         self.streamer = streamer
111 |         self.active_samples = set(range(streamer.batch_size))
112 |         
113 |     def __iter__(self):
114 |         return self
115 |     
116 |     def __next__(self):
117 |         if not self.active_samples:
118 |             raise StopIteration()
119 |             
120 |         batch_chunks = {}
121 |         samples_to_remove = set()
122 |         
123 |         # Try to get chunks from all active samples
124 |         for idx in self.active_samples:
125 |             try:
126 |                 value = self.streamer.audio_queues[idx].get(block=False)
127 |                 if value == self.streamer.stop_signal:
128 |                     samples_to_remove.add(idx)
129 |                 else:
130 |                     batch_chunks[idx] = value
131 |             except:
132 |                 # Queue is empty for this sample, skip it this iteration
133 |                 pass
134 |         
135 |         # Remove finished samples
136 |         self.active_samples -= samples_to_remove
137 |         
138 |         if batch_chunks:
139 |             return batch_chunks
140 |         elif self.active_samples:
141 |             # If no chunks were ready but we still have active samples, 
142 |             # wait a bit and try again
143 |             import time
144 |             time.sleep(0.01)
145 |             return self.__next__()
146 |         else:
147 |             raise StopIteration()
148 | 
149 | 
150 | class AsyncAudioStreamer(AudioStreamer):
151 |     """
152 |     Async version of AudioStreamer for use in async contexts.
153 |     """
154 |     
155 |     def __init__(
156 |         self, 
157 |         batch_size: int,
158 |         stop_signal: Optional[any] = None,
159 |         timeout: Optional[float] = None,
160 |     ):
161 |         super().__init__(batch_size, stop_signal, timeout)
162 |         # Replace regular queues with async queues
163 |         self.audio_queues = [asyncio.Queue() for _ in range(batch_size)]
164 |         self.loop = asyncio.get_running_loop()
165 |         
166 |     def put(self, audio_chunks: torch.Tensor, sample_indices: torch.Tensor):
167 |         """Put audio chunks in the appropriate async queues."""
168 |         for i, sample_idx in enumerate(sample_indices):
169 |             idx = sample_idx.item()
170 |             if idx < self.batch_size and not self.finished_flags[idx]:
171 |                 audio_chunk = audio_chunks[i].detach().cpu()
172 |                 self.loop.call_soon_threadsafe(
173 |                     self.audio_queues[idx].put_nowait, audio_chunk
174 |                 )
175 |     
176 |     def end(self, sample_indices: Optional[torch.Tensor] = None):
177 |         """Signal the end of generation for specified samples."""
178 |         if sample_indices is None:
179 |             indices_to_end = range(self.batch_size)
180 |         else:
181 |             indices_to_end = [s.item() if torch.is_tensor(s) else s for s in sample_indices]
182 |             
183 |         for idx in indices_to_end:
184 |             if idx < self.batch_size and not self.finished_flags[idx]:
185 |                 self.loop.call_soon_threadsafe(
186 |                     self.audio_queues[idx].put_nowait, self.stop_signal
187 |                 )
188 |                 self.finished_flags[idx] = True
189 |     
190 |     async def get_stream(self, sample_idx: int):
191 |         """Get async iterator for a specific sample's audio stream."""
192 |         if sample_idx >= self.batch_size:
193 |             raise ValueError(f"Sample index {sample_idx} exceeds batch size {self.batch_size}")
194 |             
195 |         while True:
196 |             value = await self.audio_queues[sample_idx].get()
197 |             if value == self.stop_signal:
198 |                 break
199 |             yield value
200 |     
201 |     def __aiter__(self):
202 |         """Returns an async iterator over all audio streams."""
203 |         return AsyncAudioBatchIterator(self)
204 | 
205 | 
206 | class AsyncAudioBatchIterator:
207 |     """Async iterator for batch audio streaming."""
208 |     
209 |     def __init__(self, streamer: AsyncAudioStreamer):
210 |         self.streamer = streamer
211 |         self.active_samples = set(range(streamer.batch_size))
212 |         
213 |     def __aiter__(self):
214 |         return self
215 |         
216 |     async def __anext__(self):
217 |         if not self.active_samples:
218 |             raise StopAsyncIteration()
219 |             
220 |         batch_chunks = {}
221 |         samples_to_remove = set()
222 |         
223 |         # Create tasks for all active samples
224 |         tasks = {
225 |             idx: asyncio.create_task(self._get_chunk(idx)) 
226 |             for idx in self.active_samples
227 |         }
228 |         
229 |         # Wait for at least one chunk to be ready
230 |         done, pending = await asyncio.wait(
231 |             tasks.values(), 
232 |             return_when=asyncio.FIRST_COMPLETED,
233 |             timeout=self.streamer.timeout
234 |         )
235 |         
236 |         # Cancel pending tasks
237 |         for task in pending:
238 |             task.cancel()
239 |             
240 |         # Process completed tasks
241 |         for idx, task in tasks.items():
242 |             if task in done:
243 |                 try:
244 |                     value = await task
245 |                     if value == self.streamer.stop_signal:
246 |                         samples_to_remove.add(idx)
247 |                     else:
248 |                         batch_chunks[idx] = value
249 |                 except asyncio.CancelledError:
250 |                     pass
251 |                     
252 |         self.active_samples -= samples_to_remove
253 |         
254 |         if batch_chunks:
255 |             return batch_chunks
256 |         elif self.active_samples:
257 |             # Try again if we still have active samples
258 |             return await self.__anext__()
259 |         else:
260 |             raise StopAsyncIteration()
261 |     
262 |     async def _get_chunk(self, idx):
263 |         """Helper to get a chunk from a specific queue."""
264 |         return await self.streamer.audio_queues[idx].get()


--------------------------------------------------------------------------------
/vvembed/modular/configuration_vibevoice.py:
--------------------------------------------------------------------------------
  1 | # Original code by Microsoft
  2 | # updated by Fabio Sarracino - Enemyx-net
  3 | 
  4 | """ VibeVoice_AcousticTokenizer model configuration"""
  5 | 
  6 | from typing import Dict, List, Optional, Tuple
  7 | 
  8 | from transformers.configuration_utils import PretrainedConfig 
  9 | from transformers.utils import logging
 10 | 
 11 | from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
 12 | 
 13 | logger = logging.get_logger(__name__)
 14 | 
 15 | # to be improved...
 16 | 
 17 | 
 18 | class VibeVoiceAcousticTokenizerConfig(PretrainedConfig):
 19 |     model_type = "vibevoice_acoustic_tokenizer"
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         channels: int = 1,
 24 |         corpus_normalize: float = 0.0,
 25 |         causal: bool = True,
 26 |         vae_dim: int = 64,
 27 |         fix_std: float = 0.5,
 28 |         std_dist_type: str = 'gaussian',
 29 |         # common 
 30 |         mixer_layer: str = 'depthwise_conv',
 31 |         conv_norm: str = 'none',
 32 |         pad_mode: str = 'constant',
 33 |         disable_last_norm: bool = True,
 34 |         layernorm: str = 'RMSNorm',
 35 |         layernorm_eps: float = 1e-5,
 36 |         layernorm_elementwise_affine: bool = True,
 37 |         conv_bias: bool = True,
 38 |         layer_scale_init_value: float = 1e-6,
 39 |         weight_init_value: float = 1e-2,
 40 |         # encoder specific
 41 |         encoder_n_filters: int = 32,
 42 |         encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2],
 43 |         encoder_depths: str = "3-3-3-3-3-3-8",
 44 |         # decoder specific
 45 |         decoder_n_filters: int = 32,
 46 |         decoder_ratios: Optional[List[int]] = None, # if None, same as encoder
 47 |         decoder_depths: Optional[str] = None,
 48 |         **kwargs
 49 |     ):
 50 |         super().__init__(**kwargs)
 51 |         self.channels = channels
 52 |         self.corpus_normalize = corpus_normalize
 53 |         self.causal = causal
 54 |         self.vae_dim = vae_dim
 55 |         self.fix_std = fix_std
 56 |         self.std_dist_type = std_dist_type
 57 |         
 58 |         # common parameters
 59 |         self.conv_norm = conv_norm
 60 |         self.pad_mode = pad_mode
 61 |         self.layernorm_eps = layernorm_eps
 62 |         self.disable_last_norm = disable_last_norm
 63 |         self.layernorm = layernorm
 64 |         self.layernorm_elementwise_affine = layernorm_elementwise_affine
 65 |         self.conv_bias = conv_bias
 66 |         self.layer_scale_init_value = layer_scale_init_value
 67 |         self.weight_init_value = weight_init_value
 68 |         self.mixer_layer = mixer_layer
 69 | 
 70 |         # encoder specific parameters
 71 |         self.encoder_n_filters = encoder_n_filters
 72 |         self.encoder_ratios = encoder_ratios
 73 |         self.encoder_depths = encoder_depths
 74 |         
 75 |         # decoder specific parameters
 76 |         self.decoder_ratios = decoder_ratios if decoder_ratios is not None else encoder_ratios
 77 |         self.decoder_n_filters = decoder_n_filters
 78 |         self.decoder_depths = decoder_depths
 79 | 
 80 | 
 81 | class VibeVoiceSemanticTokenizerConfig(PretrainedConfig):
 82 |     model_type = "vibevoice_semantic_tokenizer"
 83 |     
 84 |     def __init__(
 85 |         self,
 86 |         channels: int = 1,
 87 |         corpus_normalize: float = 0.0,
 88 |         causal: bool = True,
 89 |         vae_dim: int = 64,
 90 |         fix_std: float = 0,
 91 |         std_dist_type: str = 'none',
 92 |         # common 
 93 |         mixer_layer: str = 'depthwise_conv',
 94 |         conv_norm: str = 'none',
 95 |         pad_mode: str = 'constant',
 96 |         disable_last_norm: bool = True,
 97 |         layernorm: str = 'RMSNorm',
 98 |         layernorm_eps: float = 1e-5,
 99 |         layernorm_elementwise_affine: bool = True,
100 |         conv_bias: bool = True,
101 |         layer_scale_init_value: float = 1e-6,
102 |         weight_init_value: float = 1e-2,
103 |         # encoder specific
104 |         encoder_n_filters: int = 32,
105 |         encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2],
106 |         encoder_depths: str = "3-3-3-3-3-3-8",
107 |         **kwargs
108 |     ):
109 |         super().__init__(**kwargs)
110 |         self.channels = channels
111 |         self.corpus_normalize = corpus_normalize
112 |         self.causal = causal
113 |         self.vae_dim = vae_dim
114 |         self.fix_std = fix_std
115 |         self.std_dist_type = std_dist_type
116 |         
117 |         # common parameters
118 |         self.conv_norm = conv_norm
119 |         self.pad_mode = pad_mode
120 |         self.layernorm_eps = layernorm_eps
121 |         self.disable_last_norm = disable_last_norm
122 |         self.layernorm = layernorm
123 |         self.layernorm_elementwise_affine = layernorm_elementwise_affine
124 |         self.conv_bias = conv_bias
125 |         self.layer_scale_init_value = layer_scale_init_value
126 |         self.weight_init_value = weight_init_value
127 |         self.mixer_layer = mixer_layer
128 | 
129 |         # encoder specific parameters
130 |         self.encoder_n_filters = encoder_n_filters
131 |         self.encoder_ratios = encoder_ratios
132 |         self.encoder_depths = encoder_depths
133 |         
134 | 
135 | class VibeVoiceDiffusionHeadConfig(PretrainedConfig):
136 |     model_type = "vibevoice_diffusion_head"
137 | 
138 |     def __init__(
139 |         self,
140 |         hidden_size=768,
141 |         head_layers=4,
142 |         head_ffn_ratio=3.0,
143 |         rms_norm_eps=1e-5,
144 |         latent_size=64,
145 |         speech_vae_dim=None,
146 |         prediction_type="v_prediction",
147 |         diffusion_type="ddpm",
148 |         ddpm_num_steps=1000,
149 |         ddpm_num_inference_steps=20,
150 |         ddpm_beta_schedule="cosine",
151 |         ddpm_batch_mul=4,
152 |         **kwargs
153 |     ):
154 |         self.hidden_size = hidden_size
155 |         self.head_layers = head_layers
156 |         self.head_ffn_ratio = head_ffn_ratio
157 |         self.rms_norm_eps = rms_norm_eps
158 |         self.latent_size = latent_size
159 |         self.speech_vae_dim = speech_vae_dim
160 |         self.prediction_type = prediction_type
161 |         self.diffusion_type = diffusion_type
162 |         self.ddpm_num_steps = ddpm_num_steps
163 |         self.ddpm_num_inference_steps = ddpm_num_inference_steps
164 |         self.ddpm_beta_schedule = ddpm_beta_schedule
165 |         self.ddpm_batch_mul = ddpm_batch_mul
166 |         
167 |         super().__init__(**kwargs)
168 | 
169 | class VibeVoiceConfig(PretrainedConfig):
170 |     model_type = "vibevoice"
171 |     is_composition = True
172 |     sub_configs = {
173 |         "acoustic_tokenizer_config": VibeVoiceAcousticTokenizerConfig, 
174 |         "semantic_tokenizer_config": VibeVoiceSemanticTokenizerConfig,
175 |         "decoder_config": Qwen2Config,
176 |         "diffusion_head_config": VibeVoiceDiffusionHeadConfig,
177 |     }
178 |     # keys_to_ignore_at_inference = ["past_key_values"]
179 |     # Default tensor parallel plan for base model `Qwen2`
180 |     base_model_tp_plan = {
181 |         "layers.*.self_attn.q_proj": "colwise",
182 |         "layers.*.self_attn.k_proj": "colwise",
183 |         "layers.*.self_attn.v_proj": "colwise",
184 |         "layers.*.self_attn.o_proj": "rowwise",
185 |         "layers.*.mlp.gate_proj": "colwise",
186 |         "layers.*.mlp.up_proj": "colwise",
187 |         "layers.*.mlp.down_proj": "rowwise",
188 |     }
189 |     
190 |     def __init__(
191 |         self,
192 |         acoustic_tokenizer_config=None,
193 |         semantic_tokenizer_config=None,
194 |         decoder_config=None,
195 |         diffusion_head_config=None,
196 |         **kwargs
197 |     ):
198 | 
199 |         # kwargs["_attn_implementation"] = "flash_attention_2"
200 |         kwargs["_attn_implementation_autoset"] = False 
201 | 
202 |         if acoustic_tokenizer_config is None:
203 |             self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"]()
204 |         elif isinstance(acoustic_tokenizer_config, dict):
205 |             acoustic_tokenizer_config["model_type"] = "vibevoice_acoustic_tokenizer"
206 |             self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"](**acoustic_tokenizer_config)
207 |         elif isinstance(acoustic_tokenizer_config, VibeVoiceAcousticTokenizerConfig):
208 |             # If an instance of the config class is provided
209 |             self.acoustic_tokenizer_config = acoustic_tokenizer_config
210 | 
211 |         if semantic_tokenizer_config is None:
212 |             self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"]()
213 |         elif isinstance(semantic_tokenizer_config, dict):
214 |             semantic_tokenizer_config["model_type"] = "vibevoice_semantic_tokenizer"
215 |             self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"](**semantic_tokenizer_config)
216 |         elif isinstance(semantic_tokenizer_config, VibeVoiceSemanticTokenizerConfig):
217 |             # If an instance of the config class is provided
218 |             self.semantic_tokenizer_config = semantic_tokenizer_config
219 | 
220 |         if decoder_config is None:
221 |             self.decoder_config = self.sub_configs["decoder_config"]()
222 |         elif isinstance(decoder_config, dict):
223 |             # If a dictionary is provided, instantiate the config class with it
224 |             # self.decoder_config = self.sub_configs["decoder_config"](**decoder_config)
225 |             if decoder_config.get("model_type", '') == "qwen2":
226 |                 self.decoder_config = Qwen2Config(**decoder_config)
227 |             else:
228 |                 raise ValueError(f"Unsupported decoder model type: {decoder_config.get('model_type', '')}")
229 |         elif isinstance(decoder_config, (Qwen2Config,)):
230 |             # If an instance of the config class is provided
231 |             self.decoder_config = decoder_config
232 | 
233 |         if diffusion_head_config is None:
234 |             self.diffusion_head_config = self.sub_configs["diffusion_head_config"]()
235 |         elif isinstance(diffusion_head_config, dict):
236 |             diffusion_head_config["model_type"] = "vibevoice_diffusion_head"
237 |             self.diffusion_head_config = self.sub_configs["diffusion_head_config"](**diffusion_head_config)
238 |         elif isinstance(diffusion_head_config, VibeVoiceDiffusionHeadConfig):
239 |             # If an instance of the config class is provided
240 |             self.diffusion_head_config = diffusion_head_config
241 | 
242 |         # other parameters
243 |         self.acoustic_vae_dim = getattr(self.acoustic_tokenizer_config, 'vae_dim', 64)
244 |         self.semantic_vae_dim = getattr(self.semantic_tokenizer_config, 'vae_dim', 128)
245 |         
246 |         # Add attributes required by newer transformers versions from decoder_config
247 |         # These are used by GenerationMixin in newer versions
248 |         if hasattr(self.decoder_config, 'num_hidden_layers'):
249 |             self.num_hidden_layers = self.decoder_config.num_hidden_layers
250 |         if hasattr(self.decoder_config, 'vocab_size'):
251 |             self.vocab_size = self.decoder_config.vocab_size
252 |         if hasattr(self.decoder_config, 'hidden_size'):
253 |             self.hidden_size = self.decoder_config.hidden_size
254 |         if hasattr(self.decoder_config, 'num_attention_heads'):
255 |             self.num_attention_heads = self.decoder_config.num_attention_heads
256 |         if hasattr(self.decoder_config, 'num_key_value_heads'):
257 |             self.num_key_value_heads = self.decoder_config.num_key_value_heads
258 |         if hasattr(self.decoder_config, 'intermediate_size'):
259 |             self.intermediate_size = self.decoder_config.intermediate_size
260 |         if hasattr(self.decoder_config, 'max_position_embeddings'):
261 |             self.max_position_embeddings = self.decoder_config.max_position_embeddings
262 | 
263 |         super().__init__(**kwargs)
264 | 
265 | __all__ = [
266 |     "VibeVoiceAcousticTokenizerConfig", 
267 |     "VibeVoiceSemanticTokenizerConfig", 
268 |     "VibeVoiceDiffusionHeadConfig", 
269 |     "VibeVoiceConfig"
270 | ]


--------------------------------------------------------------------------------
/nodes/single_speaker_node.py:
--------------------------------------------------------------------------------
  1 | # Created by Fabio Sarracino
  2 | 
  3 | import logging
  4 | import os
  5 | import tempfile
  6 | import torch
  7 | import numpy as np
  8 | import re
  9 | from typing import List, Optional
 10 | 
 11 | from .base_vibevoice import BaseVibeVoiceNode, get_available_models
 12 | 
 13 | # Setup logging
 14 | logger = logging.getLogger("VibeVoice")
 15 | 
 16 | class VibeVoiceSingleSpeakerNode(BaseVibeVoiceNode):
 17 |     def __init__(self):
 18 |         super().__init__()
 19 |         # Register this instance for memory management
 20 |         try:
 21 |             from .free_memory_node import VibeVoiceFreeMemoryNode
 22 |             VibeVoiceFreeMemoryNode.register_single_speaker(self)
 23 |         except:
 24 |             pass
 25 |     
 26 |     @classmethod
 27 |     def INPUT_TYPES(cls):
 28 |         # Get available models dynamically
 29 |         available_models = get_available_models()
 30 |         model_choices = [display_name for _, display_name in available_models]
 31 |         default_model = model_choices[0] if model_choices else "No models found"
 32 | 
 33 |         return {
 34 |             "required": {
 35 |                 "text": ("STRING", {
 36 |                     "multiline": True,
 37 |                     "default": "Hello, this is a test of the VibeVoice text-to-speech system.",
 38 |                     "tooltip": "Text to convert to speech. Gets disabled when connected to another node.",
 39 |                     "forceInput": False,
 40 |                     "dynamicPrompts": True
 41 |                 }),
 42 |                 "model": (model_choices if model_choices else ["No models found"], {
 43 |                     "default": default_model,
 44 |                     "tooltip": "Select a model from ComfyUI/models/vibevoice/ folder"
 45 |                 }),
 46 |                 "attention_type": (["auto", "eager", "sdpa", "flash_attention_2", "sage"], {
 47 |                     "default": "auto",
 48 |                     "tooltip": "Attention implementation. Auto selects the best available, eager is standard, sdpa is optimized PyTorch, flash_attention_2 requires compatible GPU, sage uses quantized attention for speedup (CUDA only)"
 49 |                 }),
 50 |                 "quantize_llm": (["full precision", "4bit", "8bit"], {
 51 |                     "default": "full precision",
 52 |                     "tooltip": "Dynamically quantize only the LLM component for non-quantized models. 4bit: major VRAM savings with minimal quality loss. 8bit: good balance of quality and memory usage. Full precision: original quality. Note: ignored for pre-quantized models. Requires CUDA GPU."
 53 |                 }),
 54 |                 "free_memory_after_generate": ("BOOLEAN", {"default": True, "tooltip": "Free model from memory after generation to save VRAM/RAM. Disable to keep model loaded for faster subsequent generations"}),
 55 |                 "diffusion_steps": ("INT", {"default": 20, "min": 1, "max": 100, "step": 1, "tooltip": "Number of denoising steps. More steps = theoretically better quality but slower. Default: 20"}),
 56 |                 "seed": ("INT", {"default": 42, "min": 0, "max": 2**32-1, "tooltip": "Random seed for generation. Default 42 is used in official examples"}),
 57 |                 "cfg_scale": ("FLOAT", {"default": 1.3, "min": 0.5, "max": 3.5, "step": 0.05, "tooltip": "Classifier-free guidance scale (official default: 1.3)"}),
 58 |                 "use_sampling": ("BOOLEAN", {"default": False, "tooltip": "Enable sampling mode. When False (default), uses deterministic generation like official examples"}),
 59 |             },
 60 |             "optional": {
 61 |                 "voice_to_clone": ("AUDIO", {"tooltip": "Optional: Reference voice to clone. If not provided, synthetic voice will be used."}),
 62 |                 "lora": ("LORA_CONFIG", {"tooltip": "Optional: LoRA configuration from VibeVoice LoRA node"}),
 63 |                 "temperature": ("FLOAT", {"default": 0.95, "min": 0.1, "max": 2.0, "step": 0.05, "tooltip": "Only used when sampling is enabled"}),
 64 |                 "top_p": ("FLOAT", {"default": 0.95, "min": 0.1, "max": 1.0, "step": 0.05, "tooltip": "Only used when sampling is enabled"}),
 65 |                 "max_words_per_chunk": ("INT", {"default": 250, "min": 100, "max": 500, "step": 50, "tooltip": "Maximum words per chunk for long texts. Lower values prevent speed issues but create more chunks."}),
 66 |                 "voice_speed_factor": ("FLOAT", {
 67 |                     "default": 1.0,
 68 |                     "min": 0.8,
 69 |                     "max": 1.2,
 70 |                     "step": 0.01,
 71 |                     "tooltip": "1.0 = normal speed, <1.0 = slower speed, >1.0 = faster speed"
 72 |                 }),
 73 |             }
 74 |         }
 75 | 
 76 |     RETURN_TYPES = ("AUDIO",)
 77 |     RETURN_NAMES = ("audio",)
 78 |     FUNCTION = "generate_speech"
 79 |     CATEGORY = "VibeVoiceWrapper"
 80 |     DESCRIPTION = "Generate speech from text using Microsoft VibeVoice with optional voice cloning"
 81 | 
 82 |     def _prepare_voice_samples(self, speakers: list, voice_to_clone, voice_speed_factor: float = 1.0) -> List[np.ndarray]:
 83 |         """Prepare voice samples from input audio or create synthetic ones"""
 84 | 
 85 |         if voice_to_clone is not None:
 86 |             # Use the base class method to prepare audio with speed adjustment
 87 |             audio_np = self._prepare_audio_from_comfyui(voice_to_clone, speed_factor=voice_speed_factor)
 88 |             if audio_np is not None:
 89 |                 return [audio_np]
 90 |         
 91 |         # Create synthetic voice samples for speakers
 92 |         voice_samples = []
 93 |         for i, speaker in enumerate(speakers):
 94 |             voice_sample = self._create_synthetic_voice_sample(i)
 95 |             voice_samples.append(voice_sample)
 96 |             
 97 |         return voice_samples
 98 |     
 99 |     def generate_speech(self, text: str = "", model: str = "VibeVoice-1.5B",
100 |                        attention_type: str = "auto", quantize_llm: str = "full precision",
101 |                        free_memory_after_generate: bool = True,
102 |                        diffusion_steps: int = 20, seed: int = 42, cfg_scale: float = 1.3,
103 |                        use_sampling: bool = False, voice_to_clone=None, lora=None,
104 |                        temperature: float = 0.95, top_p: float = 0.95,
105 |                        max_words_per_chunk: int = 250, voice_speed_factor: float = 1.0):
106 |         """Generate speech from text using VibeVoice"""
107 |         
108 |         try:
109 |             # Use text directly (it now serves as both manual input and connection input)
110 |             if text and text.strip():
111 |                 final_text = text
112 |             else:
113 |                 raise Exception("No text provided. Please enter text or connect from LoadTextFromFile node.")
114 |             
115 |             # Get the actual folder path for the selected model
116 |             available_models = get_available_models()
117 |             model_path = None
118 |             for folder, display_name in available_models:
119 |                 if display_name == model:
120 |                     model_path = folder
121 |                     break
122 | 
123 |             if not model_path:
124 |                 raise Exception(f"Model '{model}' not found in models/vibevoice/")
125 | 
126 |             # Extract LoRA configuration if provided
127 |             lora_path = None
128 |             llm_lora_strength = 1.0
129 |             if lora and isinstance(lora, dict):
130 |                 lora_path = lora.get("path", None)
131 |                 llm_lora_strength = lora.get("llm_strength", 1.0)
132 | 
133 |                 # Set LoRA component flags based on configuration
134 |                 self.use_llm_lora = lora.get("use_llm", True)
135 |                 self.use_diffusion_head_lora = lora.get("use_diffusion_head", True)
136 |                 self.use_acoustic_connector_lora = lora.get("use_acoustic_connector", True)
137 |                 self.use_semantic_connector_lora = lora.get("use_semantic_connector", True)
138 | 
139 |                 if lora_path:
140 |                     logger.info(f"Using LoRA from: {lora_path}")
141 | 
142 |             # Load model with optional LoRA
143 |             self.load_model(model, model_path, attention_type, quantize_llm=quantize_llm, lora_path=lora_path)
144 |             
145 |             # For single speaker, we just use ["Speaker 1"]
146 |             speakers = ["Speaker 1"]
147 |             
148 |             # Parse pause keywords from text
149 |             segments = self._parse_pause_keywords(final_text)
150 |             
151 |             # Process segments
152 |             all_audio_segments = []
153 |             voice_samples = None  # Will be created on first text segment
154 |             sample_rate = 24000  # VibeVoice uses 24kHz
155 |             
156 |             for seg_idx, (seg_type, seg_content) in enumerate(segments):
157 |                 if seg_type == 'pause':
158 |                     # Generate silence for pause
159 |                     duration_ms = seg_content
160 |                     logger.info(f"Adding {duration_ms}ms pause")
161 |                     silence_audio = self._generate_silence(duration_ms, sample_rate)
162 |                     all_audio_segments.append(silence_audio)
163 |                     
164 |                 elif seg_type == 'text':
165 |                     # Process text segment (with chunking if needed)
166 |                     word_count = len(seg_content.split())
167 |                     
168 |                     if word_count > max_words_per_chunk:
169 |                         # Split long text into chunks
170 |                         logger.info(f"Text segment {seg_idx+1} has {word_count} words, splitting into chunks...")
171 |                         text_chunks = self._split_text_into_chunks(seg_content, max_words_per_chunk)
172 |                         
173 |                         for chunk_idx, chunk in enumerate(text_chunks):
174 |                             logger.info(f"Processing chunk {chunk_idx+1}/{len(text_chunks)} of segment {seg_idx+1}...")
175 |                             
176 |                             # Format chunk for VibeVoice
177 |                             formatted_text = self._format_text_for_vibevoice(chunk, speakers)
178 |                             
179 |                             # Create voice samples on first text segment
180 |                             if voice_samples is None:
181 |                                 voice_samples = self._prepare_voice_samples(speakers, voice_to_clone, voice_speed_factor)
182 |                             
183 |                             # Generate audio for this chunk
184 |                             chunk_audio = self._generate_with_vibevoice(
185 |                                 formatted_text, voice_samples, cfg_scale,
186 |                                 seed,  # Use same seed for voice consistency
187 |                                 diffusion_steps, use_sampling, temperature, top_p,
188 |                                 llm_lora_strength=llm_lora_strength
189 |                             )
190 |                             
191 |                             all_audio_segments.append(chunk_audio)
192 |                     else:
193 |                         # Process as single chunk
194 |                         logger.info(f"Processing text segment {seg_idx+1} ({word_count} words)")
195 |                         
196 |                         # Format text for VibeVoice
197 |                         formatted_text = self._format_text_for_vibevoice(seg_content, speakers)
198 |                         
199 |                         # Create voice samples on first text segment
200 |                         if voice_samples is None:
201 |                             voice_samples = self._prepare_voice_samples(speakers, voice_to_clone, voice_speed_factor)
202 |                         
203 |                         # Generate audio
204 |                         segment_audio = self._generate_with_vibevoice(
205 |                             formatted_text, voice_samples, cfg_scale, seed, diffusion_steps,
206 |                             use_sampling, temperature, top_p, llm_lora_strength=llm_lora_strength
207 |                         )
208 |                         
209 |                         all_audio_segments.append(segment_audio)
210 |             
211 |             # Concatenate all audio segments (including pauses)
212 |             if all_audio_segments:
213 |                 logger.info(f"Concatenating {len(all_audio_segments)} audio segments (including pauses)...")
214 |                 
215 |                 # Extract waveforms from all segments
216 |                 waveforms = []
217 |                 for audio_segment in all_audio_segments:
218 |                     if isinstance(audio_segment, dict) and "waveform" in audio_segment:
219 |                         waveforms.append(audio_segment["waveform"])
220 |                 
221 |                 if waveforms:
222 |                     # Filter out None values if any
223 |                     valid_waveforms = [w for w in waveforms if w is not None]
224 |                     
225 |                     if valid_waveforms:
226 |                         # Concatenate along the time dimension (last dimension)
227 |                         combined_waveform = torch.cat(valid_waveforms, dim=-1)
228 |                         
229 |                         # Create final audio dict
230 |                         audio_dict = {
231 |                             "waveform": combined_waveform,
232 |                             "sample_rate": sample_rate
233 |                         }
234 |                         logger.info(f"Successfully generated audio with {len(segments)} segments")
235 |                     else:
236 |                         raise Exception("No valid audio waveforms generated")
237 |                 else:
238 |                     raise Exception("Failed to extract waveforms from audio segments")
239 |             else:
240 |                 raise Exception("No audio segments generated")
241 |             
242 |             # Free memory if requested
243 |             if free_memory_after_generate:
244 |                 self.free_memory()
245 |             
246 |             return (audio_dict,)
247 |                     
248 |         except Exception as e:
249 |             # Check if this is an interruption by the user
250 |             import comfy.model_management as mm
251 |             if isinstance(e, mm.InterruptProcessingException):
252 |                 # User interrupted - just log it and re-raise to stop the workflow
253 |                 logger.info("Generation interrupted by user")
254 |                 raise  # Propagate the interruption to stop the workflow
255 |             else:
256 |                 # Real error - show it
257 |                 logger.error(f"Single speaker speech generation failed: {str(e)}")
258 |                 raise Exception(f"Error generating speech: {str(e)}")
259 | 
260 |     @classmethod
261 |     def IS_CHANGED(cls, text="", model="VibeVoice-1.5B", voice_to_clone=None, lora=None, **kwargs):
262 |         """Cache key for ComfyUI"""
263 |         voice_hash = hash(str(voice_to_clone)) if voice_to_clone else 0
264 |         lora_hash = hash(str(lora)) if lora else 0
265 |         return f"{hash(text)}_{model}_{voice_hash}_{lora_hash}_{kwargs.get('cfg_scale', 1.3)}_{kwargs.get('seed', 0)}"


--------------------------------------------------------------------------------
/vvembed/processor/vibevoice_tokenizer_processor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Processor class for VibeVoice models.
  3 | """
  4 | 
  5 | import os
  6 | import json
  7 | import warnings
  8 | from typing import List, Optional, Union, Dict, Any
  9 | 
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | from transformers.feature_extraction_utils import FeatureExtractionMixin
 14 | from transformers.utils import logging
 15 | 
 16 | logger = logging.get_logger(__name__)
 17 | 
 18 | 
 19 | class AudioNormalizer:
 20 |     """
 21 |     Audio normalization class for VibeVoice tokenizer.
 22 |     
 23 |     This class provides audio normalization to ensure consistent input levels
 24 |     for the VibeVoice tokenizer while maintaining audio quality.
 25 |     """
 26 |     
 27 |     def __init__(self, target_dB_FS: float = -25, eps: float = 1e-6):
 28 |         """
 29 |         Initialize the audio normalizer.
 30 |         
 31 |         Args:
 32 |             target_dB_FS (float): Target dB FS level for the audio. Default: -25
 33 |             eps (float): Small value to avoid division by zero. Default: 1e-6
 34 |         """
 35 |         self.target_dB_FS = target_dB_FS
 36 |         self.eps = eps
 37 |     
 38 |     def tailor_dB_FS(self, audio: np.ndarray) -> tuple:
 39 |         """
 40 |         Adjust the audio to the target dB FS level.
 41 |         
 42 |         Args:
 43 |             audio (np.ndarray): Input audio signal
 44 |             
 45 |         Returns:
 46 |             tuple: (normalized_audio, rms, scalar)
 47 |         """
 48 |         rms = np.sqrt(np.mean(audio**2))
 49 |         scalar = 10 ** (self.target_dB_FS / 20) / (rms + self.eps)
 50 |         normalized_audio = audio * scalar
 51 |         return normalized_audio, rms, scalar
 52 |     
 53 |     def avoid_clipping(self, audio: np.ndarray, scalar: Optional[float] = None) -> tuple:
 54 |         """
 55 |         Avoid clipping by scaling down if necessary.
 56 |         
 57 |         Args:
 58 |             audio (np.ndarray): Input audio signal
 59 |             scalar (float, optional): Explicit scaling factor
 60 |             
 61 |         Returns:
 62 |             tuple: (normalized_audio, scalar)
 63 |         """
 64 |         if scalar is None:
 65 |             max_val = np.max(np.abs(audio))
 66 |             if max_val > 1.0:
 67 |                 scalar = max_val + self.eps
 68 |             else:
 69 |                 scalar = 1.0
 70 |         
 71 |         return audio / scalar, scalar
 72 |     
 73 |     def __call__(self, audio: np.ndarray) -> np.ndarray:
 74 |         """
 75 |         Normalize the audio by adjusting to target dB FS and avoiding clipping.
 76 |         
 77 |         Args:
 78 |             audio (np.ndarray): Input audio signal
 79 |             
 80 |         Returns:
 81 |             np.ndarray: Normalized audio signal
 82 |         """
 83 |         # First adjust to target dB FS
 84 |         audio, _, _ = self.tailor_dB_FS(audio)
 85 |         # Then avoid clipping
 86 |         audio, _ = self.avoid_clipping(audio)
 87 |         return audio
 88 | 
 89 | 
 90 | # Change from ProcessorMixin to FeatureExtractionMixin which is designed for single components
 91 | class VibeVoiceTokenizerProcessor(FeatureExtractionMixin):
 92 |     """
 93 |     Processor for VibeVoice acoustic tokenizer models.
 94 |     
 95 |     This processor handles audio preprocessing for VibeVoice models, including:
 96 |     - Audio format conversion (stereo to mono)
 97 |     - Optional audio normalization
 98 |     - Streaming support for infinite-length audio
 99 |     
100 |     Args:
101 |         sampling_rate (int, optional): Expected sampling rate. Defaults to 24000.
102 |         normalize_audio (bool, optional): Whether to normalize audio. Defaults to True.
103 |         target_dB_FS (float, optional): Target dB FS for normalization. Defaults to -25.
104 |         eps (float, optional): Small value for numerical stability. Defaults to 1e-6.
105 |     """
106 |     model_input_names = ["input_features"]
107 |     
108 |     def __init__(
109 |         self,
110 |         sampling_rate: int = 24000,
111 |         normalize_audio: bool = True,
112 |         target_dB_FS: float = -25,
113 |         eps: float = 1e-6,
114 |         **kwargs,
115 |     ):
116 |         super().__init__(**kwargs)
117 |         
118 |         self.sampling_rate = sampling_rate
119 |         self.normalize_audio = normalize_audio
120 |         
121 |         # Initialize audio normalizer if needed
122 |         if self.normalize_audio:
123 |             self.normalizer = AudioNormalizer(target_dB_FS=target_dB_FS, eps=eps)
124 |         else:
125 |             self.normalizer = None
126 |         
127 |         # Save config
128 |         self.feature_extractor_dict = {
129 |             "sampling_rate": sampling_rate,
130 |             "normalize_audio": normalize_audio,
131 |             "target_dB_FS": target_dB_FS,
132 |             "eps": eps,
133 |         }
134 |     
135 |     def _ensure_mono(self, audio: np.ndarray) -> np.ndarray:
136 |         """
137 |         Convert stereo audio to mono if needed.
138 |         
139 |         Args:
140 |             audio (np.ndarray): Input audio array
141 |             
142 |         Returns:
143 |             np.ndarray: Mono audio array
144 |         """
145 |         if len(audio.shape) == 1:
146 |             return audio
147 |         elif len(audio.shape) == 2:
148 |             if audio.shape[0] == 2:  # (2, time)
149 |                 return np.mean(audio, axis=0)
150 |             elif audio.shape[1] == 2:  # (time, 2)
151 |                 return np.mean(audio, axis=1)
152 |             else:
153 |                 # If one dimension is 1, squeeze it
154 |                 if audio.shape[0] == 1:
155 |                     return audio.squeeze(0)
156 |                 elif audio.shape[1] == 1:
157 |                     return audio.squeeze(1)
158 |                 else:
159 |                     raise ValueError(f"Unexpected audio shape: {audio.shape}")
160 |         else:
161 |             raise ValueError(f"Audio should be 1D or 2D, got shape: {audio.shape}")
162 |     
163 |     def _process_single_audio(self, audio: Union[np.ndarray, List[float]]) -> np.ndarray:
164 |         """
165 |         Process a single audio array.
166 |         
167 |         Args:
168 |             audio: Single audio input
169 |             
170 |         Returns:
171 |             np.ndarray: Processed audio
172 |         """
173 |         # Convert to numpy array
174 |         if not isinstance(audio, np.ndarray):
175 |             audio = np.array(audio, dtype=np.float32)
176 |         else:
177 |             audio = audio.astype(np.float32)
178 |         
179 |         # Ensure mono
180 |         audio = self._ensure_mono(audio)
181 |         
182 |         # Normalize if requested
183 |         if self.normalize_audio and self.normalizer is not None:
184 |             audio = self.normalizer(audio)
185 |         
186 |         return audio
187 |     
188 |     def __call__(
189 |         self,
190 |         audio: Union[str, np.ndarray, List[float], List[np.ndarray], List[List[float]], List[str]] = None,
191 |         sampling_rate: Optional[int] = None,
192 |         return_tensors: Optional[str] = None,
193 |         **kwargs,
194 |     ):
195 |         """
196 |         Process audio for VibeVoice models.
197 |         
198 |         Args:
199 |             audio: Audio input(s) to process. Can be:
200 |                 - str: Path to audio file
201 |                 - np.ndarray: Audio array
202 |                 - List[float]: Audio as list of floats
203 |                 - List[np.ndarray]: Batch of audio arrays
204 |                 - List[str]: Batch of audio file paths
205 |             sampling_rate (int, optional): Sampling rate of the input audio
206 |             return_tensors (str, optional): Return format ('pt' for PyTorch, 'np' for NumPy)
207 |             
208 |         Returns:
209 |             dict: Processed audio inputs with keys:
210 |                 - input_features: Audio tensor(s) ready for the model
211 |         """
212 |         if audio is None:
213 |             raise ValueError("Audio input is required")
214 |         
215 |         # Validate sampling rate
216 |         if sampling_rate is not None and sampling_rate != self.sampling_rate:
217 |             logger.warning(
218 |                 f"Input sampling rate ({sampling_rate}) differs from expected "
219 |                 f"sampling rate ({self.sampling_rate}). Please resample your audio."
220 |             )
221 |         
222 |         # Handle different input types
223 |         if isinstance(audio, str):
224 |             # Single audio file path
225 |             audio = self._load_audio_from_path(audio)
226 |             is_batched = False
227 |         elif isinstance(audio, list):
228 |             if len(audio) == 0:
229 |                 raise ValueError("Empty audio list provided")
230 |             
231 |             # Check if it's a list of file paths
232 |             if all(isinstance(item, str) for item in audio):
233 |                 # Batch of audio file paths
234 |                 audio = [self._load_audio_from_path(path) for path in audio]
235 |                 is_batched = True
236 |             else:
237 |                 # Check if it's batched audio arrays
238 |                 is_batched = isinstance(audio[0], (np.ndarray, list))
239 |         else:
240 |             # Single audio array or list
241 |             is_batched = False
242 |         
243 |         # Process audio
244 |         if is_batched:
245 |             processed_audio = [self._process_single_audio(a) for a in audio]
246 |         else:
247 |             processed_audio = [self._process_single_audio(audio)]
248 |         
249 |         # Convert to tensors if requested
250 |         if return_tensors == "pt":
251 |             if len(processed_audio) == 1:
252 |                 # Create a proper batch dimension (B, T)
253 |                 input_features = torch.from_numpy(processed_audio[0]).unsqueeze(0).unsqueeze(1)
254 |             else:
255 |                 # For batched input with different lengths, create a batch properly
256 |                 input_features = torch.stack([torch.from_numpy(a) for a in processed_audio]).unsqueeze(1)
257 |         elif return_tensors == "np":
258 |             if len(processed_audio) == 1:
259 |                 input_features = processed_audio[0][np.newaxis, np.newaxis, :]
260 |             else:
261 |                 input_features = np.stack(processed_audio)[:, np.newaxis, :]
262 |         else:
263 |             input_features = processed_audio[0] if len(processed_audio) == 1 else processed_audio
264 |         
265 |         outputs = {
266 |             "audio": input_features,  # Use "audio" instead of "input_features"
267 |         }
268 |         
269 |         return outputs
270 | 
271 |     def _load_audio_from_path(self, audio_path: str) -> np.ndarray:
272 |         """
273 |         Load audio from file path.
274 |         
275 |         Args:
276 |             audio_path (str): Path to audio file
277 |             
278 |         Returns:
279 |             np.ndarray: Loaded audio array
280 |         """
281 |         # Get file extension to determine loading method
282 |         file_ext = os.path.splitext(audio_path)[1].lower()
283 |         
284 |         if file_ext in ['.wav', '.mp3', '.flac', '.m4a', '.ogg']:
285 |             # Audio file - use librosa
286 |             import librosa
287 |             audio_array, sr = librosa.load(
288 |                 audio_path, 
289 |                 sr=self.sampling_rate, 
290 |                 mono=True
291 |             )
292 |             return audio_array
293 |         elif file_ext == '.pt':
294 |             # PyTorch tensor file
295 |             audio_tensor = torch.load(audio_path, map_location='cpu').squeeze()
296 |             if isinstance(audio_tensor, torch.Tensor):
297 |                 audio_array = audio_tensor.numpy()
298 |             else:
299 |                 audio_array = np.array(audio_tensor)
300 |             return audio_array.astype(np.float32)
301 |         elif file_ext == '.npy':
302 |             # NumPy file
303 |             audio_array = np.load(audio_path)
304 |             return audio_array.astype(np.float32)
305 |         else:
306 |             raise ValueError(
307 |                 f"Unsupported file format: {file_ext}. "
308 |                 f"Supported formats: .wav, .mp3, .flac, .m4a, .ogg, .pt, .npy, .npz"
309 |             )
310 |     
311 |     def preprocess_audio(
312 |         self, 
313 |         audio_path_or_array: Union[str, np.ndarray],
314 |         normalize: Optional[bool] = None,
315 |     ) -> np.ndarray:
316 |         """
317 |         Convenience method to preprocess audio from file path or array.
318 |         This method is kept for backward compatibility but __call__ is recommended.
319 |         
320 |         Args:
321 |             audio_path_or_array: Path to audio file or numpy array
322 |             normalize: Whether to normalize (overrides default setting)
323 |             
324 |         Returns:
325 |             np.ndarray: Preprocessed audio array
326 |         """
327 |         if isinstance(audio_path_or_array, str):
328 |             audio_array = self._load_audio_from_path(audio_path_or_array)
329 |         else:
330 |             audio_array = np.array(audio_path_or_array, dtype=np.float32)
331 |         
332 |         # Override normalization setting if specified
333 |         original_normalize = self.normalize_audio
334 |         if normalize is not None:
335 |             self.normalize_audio = normalize
336 |         
337 |         try:
338 |             processed = self._process_single_audio(audio_array)
339 |         finally:
340 |             # Restore original setting
341 |             self.normalize_audio = original_normalize
342 |         
343 |         return processed
344 |     
345 |     # Override to_dict method for configuration saving
346 |     def to_dict(self) -> Dict[str, Any]:
347 |         """
348 |         Convert the object to a dict containing all attributes needed for serialization.
349 |         """
350 |         return self.feature_extractor_dict
351 | 
352 |     def save_audio(
353 |         self,
354 |         audio: Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, np.ndarray]]],
355 |         output_path: str = "output.wav",
356 |         sampling_rate: Optional[int] = None,
357 |         normalize: bool = False,
358 |         batch_prefix: str = "audio_",
359 |     ):
360 |         """
361 |         Save audio data to WAV file(s).
362 |         
363 |         Args:
364 |             audio: Audio data to save. Can be:
365 |                 - torch.Tensor: PyTorch tensor with shape (B, C, T) or (B, T) or (T)
366 |                 - np.ndarray: NumPy array with shape (B, C, T) or (B, T) or (T)
367 |                 - List of tensors or arrays
368 |             output_path: Path where to save the audio. If saving multiple files,
369 |                 this is treated as a directory and individual files will be saved inside.
370 |             sampling_rate: Sampling rate for the saved audio. Defaults to the processor's rate.
371 |             normalize: Whether to normalize audio before saving.
372 |             batch_prefix: Prefix for batch files when saving multiple audios.
373 |                 
374 |         Returns:
375 |             List[str]: Paths to the saved audio files.
376 |         """
377 |         if sampling_rate is None:
378 |             sampling_rate = self.sampling_rate
379 |         
380 |         try:
381 |             import soundfile as sf
382 |         except ImportError:
383 |             raise ImportError(
384 |                 "soundfile is required to save audio files. "
385 |                 "Install it with: pip install soundfile"
386 |             )
387 |         
388 |         # Ensure audio is in the right format
389 |         if isinstance(audio, torch.Tensor):
390 |             # Convert PyTorch tensor to numpy
391 |             audio_np = audio.float().detach().cpu().numpy()
392 |         elif isinstance(audio, np.ndarray):
393 |             audio_np = audio
394 |         elif isinstance(audio, list):
395 |             # Handle list of tensors or arrays
396 |             if all(isinstance(a, torch.Tensor) for a in audio):
397 |                 audio_np = [a.float().detach().cpu().numpy() for a in audio]
398 |             else:
399 |                 audio_np = audio
400 |         else:
401 |             raise ValueError(f"Unsupported audio type: {type(audio)}")
402 |         
403 |         saved_paths = []
404 |         
405 |         # Handle based on shape or type
406 |         if isinstance(audio_np, list):
407 |             # Multiple separate audios to save
408 |             output_dir = output_path
409 |             
410 |             # Ensure output directory exists
411 |             os.makedirs(output_dir, exist_ok=True)
412 |             
413 |             # Save each audio
414 |             for i, audio_item in enumerate(audio_np):
415 |                 audio_item = self._prepare_audio_for_save(audio_item, normalize)
416 |                 file_path = os.path.join(output_dir, f"{batch_prefix}{i}.wav")
417 |                 sf.write(file_path, audio_item, sampling_rate)
418 |                 saved_paths.append(file_path)
419 |                 
420 |         else:
421 |             # Handle different dimensions
422 |             if len(audio_np.shape) >= 3:  # (B, C, T) or similar
423 |                 # Get batch size
424 |                 batch_size = audio_np.shape[0]
425 |                 
426 |                 if batch_size > 1:
427 |                     # Multiple audios in a batch
428 |                     output_dir = output_path
429 |                     
430 |                     # Ensure output directory exists
431 |                     os.makedirs(output_dir, exist_ok=True)
432 |                     
433 |                     # Save each audio in the batch
434 |                     for i in range(batch_size):
435 |                         # Extract single audio and remove channel dim if present
436 |                         single_audio = audio_np[i]
437 |                         if len(single_audio.shape) > 1:
438 |                             if single_audio.shape[0] == 1:  # (1, T)
439 |                                 single_audio = single_audio.squeeze(0)
440 |                         
441 |                         single_audio = self._prepare_audio_for_save(single_audio, normalize)
442 |                         file_path = os.path.join(output_dir, f"{batch_prefix}{i}.wav")
443 |                         sf.write(file_path, single_audio, sampling_rate)
444 |                         saved_paths.append(file_path)
445 |                 else:
446 |                     # Single audio with batch and channel dims
447 |                     audio_item = audio_np.squeeze()  # Remove batch and channel dimensions
448 |                     audio_item = self._prepare_audio_for_save(audio_item, normalize)
449 |                     sf.write(output_path, audio_item, sampling_rate)
450 |                     saved_paths.append(output_path)
451 |             else:
452 |                 # Single audio without batch dimension
453 |                 audio_item = self._prepare_audio_for_save(audio_np, normalize)
454 |                 sf.write(output_path, audio_item, sampling_rate)
455 |                 saved_paths.append(output_path)
456 |         
457 |         return saved_paths
458 | 
459 |     def _prepare_audio_for_save(self, audio: np.ndarray, normalize: bool) -> np.ndarray:
460 |         """
461 |         Prepare audio for saving by ensuring it's the right shape and optionally normalizing.
462 |         
463 |         Args:
464 |             audio: Audio data as numpy array
465 |             normalize: Whether to normalize audio
466 |             
467 |         Returns:
468 |             np.ndarray: Processed audio ready for saving
469 |         """
470 |         # Ensure right dimensionality
471 |         if len(audio.shape) > 1 and audio.shape[0] == 1:  # (1, T)
472 |             audio = audio.squeeze(0)
473 |         
474 |         # Normalize if requested
475 |         if normalize:
476 |             max_val = np.abs(audio).max()
477 |             if max_val > 0:
478 |                 audio = audio / max_val
479 |         
480 |         return audio
481 | 
482 | 
483 | __all__ = ["VibeVoiceTokenizerProcessor", "AudioNormalizer"]


--------------------------------------------------------------------------------
/nodes/multi_speaker_node.py:
--------------------------------------------------------------------------------
  1 | # Created by Fabio Sarracino
  2 | 
  3 | import logging
  4 | import os
  5 | import re
  6 | import tempfile
  7 | import torch
  8 | import numpy as np
  9 | from typing import List, Optional
 10 | 
 11 | from .base_vibevoice import BaseVibeVoiceNode, get_available_models
 12 | 
 13 | # Setup logging
 14 | logger = logging.getLogger("VibeVoice")
 15 | 
 16 | class VibeVoiceMultipleSpeakersNode(BaseVibeVoiceNode):
 17 |     def __init__(self):
 18 |         super().__init__()
 19 |         # Register this instance for memory management
 20 |         try:
 21 |             from .free_memory_node import VibeVoiceFreeMemoryNode
 22 |             VibeVoiceFreeMemoryNode.register_multi_speaker(self)
 23 |         except:
 24 |             pass
 25 |     
 26 |     @classmethod
 27 |     def INPUT_TYPES(cls):
 28 |         # Get available models dynamically
 29 |         available_models = get_available_models()
 30 |         model_choices = [display_name for _, display_name in available_models]
 31 |         # Try to select Large model by default if available
 32 |         default_model = "VibeVoice-Large"
 33 |         if default_model not in model_choices:
 34 |             default_model = model_choices[0] if model_choices else "No models found"
 35 | 
 36 |         return {
 37 |             "required": {
 38 |                 "text": ("STRING", {
 39 |                     "multiline": True,
 40 |                     "default": "[1]: Hello, this is the first speaker.\n[2]: Hi there, I'm the second speaker.\n[1]: Nice to meet you!\n[2]: Nice to meet you too!",
 41 |                     "tooltip": "Text with speaker labels. Use '[N]:' format where N is 1-4. Gets disabled when connected to another node.",
 42 |                     "forceInput": False,
 43 |                     "dynamicPrompts": True
 44 |                 }),
 45 |                 "model": (model_choices if model_choices else ["No models found"], {
 46 |                     "default": default_model,
 47 |                     "tooltip": "Select a model from ComfyUI/models/vibevoice/ folder. Large is recommended for multi-speaker"
 48 |                 }),
 49 |                 "attention_type": (["auto", "eager", "sdpa", "flash_attention_2", "sage"], {
 50 |                     "default": "auto",
 51 |                     "tooltip": "Attention implementation. Auto selects the best available, eager is standard, sdpa is optimized PyTorch, flash_attention_2 requires compatible GPU, sage uses quantized attention for speedup (CUDA only)"
 52 |                 }),
 53 |                 "quantize_llm": (["full precision", "4bit", "8bit"], {
 54 |                     "default": "full precision",
 55 |                     "tooltip": "Dynamically quantize only the LLM component for non-quantized models. 4bit: major VRAM savings with minimal quality loss. 8bit: good balance of quality and memory usage. Full precision: original quality. Note: ignored for pre-quantized models. Requires CUDA GPU."
 56 |                 }),
 57 |                 "free_memory_after_generate": ("BOOLEAN", {"default": True, "tooltip": "Free model from memory after generation to save VRAM/RAM. Disable to keep model loaded for faster subsequent generations"}),
 58 |                 "diffusion_steps": ("INT", {"default": 20, "min": 1, "max": 100, "step": 1, "tooltip": "Number of denoising steps. More steps = theoretically better quality but slower. Default: 20"}),
 59 |                 "seed": ("INT", {"default": 42, "min": 0, "max": 2**32-1, "tooltip": "Random seed for generation. Default 42 is used in official examples"}),
 60 |                 "cfg_scale": ("FLOAT", {"default": 1.3, "min": 0.5, "max": 3.5, "step": 0.05, "tooltip": "Classifier-free guidance scale (official default: 1.3)"}),
 61 |                 "use_sampling": ("BOOLEAN", {"default": False, "tooltip": "Enable sampling mode. When False (default), uses deterministic generation like official examples"}),
 62 |             },
 63 |             "optional": {
 64 |                 "speaker1_voice": ("AUDIO", {"tooltip": "Optional: Voice sample for Speaker 1. If not provided, synthetic voice will be used."}),
 65 |                 "speaker2_voice": ("AUDIO", {"tooltip": "Optional: Voice sample for Speaker 2. If not provided, synthetic voice will be used."}),
 66 |                 "speaker3_voice": ("AUDIO", {"tooltip": "Optional: Voice sample for Speaker 3. If not provided, synthetic voice will be used."}),
 67 |                 "speaker4_voice": ("AUDIO", {"tooltip": "Optional: Voice sample for Speaker 4. If not provided, synthetic voice will be used."}),
 68 |                 "lora": ("LORA_CONFIG", {"tooltip": "Optional: LoRA configuration from VibeVoice LoRA node"}),
 69 |                 "temperature": ("FLOAT", {"default": 0.95, "min": 0.1, "max": 2.0, "step": 0.05, "tooltip": "Only used when sampling is enabled"}),
 70 |                 "top_p": ("FLOAT", {"default": 0.95, "min": 0.1, "max": 1.0, "step": 0.05, "tooltip": "Only used when sampling is enabled"}),
 71 |                 "voice_speed_factor": ("FLOAT", {
 72 |                     "default": 1.0,
 73 |                     "min": 0.8,
 74 |                     "max": 1.2,
 75 |                     "step": 0.01,
 76 |                     "tooltip": "1.0 = normal speed, <1.0 = slower speed, >1.0 = faster speed (applies to all speakers)"
 77 |                 }),
 78 |             }
 79 |         }
 80 | 
 81 |     RETURN_TYPES = ("AUDIO",)
 82 |     RETURN_NAMES = ("audio",)
 83 |     FUNCTION = "generate_speech"
 84 |     CATEGORY = "VibeVoiceWrapper"
 85 |     DESCRIPTION = "Generate multi-speaker conversations with up to 4 distinct voices using Microsoft VibeVoice"
 86 | 
 87 |     def _prepare_voice_sample(self, voice_audio, speaker_idx: int, voice_speed_factor: float = 1.0) -> Optional[np.ndarray]:
 88 |         """Prepare a single voice sample from input audio with speed adjustment"""
 89 |         return self._prepare_audio_from_comfyui(voice_audio, speed_factor=voice_speed_factor)
 90 |     
 91 |     def generate_speech(self, text: str = "", model: str = "VibeVoice-7B-Preview",
 92 |                        attention_type: str = "auto", quantize_llm: str = "full precision",
 93 |                        free_memory_after_generate: bool = True,
 94 |                        diffusion_steps: int = 20, seed: int = 42, cfg_scale: float = 1.3,
 95 |                        use_sampling: bool = False, lora=None,
 96 |                        speaker1_voice=None, speaker2_voice=None,
 97 |                        speaker3_voice=None, speaker4_voice=None,
 98 |                        temperature: float = 0.95, top_p: float = 0.95,
 99 |                        voice_speed_factor: float = 1.0):
100 |         """Generate multi-speaker speech from text using VibeVoice"""
101 |         
102 |         try:
103 |             # Check text input
104 |             if not text or not text.strip():
105 |                 raise Exception("No text provided. Please enter text with speaker labels (e.g., '[1]: Hello' or '[2]: Hi')")
106 |             
107 |             # First detect how many speakers are in the text
108 |             bracket_pattern = r'\[(\d+)\]\s*:'
109 |             speakers_numbers = sorted(list(set([int(m) for m in re.findall(bracket_pattern, text)])))
110 |             
111 |             # Limit to 1-4 speakers
112 |             if not speakers_numbers:
113 |                 num_speakers = 1  # Default to 1 if no speaker format found
114 |             else:
115 |                 num_speakers = min(max(speakers_numbers), 4)  # Max speaker number, capped at 4
116 |                 if max(speakers_numbers) > 4:
117 |                     print(f"[VibeVoice] Warning: Found {max(speakers_numbers)} speakers, limiting to 4")
118 |             
119 |             # Direct conversion from [N]: to Speaker (N-1): for VibeVoice processor
120 |             # This avoids multiple conversion steps
121 |             converted_text = text
122 |             
123 |             # Find all [N]: patterns in the text
124 |             speakers_in_text = sorted(list(set([int(m) for m in re.findall(bracket_pattern, text)])))
125 |             
126 |             if not speakers_in_text:
127 |                 # No [N]: format found, try Speaker N: format
128 |                 speaker_pattern = r'Speaker\s+(\d+)\s*:'
129 |                 speakers_in_text = sorted(list(set([int(m) for m in re.findall(speaker_pattern, text)])))
130 |                 
131 |                 if speakers_in_text:
132 |                     # Text already in Speaker N format, convert to 0-based
133 |                     for speaker_num in sorted(speakers_in_text, reverse=True):
134 |                         pattern = f'Speaker\\s+{speaker_num}\\s*:'
135 |                         replacement = f'Speaker {speaker_num - 1}:'
136 |                         converted_text = re.sub(pattern, replacement, converted_text)
137 |                 else:
138 |                     # No speaker format found
139 |                     speakers_in_text = [1]
140 |                     
141 |                     # Parse pause keywords even for single speaker
142 |                     pause_segments = self._parse_pause_keywords(text)
143 |                     
144 |                     # Store speaker segments for pause processing
145 |                     speaker_segments_with_pauses = []
146 |                     segments = []
147 |                     
148 |                     for seg_type, seg_content in pause_segments:
149 |                         if seg_type == 'pause':
150 |                             speaker_segments_with_pauses.append(('pause', seg_content, None))
151 |                         else:
152 |                             # Clean up newlines
153 |                             text_clean = seg_content.replace('\n', ' ').replace('\r', ' ')
154 |                             text_clean = ' '.join(text_clean.split())
155 |                             
156 |                             if text_clean:
157 |                                 speaker_segments_with_pauses.append(('text', text_clean, 1))
158 |                                 segments.append(f"Speaker 0: {text_clean}")
159 |                     
160 |                     # Join all segments for fallback
161 |                     converted_text = '\n'.join(segments) if segments else f"Speaker 0: {text}"
162 |             else:
163 |                 # Convert [N]: directly to Speaker (N-1): and handle multi-line text
164 |                 # Split text to preserve speaker segments while cleaning up newlines within each segment
165 |                 segments = []
166 |                 
167 |                 # Find all speaker markers with their positions
168 |                 speaker_matches = list(re.finditer(f'\\[({"|".join(map(str, speakers_in_text))})\\]\\s*:', converted_text))
169 |                 
170 |                 # Store speaker segments for pause processing
171 |                 speaker_segments_with_pauses = []
172 |                 
173 |                 for i, match in enumerate(speaker_matches):
174 |                     speaker_num = int(match.group(1))
175 |                     start = match.end()
176 |                     
177 |                     # Find where this speaker's text ends (at next speaker or end of text)
178 |                     if i + 1 < len(speaker_matches):
179 |                         end = speaker_matches[i + 1].start()
180 |                     else:
181 |                         end = len(converted_text)
182 |                     
183 |                     # Extract the speaker's text (keep pause keywords for now)
184 |                     speaker_text = converted_text[start:end].strip()
185 |                     
186 |                     # Parse pause keywords within this speaker's text
187 |                     pause_segments = self._parse_pause_keywords(speaker_text)
188 |                     
189 |                     # Process each segment (text or pause) for this speaker
190 |                     for seg_type, seg_content in pause_segments:
191 |                         if seg_type == 'pause':
192 |                             # Add pause segment
193 |                             speaker_segments_with_pauses.append(('pause', seg_content, None))
194 |                         else:
195 |                             # Clean up the text segment
196 |                             text_clean = seg_content.replace('\n', ' ').replace('\r', ' ')
197 |                             text_clean = ' '.join(text_clean.split())
198 |                             
199 |                             if text_clean:  # Only add non-empty text
200 |                                 # Add text segment with speaker info
201 |                                 speaker_segments_with_pauses.append(('text', text_clean, speaker_num))
202 |                                 # Also build the traditional segments for fallback
203 |                                 segments.append(f'Speaker {speaker_num - 1}: {text_clean}')
204 |                 
205 |                 # Join all segments with newlines (required for multi-speaker format) - for fallback
206 |                 converted_text = '\n'.join(segments) if segments else ""
207 |             
208 |             # Build speaker names list - these are just for logging, not used by processor
209 |             # The processor uses the speaker labels in the text itself
210 |             speakers = [f"Speaker {i}" for i in range(len(speakers_in_text))]
211 |             
212 |             # Get the actual folder path for the selected model
213 |             available_models = get_available_models()
214 |             model_path = None
215 |             for folder, display_name in available_models:
216 |                 if display_name == model:
217 |                     model_path = folder
218 |                     break
219 | 
220 |             if not model_path:
221 |                 raise Exception(f"Model '{model}' not found in models/vibevoice/")
222 | 
223 |             # Extract LoRA configuration if provided
224 |             lora_path = None
225 |             llm_lora_strength = 1.0
226 |             if lora and isinstance(lora, dict):
227 |                 lora_path = lora.get("path", None)
228 |                 llm_lora_strength = lora.get("llm_strength", 1.0)
229 | 
230 |                 # Set LoRA component flags based on configuration
231 |                 self.use_llm_lora = lora.get("use_llm", True)
232 |                 self.use_diffusion_head_lora = lora.get("use_diffusion_head", True)
233 |                 self.use_acoustic_connector_lora = lora.get("use_acoustic_connector", True)
234 |                 self.use_semantic_connector_lora = lora.get("use_semantic_connector", True)
235 | 
236 |                 if lora_path:
237 |                     logger.info(f"Using LoRA from: {lora_path}")
238 | 
239 |             # Load model with optional LoRA
240 |             self.load_model(model, model_path, attention_type, quantize_llm=quantize_llm, lora_path=lora_path)
241 |             
242 |             voice_inputs = [speaker1_voice, speaker2_voice, speaker3_voice, speaker4_voice]
243 |             
244 |             # Prepare voice samples in order of appearance
245 |             voice_samples = []
246 |             for i, speaker_num in enumerate(speakers_in_text):
247 |                 idx = speaker_num - 1  # Convert to 0-based for voice array
248 |                 
249 |                 # Try to use provided voice sample
250 |                 if idx < len(voice_inputs) and voice_inputs[idx] is not None:
251 |                     voice_sample = self._prepare_voice_sample(voice_inputs[idx], idx, voice_speed_factor)
252 |                     if voice_sample is None:
253 |                         # Use the actual speaker index for consistent synthetic voice
254 |                         voice_sample = self._create_synthetic_voice_sample(idx)
255 |                 else:
256 |                     # Use the actual speaker index for consistent synthetic voice
257 |                     voice_sample = self._create_synthetic_voice_sample(idx)
258 |                     
259 |                 voice_samples.append(voice_sample)
260 |             
261 |             # Ensure voice_samples count matches detected speakers
262 |             if len(voice_samples) != len(speakers_in_text):
263 |                 logger.error(f"Mismatch: {len(speakers_in_text)} speakers but {len(voice_samples)} voice samples!")
264 |                 raise Exception(f"Voice sample count mismatch: expected {len(speakers_in_text)}, got {len(voice_samples)}")
265 |             
266 |             # Check if we have pause segments to process
267 |             if 'speaker_segments_with_pauses' in locals() and speaker_segments_with_pauses:
268 |                 # Process segments with pauses
269 |                 all_audio_segments = []
270 |                 sample_rate = 24000  # VibeVoice uses 24kHz
271 |                 
272 |                 # Group consecutive text segments from same speaker for efficiency
273 |                 grouped_segments = []
274 |                 current_group = []
275 |                 current_speaker = None
276 |                 
277 |                 for seg_type, seg_content, speaker_num in speaker_segments_with_pauses:
278 |                     if seg_type == 'pause':
279 |                         # Save current group if any
280 |                         if current_group:
281 |                             grouped_segments.append(('text_group', current_group, current_speaker))
282 |                             current_group = []
283 |                             current_speaker = None
284 |                         # Add pause
285 |                         grouped_segments.append(('pause', seg_content, None))
286 |                     else:
287 |                         # Text segment
288 |                         if speaker_num == current_speaker:
289 |                             # Same speaker, add to current group
290 |                             current_group.append(seg_content)
291 |                         else:
292 |                             # Different speaker, save current group and start new one
293 |                             if current_group:
294 |                                 grouped_segments.append(('text_group', current_group, current_speaker))
295 |                             current_group = [seg_content]
296 |                             current_speaker = speaker_num
297 |                 
298 |                 # Save last group if any
299 |                 if current_group:
300 |                     grouped_segments.append(('text_group', current_group, current_speaker))
301 |                 
302 |                 # Process grouped segments
303 |                 for seg_type, seg_content, speaker_num in grouped_segments:
304 |                     if seg_type == 'pause':
305 |                         # Generate silence
306 |                         duration_ms = seg_content
307 |                         logger.info(f"Adding {duration_ms}ms pause")
308 |                         silence_audio = self._generate_silence(duration_ms, sample_rate)
309 |                         all_audio_segments.append(silence_audio)
310 |                     else:
311 |                         # Process text group for a speaker
312 |                         combined_text = ' '.join(seg_content)
313 |                         formatted_text = f"Speaker {speaker_num - 1}: {combined_text}"
314 |                         
315 |                         # Get voice sample for this speaker
316 |                         speaker_idx = speakers_in_text.index(speaker_num)
317 |                         speaker_voice_samples = [voice_samples[speaker_idx]]
318 |                         
319 |                         logger.info(f"Generating audio for Speaker {speaker_num}: {len(combined_text.split())} words")
320 |                         
321 |                         # Generate audio for this speaker's text
322 |                         segment_audio = self._generate_with_vibevoice(
323 |                             formatted_text, speaker_voice_samples, cfg_scale, seed,
324 |                             diffusion_steps, use_sampling, temperature, top_p,
325 |                             llm_lora_strength=llm_lora_strength
326 |                         )
327 |                         
328 |                         all_audio_segments.append(segment_audio)
329 |                 
330 |                 # Concatenate all audio segments
331 |                 if all_audio_segments:
332 |                     logger.info(f"Concatenating {len(all_audio_segments)} audio segments (including pauses)...")
333 |                     
334 |                     # Extract waveforms
335 |                     waveforms = []
336 |                     for audio_segment in all_audio_segments:
337 |                         if isinstance(audio_segment, dict) and "waveform" in audio_segment:
338 |                             waveforms.append(audio_segment["waveform"])
339 |                     
340 |                     if waveforms:
341 |                         # Filter out None values if any
342 |                         valid_waveforms = [w for w in waveforms if w is not None]
343 |                         
344 |                         if valid_waveforms:
345 |                             # Concatenate along time dimension
346 |                             combined_waveform = torch.cat(valid_waveforms, dim=-1)
347 |                             
348 |                             audio_dict = {
349 |                                 "waveform": combined_waveform,
350 |                                 "sample_rate": sample_rate
351 |                             }
352 |                             logger.info(f"Successfully generated multi-speaker audio with pauses")
353 |                         else:
354 |                             raise Exception("No valid audio waveforms generated")
355 |                     else:
356 |                         raise Exception("Failed to extract waveforms from audio segments")
357 |                 else:
358 |                     raise Exception("No audio segments generated")
359 |             else:
360 |                 # Fallback to original method without pause support
361 |                 logger.info("Processing without pause support (no pause keywords found)")
362 |                 audio_dict = self._generate_with_vibevoice(
363 |                     converted_text, voice_samples, cfg_scale, seed, diffusion_steps,
364 |                     use_sampling, temperature, top_p, llm_lora_strength=llm_lora_strength
365 |                 )
366 |             
367 |             # Free memory if requested
368 |             if free_memory_after_generate:
369 |                 self.free_memory()
370 |             
371 |             return (audio_dict,)
372 |                     
373 |         except Exception as e:
374 |             # Check if this is an interruption by the user
375 |             import comfy.model_management as mm
376 |             if isinstance(e, mm.InterruptProcessingException):
377 |                 # User interrupted - just log it and re-raise to stop the workflow
378 |                 logger.info("Generation interrupted by user")
379 |                 raise  # Propagate the interruption to stop the workflow
380 |             else:
381 |                 # Real error - show it
382 |                 logger.error(f"Multi-speaker speech generation failed: {str(e)}")
383 |                 raise Exception(f"Error generating multi-speaker speech: {str(e)}")
384 | 
385 |     @classmethod
386 |     def IS_CHANGED(cls, text="", model="VibeVoice-7B-Preview",
387 |                    speaker1_voice=None, speaker2_voice=None,
388 |                    speaker3_voice=None, speaker4_voice=None, lora=None, **kwargs):
389 |         """Cache key for ComfyUI"""
390 |         voices_hash = hash(str([speaker1_voice, speaker2_voice, speaker3_voice, speaker4_voice]))
391 |         lora_hash = hash(str(lora)) if lora else 0
392 |         return f"{hash(text)}_{model}_{voices_hash}_{lora_hash}_{kwargs.get('cfg_scale', 1.3)}_{kwargs.get('seed', 0)}"


--------------------------------------------------------------------------------
/vvembed/modular/modeling_vibevoice.py:
--------------------------------------------------------------------------------
  1 | # Original code by Microsoft
  2 | # updated by Fabio Sarracino - Enemyx-net
  3 | 
  4 | from dataclasses import dataclass
  5 | from typing import Dict, List, Optional, Tuple, Union, Callable
  6 | from tqdm import tqdm
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import torch.distributed as dist
 11 | 
 12 | from transformers.models.auto import AutoModel, AutoModelForCausalLM
 13 | 
 14 | from transformers.activations import ACT2FN
 15 | from transformers.modeling_outputs import CausalLMOutput, BaseModelOutputWithPast, ModelOutput
 16 | from transformers.models.llama.modeling_llama import LlamaRMSNorm
 17 | from transformers import modeling_utils
 18 | from transformers.modeling_utils import PreTrainedModel
 19 | from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 20 | from transformers.utils import logging
 21 | 
 22 | 
 23 | from .modular_vibevoice_tokenizer import VibeVoiceTokenizerStreamingCache, VibeVoiceAcousticTokenizerModel, VibeVoiceSemanticTokenizerModel
 24 | from .modular_vibevoice_diffusion_head import VibeVoiceDiffusionHead
 25 | 
 26 | # Import schedule module with robust path handling to avoid conflicts with PyPI 'schedule' package
 27 | import sys
 28 | import os
 29 | 
 30 | # Get the path to vvembed directory
 31 | _current_dir = os.path.dirname(os.path.abspath(__file__))
 32 | _vvembed_dir = os.path.dirname(_current_dir)
 33 | _schedule_path = os.path.join(_vvembed_dir, 'schedule')
 34 | 
 35 | # Ensure vvembed is at the front of sys.path to prioritize our schedule module
 36 | if _vvembed_dir not in sys.path:
 37 |     sys.path.insert(0, _vvembed_dir)
 38 | elif sys.path.index(_vvembed_dir) > 0:
 39 |     # Move it to the front if it's not already
 40 |     sys.path.remove(_vvembed_dir)
 41 |     sys.path.insert(0, _vvembed_dir)
 42 | 
 43 | # Verify the schedule module exists
 44 | if not os.path.exists(_schedule_path):
 45 |     raise ImportError(
 46 |         f"Cannot find 'schedule' directory in vvembed. "
 47 |         f"Expected at: {_schedule_path}"
 48 |     )
 49 | 
 50 | # Import with our schedule module prioritized
 51 | try:
 52 |     from schedule.dpm_solver import DPMSolverMultistepScheduler
 53 | except ImportError as e:
 54 |     raise ImportError(
 55 |         f"Failed to import DPMSolverMultistepScheduler from {_schedule_path}. "
 56 |         f"There might be a conflict with another Python package. "
 57 |         f"Original error: {e}"
 58 |     )
 59 | 
 60 | from .configuration_vibevoice import VibeVoiceConfig
 61 | 
 62 | 
 63 | logger = logging.get_logger(__name__)
 64 | 
 65 | if not hasattr(modeling_utils, "ALL_PARALLEL_STYLES") or modeling_utils.ALL_PARALLEL_STYLES is None:
 66 |     modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none", "colwise", "rowwise"]
 67 | 
 68 | @dataclass
 69 | class VibeVoiceCausalLMOutputWithPast(ModelOutput):
 70 |     loss: Optional[torch.FloatTensor] = None
 71 |     diffusion_loss: Optional[torch.FloatTensor] = None
 72 |     speech_token_num: Optional[int] = None
 73 |     logits: torch.FloatTensor = None
 74 |     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
 75 |     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 76 |     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 77 | 
 78 | 
 79 | @dataclass
 80 | class VibeVoiceGenerationOutput(ModelOutput):
 81 |     """
 82 |     Output type for VibeVoice generation.
 83 |     
 84 |     Args:
 85 |         sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
 86 |             The generated sequences. 
 87 |         speech_outputs (`List[torch.FloatTensor]`, *optional*):
 88 |             List of generated speech waveforms or latents for each speech segment.
 89 |     """
 90 |     sequences: torch.LongTensor = None
 91 |     speech_outputs: Optional[List[torch.FloatTensor]] = None
 92 | 
 93 | 
 94 | class SpeechConnector(nn.Module):
 95 |     def __init__(self, input_dim, output_dim):
 96 |         super().__init__()
 97 |         self.fc1 = nn.Linear(input_dim, output_dim)
 98 |         self.norm = LlamaRMSNorm(output_dim, eps=1e-6)
 99 |         self.fc2 = nn.Linear(output_dim, output_dim)
100 | 
101 |     def forward(self, features, **kwargs):    
102 |         x = self.fc1(features)
103 |         x = self.norm(x)
104 |         x = self.fc2(x)
105 |         return x
106 | 
107 | 
108 | # @auto_docstring
109 | class VibeVoicePreTrainedModel(PreTrainedModel):
110 |     config_class = VibeVoiceConfig
111 |     base_model_prefix = "model"
112 |     supports_gradient_checkpointing = True
113 |     _skip_keys_device_placement = "past_key_values"
114 |     _supports_cache_class = True
115 |     _supports_flash_attn_2 = True
116 |     _supports_sdpa = True
117 |     _supports_quantized_cache = True
118 |     _supports_static_cache = True
119 |     _supports_attention_backend = True
120 | 
121 |     def _init_weights(self, module):
122 |         if isinstance(module, VibeVoiceDiffusionHead):
123 |             module.initialize_weights()
124 |             return
125 | 
126 |         # Use the language model's initializer_range if available
127 |         if hasattr(self.config, 'language_model_config') and hasattr(self.config.language_model_config, 'initializer_range'):
128 |             std = self.config.language_model_config.initializer_range
129 |         elif hasattr(self.config, 'decoder_config') and hasattr(self.config.decoder_config, 'initializer_range'):
130 |             std = self.config.decoder_config.initializer_range
131 |         else:
132 |             std = 0.02  # Default value
133 |             
134 |         if isinstance(module, nn.Linear):
135 |             module.weight.data.normal_(mean=0.0, std=std)
136 |             if module.bias is not None:
137 |                 module.bias.data.zero_()
138 |         elif isinstance(module, nn.LayerNorm):
139 |             module.weight.data.fill_(1.0)
140 |             module.bias.data.zero_()
141 | 
142 | # @auto_docstring
143 | class VibeVoiceModel(VibeVoicePreTrainedModel):
144 |     def __init__(self, config):
145 |         super().__init__(config)
146 |         
147 |         if hasattr(config, 'torch_dtype') and config.torch_dtype is not None:
148 |             if isinstance(config.torch_dtype, str):
149 |                 dtype = getattr(torch, config.torch_dtype)
150 |             else:
151 |                 dtype = config.torch_dtype
152 |         else:
153 |             dtype = torch.float32
154 |         
155 |         # Initialize Qwen2 model for language modeling
156 |         lm_config = config.decoder_config 
157 |         self.language_model = AutoModel.from_config(lm_config)
158 |         
159 |         # Initialize speech components if needed
160 |         self.acoustic_tokenizer = AutoModel.from_config(config.acoustic_tokenizer_config).to(dtype)
161 |         self.semantic_tokenizer = AutoModel.from_config(config.semantic_tokenizer_config).to(dtype)
162 | 
163 |         self.acoustic_connector = SpeechConnector(config.acoustic_vae_dim, lm_config.hidden_size).to(dtype)
164 |         self.semantic_connector = SpeechConnector(config.semantic_vae_dim, lm_config.hidden_size).to(dtype)
165 |         
166 |         # Register scaling factors as buffers - use 1D tensors for FSDP compatibility
167 |         self.register_buffer('speech_scaling_factor', torch.tensor(float('nan')))  
168 |         self.register_buffer('speech_bias_factor', torch.tensor(float('nan')))
169 | 
170 |         # Initialize prediction head for speech generation
171 |         self.prediction_head = AutoModel.from_config(config.diffusion_head_config).to(dtype)
172 | 
173 |         # Initialize noise scheduler
174 |         self.noise_scheduler = DPMSolverMultistepScheduler(
175 |             num_train_timesteps=config.diffusion_head_config.ddpm_num_steps,
176 |             beta_schedule=config.diffusion_head_config.ddpm_beta_schedule,
177 |             prediction_type=config.diffusion_head_config.prediction_type
178 |         )
179 |     
180 |     def get_input_embeddings(self):
181 |         if hasattr(self.language_model, 'embed_tokens'):
182 |             # If the language model has an embed_tokens attribute, return it
183 |             return self.language_model.embed_tokens
184 |         
185 |         for name, attr in self.language_model.fullmap.items(): # parallel by nnscaler, the name is changed
186 |             if attr.orig_name == 'embed_tokens.weight':
187 |                 return getattr(self.language_model, name)
188 |         assert False, 'should not arrive here'
189 | 
190 |     def set_input_embeddings(self, value):
191 |         self.language_model.embed_tokens = value
192 |     
193 |     def set_speech_tokenizers(self, acoustic_tokenizer=None, semantic_tokenizer=None):
194 |         """Set the speech tokenizers used for encoding and decoding speech."""
195 |         self.acoustic_tokenizer = acoustic_tokenizer
196 |         self.semantic_tokenizer = semantic_tokenizer
197 |         
198 |         # Reset the encoder to evaluation mode
199 |         if self.acoustic_tokenizer is not None:
200 |             self.acoustic_tokenizer.eval()
201 |             
202 |         if self.semantic_tokenizer is not None:
203 |             self.semantic_tokenizer.eval()
204 |     
205 |     def forward(
206 |         self,
207 |         input_ids: torch.LongTensor = None,
208 |         attention_mask: Optional[torch.Tensor] = None,
209 |         position_ids: Optional[torch.LongTensor] = None,
210 |         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
211 |         inputs_embeds: Optional[torch.FloatTensor] = None,
212 |         use_cache: Optional[bool] = None,
213 |         output_attentions: Optional[bool] = None,
214 |         output_hidden_states: Optional[bool] = None,
215 |         return_dict: Optional[bool] = None,
216 |         cache_position: Optional[torch.LongTensor] = None,
217 |         **kwargs,
218 |     ) -> Union[Tuple, BaseModelOutputWithPast]:
219 |         
220 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
221 |         
222 |         # Forward through language model
223 |         outputs = self.language_model(
224 |             input_ids=input_ids,
225 |             attention_mask=attention_mask,
226 |             position_ids=position_ids,
227 |             past_key_values=past_key_values,
228 |             inputs_embeds=inputs_embeds,
229 |             use_cache=use_cache,
230 |             output_attentions=output_attentions,
231 |             output_hidden_states=output_hidden_states,
232 |             return_dict=return_dict,
233 |             cache_position=cache_position,
234 |             **kwargs,
235 |         )
236 |         
237 |         if not return_dict:
238 |             return outputs
239 |             
240 |         return BaseModelOutputWithPast(
241 |             last_hidden_state=outputs.last_hidden_state,
242 |             past_key_values=outputs.past_key_values,
243 |             hidden_states=outputs.hidden_states,
244 |             attentions=outputs.attentions,
245 |         )
246 | 
247 | 
248 | class VibeVoiceForConditionalGeneration(VibeVoicePreTrainedModel):
249 |     _tied_weights_keys = ["lm_head.weight"]
250 |     _tp_plan = {"lm_head": "colwise_rep"}
251 | 
252 |     def __init__(self, config):
253 |         super().__init__(config)
254 |         self.model = VibeVoiceModel(config)
255 |         self.vocab_size = config.decoder_config.vocab_size
256 |         self.lm_head = nn.Linear(config.decoder_config.hidden_size, self.vocab_size, bias=False)
257 | 
258 |         self.post_init()
259 |         
260 |     def get_input_embeddings(self):
261 |         return self.model.get_input_embeddings()
262 | 
263 |     def set_input_embeddings(self, value):
264 |         self.model.set_input_embeddings(value)
265 | 
266 |     def get_output_embeddings(self):
267 |         return self.lm_head
268 | 
269 |     def set_decoder(self, decoder):
270 |         self.model.language_model = decoder
271 | 
272 |     def get_decoder(self):
273 |         return self.model.language_model
274 | 
275 |     def tie_weights(self):
276 |         """
277 |         Tie the weights between the input embeddings and the output embeddings.
278 |         """
279 |         if getattr(self.config.decoder_config, 'tie_word_embeddings', False):
280 |             # The standard PreTrainedModel method will handle the tying.
281 |             # It typically does a simple parameter object assignment, which is
282 |             # CORRECT to do BEFORE FSDP wraps the model.
283 |             output_embeddings = self.get_output_embeddings()
284 |             input_embeddings = self.get_input_embeddings()
285 |             if hasattr(input_embeddings, 'weight'):
286 |                 output_embeddings.weight = input_embeddings.weight
287 |             else:
288 |                 # maybe returned input_embeddings a tensor directly
289 |                 output_embeddings.weight = input_embeddings
290 | 
291 |             if getattr(output_embeddings, "bias", None) is not None:
292 |                 output_embeddings.bias.data = nn.functional.pad(
293 |                     output_embeddings.bias.data,
294 |                     (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0]),
295 |                     "constant",
296 |                     0,
297 |                 )
298 |             print("✅ Tied input and output embeddings using standard assignment.")
299 |         else:
300 |             print("ℹ️  tie_word_embeddings is False, not tying weights.")
301 | 
302 |     # Also, ensure set_output_embeddings is safe, though your implementation looks okay.
303 |     # The key is to avoid calling it after accelerator.prepare().
304 |     def set_output_embeddings(self, new_embeddings):
305 |         # Your current implementation using data.copy_ is good practice,
306 |         # but the best way is to not call this after prepare().
307 |         self.lm_head = new_embeddings
308 | 
309 |     def forward_speech_features(
310 |             self, 
311 |             speech_tensors=None, 
312 |             speech_masks=None, 
313 |             speech_type="audio", 
314 |             return_unmask=False
315 |         ):
316 |         if speech_tensors is None:
317 |             # Use config to get vae_dim instead of non-existent self.args
318 |             vae_dim = self.config.acoustic_tokenizer_config.vae_dim
319 |             audio_features = torch.zeros(1, 1, vae_dim).to(self.get_input_embeddings().weight)
320 |             connect_features = self.model.acoustic_connector(audio_features)
321 |             return audio_features, connect_features
322 |         else:
323 |             with torch.no_grad():
324 |                 if speech_type == "audio":
325 |                     with torch.no_grad():
326 |                         frames = self.model.acoustic_tokenizer.encode(speech_tensors.unsqueeze(1))[0][0]
327 |                     audio_tokens = frames.sample(self.model.acoustic_tokenizer.std_dist_type)[0]
328 | 
329 |                 elif speech_type == "vae":
330 |                     # Use config to get vae_dim instead of non-existent self.args
331 |                     vae_dim = self.config.acoustic_tokenizer_config.vae_dim
332 |                     speech_mode = speech_tensors.reshape(speech_tensors.size(0), -1, vae_dim)
333 | 
334 |                     # gaussian sample from the speech_mode
335 |                     batch_size = speech_mode.size(0)
336 |                     value = self.model.acoustic_tokenizer.fix_std / 0.8
337 |                     std = torch.randn(batch_size, dtype=speech_mode.dtype, device=speech_mode.device) * value
338 |                     std = std.view(-1, *[1] * (speech_mode.dim() - 1))
339 |                     audio_tokens = speech_mode + std * torch.randn(speech_mode.shape).to(speech_mode)
340 |                 else:
341 |                     raise NotImplementedError(f"Speech type {speech_type} not implemented")
342 |                 
343 |                 if torch.isnan(self.model.speech_scaling_factor) or torch.isnan(self.model.speech_bias_factor):
344 |                     scaling_factor = 1. / audio_tokens[speech_masks].flatten().std()
345 |                     bias_factor = -audio_tokens[speech_masks].flatten().mean()
346 |                     
347 |                     # Only use distributed operations if the process group is initialized
348 |                     if dist.is_available() and dist.is_initialized():
349 |                         dist.all_reduce(scaling_factor, op=dist.ReduceOp.SUM)
350 |                         dist.all_reduce(bias_factor, op=dist.ReduceOp.SUM)
351 |                         world_size = dist.get_world_size()
352 |                         self.model.speech_scaling_factor.copy_(scaling_factor / world_size)  
353 |                         self.model.speech_bias_factor.copy_(bias_factor / world_size)
354 |                         print(f"Speech scaling factor (distributed): {self.model.speech_scaling_factor}, bias factor: {self.model.speech_bias_factor}", flush=True)
355 |                     else:
356 |                         # Single process case
357 |                         self.model.speech_scaling_factor.copy_(scaling_factor)  
358 |                         self.model.speech_bias_factor.copy_(bias_factor)
359 |                         print(f"Speech scaling factor (single process): {self.model.speech_scaling_factor}, bias factor: {self.model.speech_bias_factor}", flush=True)
360 |                     
361 |                 audio_features = (audio_tokens + self.model.speech_bias_factor) * self.model.speech_scaling_factor
362 |             
363 |             connect_features = self.model.acoustic_connector(audio_features)
364 |             if return_unmask:
365 |                 return audio_features, connect_features
366 |             return audio_features[speech_masks], connect_features[speech_masks]
367 |         
368 |     def forward(
369 |         self,
370 |         input_ids: torch.LongTensor = None,
371 |         attention_mask: Optional[torch.Tensor] = None,
372 |         position_ids: Optional[torch.LongTensor] = None,
373 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
374 |         inputs_embeds: Optional[torch.FloatTensor] = None,
375 |         labels: Optional[torch.LongTensor] = None,
376 |         use_cache: Optional[bool] = False,
377 |         output_attentions: Optional[bool] = None,
378 |         output_hidden_states: Optional[bool] = None,
379 |         return_dict: Optional[bool] = None,
380 |         cache_position: Optional[torch.LongTensor] = None,
381 |         # New arguments for speech processing and loss calculation
382 |         speech_tensors: Optional[torch.FloatTensor] = None,
383 |         speech_masks: Optional[torch.BoolTensor] = None,
384 |         speeches_loss_input: Optional[torch.FloatTensor] = None,
385 |         speech_semantic_tensors: Optional[torch.FloatTensor] = None, 
386 |         acoustic_input_mask: Optional[torch.BoolTensor] = None,
387 |         acoustic_loss_mask: Optional[torch.BoolTensor] = None,
388 |         ddpm_batch_mul: int = 1,
389 |         **kwargs: Optional[Dict[str, Union[torch.Tensor, str]]],
390 |         ) -> Union[Tuple, VibeVoiceCausalLMOutputWithPast]:
391 |         
392 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
393 |         
394 |         x = self.get_input_embeddings()(input_ids)
395 | 
396 |         semantic_speech_all_connect_features = self.model.semantic_connector(speech_semantic_tensors)
397 |         if speeches_loss_input is not None:
398 |             # only part audio need diffuse
399 |             speech_all_features, speech_all_connect_features = self.forward_speech_features(
400 |                     speech_tensors=speech_tensors.type_as(x) if speech_tensors is not None else None,
401 |                     speech_masks=speech_masks,
402 |                     speech_type=kwargs.get("speech_type", "audio"),
403 |                     return_unmask=True
404 |                 )
405 |             if speech_tensors is not None:
406 |                 if semantic_speech_all_connect_features is not None:
407 |                     x[acoustic_input_mask] = speech_all_connect_features[speech_masks] + semantic_speech_all_connect_features[speech_masks]
408 |                 else:
409 |                     x[acoustic_input_mask] = speech_all_connect_features[speech_masks]
410 |                 speech_features = speech_all_features[speeches_loss_input.unsqueeze(-1) & speech_masks] # only part audio need diffuse
411 |                 speech_connect_features = speech_all_connect_features[speeches_loss_input.unsqueeze(-1) & speech_masks]
412 |         else:
413 |             speech_features, speech_connect_features = self.forward_speech_features(
414 |                     speech_tensors=speech_tensors.type_as(x) if speech_tensors is not None else None,
415 |                     speech_masks=speech_masks,
416 |                     speech_type=kwargs.get("speech_type", "audio"),
417 |                 )
418 |             if speech_tensors is not None:
419 |                 x[acoustic_input_mask] = speech_connect_features
420 | 
421 |         outputs = self.model(
422 |             input_ids=None,
423 |             attention_mask=attention_mask,
424 |             position_ids=position_ids,
425 |             past_key_values=past_key_values,
426 |             inputs_embeds=x,
427 |             use_cache=use_cache,
428 |             output_attentions=output_attentions,
429 |             output_hidden_states=False,
430 |             return_dict=return_dict,
431 |             cache_position=cache_position,
432 |         )
433 | 
434 |         hidden_states = outputs.last_hidden_state
435 |         logits = self.lm_head(hidden_states)
436 |         # logits = logits.float()
437 | 
438 |         loss = None
439 |         if labels is not None:
440 |             # The custom CE loss with masking is calculated in the training script.
441 |             # We leave the standard loss calculation here as None.
442 |             pass
443 | 
444 |         # --- Diffusion Loss Calculation ---
445 |         diffusion_loss = None
446 |         # This block is executed only if we are in a context that involves speech.
447 |         if speech_tensors is not None and acoustic_loss_mask.sum().item() > 0:
448 |             condition_features = hidden_states[acoustic_loss_mask]
449 |             
450 |             speech_len, latent_size = speech_features.shape
451 |             
452 |             noise = torch.randn(
453 |                 (speech_len * ddpm_batch_mul, latent_size),
454 |                 device=hidden_states.device,
455 |                 dtype=hidden_states.dtype
456 |             )
457 |             
458 |             timesteps = torch.multinomial(
459 |                 torch.ones(self.config.diffusion_head_config.ddpm_num_steps),
460 |                 speech_len * ddpm_batch_mul,
461 |                 replacement=True,
462 |             ).to(hidden_states.device)
463 | 
464 |             speech_features_repeated = speech_features.repeat_interleave(ddpm_batch_mul, dim=0)
465 |             condition_features_repeated = condition_features.repeat_interleave(ddpm_batch_mul, dim=0)
466 | 
467 |             noisy_speech_features = self.model.noise_scheduler.add_noise(
468 |                 speech_features_repeated, noise, timesteps
469 |             )
470 |             
471 |             model_output = self.model.prediction_head(
472 |                 noisy_speech_features, 
473 |                 timesteps.type_as(x), 
474 |                 condition_features_repeated
475 |             )
476 | 
477 |             prediction_type = self.config.diffusion_head_config.prediction_type
478 |             if prediction_type == "epsilon":
479 |                 target_for_loss = noise
480 |             elif prediction_type == "v_prediction":
481 |                 target_for_loss = self.model.noise_scheduler.get_velocity(
482 |                     speech_features_repeated, noise, timesteps
483 |                 )
484 |             else:
485 |                 raise NotImplementedError(f"Prediction type {prediction_type} not implemented")
486 | 
487 |             diffusion_loss = F.mse_loss(model_output.float(), target_for_loss.float(), reduction='sum')
488 |             if latent_size > 0 and ddpm_batch_mul > 0:
489 |                 diffusion_loss = diffusion_loss / latent_size / ddpm_batch_mul
490 |             else:
491 |                 diffusion_loss = torch.tensor(0.0, device=diffusion_loss.device)
492 |         
493 |         else:
494 |             # Dummy loss for DDP to work when there are no speech samples in a batch,
495 |             # but we are in a speech context.
496 |             diffusion_loss = sum(p.sum() for p in self.model.prediction_head.parameters()) * 0.0
497 |             diffusion_loss += sum(p.sum() for p in self.model.acoustic_connector.parameters()) * 0.0
498 |             diffusion_loss += sum(p.sum() for p in self.model.semantic_connector.parameters()) * 0.0
499 |         # --- End Diffusion Loss Calculation ---
500 | 
501 |         if not return_dict:
502 |             output = (logits, speech_len) + outputs.to_tuple()[1:]
503 |             return (loss, diffusion_loss) + output
504 | 
505 |         return VibeVoiceCausalLMOutputWithPast(
506 |             loss=loss,
507 |             diffusion_loss=diffusion_loss,
508 |             speech_token_num=speech_len if speech_tensors is not None else 0,
509 |             logits=logits,
510 |             past_key_values=outputs.past_key_values,
511 |             hidden_states=outputs.hidden_states,
512 |             attentions=outputs.attentions,
513 |         )
514 | 
515 | AutoModel.register(VibeVoiceConfig, VibeVoiceModel)
516 | AutoModelForCausalLM.register(VibeVoiceConfig, VibeVoiceForConditionalGeneration)
517 | 
518 | __all__ = [
519 |     "VibeVoiceModel",
520 |     "VibeVoicePreTrainedModel",
521 |     "VibeVoiceForConditionalGeneration",
522 |     "VibeVoiceCausalLMOutputWithPast",
523 |     "VibeVoiceGenerationOutput",
524 | ]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # VibeVoice ComfyUI Nodes
  2 | 
  3 | A comprehensive ComfyUI integration for Microsoft's VibeVoice text-to-speech model, enabling high-quality single and multi-speaker voice synthesis directly within your ComfyUI workflows.
  4 | 
  5 | ## ✨ Features
  6 | 
  7 | ### Core Functionality
  8 | - 🎤 **Single Speaker TTS**: Generate natural speech with optional voice cloning
  9 | - 👥 **Multi-Speaker Conversations**: Support for up to 4 distinct speakers
 10 | - 🎯 **Voice Cloning**: Clone voices from audio samples
 11 | - 🎨 **LoRA Support**: Fine-tune voices with custom LoRA adapters (v1.4.0+)
 12 | - 🎚️ **Voice Speed Control**: Adjust speech rate by modifying reference voice speed (v1.5.0+)
 13 | - 📝 **Text File Loading**: Load scripts from text files
 14 | - 📚 **Automatic Text Chunking**: Handles long texts seamlessly with configurable chunk size
 15 | - ⏸️ **Custom Pause Tags**: Insert silences with `[pause]` and `[pause:ms]` tags (wrapper feature)
 16 | - 🔄 **Node Chaining**: Connect multiple VibeVoice nodes for complex workflows
 17 | - ⏹️ **Interruption Support**: Cancel operations before or between generations
 18 | - 🔧 **Flexible Configuration**: Control temperature, sampling, and guidance scale
 19 | 
 20 | ### Performance & Optimization
 21 | - ⚡ **Attention Mechanisms**: Choose between auto, eager, sdpa, flash_attention_2 or sage
 22 | - 🎛️ **Diffusion Steps**: Adjustable quality vs speed trade-off (default: 20)
 23 | - 💾 **Memory Management**: Toggle automatic VRAM cleanup after generation
 24 | - 🧹 **Free Memory Node**: Manual memory control for complex workflows
 25 | - 🍎 **Apple Silicon Support**: Native GPU acceleration on M1/M2/M3 Macs via MPS
 26 | - 🔢 **8-Bit Quantization**: Perfect audio quality with high VRAM reduction
 27 | - 🔢 **4-Bit Quantization**: Maximum VRAM savings with minimal quality loss
 28 | 
 29 | ### Compatibility & Installation
 30 | - 📦 **Self-Contained**: Embedded VibeVoice code, no external dependencies
 31 | - 🔄 **Universal Compatibility**: Adaptive support for transformers v4.51.3+
 32 | - 🖥️ **Cross-Platform**: Works on Windows, Linux, and macOS
 33 | - 🎮 **Multi-Backend**: Supports CUDA, CPU, and MPS (Apple Silicon)
 34 | 
 35 | ## 🎥 Video Demo
 36 | <p align="center">
 37 |   <a href="https://www.youtube.com/watch?v=fIBMepIBKhI">
 38 |     <img src="https://img.youtube.com/vi/fIBMepIBKhI/maxresdefault.jpg" alt="VibeVoice ComfyUI Wrapper Demo" />
 39 |   </a>
 40 |   <br>
 41 |   <strong>Click to watch the demo video</strong>
 42 | </p>
 43 | 
 44 | ## 📦 Installation
 45 | 
 46 | ### Automatic Installation (Recommended)
 47 | 1. Clone this repository into your ComfyUI custom nodes folder:
 48 | ```bash
 49 | cd ComfyUI/custom_nodes
 50 | git clone https://github.com/Enemyx-net/VibeVoice-ComfyUI
 51 | ```
 52 | 
 53 | 2. Restart ComfyUI - the nodes will automatically install requirements on first use
 54 | 
 55 | ## 📥 Model Installation
 56 | 
 57 | ### Manual Download Required
 58 | Starting from version 1.6.0, models and tokenizer must be manually downloaded and placed in the correct folder. The wrapper no longer downloads them automatically.
 59 | 
 60 | ### Download Links
 61 | 
 62 | #### Models
 63 | You can download VibeVoice models from HuggingFace:
 64 | 
 65 | | Model                  | Size   | Download Link |
 66 | |------------------------|--------|---------------|
 67 | | **VibeVoice-1.5B**     | ~5.4GB | [microsoft/VibeVoice-1.5B](https://huggingface.co/microsoft/VibeVoice-1.5B) |
 68 | | **VibeVoice-Large**    | ~18.7GB | [aoi-ot/VibeVoice-Large](https://huggingface.co/aoi-ot/VibeVoice-Large) |
 69 | | **VibeVoice-Large-Q8** | ~11.6GB | [FabioSarracino/VibeVoice-Large-Q8](https://huggingface.co/FabioSarracino/VibeVoice-Large-Q8) |
 70 | | **VibeVoice-Large-Q4** | ~6.6GB | [DevParker/VibeVoice7b-low-vram](https://huggingface.co/DevParker/VibeVoice7b-low-vram) |
 71 | 
 72 | #### Tokenizer (Required)
 73 | VibeVoice uses the Qwen2.5-1.5B tokenizer:
 74 | - Download from: [Qwen2.5-1.5B Tokenizer](https://huggingface.co/Qwen/Qwen2.5-1.5B/tree/main)
 75 | - Required files: `tokenizer_config.json`, `vocab.json`, `merges.txt`, `tokenizer.json`
 76 | 
 77 | ### Installation Steps
 78 | 1. Create the models folder if it doesn't exist:
 79 |    ```
 80 |    ComfyUI/models/vibevoice/
 81 |    ```
 82 | 
 83 | 2. Download and organize files in the vibevoice folder:
 84 |    ```
 85 |    ComfyUI/models/vibevoice/
 86 |    ├── tokenizer/                 # Place Qwen tokenizer files here
 87 |    │   ├── tokenizer_config.json
 88 |    │   ├── vocab.json
 89 |    │   ├── merges.txt
 90 |    │   └── tokenizer.json
 91 |    ├── VibeVoice-1.5B/           # Model folder
 92 |    │   ├── config.json
 93 |    │   ├── model-00001-of-00003.safetensors
 94 |    │   ├── model-00002-of-00003.safetensors
 95 |    │   └── ... (other model files)
 96 |    ├── VibeVoice-Large/
 97 |    │   └── ... (model files)
 98 |    └── my-custom-vibevoice/      # custom names are supported
 99 |        └── ... (model files)
100 |    ```
101 | 
102 | 3. For models downloaded from HuggingFace using git-lfs or the HF CLI, you can also use the cache structure:
103 |    ```
104 |    ComfyUI/models/vibevoice/
105 |    └── models--microsoft--VibeVoice-1.5B/
106 |        └── snapshots/
107 |            └── [hash]/
108 |                └── ... (model files)
109 |    ```
110 | 
111 | 4. Refresh your browser - the models will appear in the dropdown menu
112 | 
113 | ### Notes
114 | - The dropdown will show user-friendly names extracted from folder names
115 | - Both regular folders and HuggingFace cache structures are supported
116 | - Models are rescanned on every browser refresh
117 | - Quantized models are automatically detected from their config files
118 | - The tokenizer is searched in this priority order:
119 |   1. `ComfyUI/models/vibevoice/tokenizer/` (recommended)
120 |   2. `ComfyUI/models/vibevoice/models--Qwen--Qwen2.5-1.5B/` (if exists from previous installations)
121 |   3. HuggingFace cache (if available)
122 | 
123 | ## 🔧 Available Nodes
124 | 
125 | ### 1. VibeVoice Load Text From File
126 | Loads text content from files in ComfyUI's input/output/temp directories.
127 | - **Supported formats**: .txt
128 | - **Output**: Text string for TTS nodes
129 | 
130 | ### 2. VibeVoice Single Speaker
131 | Generates speech from text using a single voice.
132 | - **Text Input**: Direct text or connection from Load Text node
133 | - **Models**: Select from available models in dropdown menu
134 | - **Voice Cloning**: Optional audio input for voice cloning
135 | - **Parameters** (in order):
136 |   - `text`: Input text to convert to speech
137 |   - `model`: Select from dropdown list of available models found in `ComfyUI/models/vibevoice/`
138 |   - `attention_type`: auto, eager, sdpa, flash_attention_2 or sage (default: auto)
139 |   - `quantize_llm`: Dynamically quantize only the LLM component for non-quantized models. Options: "full precision" (default), "4bit", or "8bit". 4-bit provides major VRAM savings with minimal quality loss. 8-bit provides a good balance between quality and memory usage. Requires CUDA GPU. Ignored for pre-quantized models.
140 |   - `free_memory_after_generate`: Free VRAM after generation (default: True)
141 |   - `diffusion_steps`: Number of denoising steps (5-100, default: 20)
142 |   - `seed`: Random seed for reproducibility (default: 42)
143 |   - `cfg_scale`: Classifier-free guidance (1.0-2.0, default: 1.3)
144 |   - `use_sampling`: Enable/disable deterministic generation (default: False)
145 | - **Optional Parameters**:
146 |   - `voice_to_clone`: Audio input for voice cloning
147 |   - `lora`: LoRA configuration from VibeVoice LoRA node
148 |   - `temperature`: Sampling temperature (0.1-2.0, default: 0.95)
149 |   - `top_p`: Nucleus sampling parameter (0.1-1.0, default: 0.95)
150 |   - `max_words_per_chunk`: Maximum words per chunk for long texts (100-500, default: 250)
151 |   - `voice_speed_factor`: Speech rate adjustment (0.8-1.2, default: 1.0, step: 0.01)
152 | 
153 | ### 3. VibeVoice Multiple Speakers
154 | Generates multi-speaker conversations with distinct voices.
155 | - **Speaker Format**: Use `[N]:` notation where N is 1-4
156 | - **Voice Assignment**: Optional voice samples for each speaker
157 | - **Recommended Model**: VibeVoice-Large for better multi-speaker quality
158 | - **Parameters** (in order):
159 |   - `text`: Input text with speaker labels
160 |   - `model`: Select from dropdown list of available models found in `ComfyUI/models/vibevoice/`
161 |   - `attention_type`: auto, eager, sdpa, flash_attention_2 or sage (default: auto)
162 |   - `quantize_llm`: Dynamically quantize only the LLM component for non-quantized models. Options: "full precision" (default), "4bit", or "8bit". 4-bit provides major VRAM savings with minimal quality loss. 8-bit provides a good balance between quality and memory usage. Requires CUDA GPU. Ignored for pre-quantized models.
163 |   - `free_memory_after_generate`: Free VRAM after generation (default: True)
164 |   - `diffusion_steps`: Number of denoising steps (5-100, default: 20)
165 |   - `seed`: Random seed for reproducibility (default: 42)
166 |   - `cfg_scale`: Classifier-free guidance (1.0-2.0, default: 1.3)
167 |   - `use_sampling`: Enable/disable deterministic generation (default: False)
168 | - **Optional Parameters**:
169 |   - `speaker1_voice` to `speaker4_voice`: Audio inputs for voice cloning
170 |   - `lora`: LoRA configuration from VibeVoice LoRA node
171 |   - `temperature`: Sampling temperature (0.1-2.0, default: 0.95)
172 |   - `top_p`: Nucleus sampling parameter (0.1-1.0, default: 0.95)
173 |   - `voice_speed_factor`: Speech rate adjustment for all speakers (0.8-1.2, default: 1.0, step: 0.01)
174 | 
175 | ### 4. VibeVoice Free Memory
176 | Manually frees all loaded VibeVoice models from memory.
177 | - **Input**: `audio` - Connect audio output to trigger memory cleanup
178 | - **Output**: `audio` - Passes through the input audio unchanged
179 | - **Use Case**: Insert between nodes to free VRAM/RAM at specific workflow points
180 | - **Example**: `[VibeVoice Node] → [Free Memory] → [Save Audio]`
181 | 
182 | ### 5. VibeVoice LoRA
183 | Configure and load custom LoRA adapters for fine-tuned VibeVoice models.
184 | - **LoRA Selection**: Dropdown menu with available LoRA adapters
185 | - **LoRA Location**: Place your LoRA folders in `ComfyUI/models/vibevoice/loras/`
186 | - **Parameters**:
187 |   - `lora_name`: Select from available LoRA adapters or "None" to disable
188 |   - `llm_strength`: Strength of the language model LoRA (0.0-2.0, default: 1.0)
189 |   - `use_llm`: Apply language model LoRA component (default: True)
190 |   - `use_diffusion_head`: Apply diffusion head replacement (default: True)
191 |   - `use_acoustic_connector`: Apply acoustic connector LoRA (default: True)
192 |   - `use_semantic_connector`: Apply semantic connector LoRA (default: True)
193 | - **Output**: `lora` - LoRA configuration to connect to speaker nodes
194 | - **Usage**: `[VibeVoice LoRA] → [Single/Multiple Speaker Node]`
195 | 
196 | ## 💬 Multi-Speaker Text Format
197 | 
198 | For multi-speaker generation, format your text using the `[N]:` notation:
199 | 
200 | ```
201 | [1]: Hello, how are you today?
202 | [2]: I'm doing great, thanks for asking!
203 | [1]: That's wonderful to hear.
204 | [3]: Hey everyone, mind if I join the conversation?
205 | [2]: Not at all, welcome!
206 | ```
207 | 
208 | **Important Notes:**
209 | - Use `[1]:`, `[2]:`, `[3]:`, `[4]:` for speaker labels
210 | - Maximum 4 speakers supported
211 | - The system automatically detects the number of speakers from your text
212 | - Each speaker can have an optional voice sample for cloning
213 | 
214 | ## 🧠 Model Information
215 | 
216 | ### VibeVoice-1.5B
217 | - **Size**: ~5.4GB download
218 | - **VRAM**: ~6GB
219 | - **Speed**: Faster inference
220 | - **Quality**: Good for single speaker
221 | - **Use Case**: Quick prototyping, single voices
222 | 
223 | ### VibeVoice-Large
224 | - **Size**: ~18.7GB download
225 | - **VRAM**: ~20GB
226 | - **Speed**: Slower inference but optimized
227 | - **Quality**: Best available quality (full precision)
228 | - **Use Case**: Highest quality production, multi-speaker conversations
229 | - **Note**: Latest official release from Microsoft
230 | 
231 | ### VibeVoice-Large-Q8
232 | - **Size**: ~11.6GB download (38% reduction from full model)
233 | - **VRAM**: ~12GB (40% reduction from full precision)
234 | - **Speed**: Balanced inference
235 | - **Quality**: Identical to full precision - perfect audio preservation
236 | - **Use Case**: Production-quality audio with 12GB VRAM GPUs (RTX 3060, 4070 Ti, etc.)
237 | - **Quantization**: Selective 8-bit - only LLM quantized, audio components at full precision
238 | - **Note**: Quantized by Fabio Sarracino
239 | 
240 | ### VibeVoice-Large-Q4
241 | - **Size**: ~6.6GB download
242 | - **VRAM**: ~8GB
243 | - **Speed**: Balanced inference
244 | - **Quality**: Good quality with minimal loss
245 | - **Use Case**: Maximum VRAM savings for lower-end GPUs
246 | - **Note**: Quantized by DevParker
247 | 
248 | Models are automatically downloaded on first use and cached in `ComfyUI/models/vibevoice/`.
249 | 
250 | ## ⚙️ Generation Modes
251 | 
252 | ### Deterministic Mode (Default)
253 | - `use_sampling = False`
254 | - Produces consistent, stable output
255 | - Recommended for production use
256 | 
257 | ### Sampling Mode
258 | - `use_sampling = True`
259 | - More variation in output
260 | - Uses temperature and top_p parameters
261 | - Good for creative exploration
262 | 
263 | ## 🎯 Voice Cloning
264 | 
265 | To clone a voice:
266 | 1. Connect an audio node to the `voice_to_clone` input (single speaker)
267 | 2. Or connect to `speaker1_voice`, `speaker2_voice`, etc. (multi-speaker)
268 | 3. The model will attempt to match the voice characteristics
269 | 
270 | **Requirements for voice samples:**
271 | - Clear audio with minimal background noise
272 | - Minimum 3–10 seconds. Recommended at least 30 seconds for better quality
273 | - Automatically resampled to 24kHz
274 | 
275 | ## 🎨 LoRA Support
276 | 
277 | ### Overview
278 | Starting from version 1.4.0, VibeVoice ComfyUI supports custom LoRA (Low-Rank Adaptation) adapters for fine-tuning voice characteristics. This allows you to train and use specialized voice models while maintaining the base VibeVoice capabilities.
279 | 
280 | ### Setting Up LoRA Adapters
281 | 
282 | 1. **LoRA Directory Structure**:
283 |    Place your LoRA adapter folders in: `ComfyUI/models/vibevoice/loras/`
284 |    ```
285 |    ComfyUI/
286 |    └── models/
287 |        └── vibevoice/
288 |            └── loras/
289 |                ├── my_custom_voice/
290 |                │   ├── adapter_config.json
291 |                │   ├── adapter_model.safetensors
292 |                │   └── diffusion_head/  (optional)
293 |                ├── character_voice/
294 |                └── style_adaptation/
295 |    ```
296 | 
297 | 2. **Required Files**:
298 |    - `adapter_config.json`: LoRA configuration
299 |    - `adapter_model.safetensors` or `adapter_model.bin`: Model weights
300 |    - Optional components:
301 |      - `diffusion_head/`: Custom diffusion head weights
302 |      - `acoustic_connector/`: Acoustic connector adaptation
303 |      - `semantic_connector/`: Semantic connector adaptation
304 | 
305 | ### Using LoRA in ComfyUI
306 | 
307 | 1. **Add VibeVoice LoRA Node**:
308 |    - Create a "VibeVoice LoRA" node in your workflow
309 |    - Select your LoRA from the dropdown menu
310 |    - Configure component settings and strength
311 | 
312 | 2. **Connect to Speaker Nodes**:
313 |    - Connect the LoRA node's output to the `lora` input of speaker nodes
314 |    - Both Single Speaker and Multiple Speakers nodes support LoRA
315 | 
316 | 3. **LoRA Parameters**:
317 |    - **llm_strength**: Controls the influence of the language model LoRA (0.0-2.0)
318 |    - **Component toggles**: Enable/disable specific LoRA components
319 |    - Select "None" to disable LoRA and use the base model
320 | 
321 | ### Training Your Own LoRA
322 | 
323 | To create custom LoRA adapters for VibeVoice, use the official fine-tuning repository:
324 | - **Repository**: [VibeVoice Fine-tuning](https://github.com/voicepowered-ai/VibeVoice-finetuning)
325 | - **Features**:
326 |   - Parameter-efficient fine-tuning
327 |   - Support for custom datasets
328 |   - Adjustable LoRA rank and scaling
329 |   - Optional diffusion head adaptation
330 | 
331 | ### Best Practices
332 | 
333 | - **Voice Consistency**: Use the same LoRA across all chunks for long texts
334 | - **Memory Management**: LoRA adds minimal memory overhead (~100-500MB)
335 | - **Compatibility**: LoRA adapters are compatible with all VibeVoice model variants
336 | - **Strength Tuning**: Start with default strength (1.0) and adjust based on results
337 | 
338 | ### Compatibility Note
339 | 
340 | ⚠️ **Transformers Version**: The LoRA implementation was developed and tested with `transformers==4.51.3`. While our wrapper supports `transformers>=4.51.3`, LoRA functionality with newer versions of transformers is not guaranteed. If you experience issues with LoRA loading, consider using `transformers==4.51.3` specifically:
341 | ```bash
342 | pip install transformers==4.51.3
343 | ```
344 | 
345 | ### 🙏 Credits
346 | 
347 | LoRA implementation by [@jpgallegoar](https://github.com/jpgallegoar) (PR #127)
348 | 
349 | ## 🎚️ Voice Speed Control
350 | 
351 | ### Overview
352 | The Voice Speed Control feature allows you to influence the speaking rate of generated speech by adjusting the speed of the reference voice. This feature modifies the input voice sample before processing, causing the model to learn and reproduce the altered speech rate.
353 | 
354 | **Available from version 1.5.0**
355 | 
356 | ### How It Works
357 | The system applies time-stretching to the reference voice audio:
358 | - Values < 1.0 slow down the reference voice, resulting in slower generated speech
359 | - Values > 1.0 speed up the reference voice, resulting in faster generated speech
360 | - The model learns from the modified voice characteristics and generates speech at a similar pace
361 | 
362 | ### Usage
363 | - **Parameter**: `voice_speed_factor`
364 | - **Range**: 0.8 to 1.2
365 | - **Default**: 1.0 (normal speed)
366 | - **Step**: 0.01 (1% increments)
367 | 
368 | ### Recommended Settings
369 | - **Optimal Range**: 0.95 to 1.05 for natural-sounding results
370 | - **Slower Speech**: Try 0.95 (5% slower) or 0.97 (3% slower)
371 | - **Faster Speech**: Try 1.03 (3% faster) or 1.05 (5% faster)
372 | - **Best Results**: Provide reference audio of at least 20 seconds for more accurate speed matching
373 | 
374 | ### Important Notes
375 | - The effect works best with longer reference audio samples (20+ seconds recommended)
376 | - Extreme values (< 0.9 or > 1.1) may produce unnatural-sounding speech
377 | - In Multi Speaker mode, the speed adjustment applies to all speakers equally
378 | - Synthetic voices (when no audio is provided) are not affected by this parameter
379 | 
380 | ### 📖 Examples
381 | ```
382 | # Single Speaker
383 | voice_speed_factor: 0.95  # Slightly slower, more deliberate speech
384 | voice_speed_factor: 1.05  # Slightly faster, more energetic speech
385 | 
386 | # Multi Speaker
387 | voice_speed_factor: 0.98  # All speakers talk 2% slower
388 | voice_speed_factor: 1.02  # All speakers talk 2% faster
389 | ```
390 | 
391 | ## ⏸️ Pause Tags Support
392 | 
393 | ### Overview
394 | The VibeVoice wrapper includes a custom pause tag feature that allows you to insert silences between text segments. **This is NOT a standard Microsoft VibeVoice feature** - it's an original implementation of our wrapper to provide more control over speech pacing.
395 | 
396 | **Available from version 1.3.0**
397 | 
398 | ### Usage
399 | You can use two types of pause tags in your text:
400 | - `[pause]` - Inserts a 1-second silence (default)
401 | - `[pause:ms]` - Inserts a custom duration silence in milliseconds (e.g., `[pause:2000]` for 2 seconds)
402 | 
403 | ### 📖 Examples
404 | 
405 | #### Single Speaker
406 | ```
407 | Welcome to our presentation. [pause] Today we'll explore artificial intelligence. [pause:500] Let's begin!
408 | ```
409 | 
410 | #### Multi-Speaker  
411 | ```
412 | [1]: Hello everyone [pause] how are you doing today?
413 | [2]: I'm doing great! [pause:500] Thanks for asking.
414 | [1]: Wonderful to hear!
415 | ```
416 | 
417 | ### Important Notes
418 | 
419 | ⚠️ **Context Limitation Warning**:
420 | > **Note: The pause forces the text to be split into chunks. This may worsen the model's ability to understand the context. The model's context is represented ONLY by its own chunk.**
421 | 
422 | This means:
423 | - Text before a pause and text after a pause are processed separately
424 | - The model cannot see across pause boundaries when generating speech
425 | - This may affect prosody and intonation consistency
426 | - This may affect prosody and intonation consistency
427 | 
428 | ### How It Works
429 | 1. The wrapper parses your text to find pause tags
430 | 2. Text segments between pauses are processed independently 
431 | 3. Silence audio is generated for each pause duration
432 | 4. All audio segments (speech and silence) are concatenated
433 | 
434 | ### Best Practices
435 | - Use pauses at natural breaking points (end of sentences, paragraphs)
436 | - Avoid pauses in the middle of phrases where context is important
437 | - Test different pause durations to find what sounds most natural
438 | 
439 | ## 💡 Tips for Best Results
440 | 
441 | 1. **Text Preparation**:
442 |    - Use proper punctuation for natural pauses
443 |    - Break long texts into paragraphs
444 |    - For multi-speaker, ensure clear speaker transitions
445 |    - Use pause tags sparingly to maintain context continuity
446 | 
447 | 2. **Model Selection**:
448 |    - Use 1.5B for quick single-speaker tasks (fastest, ~8GB VRAM)
449 |    - Use Large for absolute best quality (~20GB VRAM)
450 |    - Use Large-Q8 for production quality with 12GB VRAM (perfect audio, 38% smaller)
451 |    - Use Large-Quant-4Bit for maximum VRAM savings (~7GB VRAM)
452 | 
453 | 3. **Seed Management**:
454 |    - Default seed (42) works well for most cases
455 |    - Save good seeds for consistent character voices
456 |    - Try random seeds if default doesn't work well
457 | 
458 | 4. **Performance**:
459 |    - First run downloads models (5-17GB)
460 |    - Subsequent runs use cached models
461 |    - GPU recommended for faster inference
462 | 
463 | ## 💻 System Requirements
464 | 
465 | ### Hardware
466 | - **Minimum**: 8GB VRAM for VibeVoice-1.5B
467 | - **Recommended**: 17GB+ VRAM for VibeVoice-Large
468 | - **RAM**: 16GB+ system memory
469 | 
470 | ### Software
471 | - Python 3.8+
472 | - PyTorch 2.0+
473 | - CUDA 11.8+ (for GPU acceleration)
474 | - Transformers 4.51.3+
475 | - ComfyUI (latest version)
476 | 
477 | ## 🔧 Troubleshooting
478 | 
479 | ### Installation Issues
480 | - Ensure you're using ComfyUI's Python environment
481 | - Try manual installation if automatic fails
482 | - Restart ComfyUI after installation
483 | 
484 | ### Generation Issues
485 | - If voices sound unstable, try deterministic mode
486 | - For multi-speaker, ensure text has proper `[N]:` format
487 | - Check that speaker numbers are sequential (1,2,3 not 1,3,5)
488 | 
489 | ### Memory Issues
490 | - Large model requires ~16GB VRAM
491 | - Use 1.5B model for lower VRAM systems
492 | - Models use bfloat16 precision for efficiency
493 | 
494 | ## 📖 Examples
495 | 
496 | ### Single Speaker
497 | ```
498 | Text: "Welcome to our presentation. Today we'll explore the fascinating world of artificial intelligence."
499 | Model: [Select from available models]
500 | cfg_scale: 1.3
501 | use_sampling: False
502 | ```
503 | 
504 | ### Two Speakers
505 | ```
506 | [1]: Have you seen the new AI developments?
507 | [2]: Yes, they're quite impressive!
508 | [1]: I think voice synthesis has come a long way.
509 | [2]: Absolutely, it sounds so natural now.
510 | ```
511 | 
512 | ### Four Speaker Conversation
513 | ```
514 | [1]: Welcome everyone to our meeting.
515 | [2]: Thanks for having us!
516 | [3]: Glad to be here.
517 | [4]: Looking forward to the discussion.
518 | [1]: Let's begin with the agenda.
519 | ```
520 | 
521 | ## 📊 Performance Benchmarks
522 | 
523 | | Model              | VRAM Usage | Context Length | Max Audio Duration |
524 | |--------------------|------------|----------------|-------------------|
525 | | VibeVoice-1.5B     | ~6GB       | 64K tokens | ~90 minutes |
526 | | VibeVoice-Large | ~20GB      | 32K tokens | ~45 minutes |
527 | | VibeVoice-Large-Q8 | ~12GB      | 32K tokens | ~45 minutes |
528 | | VibeVoice-Large-Q4 | ~8GB       | 32K tokens | ~45 minutes |
529 | 
530 | ## ⚠️ Known Limitations
531 | 
532 | - Maximum 4 speakers in multi-speaker mode
533 | - Works best with English and Chinese text
534 | - Some seeds may produce unstable output
535 | - Background music generation cannot be directly controlled
536 | 
537 | ## 📄 License
538 | 
539 | This ComfyUI wrapper is released under the MIT License. See LICENSE file for details.
540 | 
541 | **Note**: The VibeVoice model itself is subject to Microsoft's licensing terms:
542 | - VibeVoice is for research purposes only
543 | - Check Microsoft's VibeVoice repository for full model license details
544 | 
545 | ## 🔗 Links
546 | 
547 | - [Original VibeVoice Repository](https://github.com/microsoft/VibeVoice) - Official Microsoft VibeVoice repository (currently unavailable)
548 | 
549 | ## 🙏 Credits
550 | 
551 | - **VibeVoice Model**: Microsoft Research
552 | - **ComfyUI Integration**: Fabio Sarracino
553 | - **Base Model**: Built on Qwen2.5 architecture
554 | 
555 | ## 💬 Support
556 | 
557 | For issues or questions:
558 | 1. Check the troubleshooting section
559 | 2. Review ComfyUI logs for error messages
560 | 3. Ensure VibeVoice is properly installed
561 | 4. Open an issue with detailed error information
562 | 
563 | ## 🤝 Contributing
564 | 
565 | Contributions welcome! Please:
566 | 1. Test changes thoroughly
567 | 2. Follow existing code style
568 | 3. Update documentation as needed
569 | 4. Submit pull requests with clear descriptions
570 | 
571 | ## 📝 Changelog
572 | 
573 | ### Version 1.8.1
574 | - Forced installation of the bitsandbytes>=0.48.1 library as version 0.48.0 has a critical bug that prevents the Q8 model from working.
575 | - Bug Fixing
576 | 
577 | ### Version 1.8.0
578 | - **New Official 8-bit Quantized Model**: VibeVoice-Large-Q8
579 |   - Released on HuggingFace: [FabioSarracino/VibeVoice-Large-Q8](https://huggingface.co/FabioSarracino/VibeVoice-Large-Q8)
580 |   - Model size: 11.6GB (38% reduction from 18.7GB full precision)
581 |   - VRAM usage: ~12GB (40% reduction from ~20GB)
582 |   - **Perfect audio quality**: Identical to full precision model - no quality degradation
583 |   - **Selective quantization approach**: audio-critical components (diffusion head, VAE, connectors) kept at full precision
584 |   - Optimized for 12GB VRAM GPUs (RTX 3060, 4070 Ti, etc.)
585 |   - Solves the common 8-bit "noise problem" by carefully selecting which components to quantize
586 | - **Added 8-bit Dynamic LLM Quantization**
587 |   - New "8bit" option in `quantize_llm` parameter for both Single and Multiple Speaker nodes
588 |   - Options now: "full precision" (default), "4bit", "8bit"
589 |   - Dynamically quantizes only the LLM component for non-quantized models
590 |   - Skips all audio-critical components (diffusion_head, acoustic/semantic connectors, tokenizers)
591 |   - Provides good balance between quality and VRAM savings
592 |   - Requires CUDA GPU and bitsandbytes library
593 |   - Automatically ignored for pre-quantized models
594 | 
595 | ### Version 1.7.0
596 | - Added dynamic LLM-only 4-bit quantization for non-quantized models
597 |   - New `quantize_llm` parameter in both Single and Multiple Speaker nodes
598 |   - Options: "full precision" (default) or "4bit"
599 |   - Quantizes only the language model component while keeping diffusion head at full precision
600 |   - Significantly faster generation with major VRAM savings
601 |   - Minimal quality loss compared to full precision
602 |   - Requires CUDA GPU for quantization
603 |   - Automatically ignored for pre-quantized models
604 |   - Uses NF4 (4-bit NormalFloat) quantization type optimized for neural networks
605 | 
606 | ### Version 1.6.3
607 | - Fixed tokenizer initialization error
608 |   - Resolved `TypeError: expected str, bytes or os.PathLike object, not NoneType` when loading processor
609 |   - Added robust fallback mechanism for tokenizer file path resolution
610 |   - Improved handling of vocab.json and merges.txt file loading
611 |   - Enhanced error handling for edge cases in tokenizer initialization
612 | 
613 | ### Version 1.6.2
614 | - Fixed tokenizer loading issue where HuggingFace cache could interfere with local files
615 | - Tokenizer now loads directly from specified path, avoiding cache conflicts
616 | - Added explicit file path loading for better reliability
617 | - Improved logging to show which tokenizer files are being used
618 | 
619 | ### Version 1.6.1
620 | - Improved integration by removing HuggingFace unnecessary settings
621 | 
622 | ### Version 1.6.0
623 | - **Major Change**: Removed automatic model downloading from HuggingFace
624 |   - Models must now be manually downloaded and placed in `ComfyUI/models/vibevoice/`
625 |   - Dynamic model dropdown that scans available models on each browser refresh
626 |   - Support for custom folder names and HuggingFace cache structure
627 |   - Automatic detection of quantized models from config files
628 |   - Better user control over model management
629 |   - Eliminates authentication issues with private HuggingFace repos
630 | - **Improved Logging System**:
631 |   - Optimized logging to reduce console clutter
632 |   - Cleaner output for better user experience
633 | 
634 | ### Version 1.5.0
635 | - Added Voice Speed Control feature for adjusting speech rate
636 |   - New `voice_speed_factor` parameter in both Single and Multi Speaker nodes
637 |   - Time-stretching applied to reference audio to influence output speech rate
638 |   - Range: 0.8 to 1.2 with 0.01 step increments
639 |   - Recommended range: 0.95 to 1.05 for natural results
640 |   - Best results with 20+ seconds of reference audio
641 | 
642 | ### Version 1.4.3
643 | - Improved LoRA system with better logging and compatibility checks
644 |   - Added model compatibility detection to prevent mismatched LoRA loading
645 |   - Enhanced debug logging for LoRA component loading process
646 |   - Automatic detection and clear error messages for incompatible model-LoRA combinations
647 |   - Prevents loading errors when using quantized models with standard LoRAs
648 |   - Minor optimizations to LoRA weight loading process
649 | 
650 | ### Version 1.4.2
651 | - Bug Fixing
652 | 
653 | ### Version 1.4.1
654 | - Fixed HuggingFace authentication error when loading locally cached models
655 |   - Resolved 401 authorization errors for already downloaded models
656 |   - Node now correctly uses local model snapshots without requiring HuggingFace API authentication
657 |   - Prevents unnecessary API calls when models exist in `ComfyUI/models/vibevoice/`
658 | 
659 | ### Version 1.4.0
660 | - Added LoRA (Low-Rank Adaptation) support for fine-tuned models
661 |   - New "VibeVoice LoRA" node for configuring custom voice adaptations
662 |   - Support for language model, diffusion head, and connector adaptations
663 |   - Dropdown menu for easy LoRA selection from `ComfyUI/models/vibevoice/loras/`
664 |   - Adjustable LoRA strength and component toggles
665 |   - Compatible with both Single and Multiple Speaker nodes
666 |   - Minimal memory overhead (~100-500MB per LoRA)
667 |   - Credits: Implementation by [@jpgallegoar](https://github.com/jpgallegoar)
668 | 
669 | ### Version 1.3.0
670 | - Added custom pause tag support for speech pacing control
671 |   - New `[pause]` tag for 1-second silence (default)
672 |   - New `[pause:ms]` tag for custom duration in milliseconds (e.g., `[pause:2000]` for 2 seconds)
673 |   - Works with both Single Speaker and Multiple Speakers nodes
674 |   - Automatically splits text at pause points while maintaining voice consistency
675 |   - Note: This is a wrapper feature, not part of Microsoft's VibeVoice
676 | 
677 | ### Version 1.2.5
678 | - Bug Fixing
679 | 
680 | ### Version 1.2.4
681 | - Added automatic text chunking for long texts in Single Speaker node
682 |   - Single Speaker node now automatically splits texts longer than 250 words to prevent audio acceleration issues
683 |   - New optional parameter `max_words_per_chunk` (range: 100-500 words, default: 250)
684 |   - Maintains consistent voice characteristics across all chunks using the same seed
685 |   - Seamlessly concatenates audio chunks for smooth, natural output
686 | 
687 | ### Version 1.2.3
688 | - Added SageAttention support for inference speedup
689 |   - New attention option "sage" using quantized attention (INT8/FP8) for faster generation
690 |   - Requirements: NVIDIA GPU with CUDA and sageattention library installation
691 | 
692 | ### Version 1.2.2
693 | - Added 4-bit quantized model support
694 |   - New model in menu: `VibeVoice-Large-Quant-4Bit` using ~7GB VRAM instead of ~17GB
695 |   - Requirements: NVIDIA GPU with CUDA and bitsandbytes library installed
696 | 
697 | ### Version 1.2.1
698 | - Bug Fixing
699 | 
700 | ### Version 1.2.0
701 | - MPS Support for Apple Silicon:
702 |   - Added GPU acceleration support for Mac with Apple Silicon (M1/M2/M3)
703 |   - Automatically detects and uses MPS backend when available, providing significant performance improvements over CPU
704 | 
705 | ### Version 1.1.1
706 | - Universal Transformers Compatibility:
707 |   - Implemented adaptive system that automatically adjusts to different transformers versions
708 |   - Guaranteed compatibility from v4.51.3 onwards
709 |   - Auto-detects and adapts to API changes between versions
710 | 
711 | ### Version 1.1.0
712 | - Updated the URL for downloading the VibeVoice-Large model
713 | - Removed VibeVoice-Large-Preview deprecated model
714 | 
715 | ### Version 1.0.9
716 | - Embedded VibeVoice code directly into the wrapper
717 |   - Added vvembed folder containing the complete VibeVoice code (MIT licensed)
718 |   - No longer requires external VibeVoice installation
719 |   - Ensures continued functionality for all users
720 | 
721 | ### Version 1.0.8
722 | - BFloat16 Compatibility Fix
723 |   - Fixed tensor type compatibility issues with audio processing nodes
724 |   - Input audio tensors are now converted from BFloat16 to Float32 for numpy compatibility
725 |   - Output audio tensors are explicitly converted to Float32 to ensure compatibility with downstream nodes
726 |   - Resolves "Got unsupported ScalarType BFloat16" errors when using voice cloning or saving audio
727 | 
728 | ### Version 1.0.7
729 | - Added interruption handler to detect user's cancel request
730 | - Bug fixing
731 | 
732 | ### Version 1.0.6
733 | - Fixed a bug that prevented VibeVoice nodes from receiving audio directly from another VibeVoice node
734 | 
735 | ### Version 1.0.5
736 | - Added support for Microsoft's official VibeVoice-Large model (stable release)
737 | 
738 | ### Version 1.0.4
739 | - Improved tokenizer dependency handling
740 | 
741 | ### Version 1.0.3
742 | - Added `attention_type` parameter to both Single Speaker and Multi Speaker nodes for performance optimization
743 |   - auto (default): Automatic selection of best implementation
744 |   - eager: Standard implementation without optimizations
745 |   - sdpa: PyTorch's optimized Scaled Dot Product Attention
746 |   - flash_attention_2: Flash Attention 2 for maximum performance (requires compatible GPU)
747 | - Added `diffusion_steps` parameter to control generation quality vs speed trade-off
748 |   - Default: 20 (VibeVoice default)
749 |   - Higher values: Better quality, longer generation time
750 |   - Lower values: Faster generation, potentially lower quality
751 | 
752 | ### Version 1.0.2
753 | - Added `free_memory_after_generate` toggle to both Single Speaker and Multi Speaker nodes
754 | - New dedicated "Free Memory Node" for manual memory management in workflows
755 | - Improved VRAM/RAM usage optimization
756 | - Enhanced stability for long generation sessions
757 | - Users can now choose between automatic or manual memory management
758 | 
759 | ### Version 1.0.1
760 | - Fixed issue with line breaks in speaker text (both single and multi-speaker nodes)
761 | - Line breaks within individual speaker text are now automatically removed before generation
762 | - Improved text formatting handling for all generation modes
763 | 
764 | ### Version 1.0.0
765 | - Initial release
766 | - Single speaker node with voice cloning
767 | - Multi-speaker node with automatic speaker detection
768 | - Text file loading from ComfyUI directories
769 | - Deterministic and sampling generation modes
770 | - Support for VibeVoice 1.5B and Large models


--------------------------------------------------------------------------------