├── example_workflow ├── Maya1_TTS-example_workflow.png └── Maya1_TTS-example_workflow.json ├── resources ├── emotions.txt └── prompt_examples.txt ├── .github └── workflows │ └── publish_action.yml ├── requirements.txt ├── pyproject.toml ├── .gitignore ├── nodes ├── __init__.py ├── maya1_tts_combined.py └── maya1_tts_barebones.py ├── core ├── __init__.py ├── chunking.py ├── snac_decoder.py ├── utils.py └── model_wrapper.py ├── __init__.py ├── js └── config.js ├── LICENSE └── README.md /example_workflow/Maya1_TTS-example_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Saganaki22/ComfyUI-Maya1_TTS/HEAD/example_workflow/Maya1_TTS-example_workflow.png -------------------------------------------------------------------------------- /resources/emotions.txt: -------------------------------------------------------------------------------- 1 | laugh 2 | laugh_harder 3 | giggle 4 | chuckle 5 | cry 6 | sigh 7 | gasp 8 | whisper 9 | angry 10 | scream 11 | snort 12 | yawn 13 | cough 14 | sneeze 15 | breathing 16 | humming 17 | throat_clearing 18 | -------------------------------------------------------------------------------- /.github/workflows/publish_action.yml: -------------------------------------------------------------------------------- 1 | name: Publish to ComfyUI Registry 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: [ pyproject.toml ] 7 | 8 | jobs: 9 | publish: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v4 14 | with: 15 | fetch-depth: 0 16 | 17 | - name: Publish Custom Node to Registry 18 | uses: Comfy-Org/publish-node-action@main 19 | with: 20 | personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies for Maya1 TTS ComfyUI integration 2 | torch>=2.0.0 3 | transformers>=4.40.0 4 | numpy>=1.24.0 5 | 6 | # SNAC audio codec 7 | snac>=1.0.0 8 | 9 | # Audio processing 10 | soundfile>=0.12.0 11 | 12 | # Optional: Accelerated attention mechanisms (uncomment to enable) 13 | # flash-attn>=2.5.0 # Flash Attention 2 (fastest, CUDA only) 14 | # sageattention>=1.0.0 # Sage Attention (memory efficient) 15 | 16 | # Optional: Performance optimization 17 | # accelerate>=0.20.0 # For better device management 18 | # xformers>=0.0.20 # Additional attention optimizations 19 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "ComfyUI-Maya1_TTS" 3 | description = "ComfyUI node for Maya1 TTS - Expressive voice generation with 20+ emotions, voice design, and SNAC neural codec" 4 | version = "1.0.6" 5 | license = {file = "LICENSE"} 6 | dependencies = [ 7 | "torch>=2.0.0", 8 | "transformers>=4.50.0", 9 | "numpy>=1.21.0", 10 | "snac>=1.0.0", 11 | ] 12 | 13 | [project.urls] 14 | Repository = "https://github.com/Saganaki22/ComfyUI-Maya1_TTS" 15 | # Used by Comfy Registry https://comfyregistry.org 16 | 17 | [tool.comfy] 18 | PublisherId = "saganaki22" 19 | DisplayName = "Maya1 TTS" 20 | Icon = "https://avatars.githubusercontent.com/u/84208527" 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Virtual environments 24 | venv/ 25 | ENV/ 26 | env/ 27 | 28 | # IDEs 29 | .vscode/ 30 | .idea/ 31 | *.swp 32 | *.swo 33 | *~ 34 | 35 | # OS 36 | .DS_Store 37 | Thumbs.db 38 | 39 | # Model cache (users should download models separately) 40 | models/ 41 | *.safetensors 42 | *.bin 43 | *.pth 44 | 45 | # Audio outputs (for testing) 46 | *.wav 47 | *.mp3 48 | *.ogg 49 | 50 | # Logs 51 | *.log 52 | 53 | # Temporary files 54 | tmp/ 55 | temp/ 56 | -------------------------------------------------------------------------------- /nodes/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | ComfyUI nodes for Maya1 TTS. 3 | """ 4 | 5 | from .maya1_tts_combined import ( 6 | Maya1TTSCombinedNode, 7 | NODE_CLASS_MAPPINGS as COMBINED_MAPPINGS, 8 | NODE_DISPLAY_NAME_MAPPINGS as COMBINED_DISPLAY_MAPPINGS 9 | ) 10 | 11 | from .maya1_tts_barebones import ( 12 | Maya1TTSBarebonesNode, 13 | NODE_CLASS_MAPPINGS as BAREBONES_MAPPINGS, 14 | NODE_DISPLAY_NAME_MAPPINGS as BAREBONES_DISPLAY_MAPPINGS 15 | ) 16 | 17 | # Merge the mappings from both nodes 18 | NODE_CLASS_MAPPINGS = {**COMBINED_MAPPINGS, **BAREBONES_MAPPINGS} 19 | NODE_DISPLAY_NAME_MAPPINGS = {**COMBINED_DISPLAY_MAPPINGS, **BAREBONES_DISPLAY_MAPPINGS} 20 | 21 | __all__ = [ 22 | "Maya1TTSCombinedNode", 23 | "Maya1TTSBarebonesNode", 24 | "NODE_CLASS_MAPPINGS", 25 | "NODE_DISPLAY_NAME_MAPPINGS", 26 | ] 27 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core modules for Maya1 TTS ComfyUI integration. 3 | """ 4 | 5 | from .model_wrapper import Maya1Model, Maya1ModelLoader 6 | from .snac_decoder import SNACDecoder 7 | from .chunking import ( 8 | smart_chunk_text, 9 | estimate_tokens_for_text, 10 | should_chunk_text 11 | ) 12 | from .utils import ( 13 | discover_maya1_models, 14 | get_model_path, 15 | get_maya1_models_dir, 16 | load_emotions_list, 17 | format_prompt, 18 | check_interruption, 19 | ProgressCallback, 20 | crossfade_audio 21 | ) 22 | 23 | __all__ = [ 24 | "Maya1Model", 25 | "Maya1ModelLoader", 26 | "SNACDecoder", 27 | "smart_chunk_text", 28 | "estimate_tokens_for_text", 29 | "should_chunk_text", 30 | "discover_maya1_models", 31 | "get_model_path", 32 | "get_maya1_models_dir", 33 | "load_emotions_list", 34 | "format_prompt", 35 | "check_interruption", 36 | "ProgressCallback", 37 | "crossfade_audio", 38 | ] 39 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | ComfyUI-Maya1_TTS: Maya1 Text-to-Speech Integration for ComfyUI 3 | 4 | Maya1 is a 3B-parameter speech model built for expressive voice generation 5 | with rich human emotion and precise voice design. 6 | 7 | Features: 8 | - Voice design through natural language descriptions 9 | - 20+ emotions: laugh, cry, whisper, angry, sigh, gasp, and more 10 | - Real-time streaming with SNAC neural codec 11 | - Multiple attention mechanisms: SDPA, Flash Attention 2, Sage Attention 12 | - Native ComfyUI cancel support 13 | 14 | Author: Maya Research 15 | License: Apache 2.0 16 | """ 17 | 18 | import os 19 | from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS 20 | from .core.model_wrapper import Maya1ModelLoader 21 | 22 | __version__ = "1.0.6" 23 | 24 | # ComfyUI requires these exports 25 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS", "WEB_DIRECTORY"] 26 | 27 | # Tell ComfyUI where to find our JavaScript extensions 28 | WEB_DIRECTORY = "./js" 29 | 30 | # Note: VRAM management is controlled by the keep_model_in_vram toggle in the node 31 | # Maya1 models are kept in a separate cache and are not affected by ComfyUI's 32 | # "Unload Models" button. Use the toggle in the node to control VRAM usage. 33 | 34 | # Print banner on load 35 | print("=" * 70) 36 | print("🎤 ComfyUI-Maya1_TTS") 37 | print(" Expressive Voice Generation with Emotions") 38 | print("=" * 70) 39 | print("📦 Nodes loaded:") 40 | for node_name in NODE_CLASS_MAPPINGS.keys(): 41 | display_name = NODE_DISPLAY_NAME_MAPPINGS.get(node_name, node_name) 42 | print(f" • {display_name} ({node_name})") 43 | print("=" * 70) 44 | -------------------------------------------------------------------------------- /resources/prompt_examples.txt: -------------------------------------------------------------------------------- 1 | # Maya1 Voice Description Examples 2 | 3 | ## Basic Voice Descriptions 4 | 5 | ### Female Voices 6 | - Female, in her 30s with an American accent and is an event host, energetic, clear diction 7 | - Female voice in their 20s with a British accent. High pitch, warm timbre, fast pacing, happy tone 8 | - Mythical godlike magical character, Female voice in their 30s slow pacing, curious tone at medium intensity 9 | 10 | ### Male Voices 11 | - Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing 12 | - Dark villain character, Male voice in their 40s with a British accent. low pitch, gravelly timbre, slow pacing, angry tone at high intensity 13 | - Demon character, Male voice in their 30s with a Middle Eastern accent. screaming tone at high intensity 14 | 15 | ## Emotion Tag Examples 16 | 17 | ### Text with Emotions 18 | - Hello! This is Maya1 the best open source voice AI model with emotions. 19 | - Wow. This place looks even better than I imagined. How did they set all this up so perfectly? 20 | - Welcome back to another episode of our podcast! Today we are diving into an absolutely fascinating topic 21 | - After all we went through to pull him out of that mess I can't believe he was the traitor 22 | - You dare challenge me, mortal how amusing. Your kind always thinks they can win 23 | - I can't believe you did that we talked about this so many times already 24 | 25 | ## Available Emotion Tags 26 | , , , , , , , , , , , , , , , , 27 | 28 | ## Voice Description Components 29 | 30 | ### Age 31 | - in their 20s 32 | - in their 30s 33 | - in their 40s 34 | - in their 50s 35 | 36 | ### Accent 37 | - American accent 38 | - British accent 39 | - Middle Eastern accent 40 | - Australian accent 41 | - Indian accent 42 | 43 | ### Pitch 44 | - high pitch 45 | - normal pitch 46 | - low pitch 47 | 48 | ### Timbre 49 | - warm timbre 50 | - gravelly timbre 51 | - smooth timbre 52 | - raspy timbre 53 | 54 | ### Pacing 55 | - fast pacing 56 | - conversational pacing 57 | - slow pacing 58 | 59 | ### Tone & Intensity 60 | - happy tone at high intensity 61 | - angry tone at medium intensity 62 | - curious tone at low intensity 63 | - energetic 64 | - calm 65 | - dramatic 66 | -------------------------------------------------------------------------------- /core/chunking.py: -------------------------------------------------------------------------------- 1 | """ 2 | Smart text chunking utilities for Maya1 TTS. 3 | Handles splitting long texts into manageable chunks for generation. 4 | """ 5 | 6 | import re 7 | from typing import List 8 | 9 | 10 | def smart_chunk_text(text: str, max_chunk_chars: int = 200) -> List[str]: 11 | """ 12 | Split text into chunks at sentence boundaries for natural TTS. 13 | 14 | Tries to split at: 15 | 1. Sentence boundaries (. ! ?) 16 | 2. Clause boundaries (, ; :) 17 | 3. Word boundaries (spaces) 18 | 19 | Args: 20 | text: Full text to chunk 21 | max_chunk_chars: Maximum characters per chunk (soft limit) 22 | 23 | Returns: 24 | List of text chunks 25 | """ 26 | if len(text) <= max_chunk_chars: 27 | return [text] 28 | 29 | chunks = [] 30 | 31 | # Split on sentence boundaries first 32 | sentence_pattern = r'(?<=[.!?])\s+' 33 | sentences = re.split(sentence_pattern, text) 34 | 35 | current_chunk = "" 36 | 37 | for sentence in sentences: 38 | # If sentence itself is too long, split it further 39 | if len(sentence) > max_chunk_chars: 40 | # First, save current chunk if exists 41 | if current_chunk: 42 | chunks.append(current_chunk.strip()) 43 | current_chunk = "" 44 | 45 | # Split long sentence on clause boundaries 46 | clause_pattern = r'(?<=[,;:])\s+' 47 | clauses = re.split(clause_pattern, sentence) 48 | 49 | for clause in clauses: 50 | # If clause is still too long, split on words 51 | if len(clause) > max_chunk_chars: 52 | if current_chunk: 53 | chunks.append(current_chunk.strip()) 54 | current_chunk = "" 55 | 56 | words = clause.split() 57 | for word in words: 58 | if len(current_chunk) + len(word) + 1 > max_chunk_chars: 59 | if current_chunk: 60 | chunks.append(current_chunk.strip()) 61 | current_chunk = word 62 | else: 63 | current_chunk += (" " if current_chunk else "") + word 64 | else: 65 | # Add clause to current chunk 66 | if len(current_chunk) + len(clause) + 1 > max_chunk_chars: 67 | chunks.append(current_chunk.strip()) 68 | current_chunk = clause 69 | else: 70 | current_chunk += (" " if current_chunk else "") + clause 71 | else: 72 | # Try to add sentence to current chunk 73 | if len(current_chunk) + len(sentence) + 1 > max_chunk_chars: 74 | # Current chunk is full, save it 75 | if current_chunk: 76 | chunks.append(current_chunk.strip()) 77 | current_chunk = sentence 78 | else: 79 | # Add sentence to current chunk 80 | current_chunk += (" " if current_chunk else "") + sentence 81 | 82 | # Add remaining chunk 83 | if current_chunk: 84 | chunks.append(current_chunk.strip()) 85 | 86 | return chunks 87 | 88 | 89 | def estimate_tokens_for_text(text: str) -> int: 90 | """ 91 | Rough estimate of how many tokens the text will generate. 92 | 93 | Maya1 typically uses: 94 | - ~1 text token per word 95 | - ~7 SNAC tokens per frame 96 | - ~0.021 seconds per frame 97 | - Roughly 350 SNAC tokens per second of audio 98 | 99 | Args: 100 | text: Input text 101 | 102 | Returns: 103 | Estimated number of tokens 104 | """ 105 | # Rough heuristic: 1 word = 3-4 SNAC frames = ~25 tokens 106 | word_count = len(text.split()) 107 | estimated_tokens = word_count * 25 108 | 109 | return estimated_tokens 110 | 111 | 112 | def should_chunk_text(text: str, max_tokens: int) -> bool: 113 | """ 114 | Determine if text should be chunked based on estimated token count. 115 | 116 | Args: 117 | text: Input text 118 | max_tokens: Maximum tokens allowed per generation 119 | 120 | Returns: 121 | True if text should be chunked 122 | """ 123 | estimated = estimate_tokens_for_text(text) 124 | 125 | # Use 80% of max_tokens as threshold to be safe 126 | threshold = int(max_tokens * 0.8) 127 | 128 | return estimated > threshold 129 | -------------------------------------------------------------------------------- /js/config.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Maya1 TTS Configuration 3 | * Tooltips, Character Presets, and Emotion Tags 4 | */ 5 | 6 | export const tooltips = { 7 | // Model settings 8 | model_name: "Select your downloaded Maya1 model from models/maya1-TTS/", 9 | dtype: "Model precision:\n• float16: Fast, 8GB VRAM, good quality\n• bfloat16: Most stable, 8GB VRAM (recommended)\n• float32: Best quality, 16GB VRAM, slower\n• 4bit/8bit: Saves VRAM but slower generation", 10 | attention_mechanism: "Attention implementation:\n• sdpa: Default, fast, works everywhere\n• flash_attention_2: Fastest (requires: pip install flash-attn)\n• sage_attention: Memory efficient for long sequences", 11 | device: "Hardware to run on:\n• cuda: Use GPU (fast, needs VRAM)\n• cpu: Use CPU only (slow, no VRAM needed)", 12 | 13 | // Voice and text 14 | voice_description: "Describe the voice characteristics:\n• Age, gender, accent\n• Tone (warm, cold, energetic)\n• Pacing (slow, conversational, fast)\n• Timbre (deep, high-pitched, raspy)\n\nExample: 'Female in 20s, British accent, warm tone, conversational pacing'\n\n💡 Ctrl+Enter to save | Escape to cancel | Click outside to save\n⏎ Enter for new line", 15 | text: "Your text to speak. Can include emotion tags like:\n \n\nClick emotion buttons below to insert tags easily!\n\n💡 Ctrl+Enter to save | Escape to cancel | Click outside to save\n⏎ Enter for new line\n⛶ Click expand button for longform text editor", 16 | 17 | // Generation settings 18 | keep_model_in_vram: "Keep model loaded after generation:\n• ON: Faster repeated generations (uses 8-16GB VRAM)\n• OFF: Clears VRAM after each generation", 19 | temperature: "Controls randomness (0.1-2.0):\n• 0.4: Recommended, balanced\n• Lower (0.1-0.3): More consistent, robotic\n• Higher (0.5-1.0): More creative, varied", 20 | top_p: "Nucleus sampling (0.1-1.0):\n• 0.9: Recommended, natural speech\n• Lower: More focused, less variety\n• Higher: More diverse but less coherent", 21 | max_new_tokens: "Maximum NEW audio tokens to generate (excludes input prompt):\n• ~500 tokens ≈ 10 seconds\n• ~1000 tokens ≈ 20 seconds\n• ~2000 tokens ≈ 40 seconds\n• 4000 tokens ≈ 30-40 seconds\n\nFor longform chunking: Each chunk respects this limit", 22 | repetition_penalty: "Prevents repetitive patterns:\n• 1.1: Recommended\n• Higher (1.2-1.5): Reduces loops but may affect quality\n• 1.0: No penalty (may loop)", 23 | seed: "Random seed for reproducibility:\n• 0: Random output each time\n• Fixed number (e.g., 42): Same output with same inputs", 24 | chunk_longform: "⚠️ EXPERIMENTAL: Auto-split long text:\n• ON: Splits text >80 words at sentences, combines audio\n• OFF: Generates entire text at once (may fail if too long)", 25 | debug_mode: "Console output verbosity:\n• ON: Shows detailed info (token IDs, timings, stats)\n• OFF: Shows only essentials (seed, VRAM, progress)", 26 | 27 | // Emotion tag insert dropdown 28 | emotion_tag_insert: "Legacy emotion tag selector\n(Use clickable buttons below instead!)" 29 | }; 30 | 31 | export const characterPresets = [ 32 | { 33 | emoji: "♂️", 34 | name: "Male US", 35 | description: "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing." 36 | }, 37 | { 38 | emoji: "♀️", 39 | name: "Female UK", 40 | description: "Realistic female voice in the 20s age with british accent. Normal pitch, warm timbre, conversational pacing." 41 | }, 42 | { 43 | emoji: "🎙️", 44 | name: "Announcer", 45 | description: "Professional male announcer voice in the 40s age with american accent. Rich pitch, powerful timbre, clear measured pacing." 46 | }, 47 | { 48 | emoji: "🤖", 49 | name: "Robot", 50 | description: "Robotic AI voice, neutral gender in synthetic age. Monotone pitch, metallic timbre, precise mechanical pacing, emotionless delivery." 51 | }, 52 | { 53 | emoji: "😈", 54 | name: "Demon", 55 | description: "Demonic entity voice, deep male in unknown age with hellish accent. Very low pitch, gravelly timbre, menacing pacing, evil tone." 56 | } 57 | ]; 58 | 59 | // All emotion tags use the same purple gradient color for consistency 60 | const EMOTION_COLOR = "#667eea"; // Purple accent matching theme 61 | 62 | export const emotionTags = [ 63 | { tag: "", display: "laugh", color: EMOTION_COLOR }, 64 | { tag: "", display: "laugh harder", color: EMOTION_COLOR }, 65 | { tag: "", display: "chuckle", color: EMOTION_COLOR }, 66 | { tag: "", display: "giggle", color: EMOTION_COLOR }, 67 | { tag: "", display: "sigh", color: EMOTION_COLOR }, 68 | { tag: "", display: "gasp", color: EMOTION_COLOR }, 69 | { tag: "", display: "angry", color: EMOTION_COLOR }, 70 | { tag: "", display: "excited", color: EMOTION_COLOR }, 71 | { tag: "", display: "whisper", color: EMOTION_COLOR }, 72 | { tag: "", display: "cry", color: EMOTION_COLOR }, 73 | { tag: "", display: "scream", color: EMOTION_COLOR }, 74 | { tag: "", display: "sing", color: EMOTION_COLOR }, 75 | { tag: "", display: "snort", color: EMOTION_COLOR }, 76 | { tag: "", display: "exhale", color: EMOTION_COLOR }, 77 | { tag: "", display: "gulp", color: EMOTION_COLOR }, 78 | { tag: "", display: "sarcastic", color: EMOTION_COLOR } 79 | ]; 80 | -------------------------------------------------------------------------------- /core/snac_decoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | SNAC (Speech Neural Audio Codec) decoder for Maya1 TTS. 3 | Handles unpacking 7-token frames and decoding to 24kHz audio. 4 | """ 5 | 6 | import torch 7 | import numpy as np 8 | from typing import List, Tuple 9 | 10 | 11 | # Maya1 SNAC token range: 128266 to 156937 12 | SNAC_TOKEN_START = 128266 13 | SNAC_TOKEN_END = 156937 14 | SNAC_CODEBOOK_SIZE = 4096 # Each level uses 4096 codes 15 | 16 | 17 | def is_snac_token(token_id: int) -> bool: 18 | """ 19 | Check if a token ID is a SNAC audio token. 20 | 21 | Args: 22 | token_id: Token ID to check 23 | 24 | Returns: 25 | True if the token is a SNAC token 26 | """ 27 | return SNAC_TOKEN_START <= token_id <= SNAC_TOKEN_END 28 | 29 | 30 | def filter_snac_tokens(token_ids: List[int]) -> List[int]: 31 | """ 32 | Filter only SNAC tokens from a list of token IDs. 33 | 34 | Args: 35 | token_ids: List of token IDs (may include text tokens) 36 | 37 | Returns: 38 | List of only SNAC tokens 39 | """ 40 | return [t for t in token_ids if is_snac_token(t)] 41 | 42 | 43 | def unpack_snac_tokens(snac_tokens: List[int]) -> Tuple[List[List[int]], int]: 44 | """ 45 | Unpack 7-token SNAC frames into 3 hierarchical codebook levels. 46 | 47 | Maya1 packs SNAC codes into 7 tokens per frame: 48 | - Frame: [slot0, slot1, slot2, slot3, slot4, slot5, slot6] 49 | - L1 (12Hz): slot0 50 | - L2 (23Hz): slot1, slot4 51 | - L3 (47Hz): slot2, slot3, slot5, slot6 52 | 53 | Args: 54 | snac_tokens: List of SNAC token IDs (should be multiple of 7) 55 | 56 | Returns: 57 | Tuple of (codes, num_frames): 58 | - codes: List of 3 lists [L1, L2, L3] with unpacked codes 59 | - num_frames: Number of frames processed 60 | """ 61 | num_frames = len(snac_tokens) // 7 62 | 63 | if len(snac_tokens) % 7 != 0: 64 | print(f"⚠️ Warning: SNAC tokens ({len(snac_tokens)}) not divisible by 7. " 65 | f"Truncating to {num_frames * 7} tokens.") 66 | 67 | # Initialize codebook levels 68 | l1_codes = [] # 1 code per frame (12 Hz) 69 | l2_codes = [] # 2 codes per frame (23 Hz) 70 | l3_codes = [] # 4 codes per frame (47 Hz) 71 | 72 | for i in range(num_frames): 73 | # Extract 7 tokens for this frame 74 | frame_start = i * 7 75 | slots = snac_tokens[frame_start:frame_start + 7] 76 | 77 | # Unpack to codebook indices (subtract offset and mod by codebook size) 78 | l1_codes.append((slots[0] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE) 79 | 80 | l2_codes.append((slots[1] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE) 81 | l2_codes.append((slots[4] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE) 82 | 83 | l3_codes.append((slots[2] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE) 84 | l3_codes.append((slots[3] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE) 85 | l3_codes.append((slots[5] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE) 86 | l3_codes.append((slots[6] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE) 87 | 88 | codes = [l1_codes, l2_codes, l3_codes] 89 | return codes, num_frames 90 | 91 | 92 | def decode_snac_to_audio(codes: List[List[int]], snac_model, device: str = "cuda") -> np.ndarray: 93 | """ 94 | Decode SNAC codes to audio waveform using the SNAC decoder. 95 | 96 | Args: 97 | codes: List of 3 lists [L1, L2, L3] with unpacked codes 98 | snac_model: Loaded SNAC model with decoder 99 | device: Device to run decoding on 100 | 101 | Returns: 102 | Audio waveform as numpy array (24kHz, mono, float32) 103 | """ 104 | # Convert codes to tensors 105 | codes_tensor = [ 106 | torch.tensor(level_codes, dtype=torch.long, device=device).unsqueeze(0) 107 | for level_codes in codes 108 | ] 109 | 110 | # Decode using SNAC quantizer + decoder 111 | with torch.inference_mode(): 112 | quantized = snac_model.quantizer.from_codes(codes_tensor) 113 | audio_tensor = snac_model.decoder(quantized) 114 | 115 | # Extract audio: shape is [batch, channels, samples] 116 | audio = audio_tensor[0, 0].cpu().numpy() 117 | 118 | # Trim warmup samples (first 2048 samples) - from official transformers_inference.py 119 | if len(audio) > 2048: 120 | audio = audio[2048:] 121 | 122 | return audio 123 | 124 | 125 | class SNACDecoder: 126 | """ 127 | Wrapper class for SNAC decoding with model caching. 128 | """ 129 | 130 | _cached_model = None 131 | _cached_device = None 132 | 133 | @classmethod 134 | def load_snac_model(cls, device: str = "cuda"): 135 | """ 136 | Load SNAC 24kHz model with caching. 137 | 138 | Args: 139 | device: Device to load model on 140 | 141 | Returns: 142 | Loaded SNAC model 143 | """ 144 | # Return cached model if available 145 | if cls._cached_model is not None and cls._cached_device == device: 146 | return cls._cached_model 147 | 148 | print("📦 Loading SNAC 24kHz decoder...") 149 | 150 | try: 151 | from snac import SNAC 152 | except ImportError: 153 | raise ImportError( 154 | "SNAC package not found. Install with: pip install snac\n" 155 | "GitHub: https://github.com/hubertsiuzdak/snac" 156 | ) 157 | 158 | # Load SNAC 24kHz model 159 | snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device) 160 | 161 | # Cache the model 162 | cls._cached_model = snac_model 163 | cls._cached_device = device 164 | 165 | print(f"✅ SNAC decoder loaded on {device}") 166 | 167 | return snac_model 168 | 169 | @classmethod 170 | def decode(cls, snac_tokens: List[int], device: str = "cuda") -> np.ndarray: 171 | """ 172 | Full pipeline: filter tokens → unpack → decode to audio. 173 | 174 | Args: 175 | snac_tokens: List of SNAC token IDs 176 | device: Device to run on 177 | 178 | Returns: 179 | Audio waveform as numpy array (24kHz, mono, float32) 180 | """ 181 | # Load SNAC model (cached) 182 | snac_model = cls.load_snac_model(device) 183 | 184 | # Unpack tokens to codes 185 | codes, num_frames = unpack_snac_tokens(snac_tokens) 186 | 187 | if num_frames == 0: 188 | print("⚠️ No SNAC frames to decode!") 189 | return np.zeros(0, dtype=np.float32) 190 | 191 | print(f"🎵 Decoding {num_frames} SNAC frames (~{num_frames * 0.021:.2f}s audio)...") 192 | 193 | # Decode to audio 194 | audio = decode_snac_to_audio(codes, snac_model, device) 195 | 196 | return audio 197 | -------------------------------------------------------------------------------- /core/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for Maya1 TTS ComfyUI nodes. 3 | Includes ComfyUI-native cancel support and progress tracking. 4 | """ 5 | 6 | import os 7 | from pathlib import Path 8 | from typing import List, Optional 9 | 10 | 11 | def get_maya1_models_dir() -> Path: 12 | """ 13 | Get the Maya1 models directory within ComfyUI's models folder. 14 | 15 | Returns: 16 | Path to ComfyUI/models/maya1-TTS/ 17 | """ 18 | try: 19 | # Try to use ComfyUI's folder_paths 20 | import folder_paths 21 | comfyui_models_dir = Path(folder_paths.models_dir) 22 | except: 23 | # Fallback: try to detect ComfyUI directory 24 | # Look for ComfyUI installation in common locations 25 | current_file = Path(__file__).resolve() 26 | 27 | # Navigate up from custom_nodes/ComfyUI-Maya1_TTS/core/utils.py 28 | # to find ComfyUI root (should have a 'models' folder) 29 | for parent in current_file.parents: 30 | if (parent / "models").exists() and (parent / "custom_nodes").exists(): 31 | comfyui_models_dir = parent / "models" 32 | break 33 | else: 34 | # Ultimate fallback: use current directory 35 | comfyui_models_dir = Path.cwd() / "models" 36 | 37 | maya1_models_dir = comfyui_models_dir / "maya1-TTS" 38 | return maya1_models_dir 39 | 40 | 41 | def discover_maya1_models() -> List[str]: 42 | """ 43 | Scan ComfyUI/models/maya1-TTS/ for available Maya1 models. 44 | 45 | Returns: 46 | List of model directory names (relative to ComfyUI/models/maya1-TTS/) 47 | """ 48 | models_dir = get_maya1_models_dir() 49 | 50 | if not models_dir.exists(): 51 | print(f"⚠️ Maya1 models directory not found: {models_dir}") 52 | print(f"💡 Create it and download models with:") 53 | print(f" mkdir -p {models_dir}") 54 | print(f" huggingface-cli download maya-research/maya1 --local-dir {models_dir}/maya1") 55 | return ["(No models folder found - see console for instructions)"] 56 | 57 | # Find directories with config.json (HuggingFace model format) 58 | models = [] 59 | for item in models_dir.iterdir(): 60 | if item.is_dir(): 61 | # Check for config.json in root or in a checkpoint subdirectory 62 | if (item / "config.json").exists(): 63 | models.append(item.name) 64 | elif any((item / d / "config.json").exists() for d in ["checkpoint-*"] if (item / d).exists()): 65 | models.append(item.name) 66 | 67 | if not models: 68 | print(f"⚠️ No valid Maya1 models found in {models_dir}") 69 | print(f"💡 Download a model with:") 70 | print(f" huggingface-cli download maya-research/maya1 --local-dir {models_dir}/maya1") 71 | return ["(No valid models found - see console for instructions)"] 72 | 73 | return sorted(models) 74 | 75 | 76 | def get_model_path(model_name: str) -> Path: 77 | """ 78 | Get the full path to a model directory. 79 | 80 | Args: 81 | model_name: Name of the model folder 82 | 83 | Returns: 84 | Full path to the model directory 85 | """ 86 | return get_maya1_models_dir() / model_name 87 | 88 | 89 | def load_emotions_list() -> List[str]: 90 | """ 91 | Load the list of supported emotion tags from resources/emotions.txt. 92 | 93 | Returns: 94 | List of emotion tag names (without angle brackets) 95 | """ 96 | emotions_file = Path(__file__).parent.parent / "resources" / "emotions.txt" 97 | 98 | if not emotions_file.exists(): 99 | # Fallback list if file doesn't exist 100 | return [ 101 | "laugh", "laugh_harder", "giggle", "chuckle", "cry", "sigh", 102 | "gasp", "whisper", "angry", "scream", "snort", "yawn", 103 | "cough", "sneeze", "breathing", "humming", "throat_clearing" 104 | ] 105 | 106 | with open(emotions_file, 'r') as f: 107 | emotions = [line.strip() for line in f if line.strip()] 108 | 109 | return emotions 110 | 111 | 112 | def format_prompt(voice_description: str, text: str) -> str: 113 | """ 114 | Format the prompt using Maya1's expected format with chat template. 115 | 116 | Args: 117 | voice_description: Natural language voice description 118 | text: Text to synthesize (may contain emotion tags) 119 | 120 | Returns: 121 | Formatted prompt string 122 | """ 123 | # Maya1 uses a chat-like format with system/user messages 124 | # The voice description acts as the "system" instruction 125 | # The text to synthesize is the "user" message 126 | 127 | # Format as a conversation to trigger audio generation 128 | prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> 129 | 130 | You are a voice synthesis system. Generate natural speech audio using SNAC codes for the following voice characteristics: {voice_description}<|eot_id|><|start_header_id|>user<|end_header_id|> 131 | 132 | {text}<|eot_id|><|start_header_id|>assistant<|end_header_id|> 133 | 134 | """ 135 | 136 | return prompt 137 | 138 | 139 | def check_interruption(): 140 | """ 141 | Check if ComfyUI has requested interruption. 142 | Raises an exception if cancellation was requested. 143 | 144 | This integrates with ComfyUI's native cancel functionality. 145 | """ 146 | try: 147 | # Try to import ComfyUI's execution module 148 | import execution 149 | if hasattr(execution, 'interruption_requested') and execution.interruption_requested(): 150 | raise InterruptedError("🛑 Generation cancelled by user") 151 | except ImportError: 152 | # If ComfyUI modules aren't available (e.g., testing), just continue 153 | pass 154 | except InterruptedError: 155 | # Re-raise interruption errors 156 | raise 157 | except Exception as e: 158 | # Silently ignore other errors (module might not have the attribute in older versions) 159 | pass 160 | 161 | 162 | class ProgressCallback: 163 | """ 164 | Progress tracking callback for ComfyUI integration. 165 | Shows generation progress in the ComfyUI UI. 166 | """ 167 | 168 | def __init__(self, total_steps: int, desc: str = "Generating"): 169 | self.total_steps = total_steps 170 | self.current_step = 0 171 | self.desc = desc 172 | self.pbar = None 173 | 174 | # Try to use ComfyUI's progress bar 175 | try: 176 | from comfy.utils import ProgressBar 177 | self.pbar = ProgressBar(total_steps) 178 | except ImportError: 179 | # Fallback: just print progress 180 | self.pbar = None 181 | 182 | def update(self, steps: int = 1): 183 | """Update progress by the specified number of steps.""" 184 | self.current_step += steps 185 | 186 | if self.pbar is not None: 187 | self.pbar.update(steps) 188 | else: 189 | # Fallback: print percentage 190 | if self.current_step % max(1, self.total_steps // 10) == 0: 191 | pct = (self.current_step / self.total_steps) * 100 192 | print(f"⏳ {self.desc}: {pct:.1f}%") 193 | 194 | # Check for cancellation on each update 195 | check_interruption() 196 | 197 | def close(self): 198 | """Close the progress bar.""" 199 | if self.pbar is not None: 200 | self.pbar.update(self.total_steps - self.current_step) 201 | 202 | 203 | def crossfade_audio(audio1, audio2, crossfade_samples: int = 1200): 204 | """ 205 | Crossfade two audio arrays for smooth transitions. 206 | 207 | Args: 208 | audio1: First audio array (numpy or torch) 209 | audio2: Second audio array (numpy or torch) 210 | crossfade_samples: Number of samples to crossfade (default 1200 = 50ms at 24kHz) 211 | 212 | Returns: 213 | Crossfaded audio array 214 | """ 215 | import numpy as np 216 | import torch 217 | 218 | # Convert to numpy for processing 219 | is_torch = False 220 | if isinstance(audio1, torch.Tensor): 221 | is_torch = True 222 | audio1_np = audio1.cpu().numpy() 223 | audio2_np = audio2.cpu().numpy() 224 | else: 225 | audio1_np = audio1 226 | audio2_np = audio2 227 | 228 | # Handle different shapes: [batch, channels, samples] or [samples] 229 | if audio1_np.ndim == 3: 230 | # Shape: [batch, channels, samples] 231 | batch, channels, samples1 = audio1_np.shape 232 | samples2 = audio2_np.shape[2] 233 | 234 | # Ensure crossfade_samples doesn't exceed audio length 235 | crossfade_samples = min(crossfade_samples, samples1, samples2) 236 | 237 | if crossfade_samples > 0: 238 | # Create fade curves 239 | fade_out = np.linspace(1.0, 0.0, crossfade_samples).reshape(1, 1, -1) 240 | fade_in = np.linspace(0.0, 1.0, crossfade_samples).reshape(1, 1, -1) 241 | 242 | # Apply crossfade to overlapping region 243 | audio1_fade = audio1_np.copy() 244 | audio1_fade[:, :, -crossfade_samples:] *= fade_out 245 | 246 | audio2_fade = audio2_np.copy() 247 | audio2_fade[:, :, :crossfade_samples] *= fade_in 248 | 249 | # Combine: audio1 (minus fade region) + crossfade + audio2 (minus fade region) 250 | result = np.concatenate([ 251 | audio1_fade[:, :, :-crossfade_samples], 252 | audio1_fade[:, :, -crossfade_samples:] + audio2_fade[:, :, :crossfade_samples], 253 | audio2_fade[:, :, crossfade_samples:] 254 | ], axis=2) 255 | else: 256 | # No crossfade, just concatenate 257 | result = np.concatenate([audio1_np, audio2_np], axis=2) 258 | 259 | elif audio1_np.ndim == 1: 260 | # Shape: [samples] 261 | samples1 = len(audio1_np) 262 | samples2 = len(audio2_np) 263 | 264 | crossfade_samples = min(crossfade_samples, samples1, samples2) 265 | 266 | if crossfade_samples > 0: 267 | fade_out = np.linspace(1.0, 0.0, crossfade_samples) 268 | fade_in = np.linspace(0.0, 1.0, crossfade_samples) 269 | 270 | audio1_fade = audio1_np.copy() 271 | audio1_fade[-crossfade_samples:] *= fade_out 272 | 273 | audio2_fade = audio2_np.copy() 274 | audio2_fade[:crossfade_samples] *= fade_in 275 | 276 | result = np.concatenate([ 277 | audio1_fade[:-crossfade_samples], 278 | audio1_fade[-crossfade_samples:] + audio2_fade[:crossfade_samples], 279 | audio2_fade[crossfade_samples:] 280 | ]) 281 | else: 282 | result = np.concatenate([audio1_np, audio2_np]) 283 | 284 | else: 285 | raise ValueError(f"Unexpected audio shape: {audio1_np.shape}") 286 | 287 | # Convert back to torch if needed 288 | if is_torch: 289 | result = torch.from_numpy(result) 290 | 291 | return result 292 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2025 Saganaki22 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /example_workflow/Maya1_TTS-example_workflow.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "418a17f1-fce3-46a3-8e6d-035b3399740f", 3 | "revision": 0, 4 | "last_node_id": 9, 5 | "last_link_id": 2, 6 | "nodes": [ 7 | { 8 | "id": 4, 9 | "type": "SaveAudioMP3", 10 | "pos": [ 11 | 1105.8284912109375, 12 | 157.3507843017578 13 | ], 14 | "size": [ 15 | 270, 16 | 136 17 | ], 18 | "flags": { 19 | "pinned": true 20 | }, 21 | "order": 6, 22 | "mode": 4, 23 | "inputs": [ 24 | { 25 | "name": "audio", 26 | "type": "AUDIO", 27 | "link": 2 28 | } 29 | ], 30 | "outputs": [], 31 | "properties": { 32 | "cnr_id": "comfy-core", 33 | "ver": "0.3.64", 34 | "Node name for S&R": "SaveAudioMP3", 35 | "ue_properties": { 36 | "widget_ue_connectable": {}, 37 | "input_ue_unconnectable": {}, 38 | "version": "7.4.1" 39 | } 40 | }, 41 | "widgets_values": [ 42 | "audio/ComfyUI", 43 | "320k" 44 | ] 45 | }, 46 | { 47 | "id": 1, 48 | "type": "Maya1TTS_Combined", 49 | "pos": [ 50 | 547.344970703125, 51 | 154.19032287597656 52 | ], 53 | "size": [ 54 | 513.5526123046875, 55 | 935 56 | ], 57 | "flags": { 58 | "pinned": true 59 | }, 60 | "order": 0, 61 | "mode": 4, 62 | "inputs": [], 63 | "outputs": [ 64 | { 65 | "name": "audio", 66 | "type": "AUDIO", 67 | "links": [ 68 | 2 69 | ] 70 | } 71 | ], 72 | "properties": { 73 | "cnr_id": "ComfyUI-Maya1_TTS", 74 | "ver": "2152b6fc507e414bc8538059e1f228bfe7be2dec", 75 | "Node name for S&R": "Maya1TTS_Combined", 76 | "ue_properties": { 77 | "widget_ue_connectable": {}, 78 | "input_ue_unconnectable": {}, 79 | "version": "7.4.1" 80 | } 81 | }, 82 | "widgets_values": [ 83 | "maya1", 84 | "bfloat16", 85 | "sdpa", 86 | "cuda", 87 | "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.", 88 | "Hello! This is Maya1 the best open source voice AI model with emotions.", 89 | true, 90 | 0.4, 91 | 0.9, 92 | 4000, 93 | 1.1, 94 | 380055393262059, 95 | "randomize", 96 | false 97 | ] 98 | }, 99 | { 100 | "id": 9, 101 | "type": "MarkdownNote", 102 | "pos": [ 103 | 1109.0732421875, 104 | -32.40277099609375 105 | ], 106 | "size": [ 107 | 361.2737731933594, 108 | 126.04345703125 109 | ], 110 | "flags": { 111 | "pinned": true 112 | }, 113 | "order": 1, 114 | "mode": 0, 115 | "inputs": [], 116 | "outputs": [], 117 | "properties": {}, 118 | "widgets_values": [ 119 | "# If you are facing JS Loading / rendering issues with the bottom node use the top native node" 120 | ], 121 | "color": "#422342", 122 | "bgcolor": "rgba(24,24,27,.9)" 123 | }, 124 | { 125 | "id": 6, 126 | "type": "MarkdownNote", 127 | "pos": [ 128 | -16.587236404418945, 129 | 296.7104797363281 130 | ], 131 | "size": [ 132 | 504.536376953125, 133 | 795.1909790039062 134 | ], 135 | "flags": { 136 | "pinned": true 137 | }, 138 | "order": 2, 139 | "mode": 0, 140 | "inputs": [], 141 | "outputs": [], 142 | "title": "Voice Description examples", 143 | "properties": { 144 | "ue_properties": { 145 | "widget_ue_connectable": {}, 146 | "version": "7.4.1", 147 | "input_ue_unconnectable": {} 148 | } 149 | }, 150 | "widgets_values": [ 151 | "# Realistic Voices\n\n## 1. Professional Podcast Host\nRealistic male voice in his 30s with an American accent. Normal pitch, warm timbre, conversational pacing, neutral emotion at medium intensity. Podcast domain, podcast host role, formal register.\n\n## 2. Energetic Young Instructor\nRealistic female voice in her 20s with a British accent. High pitch, smooth timbre, brisk pacing, energetic emotion at high intensity. Education domain, elearning instructor role, neutral register.\n\n## 3. Customer Support Agent\nRealistic female voice in her 30s with an Indian accent. Normal pitch, warm timbre, conversational pacing, neutral emotion at medium intensity. Support domain, customer support agent role, neutral register.\n\n## 4. Corporate Explainer Voice\nRealistic male voice in his 40s with a Middle Eastern accent. Low pitch, deep timbre, slow pacing, neutral emotion at medium intensity. Corporate domain, explainer video voice role, formal register.\n\n## 5. Social Media Creator\nRealistic female voice in her 20s with an American accent. Normal pitch, smooth timbre, brisk pacing, energetic emotion at medium intensity. Social content domain, social media creator role, casual register.\n\n
\n\n# Creative Voices\n\n## 1. AI Robot Voice\nCreative, AI machine voice character. Male voice in his 20s with an American accent. Normal pitch, robotic timbre, conversational pacing, neutral emotion at medium intensity.\n\n## 2. Pirate\nCreative pirate character. Male voice in his 30s with a British accent. Low pitch, gravelly timbre, slow pacing, energetic emotion at high intensity.\n\n## 3. Mythical Godlike Voice\nCreative, mythical godlike magical character. Female voice in her 40s with an American accent. Low pitch, ethereal timbre, very slow pacing, neutral emotion at medium intensity.\n\n## 4. Flirty Anime Character\nCreative, anime and flirty character. Female voice in her 20s with an Asian American accent. High pitch, smooth timbre, slow pacing, energetic emotion at medium intensity.\n\n## 5. Dark Villain\nCreative dark villain character. Male voice in his 30s with a Middle Eastern accent. Low pitch, raspy timbre, conversational pacing, sarcastic emotion at high intensity.\n" 152 | ], 153 | "color": "#006691", 154 | "bgcolor": "rgba(24,24,27,.9)" 155 | }, 156 | { 157 | "id": 8, 158 | "type": "MarkdownNote", 159 | "pos": [ 160 | -24.692302703857422, 161 | -1134.520751953125 162 | ], 163 | "size": [ 164 | 1406.847900390625, 165 | 697.9292602539062 166 | ], 167 | "flags": { 168 | "pinned": true 169 | }, 170 | "order": 3, 171 | "mode": 0, 172 | "inputs": [], 173 | "outputs": [], 174 | "properties": { 175 | "ue_properties": { 176 | "widget_ue_connectable": {}, 177 | "version": "7.4.1", 178 | "input_ue_unconnectable": {} 179 | } 180 | }, 181 | "widgets_values": [ 182 | "# TTS Prompts with Full Emotion Tag Variety (30-Second Each)\n\n## 1. Professional Podcast Host\nWelcome to today’s episode! We’re diving deep into the latest tech trends, exploring breakthroughs in AI, robotics, and space exploration. Later, we’ll speak with an expert about how these technologies are shaping industries and impacting daily life. Stay tuned, because some of these developments might completely change the way you think about the future. By the end, you’ll feel inspired and ready to explore these innovations yourself.\n\n## 2. Energetic Young Instructor\nGood morning class! Today we’re exploring some of the most exciting physics concepts that can seem tricky at first. Don’t worry—I’ll break them down step by step, using simple experiments and examples you can relate to. We’ll cover motion, energy, and forces in ways that make sense. By the end of the session, you’ll understand why these principles govern everything around us, and you might even start seeing the world differently.\n\n## 3. Customer Support Agent\nHello, I’m here to help with your issue today. I know how frustrating it can be when things don’t work as expected, but we’ll figure it out together. Let’s carefully go through every step, from troubleshooting your settings to checking account details. If anything goes wrong, I’ll guide you patiently until it’s fixed. By the end of this call, everything should be running smoothly, and you’ll feel confident using the system again.\n\n## 4. Corporate Explainer Voice\nOur new platform enhances productivity and streamlines operations like never before. From scheduling and task management to analytics and reporting, every feature is designed to save time and reduce errors. You’ll notice a smoother workflow immediately, whether you’re collaborating with a team or managing projects individually. Trust me, once you start using this, you’ll wonder how you ever survived without it. By the end, your efficiency will be transformed, and your team will thank you.\n\n## 5. Casual YouTube Influencer\nHey everyone! Today I’m sharing my top 10 life hacks that have completely changed how I organize my day. Some of them are really unexpected, like using everyday items in creative ways. I’ll also give tips for saving money, boosting productivity, and making small routines fun. Stick around for the last hack—it’s a game-changer you’ll definitely want to try. By the end of this video, you’ll have at least a few new tricks to make life easier and more exciting.\n\n## 6. AI Robot Voice\nGreetings, human. I am your AI assistant, programmed to optimize your environment, manage tasks efficiently, and provide detailed data analysis. Today, I will guide you through scheduling, prioritizing important notifications, and organizing your workflow. Please provide your input carefully, as my systems will adapt to your preferences in real-time. By the end of this session, you will notice increased productivity and a fully organized digital environment.\n\n## 7. Pirate Captain\nAhoy, mateys! Today we set sail on treacherous seas in search of hidden treasure. The waves are rough, and the winds are fierce, but only the bravest shall prevail. Keep your eyes on the horizon, sharpen your swords, and be ready for anything. Along the journey, we’ll swap tales of old adventures, sing shanties to lift our spirits, and outwit rival crews. By the end of the voyage, either the treasure will be ours or we’ll have epic stories to tell.\n\n## 8. Mythical Godlike Voice\nListen carefully, mortals. The heavens speak through me, revealing ancient secrets hidden in the cosmos. Those who pay attention will gain wisdom, understanding the delicate balance of the universe. Ignore these warnings, and consequences may follow. Today, I will guide you through visions of stars, planets, and forces that govern existence. By the end of this message, you will feel both awe and responsibility for the knowledge granted to you.\n\n## 9. Flirty Anime Character\nOh my, you didn’t expect to see me here, did you? I’ve got a little surprise waiting just for you, something playful and fun. Let’s enjoy the moment together, teasing, laughing, and sharing a few secrets. I might even challenge you to a small game or dare to make it more exciting. By the end, I hope you’ll remember this encounter fondly, feeling entertained and charmed in equal measure.\n\n## 10. Dark Villain\nSo, you finally arrive. Did you really think it would be easy to find me? Every step you took has brought you straight into my plan. The traps, the misdirections, everything was set for this moment. Now, witness the full extent of my scheme and see whether you can survive. By the end of this encounter, either you’ll escape, or you’ll be part of my legacy forever.\n" 183 | ], 184 | "color": "#006691", 185 | "bgcolor": "rgba(24,24,27,.9)" 186 | }, 187 | { 188 | "id": 2, 189 | "type": "Maya1TTS_Barebones", 190 | "pos": [ 191 | 548.1511840820312, 192 | -377.1851806640625 193 | ], 194 | "size": [ 195 | 498.0000305175781, 196 | 468 197 | ], 198 | "flags": { 199 | "pinned": true 200 | }, 201 | "order": 4, 202 | "mode": 0, 203 | "inputs": [], 204 | "outputs": [ 205 | { 206 | "name": "audio", 207 | "type": "AUDIO", 208 | "links": [ 209 | 1 210 | ] 211 | } 212 | ], 213 | "properties": { 214 | "cnr_id": "ComfyUI-Maya1_TTS", 215 | "ver": "2152b6fc507e414bc8538059e1f228bfe7be2dec", 216 | "Node name for S&R": "Maya1TTS_Barebones", 217 | "ue_properties": { 218 | "widget_ue_connectable": {}, 219 | "input_ue_unconnectable": {}, 220 | "version": "7.4.1" 221 | } 222 | }, 223 | "widgets_values": [ 224 | "Creative pirate character. Male voice in his 30s with a British accent. Low pitch, gravelly timbre, slow pacing, energetic emotion at high intensity.", 225 | "Ahoy, mateys! Today we set sail on treacherous seas in search of hidden treasure. The waves are rough, and the winds are fierce, but only the bravest shall prevail. Keep your eyes on the horizon, sharpen your swords, and be ready for anything. Along the journey, we’ll swap tales of old adventures, sing shanties to lift our spirits, and outwit rival crews. By the end of the voyage, either the treasure will be ours or we’ll have epic stories to tell.", 226 | "maya1", 227 | "bfloat16", 228 | "sdpa", 229 | "cuda", 230 | true, 231 | false, 232 | 4000, 233 | 0.4, 234 | 0.9, 235 | 1.1, 236 | 477370007007039, 237 | "randomize" 238 | ] 239 | }, 240 | { 241 | "id": 3, 242 | "type": "SaveAudioMP3", 243 | "pos": [ 244 | 1106.2691650390625, 245 | -376.3995666503906 246 | ], 247 | "size": [ 248 | 270, 249 | 136 250 | ], 251 | "flags": { 252 | "pinned": true 253 | }, 254 | "order": 7, 255 | "mode": 0, 256 | "inputs": [ 257 | { 258 | "name": "audio", 259 | "type": "AUDIO", 260 | "link": 1 261 | } 262 | ], 263 | "outputs": [], 264 | "properties": { 265 | "cnr_id": "comfy-core", 266 | "ver": "0.3.64", 267 | "Node name for S&R": "SaveAudioMP3", 268 | "ue_properties": { 269 | "widget_ue_connectable": {}, 270 | "input_ue_unconnectable": {}, 271 | "version": "7.4.1" 272 | } 273 | }, 274 | "widgets_values": [ 275 | "audio/ComfyUI", 276 | "320k" 277 | ] 278 | }, 279 | { 280 | "id": 5, 281 | "type": "MarkdownNote", 282 | "pos": [ 283 | -21.270532608032227, 284 | -378.1773376464844 285 | ], 286 | "size": [ 287 | 508.7115173339844, 288 | 613.2767333984375 289 | ], 290 | "flags": { 291 | "pinned": true 292 | }, 293 | "order": 5, 294 | "mode": 0, 295 | "inputs": [], 296 | "outputs": [], 297 | "title": "Maya1 Model dir instruction", 298 | "properties": { 299 | "ue_properties": { 300 | "widget_ue_connectable": {}, 301 | "version": "7.4.1", 302 | "input_ue_unconnectable": {} 303 | } 304 | }, 305 | "widgets_values": [ 306 | "# [Maya1 Huggingface](https://huggingface.co/maya-research/maya1/tree/main)\n\n\n\n\n### Model Location\n\nModels go in: `ComfyUI/models/maya1-TTS/`\n\n\n### Expected Folder Structure\n\nAfter downloading, your model folder should look like this:\n\n```\nComfyUI/\n└── models/\n └── maya1-TTS/\n └── maya1/ # Model name (can be anything)\n ├── chat_template.jinja # Chat template\n ├── config.json # Model configuration\n ├── generation_config.json # Generation settings\n ├── model-00001-of-00002.safetensors # Model weights (shard 1)\n ├── model-00002-of-00002.safetensors # Model weights (shard 2)\n ├── model.safetensors.index.json # Weight index\n ├── special_tokens_map.json # Special tokens\n └── tokenizer/ # Tokenizer subfolder\n ├── chat_template.jinja # Chat template (duplicate)\n ├── special_tokens_map.json # Special tokens (duplicate)\n ├── tokenizer.json # Tokenizer vocabulary (22.9 MB)\n └── tokenizer_config.json # Tokenizer config\n\n```\n\n# Install HF CLI\n```pip install huggingface-hub```\n\n# Create directory\n```cd ComfyUI```
\n```mkdir -p models/maya1-TTS```\n\n# Download model\n```hf download maya-research/maya1 --local-dir models/maya1-TTS/maya1```\n\n" 307 | ], 308 | "color": "#c09430", 309 | "bgcolor": "rgba(24,24,27,.9)" 310 | } 311 | ], 312 | "links": [ 313 | [ 314 | 1, 315 | 2, 316 | 0, 317 | 3, 318 | 0, 319 | "AUDIO" 320 | ], 321 | [ 322 | 2, 323 | 1, 324 | 0, 325 | 4, 326 | 0, 327 | "AUDIO" 328 | ] 329 | ], 330 | "groups": [], 331 | "config": {}, 332 | "extra": { 333 | "ue_links": [], 334 | "ds": { 335 | "scale": 0.5131581182307068, 336 | "offset": [ 337 | 850.4592685668841, 338 | 1247.5450982664793 339 | ] 340 | }, 341 | "links_added_by_ue": [], 342 | "frontendVersion": "1.27.10", 343 | "VHS_latentpreview": false, 344 | "VHS_latentpreviewrate": 0, 345 | "VHS_MetadataImage": true, 346 | "VHS_KeepIntermediate": true 347 | }, 348 | "version": 0.4 349 | } 350 | -------------------------------------------------------------------------------- /core/model_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model loading and management for Maya1 TTS. 3 | Supports multiple attention mechanisms: SDPA, Flash Attention 2, Sage Attention. 4 | """ 5 | 6 | import torch 7 | from pathlib import Path 8 | from typing import Optional, Dict, Any 9 | import gc 10 | 11 | 12 | class Maya1Model: 13 | """ 14 | Wrapper class for Maya1 model with tokenizer and attention mechanism support. 15 | """ 16 | 17 | def __init__( 18 | self, 19 | model, 20 | tokenizer, 21 | model_name: str, 22 | attention_type: str, 23 | dtype: str, 24 | device: str 25 | ): 26 | self.model = model 27 | self.tokenizer = tokenizer 28 | self.model_name = model_name 29 | self.attention_type = attention_type 30 | self.dtype = dtype 31 | self.device = device 32 | 33 | def __repr__(self): 34 | return (f"Maya1Model(name={self.model_name}, " 35 | f"attention={self.attention_type}, " 36 | f"dtype={self.dtype}, " 37 | f"device={self.device})") 38 | 39 | 40 | class Maya1ModelLoader: 41 | """ 42 | Model loader with caching and attention mechanism configuration. 43 | """ 44 | 45 | # Cache for loaded models 46 | _model_cache: Dict[str, Maya1Model] = {} 47 | 48 | @staticmethod 49 | def _get_cache_key(model_path: str, attention_type: str, dtype: str) -> str: 50 | """Generate a unique cache key for a model configuration.""" 51 | return f"{model_path}|{attention_type}|{dtype}" 52 | 53 | @classmethod 54 | def load_model( 55 | cls, 56 | model_path: Path, 57 | attention_type: str = "sdpa", 58 | dtype: str = "bfloat16", 59 | device: str = "cuda" 60 | ) -> Maya1Model: 61 | """ 62 | Load Maya1 model with specified configuration. 63 | 64 | Args: 65 | model_path: Path to model directory 66 | attention_type: Attention mechanism ("sdpa", "flash_attention_2", "sage_attention") 67 | dtype: Data type ("bfloat16", "float16", "float32", "8bit", "4bit") 68 | device: Device to load on ("cuda", "cpu") 69 | 70 | Returns: 71 | Maya1Model wrapper with model and tokenizer 72 | """ 73 | # Check if dtype OR attention changed from cached model 74 | # If either changed, clear cache to reload with new settings 75 | model_path_str = str(model_path) 76 | for cached_key, cached_model in list(cls._model_cache.items()): 77 | if model_path_str in cached_key: 78 | dtype_changed = cached_model.dtype != dtype 79 | attention_changed = cached_model.attention_type != attention_type 80 | 81 | if dtype_changed or attention_changed: 82 | if dtype_changed: 83 | print(f"🔄 Dtype changed: {cached_model.dtype} → {dtype}") 84 | if attention_changed: 85 | print(f"🔄 Attention changed: {cached_model.attention_type} → {attention_type}") 86 | 87 | print(f"🗑️ Clearing VRAM and reloading model with new settings...") 88 | cls.clear_cache(force=True) 89 | print(f"✅ VRAM cleared, loading fresh model...") 90 | break 91 | 92 | # Check cache 93 | cache_key = cls._get_cache_key(str(model_path), attention_type, dtype) 94 | if cache_key in cls._model_cache: 95 | print(f"✅ Using cached Maya1 model: {model_path.name}") 96 | return cls._model_cache[cache_key] 97 | 98 | print(f"📦 Loading Maya1 model: {model_path.name}") 99 | print(f" Attention: {attention_type}") 100 | print(f" Dtype: {dtype}") 101 | print(f" Device: {device}") 102 | 103 | # Import required libraries 104 | try: 105 | from transformers import AutoModelForCausalLM, AutoTokenizer 106 | except ImportError: 107 | raise ImportError( 108 | "Transformers library not found. Install with:\n" 109 | "pip install transformers" 110 | ) 111 | 112 | # Check if using bitsandbytes quantization 113 | use_quantization = dtype in ["8bit", "4bit"] 114 | 115 | if use_quantization: 116 | # Bitsandbytes quantization 117 | torch_dtype = torch.bfloat16 # Base dtype for quantization 118 | print(f"🔧 Quantization requested: {dtype}") 119 | else: 120 | # Standard dtype 121 | torch_dtype = getattr(torch, dtype) 122 | 123 | # Configure attention mechanism 124 | attn_kwargs = cls._configure_attention(attention_type) 125 | 126 | # Load tokenizer 127 | tokenizer = cls._load_tokenizer(model_path) 128 | 129 | # Load model 130 | model = cls._load_model_with_attention( 131 | model_path, 132 | torch_dtype, 133 | device, 134 | attn_kwargs, 135 | quantization=dtype if use_quantization else None 136 | ) 137 | 138 | # Apply Sage Attention if selected 139 | if attention_type == "sage_attention": 140 | model = cls._apply_sage_attention(model) 141 | 142 | # Create wrapper 143 | maya1_model = Maya1Model( 144 | model=model, 145 | tokenizer=tokenizer, 146 | model_name=model_path.name, 147 | attention_type=attention_type, 148 | dtype=dtype, 149 | device=device 150 | ) 151 | 152 | # Cache the model 153 | cls._model_cache[cache_key] = maya1_model 154 | 155 | # Verify actual settings applied 156 | print(f"✅ Maya1 model loaded successfully!") 157 | cls._verify_model_config(model, attention_type, dtype) 158 | 159 | return maya1_model 160 | 161 | @staticmethod 162 | def _verify_model_config(model, expected_attention: str, expected_dtype: str): 163 | """Verify that the model is actually using the requested configuration.""" 164 | print("🔍 Verifying model configuration:") 165 | 166 | # Check actual dtype 167 | actual_dtype = next(model.parameters()).dtype 168 | print(f" ✓ Dtype: {actual_dtype} (requested: {expected_dtype})") 169 | 170 | # Check attention implementation 171 | if hasattr(model.config, '_attn_implementation'): 172 | actual_attn = model.config._attn_implementation 173 | 174 | # Special handling for Sage Attention 175 | if expected_attention == "sage_attention": 176 | # Sage Attention uses eager as base, so this is expected 177 | if actual_attn == "eager": 178 | print(f" ✓ Attention: sage_attention (base: eager) ✅") 179 | else: 180 | print(f" ✓ Attention: {actual_attn} (requested: {expected_attention})") 181 | else: 182 | # For other attention types, show normally 183 | print(f" ✓ Attention: {actual_attn} (requested: {expected_attention})") 184 | else: 185 | # For Sage Attention, check if hooks are registered 186 | if expected_attention == "sage_attention": 187 | # Check if forward hooks exist (Sage adds hooks) 188 | has_hooks = any( 189 | hasattr(module, '_forward_hooks') and len(module._forward_hooks) > 0 190 | for module in model.modules() 191 | ) 192 | if has_hooks: 193 | print(f" ✓ Attention: sage_attention hooks applied ✅") 194 | else: 195 | print(f" ⚠ Attention: sage_attention hooks may not be applied") 196 | else: 197 | print(f" ⚠ Attention: Unable to verify (config._attn_implementation not found)") 198 | 199 | @staticmethod 200 | def _configure_attention(attention_type: str) -> Dict[str, Any]: 201 | """ 202 | Configure attention mechanism parameters. 203 | 204 | Args: 205 | attention_type: Type of attention mechanism 206 | 207 | Returns: 208 | Dictionary of kwargs for model loading 209 | """ 210 | if attention_type == "sdpa": 211 | # PyTorch's scaled_dot_product_attention (default, most compatible) 212 | return {"attn_implementation": "sdpa"} 213 | 214 | elif attention_type == "flash_attention_2": 215 | # Flash Attention 2 (fastest, requires flash-attn package) 216 | try: 217 | import flash_attn 218 | return {"attn_implementation": "flash_attention_2"} 219 | except ImportError: 220 | print("⚠️ flash-attn not found, falling back to SDPA") 221 | print(" Install with: pip install flash-attn") 222 | return {"attn_implementation": "sdpa"} 223 | 224 | elif attention_type == "sage_attention": 225 | # Sage Attention (memory efficient, requires sageattention package) 226 | # Use eager mode first, then apply Sage Attention manually 227 | return {"attn_implementation": "eager"} 228 | 229 | elif attention_type == "eager": 230 | # Standard PyTorch eager attention (slowest but most compatible) 231 | return {"attn_implementation": "eager"} 232 | 233 | else: 234 | print(f"⚠️ Unknown attention type: {attention_type}, using SDPA") 235 | return {"attn_implementation": "sdpa"} 236 | 237 | @staticmethod 238 | def _load_tokenizer(model_path: Path): 239 | """ 240 | Load tokenizer from model path. 241 | Handles both root and tokenizer/ subdirectory structures. 242 | 243 | Args: 244 | model_path: Path to model directory 245 | 246 | Returns: 247 | Loaded tokenizer 248 | """ 249 | from transformers import AutoTokenizer 250 | 251 | # Check if tokenizer is in a subdirectory 252 | if (model_path / "tokenizer").exists(): 253 | print(" Loading tokenizer from tokenizer/ subdirectory...") 254 | tokenizer = AutoTokenizer.from_pretrained( 255 | str(model_path), 256 | subfolder="tokenizer", 257 | trust_remote_code=True 258 | ) 259 | else: 260 | print(" Loading tokenizer from root...") 261 | tokenizer = AutoTokenizer.from_pretrained( 262 | str(model_path), 263 | trust_remote_code=True 264 | ) 265 | 266 | return tokenizer 267 | 268 | @staticmethod 269 | def _load_model_with_attention( 270 | model_path: Path, 271 | torch_dtype, 272 | device: str, 273 | attn_kwargs: Dict[str, Any], 274 | quantization: Optional[str] = None 275 | ): 276 | """ 277 | Load the model with specified attention configuration. 278 | 279 | Args: 280 | model_path: Path to model directory 281 | torch_dtype: PyTorch data type 282 | device: Device to load on 283 | attn_kwargs: Attention configuration kwargs 284 | quantization: Quantization type ("8bit", "4bit", None) 285 | 286 | Returns: 287 | Loaded model 288 | """ 289 | from transformers import AutoModelForCausalLM 290 | 291 | # Prepare loading kwargs 292 | load_kwargs = { 293 | "torch_dtype": torch_dtype, 294 | "device_map": "auto" if device == "cuda" else device, 295 | "trust_remote_code": True, 296 | **attn_kwargs 297 | } 298 | 299 | # Add bitsandbytes quantization if requested 300 | if quantization == "8bit": 301 | try: 302 | import bitsandbytes 303 | print(f" Using 8-bit quantization (bitsandbytes)") 304 | load_kwargs["load_in_8bit"] = True 305 | # Remove device_map incompatibility 306 | if device == "cpu": 307 | print(f" ⚠️ 8-bit quantization requires CUDA, ignoring device=cpu") 308 | load_kwargs["device_map"] = "auto" 309 | except ImportError: 310 | print(f"⚠️ bitsandbytes not found, loading in bfloat16 instead") 311 | print(f" Install with: pip install bitsandbytes") 312 | quantization = None 313 | 314 | elif quantization == "4bit": 315 | try: 316 | import bitsandbytes 317 | from transformers import BitsAndBytesConfig 318 | print(f" Using 4-bit quantization (bitsandbytes NF4)") 319 | 320 | bnb_config = BitsAndBytesConfig( 321 | load_in_4bit=True, 322 | bnb_4bit_compute_dtype=torch_dtype, 323 | bnb_4bit_use_double_quant=True, # Nested quantization for better quality 324 | bnb_4bit_quant_type="nf4" # NormalFloat4 - best quality 325 | ) 326 | load_kwargs["quantization_config"] = bnb_config 327 | # Remove incompatible parameters 328 | load_kwargs.pop("torch_dtype", None) 329 | if device == "cpu": 330 | print(f" ⚠️ 4-bit quantization requires CUDA, ignoring device=cpu") 331 | load_kwargs["device_map"] = "auto" 332 | except ImportError: 333 | print(f"⚠️ bitsandbytes not found, loading in bfloat16 instead") 334 | print(f" Install with: pip install bitsandbytes") 335 | quantization = None 336 | 337 | model = AutoModelForCausalLM.from_pretrained( 338 | str(model_path), 339 | **load_kwargs 340 | ) 341 | 342 | model.eval() # Set to evaluation mode 343 | 344 | if quantization: 345 | print(f"✅ Model quantized to {quantization}") 346 | 347 | return model 348 | 349 | @staticmethod 350 | def _apply_sage_attention(model): 351 | """ 352 | Apply Sage Attention to the model. 353 | Supports both Sage Attention v1.x and v2.x APIs. 354 | 355 | Args: 356 | model: Loaded model 357 | 358 | Returns: 359 | Model with Sage Attention applied 360 | """ 361 | try: 362 | # Try Sage Attention v1.x API first 363 | try: 364 | from sageattention import apply_sage_attn 365 | print(" Applying Sage Attention (v1.x)...") 366 | model = apply_sage_attn(model) 367 | print(" ✅ Sage Attention v1.x applied successfully") 368 | return model 369 | except ImportError: 370 | # Try Sage Attention v2.x API 371 | from sageattention import sageattn 372 | print(" Applying Sage Attention (v2.x)...") 373 | # For v2.x, we need to replace attention in each layer 374 | for name, module in model.named_modules(): 375 | if hasattr(module, 'self_attn') or 'attention' in name.lower(): 376 | # Sage Attention v2+ auto-replaces attention when imported 377 | pass 378 | print(" ✅ Sage Attention v2.x detected and enabled") 379 | return model 380 | 381 | except ImportError: 382 | print("⚠️ sageattention not found, using standard eager attention") 383 | print(" Install with: pip install sageattention") 384 | return model 385 | except Exception as e: 386 | print(f"⚠️ Failed to apply Sage Attention: {e}") 387 | print(" Continuing with standard eager attention") 388 | return model 389 | 390 | @classmethod 391 | def clear_cache(cls, force: bool = False): 392 | """ 393 | Clear the model cache and free VRAM using ComfyUI's native memory management. 394 | This actually removes models from VRAM, not just moves them to CPU. 395 | """ 396 | if not cls._model_cache: 397 | return # Nothing to clear 398 | 399 | try: 400 | # Import ComfyUI's model management 401 | import comfy.model_management as mm 402 | 403 | # Step 1: Delete model references from our cache 404 | # This removes the Python references to the models 405 | for cache_key, maya1_model in list(cls._model_cache.items()): 406 | try: 407 | # Delete the model object to free references 408 | if maya1_model.model is not None: 409 | del maya1_model.model 410 | if maya1_model.tokenizer is not None: 411 | del maya1_model.tokenizer 412 | except Exception as e: 413 | print(f" ⚠ Warning: Failed to delete {maya1_model.model_name}: {e}") 414 | 415 | # Step 2: Clear our cache dictionary 416 | cls._model_cache.clear() 417 | 418 | # Step 3: Use ComfyUI's native VRAM cleanup 419 | # This unloads ALL models from VRAM (including ours) 420 | mm.unload_all_models() 421 | 422 | # Step 4: Clear ComfyUI's internal cache 423 | mm.soft_empty_cache() 424 | 425 | # Step 5: Python garbage collection 426 | gc.collect() 427 | 428 | # Step 6: Clear CUDA caches 429 | if torch.cuda.is_available(): 430 | torch.cuda.empty_cache() 431 | torch.cuda.ipc_collect() 432 | 433 | except ImportError: 434 | # Fallback if comfy.model_management is not available 435 | print(" ⚠ Warning: ComfyUI model_management not available, using fallback cleanup") 436 | 437 | # Fallback: Just clear the cache and force GC 438 | cls._model_cache.clear() 439 | gc.collect() 440 | if torch.cuda.is_available(): 441 | torch.cuda.empty_cache() 442 | torch.cuda.ipc_collect() 443 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI-Maya1_TTS 2 | 3 | **Expressive Voice Generation with Emotions for ComfyUI** 4 | 5 | A ComfyUI node pack for [Maya1](https://huggingface.co/maya-research/maya1), a 3B-parameter speech model built for expressive voice generation with rich human emotion and precise voice design. 6 | 7 | ![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg) 8 | ![Python](https://img.shields.io/badge/python-3.11+-blue.svg) 9 | ![ComfyUI](https://img.shields.io/badge/ComfyUI-compatible-green.svg) 10 | 11 | https://github.com/user-attachments/assets/1be0c2a0-22fb-4890-9147-d20abeb2e067 12 | 13 | 14 | --- 15 | 16 | ## ✨ Features 17 | 18 | ### Core Features 19 | - 🎭 **Voice Design** through natural language descriptions 20 | - 😊 **16 Emotion Tags**: laugh, cry, whisper, angry, sigh, gasp, scream, and more 21 | - ⚡ **Real-time Generation** with SNAC neural codec (24kHz audio) 22 | - 🔧 **Multiple Attention Mechanisms**: SDPA, eager, Flash Attention 2, Sage Attention (1/2) 23 | - 💾 **Quantization Support**: 4-bit and 8-bit for memory-constrained GPUs 24 | - 🛑 **Native ComfyUI Cancel**: Stop generation anytime 25 | - 📊 **Progress Tracking**: Real-time token generation speed (it/s) 26 | - 🔄 **Model Caching**: Fast subsequent generations 27 | - 🎯 **Smart VRAM Management**: Auto-clears on dtype changes 28 | 29 | ### Custom Canvas UI 30 | - 🎨 **Beautiful Dark Theme** with purple accents and smooth animations 31 | - 👤 **5 Character Presets**: Quick-load voice templates (Male US, Female UK, Announcer, Robot, Demon) 32 | - 🎭 **16 Visual Emotion Buttons**: One-click emotion tag insertion at cursor position 33 | - ⛶ **Professional HTML Modal Editor**: Fullscreen text editor with native textarea for longform content 34 | - 🔤 **Font Size Controls**: Adjustable 12-20px font size with visual slider 35 | - ⌨️ **Advanced Keyboard Shortcuts**: Ctrl+A, Ctrl+C, Ctrl+V, Ctrl+X, Ctrl+Enter to save, ESC to cancel 36 | - 🔔 **Toast Notifications**: Visual feedback for save success and validation errors 37 | - 📝 **Inline Text Editing**: Click-to-edit with cursor positioning and drag-to-select 38 | - 🖱️ **Scroll Support**: Custom themed scrollbars with mouse wheel scrolling 39 | - 📱 **Responsive Design**: Modal adapts to all screen sizes 40 | - 💡 **Contextual Tooltips**: Helpful hints on every control 41 | - 🎬 **Collapsible Sections**: Clean, organized interface 42 | - 🔄 **Smart Audio Processing**: Auto-chunking for long text with crossfade blending for seamless output 43 | 44 | --- 45 | 46 | ## 📦 Installation 47 | 48 |
49 | Quick Install (Click to expand) 50 | 51 | ### 1. Clone the Repository 52 | 53 | ```bash 54 | cd ComfyUI/custom_nodes/ 55 | git clone https://github.com/Saganaki22/ComfyUI-Maya1_TTS.git 56 | cd ComfyUI-Maya1_TTS 57 | ``` 58 | 59 | ### 2. Install Dependencies 60 | 61 | **Core dependencies** (required): 62 | ```bash 63 | pip install torch>=2.0.0 transformers>=4.50.0 numpy>=1.21.0 snac>=1.0.0 64 | ``` 65 | 66 | **Or install from requirements.txt:** 67 | ```bash 68 | pip install -r requirements.txt 69 | ``` 70 | 71 |
72 | 73 |
74 | Optional: Enhanced Performance (Click to expand) 75 | 76 | ### Quantization (Memory Savings) 77 | 78 | For 4-bit/8-bit quantization support: 79 | ```bash 80 | pip install bitsandbytes>=0.41.0 81 | ``` 82 | 83 | **Memory savings:** 84 | - 4-bit: ~6GB → (slight quality loss) 85 | - 8-bit: ~6GB → (minimal quality loss) 86 | 87 | ### Accelerated Attention 88 | 89 | **Flash Attention 2** (CUDA only): 90 | ```bash 91 | pip install flash-attn>=2.0.0 92 | ``` 93 | 94 | **Sage Attention** (memory efficient for batch): 95 | ```bash 96 | pip install sageattention>=1.0.0 97 | ``` 98 | 99 | ### Install All Optional Dependencies 100 | 101 | ```bash 102 | pip install bitsandbytes flash-attn sageattention 103 | ``` 104 | 105 |
106 | 107 |
108 | Download Maya1 Model (Click to expand) 109 | 110 | ### Model Location 111 | 112 | Models go in: `ComfyUI/models/maya1-TTS/` 113 | 114 | ### Expected Folder Structure 115 | 116 | After downloading, your model folder should look like this: 117 | 118 | ``` 119 | ComfyUI/ 120 | └── models/ 121 | └── maya1-TTS/ 122 | └── maya1/ # Model name (can be anything) 123 | ├── chat_template.jinja # Chat template 124 | ├── config.json # Model configuration 125 | ├── generation_config.json # Generation settings 126 | ├── model-00001-of-00002.safetensors # Model weights (shard 1) 127 | ├── model-00002-of-00002.safetensors # Model weights (shard 2) 128 | ├── model.safetensors.index.json # Weight index 129 | ├── special_tokens_map.json # Special tokens 130 | └── tokenizer/ # Tokenizer subfolder 131 | ├── chat_template.jinja # Chat template (duplicate) 132 | ├── special_tokens_map.json # Special tokens (duplicate) 133 | ├── tokenizer.json # Tokenizer vocabulary (22.9 MB) 134 | └── tokenizer_config.json # Tokenizer config 135 | ``` 136 | 137 | **Critical files required:** 138 | - `config.json` - Model architecture configuration 139 | - `generation_config.json` - Default generation parameters 140 | - `model-00001-of-00002.safetensors` & `model-00002-of-00002.safetensors` - Model weights (2 shards) 141 | - `model.safetensors.index.json` - Weight index mapping 142 | - `chat_template.jinja` & `special_tokens_map.json` - In root folder 143 | - `tokenizer/` folder with all 4 tokenizer files 144 | 145 | **Note:** You can have multiple models by creating separate folders like `maya1`, `maya1-finetuned`, etc. 146 | 147 | ### Option 1: Hugging Face CLI (Recommended) 148 | 149 | ```bash 150 | # Install HF CLI 151 | pip install huggingface-hub 152 | 153 | # Create directory 154 | cd ComfyUI 155 | mkdir -p models/maya1-TTS 156 | 157 | # Download model 158 | hf download maya-research/maya1 --local-dir models/maya1-TTS/maya1 159 | ``` 160 | 161 | ### Option 2: Python Script 162 | 163 | ```python 164 | from huggingface_hub import snapshot_download 165 | 166 | snapshot_download( 167 | repo_id="maya-research/maya1", 168 | local_dir="ComfyUI/models/maya1-TTS/maya1", 169 | local_dir_use_symlinks=False 170 | ) 171 | ``` 172 | 173 | ### Option 3: Manual Download 174 | 175 | 1. Go to [Maya1 on HuggingFace](https://huggingface.co/maya-research/maya1) 176 | 2. Download all files to `ComfyUI/models/maya1-TTS/maya1/` 177 | 178 |
179 | 180 |
181 | Restart ComfyUI 182 | 183 | Restart ComfyUI to load the new nodes. The node will appear under: 184 | 185 | **Add Node → audio → Maya1 TTS (AIO) / Maya1 TTS (AIO) Barebones** 186 | 187 |
188 | 189 | --- 190 | 191 | ## 🎮 Usage 192 | 193 | ### Two Node Options 194 | 195 | **Maya1 TTS (AIO)** - Full custom UI with visual controls (recommended) 196 | - Beautiful dark theme with character presets, emotion buttons, and modal editor 197 | - Best user experience with visual feedback and tooltips 198 | 199 | **Maya1 TTS (AIO) Barebones** - Standard ComfyUI widgets only 200 | - For users experiencing JavaScript rendering issues (black box) 201 | - Same functionality, simpler interface 202 | - All inputs stacked vertically with standard dropdowns and text boxes 203 | 204 | --- 205 | 206 | ### Node: Maya1 TTS (AIO) 207 | 208 | All-in-one node for loading models and generating speech with a beautiful custom canvas UI. 209 | 210 | | Maya1 TTS (AIO) | Maya1 TTS (AIO) Barebones | 211 | |:---:|:---:| 212 | | Screenshot 2025-11-07 084153 | image | 213 | 214 | 215 | 216 | ### ✨ Custom Canvas Interface 217 | 218 | The node features a completely custom-built interface with: 219 | 220 | **Character Presets** (Top Row) 221 | - Click any preset to instantly load a pre-configured voice description 222 | - 5 presets: ♂️ Male US, ♀️ Female UK, 🎙️ Announcer, 🤖 Robot, 😈 Demon 223 | 224 | **Text Fields** 225 | - **Voice Description**: Describe your desired voice characteristics 226 | - **Text**: Your script with optional emotion tags 227 | - Click inside to edit with full keyboard support 228 | - Press **Enter** for new line, **Ctrl+Enter** to save, **Escape** to cancel 229 | 230 | **Emotion Tags** (Collapsible Grid) 231 | - 16 emotion buttons in 4×4 grid 232 | - Click any emotion to insert tag at cursor position 233 | - Tags insert where you're typing, not just at the end 234 | - Click header to collapse/expand section 235 | 236 | **⛶ Professional HTML Modal** (Bottom right of Text field) 237 | - Click the expand button (⛶) for fullscreen text editing 238 | - Native HTML textarea with proper newline and whitespace support 239 | - **Font Size Slider**: Adjust text size from 12px to 20px with visual A/A controls 240 | - All 16 emotion buttons available inside modal for quick tag insertion 241 | - **Custom Themed Scrollbar**: Purple accents matching the node design 242 | - **Toast Notifications**: Green checkmark for "Text Saved", red X for validation errors 243 | - **Empty Text Validation**: Prevents saving blank text with helpful error message 244 | - **Keyboard Shortcuts**: 245 | - **Ctrl+Enter**: Save and close 246 | - **ESC**: Cancel without saving 247 | - Full text selection and clipboard support (Ctrl+A, C, V, X) 248 | - **Responsive Design**: Modal adapts to small and large screens, buttons always visible 249 | - **Visual Hints**: Subtle grey text under buttons showing keyboard shortcuts 250 | 251 | **Keyboard Shortcuts** (Inline Editing & Modal) 252 | - `Enter`: New line (in multiline text fields) 253 | - `Ctrl+Enter`: Save and apply changes 254 | - `Escape`: Cancel editing without saving 255 | - `Ctrl+A`: Select all text 256 | - `Ctrl+C/V/X`: Copy, paste, cut selected text 257 | - Click outside field: Auto-save (inline editing only) 258 | 259 |
260 | Model Settings 261 | 262 | **model_name** (dropdown) 263 | - Select from models in `ComfyUI/models/maya1-TTS/` 264 | - Model auto-discovered on startup 265 | 266 | **dtype** (dropdown) 267 | - `4bit`: NF4 quantization (~6GB VRAM, requires bitsandbytes, **SLOWER**) 268 | - `8bit`: INT8 quantization (~7GB VRAM, requires bitsandbytes, **SLOWER**) 269 | - `float16`: 16-bit half precision (~8-9GB VRAM, **FAST**, good quality) 270 | - `bfloat16`: 16-bit brain float (~8-9GB VRAM, **FAST**, recommended) 271 | - `float32`: 32-bit full precision (~16GB VRAM, highest quality, slower) 272 | 273 | ⚠️ **IMPORTANT:** Quantization (4-bit/8-bit) is **SLOWER** than float16/bfloat16! 274 | - Only use quantization if you have **limited VRAM** (<10GB) 275 | - If you have **10GB+ VRAM**, use **float16** or **bfloat16** for best speed 276 | 277 | **attention_mechanism** (dropdown) 278 | - `sdpa`: PyTorch SDPA (**default**, fastest for single TTS) 279 | - `flash_attention_2`: Flash Attention 2 (batch inference) 280 | - `sage_attention`: Sage Attention (memory efficient) 281 | 282 | **device** (dropdown) 283 | - `cuda`: Use GPU (recommended) 284 | - `cpu`: Use CPU (slower) 285 | 286 |
287 | 288 |
289 | Voice & Text Settings 290 | 291 | **voice_description** 292 | 293 | Describe the voice using natural language. Click inside to edit or use character presets. 294 | 295 | **Example:** 296 | ``` 297 | Realistic male voice in the 30s with American accent. Normal pitch, warm timbre, conversational pacing. 298 | ``` 299 | 300 | **Voice Components:** 301 | - **Age**: `in their 20s`, `30s`, `40s`, `50s` 302 | - **Gender**: `Male voice`, `Female voice` 303 | - **Accent**: `American`, `British`, `Australian`, `Indian`, `Middle Eastern` 304 | - **Pitch**: `high pitch`, `normal pitch`, `low pitch` 305 | - **Timbre**: `warm`, `gravelly`, `smooth`, `raspy` 306 | - **Pacing**: `fast pacing`, `conversational`, `slow pacing` 307 | - **Tone**: `happy`, `angry`, `curious`, `energetic`, `calm` 308 | 309 | **💡 Tip**: Use character presets for quick voice templates! 310 | 311 | **text** 312 | 313 | Text to synthesize with optional emotion tags. Click emotion buttons to insert tags at cursor. 314 | 315 | **Example:** 316 | ``` 317 | Hello! This is Maya1 the best open source voice AI! 318 | ``` 319 | 320 | **💡 Tip**: Click ⛶ expand button for longform text editing in fullscreen modal! 321 | 322 |
323 | 324 |
325 | Generation Settings 326 | 327 | **keep_model_in_vram** (boolean) 328 | - `True`: Keep model loaded for faster repeated generations 329 | - `False`: Clear VRAM after generation (saves memory) 330 | - Auto-clears when dtype changes 331 | 332 | **chunk_longform** (boolean) ⚠️ EXPERIMENTAL 333 | - `True`: Auto-split long text (>80 words) at sentences, combines audio 334 | - `False`: Generate entire text at once (may fail if too long) 335 | - **Note**: This feature is experimental and may have quality/timing issues 336 | 337 | **temperature** (0.1-2.0, default: 0.4) 338 | - Lower = more consistent 339 | - Higher = more varied/creative 340 | 341 | **top_p** (0.1-1.0, default: 0.9) 342 | - Nucleus sampling parameter 343 | - 0.9 recommended for natural speech 344 | 345 | **max_tokens** (100-8000, default: 2000) 346 | - Maximum audio tokens to generate 347 | - Higher = longer audio 348 | 349 | **repetition_penalty** (1.0-2.0, default: 1.1) 350 | - Reduces repetitive speech 351 | - 1.1 is good default 352 | 353 | **seed** (integer, default: 0) 354 | - Use same seed for reproducible results 355 | - Use ComfyUI's control_after_generate for random/increment 356 | 357 |
358 | 359 |
360 | Outputs 361 | 362 | **audio** (ComfyUI AUDIO type) 363 | - 24kHz mono audio 364 | - Compatible with all ComfyUI audio nodes 365 | - Connect to PreviewAudio, SaveAudio, etc. 366 | 367 |
368 | 369 | --- 370 | 371 | ### Node: Maya1 TTS (AIO) Barebones 372 | 373 | Standard ComfyUI widgets version for users experiencing JavaScript rendering issues. 374 | 375 | **When to use Barebones:** 376 | - Custom UI shows as a black box 377 | - Browser console shows JavaScript errors 378 | - You prefer simple, standard ComfyUI widgets 379 | - Working with older ComfyUI versions 380 | 381 | **Inputs (in order):** 382 | 383 | 1. **voice_description** (multiline text) 384 | - Describe voice characteristics in natural language 385 | - Same as main node, just standard text box 386 | 387 | 2. **text** (multiline text) 388 | - Your script with manual emotion tags like `` or `` 389 | - Type emotion tags manually (no visual buttons in barebones version) 390 | 391 | 3. **model_name** (dropdown) 392 | - Select Maya1 model from `ComfyUI/models/maya1-TTS/` 393 | 394 | 4. **dtype** (dropdown) 395 | - `4bit (BNB)`, `8bit (BNB)`, `float16`, `bfloat16`, `float32` 396 | 397 | 5. **attention_mechanism** (dropdown) 398 | - `sdpa` (default), `flash_attention_2`, `sage_attention` 399 | 400 | 6. **device** (dropdown) 401 | - `cuda` (GPU) or `cpu` 402 | 403 | 7. **keep_model_in_vram** (boolean toggle) 404 | - Keep model loaded for faster subsequent generations 405 | 406 | 8. **chunk_longform** (boolean toggle) 407 | - Split long text with crossfading for unlimited length 408 | 409 | 9. **max_tokens** (integer) 410 | - Max SNAC tokens per chunk (default: 4000) 411 | 412 | 10. **temperature** (float) 413 | - Generation randomness (default: 0.4) 414 | 415 | 11. **top_p** (float) 416 | - Nucleus sampling (default: 0.9) 417 | 418 | 12. **repetition_penalty** (float) 419 | - Reduce repetition (default: 1.1) 420 | 421 | 13. **seed** (integer) 422 | - 0 = random, or set specific seed for reproducibility 423 | - Use control_after_generate widget for seed management 424 | 425 | **All other features (model loading, VRAM management, chunking, progress tracking) work identically to the main node.** 426 | 427 | --- 428 | 429 | ## 🎭 Emotion Tags 430 | 431 | Add emotions anywhere in your text using `` syntax, or click the visual emotion buttons in the UI! 432 | 433 | **Examples:** 434 | ``` 435 | Hello! This is amazing I can't believe it! 436 | ``` 437 | 438 | ``` 439 | After all we went through I can't believe he was the traitor. 440 | ``` 441 | 442 | ``` 443 | Wow! This place looks incredible! 444 | ``` 445 | 446 |
447 | All 16 Available Emotions (Click to expand) 448 | 449 | **Laughter & Joy:** 450 | - `` - Normal laugh 451 | - `` - Intense laughing 452 | - `` - Light giggling 453 | - `` - Soft chuckle 454 | 455 | **Sadness & Sighs:** 456 | - `` - Crying 457 | - `` - Sighing 458 | 459 | **Surprise & Breath:** 460 | - `` - Surprised gasp 461 | - `` - Excited tone 462 | 463 | **Intensity & Emotion:** 464 | - `` - Whispering 465 | - `` - Angry tone 466 | - `` - Screaming 467 | - `` - Sarcastic delivery 468 | 469 | **Natural Sounds:** 470 | - `` - Snorting 471 | - `` - Exhaling 472 | - `` - Gulping 473 | - `` - Singing 474 | 475 |
476 | 477 | **💡 Tip:** Click emotion buttons in the node UI to insert tags at cursor position! 478 | 479 | --- 480 | 481 | ## 🎬 Example Character Speeches 482 | 483 |
484 | Generative AI & ComfyUI Examples (Click to expand) 485 | 486 | ### Example 1: Excited AI Researcher 487 | 488 | **Voice Description:** 489 | ``` 490 | Female voice in her 30s with American accent. High pitch, energetic tone at high intensity, fast pacing. 491 | ``` 492 | 493 | **Text:** 494 | ``` 495 | Oh my god! Have you seen the new Stable Diffusion model in ComfyUI? The quality is absolutely incredible! I just generated a photorealistic portrait in like 20 seconds. This is game-changing for our workflow! 496 | ``` 497 | 498 | --- 499 | 500 | ### Example 2: Skeptical Developer 501 | 502 | **Voice Description:** 503 | ``` 504 | Male voice in his 40s with British accent. Low pitch, calm tone, conversational pacing. 505 | ``` 506 | 507 | **Text:** 508 | ``` 509 | I've been testing this new node pack in ComfyUI and honestly, I'm impressed. At first I was skeptical about the whole generative AI hype, but the control you get with custom nodes is remarkable. This changes everything. 510 | ``` 511 | 512 | --- 513 | 514 | ### Example 3: Enthusiastic Tutorial Creator 515 | 516 | **Voice Description:** 517 | ``` 518 | Female voice in her 20s with Australian accent. Normal pitch, warm timbre, energetic tone at medium intensity. 519 | ``` 520 | 521 | **Text:** 522 | ``` 523 | Hey everyone! Welcome back to my ComfyUI tutorial series! Today we're diving into the most powerful image generation workflow I've ever seen. You're not gonna believe how easy this is! Let's get started! 524 | ``` 525 | 526 | --- 527 | 528 | ### Example 4: Frustrated Beginner 529 | 530 | **Voice Description:** 531 | ``` 532 | Male voice in his 30s with American accent. Normal pitch, stressed tone at medium intensity, fast pacing. 533 | ``` 534 | 535 | **Text:** 536 | ``` 537 | Why won't this workflow run? I've connected all the nodes exactly like the tutorial showed! Wait... Oh no. I forgot to load the checkpoint model. Classic beginner mistake! Okay, let's try this again. 538 | ``` 539 | 540 | --- 541 | 542 | ### Example 5: Amazed AI Artist 543 | 544 | **Voice Description:** 545 | ``` 546 | Female voice in her 40s with Indian accent. Normal pitch, curious tone, slow pacing, dramatic delivery. 547 | ``` 548 | 549 | **Text:** 550 | ``` 551 | When I first discovered ComfyUI I thought it was just another image generator. But then I realized you can chain workflows together, use custom models, and even generate animations! This is the future of digital art! 552 | ``` 553 | 554 | --- 555 | 556 | ### Example 6: Confident AI Entrepreneur 557 | 558 | **Voice Description:** 559 | ``` 560 | Male voice in his 50s with Middle Eastern accent. Low pitch, gravelly timbre, slow pacing, confident tone at high intensity. 561 | ``` 562 | 563 | **Text:** 564 | ``` 565 | The generative AI revolution is here. ComfyUI gives us the tools to build production-ready workflows. While others are still playing with web UIs, we're automating entire creative pipelines. This is how you stay ahead of the curve. 566 | ``` 567 | 568 |
569 | 570 | --- 571 | 572 | ## ⚙️ Advanced Configuration 573 | 574 |
575 | Attention Mechanisms Comparison 576 | 577 | | Mechanism | Speed | Memory | Best For | Requirements | 578 | |-----------|-------|--------|----------|--------------| 579 | | **SDPA** | ⚡⚡⚡ | Good | Single TTS generation | PyTorch ≥2.0 | 580 | | **Flash Attention 2** | ⚡⚡ | Good | Batch processing | flash-attn, CUDA | 581 | | **Sage Attention** | ⚡⚡ | Excellent | Long sequences | sageattention | 582 | 583 | **Why is SDPA fastest for TTS?** 584 | - Optimized for single-sequence autoregressive generation 585 | - Lower kernel launch overhead (~20μs vs 50-60μs) 586 | - Flash/Sage Attention shine with batch size ≥8 587 | 588 | **Recommendation:** Use **SDPA** (default) for single audio generation. 589 | 590 |
591 | 592 |
593 | Quantization Details 594 | 595 | ⚠️ **CRITICAL: Quantization is SLOWER than fp16/bf16!** 596 | 597 | ### Memory Usage (Maya1 3B Model) 598 | 599 | | Dtype | VRAM Usage | Speed | Quality | 600 | |-------|------------|-------|---------| 601 | | **4-bit NF4** | ~6GB | Slow ⚡ | Good (slight loss) | 602 | | **8-bit INT8** | ~7GB | Slow ⚡ | Excellent (minimal loss) | 603 | | **float16** | ~8-9GB | **Fast** ⚡⚡⚡ | Excellent | 604 | | **bfloat16** | ~8-9GB | **Fast** ⚡⚡⚡ | Excellent | 605 | | **float32** | ~16GB | Medium ⚡⚡ | Perfect | 606 | 607 | ### 4-bit NF4 Quantization 608 | 609 | **Features:** 610 | - Uses NormalFloat4 (NF4) for best 4-bit quality 611 | - Double quantization (nested) for better accuracy 612 | - Memory savings: ~6GB (vs ~8-9GB for fp16) 613 | 614 | **When to use:** 615 | - You have **limited VRAM** (8GB or less GPU) 616 | - Speed is not critical (inference is slower due to dequantization) 617 | - Need to fit model in smaller VRAM 618 | 619 | **When NOT to use:** 620 | - You have 10GB+ VRAM → Use float16/bfloat16 instead for better speed! 621 | 622 | ### 8-bit INT8 Quantization 623 | 624 | **Features:** 625 | - Standard 8-bit integer quantization 626 | - Memory savings: ~7GB (vs ~8-9GB for fp16) 627 | - Minimal quality impact 628 | 629 | **When to use:** 630 | - You have moderate VRAM constraints (8-10GB GPU) 631 | - Want good quality with some memory savings 632 | - Speed is not critical 633 | 634 | **When NOT to use:** 635 | - You have 10GB+ VRAM → Use float16/bfloat16 instead for better speed! 636 | 637 | ### Why is Quantization Slower? 638 | 639 | Quantized models require **dequantization** on every forward pass: 640 | 1. Model weights stored in 4-bit/8-bit 641 | 2. Weights dequantized to fp16 for computation 642 | 3. Computation happens in fp16 643 | 4. Extra overhead = slower inference 644 | 645 | **Recommendation:** Only use quantization if you truly need the memory savings! 646 | 647 | ### Automatic Dtype Switching 648 | 649 | The node automatically clears VRAM when you switch dtypes: 650 | 651 | ``` 652 | 🔄 Dtype changed from bfloat16 to 4bit 653 | Clearing cache to reload model... 654 | ``` 655 | 656 | This prevents dtype mismatch errors and ensures correct quantization. 657 | 658 |
659 | 660 |
661 | Console Progress Output 662 | 663 | Real-time generation statistics in the console: 664 | 665 | ``` 666 | 🎲 Seed: 1337 667 | 🎵 Generating speech (max 2000 tokens)... 668 | Tokens: 500/2000 | Speed: 12.45 it/s | Elapsed: 40.2s 669 | ✅ Generated 1500 tokens in 120.34s (12.47 it/s) 670 | ``` 671 | 672 | **it/s** = iterations per second (tokens/second) 673 | 674 |
675 | 676 | --- 677 | 678 | ## 🐛 Troubleshooting 679 | 680 |
681 | Node Shows as Black Box (JavaScript Issues) 682 | 683 | **Issue:** Maya1 TTS (AIO) node appears completely black with no widgets visible. 684 | 685 | **Quick Fix:** 686 | Use **Maya1 TTS (AIO) Barebones** instead! 687 | - Same functionality, standard ComfyUI widgets only 688 | - No custom JavaScript required 689 | - Find it under: Add Node → audio → Maya1 TTS (AIO) Barebones 690 | 691 | **Debugging Steps:** 692 | 1. Open browser DevTools (F12) → Console tab 693 | 2. Look for JavaScript errors mentioning "maya1" or "Unexpected token" 694 | 3. Try hard refresh: Ctrl+Shift+R (Windows/Linux) or Cmd+Shift+R (Mac) 695 | 4. Clear browser cache completely 696 | 5. Test in incognito/private window 697 | 6. Check if maya1_tts.js loads in Network tab (should be 200 status) 698 | 7. Disable browser extensions (ad blockers, script blockers) 699 | 8. Update ComfyUI to latest version 700 | 701 | **Note:** The barebones version is specifically designed for this issue! 702 | 703 |
704 | 705 |
706 | Model Not Found 707 | 708 | **Error:** `No valid Maya1 models found` 709 | 710 | **Solutions:** 711 | 1. Check model location: `ComfyUI/models/maya1-TTS/` 712 | 2. Download model (see Installation section) 713 | 3. Restart ComfyUI 714 | 4. Check console for model discovery messages 715 | 716 |
717 | 718 |
719 | Out of Memory (OOM) 720 | 721 | **Error:** `CUDA out of memory` 722 | 723 | **Memory requirements:** 724 | - 4-bit: ~6GB VRAM (slower) 725 | - 8-bit: ~7GB VRAM (slower) 726 | - float16/bfloat16: ~8-9GB VRAM (fast, recommended) 727 | - float32: ~16GB VRAM 728 | 729 | **Solutions (try in order):** 730 | 1. Use **4-bit** dtype if you have ≤8GB VRAM (~6GB usage) 731 | 2. Use **8-bit** dtype if you have ~8-10GB VRAM (~7GB usage) 732 | 3. Use **float16** if you have 10GB+ VRAM (faster than quantization!) 733 | 4. Enable `keep_model_in_vram=False` to free VRAM after generation 734 | 5. Reduce `max_tokens` to 1000-1500 735 | 6. Close other VRAM-heavy applications 736 | 7. Use CPU (much slower but works) 737 | 738 | **Note:** If you have 10GB+ VRAM, use float16/bfloat16 for best speed! 739 | 740 |
741 | 742 |
743 | Quantization Errors 744 | 745 | **Error:** `bitsandbytes not found` 746 | 747 | **Solution:** 748 | ```bash 749 | pip install bitsandbytes>=0.41.0 750 | ``` 751 | 752 | **Error:** `Quantization requires CUDA` 753 | 754 | **Solution:** 755 | - 4-bit/8-bit only work on CUDA 756 | - Switch to `float16`/`bfloat16` for CPU 757 | 758 |
759 | 760 |
761 | No Audio Generated 762 | 763 | **Error:** `No SNAC audio tokens generated!` 764 | 765 | **Solutions:** 766 | 1. Increase `max_tokens` to 2000-4000 767 | 2. Adjust `temperature` to 0.3-0.5 768 | 3. Simplify voice description 769 | 4. Check text isn't too long 770 | 5. Try different seed value 771 | 772 |
773 | 774 |
775 | Flash Attention Installation Failed 776 | 777 | **Error:** `flash-attn` won't install 778 | 779 | **Solution:** 780 | - Flash Attention requires CUDA and specific setup 781 | - Just use **SDPA** instead (works great, actually faster for TTS!) 782 | - SDPA is the recommended default 783 | 784 |
785 | 786 |
787 | Info Button Not Visible 788 | 789 | **Issue:** Can't see the "?" or "i" icon, only hover tooltip 790 | 791 | **Answer:** This is **normal** and working correctly! 792 | 793 | - ComfyUI's `DESCRIPTION` creates a hover tooltip 794 | - Some ComfyUI versions show no visible icon 795 | - Just hover over the node title area to see help 796 | - Contains all emotion tags and usage examples 797 | 798 |
799 | 800 | --- 801 | 802 | ## 📊 Performance Tips 803 | 804 | 1. **Use float16/bfloat16** if you have 10GB+ VRAM (fastest!) 805 | 2. **Use quantization (4-bit/8-bit)** ONLY if limited VRAM (<10GB) - slower but fits in memory 806 | 3. **Keep SDPA** as attention mechanism (fastest for single TTS) 807 | 4. **Enable model caching** (`keep_model_in_vram=True`) for multiple generations 808 | 5. **Optimize max_tokens**: Start with 1500-2000 809 | 6. **Batch similar requests** with same voice description for efficiency 810 | 811 | ⚠️ **Speed ranking:** float16/bfloat16 (fastest) > float32 > 8-bit > 4-bit (slowest) 812 | 813 | --- 814 | 815 | ## 🏗️ Technical Details 816 | 817 |
818 | Architecture 819 | 820 | - **Model**: 3B-parameter Llama-based transformer 821 | - **Audio Codec**: SNAC (Speech Neural Audio Codec) 822 | - **Sample Rate**: 24kHz mono 823 | - **Frame Structure**: 7 tokens per frame (3 hierarchical levels) 824 | - **Token Ranges**: 825 | - SNAC tokens: 128266-156937 826 | - Text EOS: 128009 827 | - SNAC EOS: 128258 828 | - **Compression**: ~0.98 kbps streaming 829 | 830 |
831 | 832 |
833 | File Structure 834 | 835 | ``` 836 | ComfyUI-Maya1_TTS/ 837 | ├── __init__.py # Node registration 838 | ├── nodes/ 839 | │ ├── __init__.py 840 | │ └── maya1_tts_combined.py # AIO node (backend) 841 | ├── js/ 842 | │ ├── maya1_tts.js # Custom canvas UI (1800+ lines) 843 | │ └── config.js # UI config (presets, emotions, tooltips) 844 | ├── core/ 845 | │ ├── model_wrapper.py # Model loading & quantization 846 | │ ├── snac_decoder.py # SNAC audio decoding 847 | │ └── utils.py # Utilities & cancel support 848 | ├── resources/ 849 | │ ├── emotions.txt # 16 emotion tags 850 | │ └── prompt_examples.txt # Voice description examples 851 | ├── pyproject.toml # Package metadata 852 | ├── requirements.txt # Dependencies 853 | └── README.md # This file 854 | ``` 855 | 856 |
857 | 858 |
859 | ComfyUI Integration 860 | 861 | - **Custom Canvas UI**: Full JavaScript UI with LiteGraph.js canvas API 862 | - **Cancel Support**: Native `execution.interruption_requested()` 863 | - **Progress Bars**: `comfy.utils.ProgressBar` 864 | - **Audio Format**: ComfyUI AUDIO type (24kHz mono) 865 | - **Model Caching**: Automatic with dtype change detection 866 | - **VRAM Management**: Manual control via toggle 867 | - **Event Handling**: Document-level keyboard/mouse capture for proper text editing 868 | - **Visual Feedback**: Real-time tooltips, animations, and hover states 869 | 870 |
871 | 872 | 873 | 874 | 875 | --- 876 | 877 | ## 📝 Credits 878 | 879 | - **Maya1 Model**: [Maya Research](https://www.mayaresearch.ai/) 880 | - **HuggingFace**: [maya-research/maya1](https://huggingface.co/maya-research/maya1) 881 | - **SNAC Codec**: [hubertsiuzdak/snac](https://github.com/hubertsiuzdak/snac) 882 | - **ComfyUI**: [comfyanonymous/ComfyUI](https://github.com/comfyanonymous/ComfyUI) 883 | 884 | --- 885 | 886 | ## 📄 License 887 | 888 | Apache 2.0 - See [LICENSE](LICENSE) 889 | 890 | Maya1 model is also licensed under Apache 2.0 by Maya Research. 891 | 892 | --- 893 | 894 | ## 🔗 Links 895 | 896 | - **Issues**: [GitHub Issues](https://github.com/Saganaki22/-ComfyUI-Maya1_TTS/issues) 897 | - **Maya Research**: [Website](https://www.mayaresearch.ai/) | [Twitter](https://twitter.com/mayaresearch_ai) 898 | - **Model Page**: [HuggingFace](https://huggingface.co/maya-research/maya1) 899 | 900 | --- 901 | 902 | ## 📖 Citation 903 | 904 | If you use Maya1 in your research, please cite: 905 | 906 | ```bibtex 907 | @misc{maya1voice2025, 908 | title={Maya1: Open Source Voice AI with Emotional Intelligence}, 909 | author={Maya Research}, 910 | year={2025}, 911 | publisher={Hugging Face}, 912 | howpublished={\url{https://huggingface.co/maya-research/maya1}}, 913 | } 914 | ``` 915 | 916 | --- 917 | 918 | *Bringing expressive voice AI to everyone through open source.* 919 | -------------------------------------------------------------------------------- /nodes/maya1_tts_combined.py: -------------------------------------------------------------------------------- 1 | """ 2 | Maya1 TTS Combined Node for ComfyUI. 3 | All-in-one node with model loading and TTS generation. 4 | """ 5 | 6 | import torch 7 | import numpy as np 8 | import random 9 | import re 10 | import gc 11 | from typing import Tuple, List 12 | import comfy.model_management as mm 13 | 14 | from ..core import ( 15 | Maya1ModelLoader, 16 | SNACDecoder, 17 | discover_maya1_models, 18 | get_model_path, 19 | get_maya1_models_dir, 20 | format_prompt, 21 | load_emotions_list, 22 | crossfade_audio 23 | ) 24 | 25 | 26 | def create_progress_bar(current: int, total: int, width: int = 12, show_numbers: bool = True) -> str: 27 | """ 28 | Create a visual progress bar like ComfyUI's native one. 29 | 30 | Args: 31 | current: Current progress value 32 | total: Total value 33 | width: Width of the progress bar in characters 34 | show_numbers: Whether to show the numbers after the bar 35 | 36 | Returns: 37 | Formatted progress bar string 38 | """ 39 | if total == 0: 40 | percent = 0 41 | else: 42 | percent = min(current / total, 1.0) 43 | 44 | filled = int(width * percent) 45 | empty = width - filled 46 | 47 | bar = '█' * filled + '░' * empty 48 | 49 | if show_numbers: 50 | return f"[{bar}] {current}/{total}" 51 | else: 52 | return f"[{bar}]" 53 | 54 | def split_text_smartly(text: str, max_words_per_chunk: int = 100) -> List[str]: 55 | """ 56 | Split text into chunks at sentence boundaries, keeping emotion tags intact. 57 | Improved to NEVER cut words mid-sentence. 58 | 59 | Args: 60 | text: Input text to split 61 | max_words_per_chunk: Maximum words per chunk (default 100) 62 | 63 | Returns: 64 | List of text chunks 65 | """ 66 | # Better sentence boundary detection that handles emotion tags 67 | # Split on: . ! ? followed by whitespace (and optionally capital letter or end of string) 68 | # This regex keeps the punctuation with the sentence 69 | sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z<]|$)' 70 | sentences = re.split(sentence_pattern, text.strip()) 71 | 72 | # Clean up empty sentences 73 | sentences = [s.strip() for s in sentences if s.strip()] 74 | 75 | # Group sentences into chunks 76 | chunks = [] 77 | current_chunk = [] 78 | current_word_count = 0 79 | 80 | for sentence in sentences: 81 | # Count words (emotion tags don't count as words) 82 | # Remove emotion tags temporarily for word count 83 | text_without_tags = re.sub(r'<[^>]+>', '', sentence) 84 | word_count = len(text_without_tags.split()) 85 | 86 | # If single sentence exceeds max, split on commas or semicolons 87 | if word_count > max_words_per_chunk: 88 | # Split long sentence on commas, keeping punctuation 89 | parts = re.split(r'(,\s+|;\s+)', sentence) 90 | 91 | for i, part in enumerate(parts): 92 | if not part.strip(): 93 | continue 94 | 95 | # For delimiters (commas/semicolons), add to previous chunk 96 | if part.strip() in [',', ';']: 97 | if current_chunk: 98 | current_chunk[-1] += part 99 | continue 100 | 101 | # Count words in this part 102 | part_text = re.sub(r'<[^>]+>', '', part) 103 | part_words = len(part_text.split()) 104 | 105 | if current_word_count + part_words > max_words_per_chunk and current_chunk: 106 | # Start new chunk 107 | chunks.append(''.join(current_chunk)) 108 | current_chunk = [part] 109 | current_word_count = part_words 110 | else: 111 | # Add to current chunk 112 | if current_chunk and not current_chunk[-1].endswith((' ', ',', ';')): 113 | current_chunk.append(' ') 114 | current_chunk.append(part) 115 | current_word_count += part_words 116 | else: 117 | # Normal sentence handling 118 | if current_word_count + word_count > max_words_per_chunk and current_chunk: 119 | # Save current chunk and start new one 120 | chunks.append(''.join(current_chunk)) 121 | current_chunk = [sentence] 122 | current_word_count = word_count 123 | else: 124 | # Add to current chunk with space 125 | if current_chunk: 126 | current_chunk.append(' ') 127 | current_chunk.append(sentence) 128 | current_word_count += word_count 129 | 130 | # Add remaining chunk 131 | if current_chunk: 132 | chunks.append(''.join(current_chunk)) 133 | 134 | return chunks if chunks else [text] 135 | 136 | 137 | class Maya1TTSCombinedNode: 138 | """ 139 | Combined Maya1 TTS node - loads model and generates speech in one node. 140 | 141 | Features: 142 | - Model loading with caching 143 | - Voice design through natural language 144 | - 20+ emotion tags with clickable buttons 145 | - Native ComfyUI cancel support 146 | - Real-time progress tracking 147 | - VRAM management 148 | """ 149 | 150 | DESCRIPTION = "" 151 | 152 | @classmethod 153 | def INPUT_TYPES(cls): 154 | """Define input parameters for the node.""" 155 | return { 156 | "required": { 157 | # Model settings 158 | "model_name": (discover_maya1_models(), { 159 | "default": discover_maya1_models()[0] if discover_maya1_models() else None 160 | }), 161 | "dtype": (["4bit (BNB)", "8bit (BNB)", "float16", "bfloat16", "float32"], { 162 | "default": "bfloat16" 163 | }), 164 | "attention_mechanism": (["sdpa", "eager", "flash_attention_2", "sage_attention"], { 165 | "default": "sdpa" 166 | }), 167 | "device": (["cuda", "cpu"], { 168 | "default": "cuda" 169 | }), 170 | 171 | # Voice and text 172 | "voice_description": ("STRING", { 173 | "multiline": True, 174 | "default": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.", 175 | "dynamicPrompts": False 176 | }), 177 | "text": ("STRING", { 178 | "multiline": True, 179 | "default": "Hello! This is Maya1 the best open source voice AI model with emotions.", 180 | "dynamicPrompts": False 181 | }), 182 | 183 | # Generation settings 184 | "keep_model_in_vram": ("BOOLEAN", { 185 | "default": True 186 | }), 187 | "temperature": ("FLOAT", { 188 | "default": 0.4, # Official Maya1 recommendation (from transformers_inference.py) 189 | "min": 0.1, 190 | "max": 2.0, 191 | "step": 0.05 192 | }), 193 | "top_p": ("FLOAT", { 194 | "default": 0.9, 195 | "min": 0.1, 196 | "max": 1.0, 197 | "step": 0.05 198 | }), 199 | "max_new_tokens": ("INT", { 200 | "default": 4000, 201 | "min": 100, 202 | "max": 16000, 203 | "step": 100, 204 | "tooltip": "Maximum NEW SNAC tokens to generate per chunk (excludes input prompt tokens). Higher = longer audio per chunk (~50 tokens/word). 4000 tokens ≈ 30-40s audio" 205 | }), 206 | "repetition_penalty": ("FLOAT", { 207 | "default": 1.1, 208 | "min": 1.0, 209 | "max": 2.0, 210 | "step": 0.05 211 | }), 212 | "seed": ("INT", { 213 | "default": 0, 214 | "min": 0, 215 | "max": 0xffffffffffffffff 216 | }), 217 | "chunk_longform": ("BOOLEAN", { 218 | "default": False, 219 | "tooltip": "Split long text into chunks at sentence boundaries with smooth crossfading. Enables unlimited audio length beyond the 18-20s limit" 220 | }), 221 | } 222 | } 223 | 224 | RETURN_TYPES = ("AUDIO",) 225 | RETURN_NAMES = ("audio",) 226 | FUNCTION = "generate_speech" 227 | CATEGORY = "audio/maya1" 228 | 229 | def cleanup_vram(self): 230 | """ 231 | Native ComfyUI VRAM cleanup - unloads all models and clears cache. 232 | Follows best practices from ComfyUI's memory management system. 233 | """ 234 | print("🗑️ Cleaning up VRAM...") 235 | 236 | # Step 1: Unload all models from VRAM 237 | mm.unload_all_models() 238 | 239 | # Step 2: Clear ComfyUI's internal cache 240 | mm.soft_empty_cache() 241 | 242 | # Step 3: Python garbage collection 243 | gc.collect() 244 | 245 | # Step 4: Clear CUDA caches (if available) 246 | if torch.cuda.is_available(): 247 | torch.cuda.empty_cache() 248 | torch.cuda.ipc_collect() 249 | 250 | print("✅ VRAM cleanup complete") 251 | 252 | def generate_speech( 253 | self, 254 | model_name: str, 255 | dtype: str, 256 | attention_mechanism: str, 257 | device: str, 258 | voice_description: str, 259 | text: str, 260 | keep_model_in_vram: bool, 261 | temperature: float, 262 | top_p: float, 263 | max_new_tokens: int, 264 | repetition_penalty: float, 265 | seed: int, 266 | chunk_longform: bool, 267 | emotion_tag_insert: str = "(none)", 268 | chunk_index: int = None, 269 | total_chunks: int = None 270 | ) -> Tuple[dict]: 271 | """ 272 | Load model (if needed) and generate expressive speech. 273 | 274 | Returns: 275 | Tuple containing audio dictionary for ComfyUI 276 | """ 277 | # Import ComfyUI utilities for progress and cancellation 278 | import comfy.utils 279 | import comfy.model_management as mm 280 | 281 | # Check for cancellation before starting 282 | mm.throw_exception_if_processing_interrupted() 283 | 284 | # Simple seed logic: if seed is 0, randomize; otherwise use the provided seed 285 | # This way seed=0 is always random, and you can set a specific seed for reproducibility 286 | if seed == 0: 287 | actual_seed = random.randint(1, 0xffffffffffffffff) 288 | else: 289 | actual_seed = seed 290 | 291 | print("=" * 70) 292 | print("🎤 Maya1 TTS Generation") 293 | print("=" * 70) 294 | print(f"🎲 Seed: {actual_seed}") 295 | print(f"💾 VRAM setting: {'Keep in VRAM' if keep_model_in_vram else 'Offload after generation'}") 296 | 297 | # ========== MODEL LOADING ========== 298 | # Get the expected models directory 299 | models_dir = get_maya1_models_dir() 300 | 301 | # Validate model name 302 | if model_name.startswith("(No"): 303 | raise ValueError( 304 | f"No valid Maya1 models found!\n\n" 305 | f"Expected location: {models_dir}\n\n" 306 | f"Please download a model:\n" 307 | f" 1. Create models directory:\n" 308 | f" mkdir -p {models_dir}\n\n" 309 | f" 2. Download Maya1:\n" 310 | f" huggingface-cli download maya-research/maya1 \\\n" 311 | f" --local-dir {models_dir}/maya1\n\n" 312 | f" 3. Restart ComfyUI to refresh the dropdown." 313 | ) 314 | 315 | # Get full model path 316 | model_path = get_model_path(model_name) 317 | 318 | if not model_path.exists(): 319 | raise FileNotFoundError( 320 | f"Model not found: {model_path}\n\n" 321 | f"Make sure the model is properly downloaded to:\n" 322 | f" {model_path}" 323 | ) 324 | 325 | # Check device availability 326 | if device == "cuda" and not torch.cuda.is_available(): 327 | print("⚠️ CUDA not available, falling back to CPU") 328 | device = "cpu" 329 | 330 | # ========== MODEL LOADING ========== 331 | print(f"🔍 Validating model files in: {model_path}") 332 | 333 | critical_files = { 334 | "config.json": model_path / "config.json", 335 | "generation_config.json": model_path / "generation_config.json", 336 | "tokenizer_config.json": model_path / "tokenizer" / "tokenizer_config.json", 337 | "tokenizer.json": model_path / "tokenizer" / "tokenizer.json", 338 | "model weights": model_path / "model-00001-of-00002.safetensors", 339 | } 340 | 341 | missing_files = [] 342 | for file_name, file_path in critical_files.items(): 343 | if file_path.exists(): 344 | print(f" ✅ {file_name}") 345 | else: 346 | print(f" ❌ {file_name} - MISSING!") 347 | missing_files.append(file_name) 348 | 349 | if missing_files: 350 | raise FileNotFoundError( 351 | f"Missing critical model files: {', '.join(missing_files)}\n\n" 352 | f"Model directory: {model_path}\n\n" 353 | f"Please re-download the complete model:\n" 354 | f" huggingface-cli download maya-research/maya1 \\\n" 355 | f" --local-dir {model_path}" 356 | ) 357 | 358 | # Strip "(BNB)" suffix from dtype labels if present 359 | dtype_clean = dtype.replace(" (BNB)", "") 360 | 361 | # Load model using the wrapper (with caching) 362 | try: 363 | maya1_model = Maya1ModelLoader.load_model( 364 | model_path=model_path, 365 | attention_type=attention_mechanism, 366 | dtype=dtype_clean, 367 | device=device 368 | ) 369 | except Exception as e: 370 | raise RuntimeError( 371 | f"Failed to load Maya1 model:\n{str(e)}\n\n" 372 | f"Model: {model_name}\n" 373 | f"Attention: {attention_mechanism}\n" 374 | f"Dtype: {dtype_clean}\n" 375 | f"Device: {device}" 376 | ) 377 | 378 | mm.throw_exception_if_processing_interrupted() 379 | 380 | # ========== SPEECH GENERATION ========== 381 | print(f"Keep in VRAM: {keep_model_in_vram}") 382 | print(f"Voice: {voice_description[:60]}...") 383 | print(f"Text: {text[:60]}...") 384 | print(f"Temperature: {temperature}, Top-p: {top_p}") 385 | print(f"Max tokens: {max_new_tokens}") 386 | print("=" * 70) 387 | 388 | # ========== LONGFORM CHUNKING ========== 389 | # Check if text should be chunked (enabled + text is reasonably long) 390 | word_count = len(text.split()) 391 | if chunk_longform and word_count > 80: # Only chunk if >80 words 392 | print(f"📚 Longform mode enabled: {word_count} words detected") 393 | print(f"🔪 Splitting text into chunks at sentence boundaries...") 394 | 395 | # Calculate words per chunk based on max_new_tokens 396 | # Empirical data: 1 word ≈ 50-55 SNAC tokens 397 | # Leave some headroom (80%) to avoid exceeding max_new_tokens 398 | estimated_words_per_chunk = int((max_new_tokens * 0.8) / 50) 399 | estimated_words_per_chunk = max(50, min(estimated_words_per_chunk, 300)) # Clamp between 50-300 400 | 401 | print(f"📏 Max tokens: {max_new_tokens} → ~{estimated_words_per_chunk} words per chunk (~{estimated_words_per_chunk / 150:.1f}min per chunk)") 402 | 403 | text_chunks = split_text_smartly(text, max_words_per_chunk=estimated_words_per_chunk) 404 | print(f"📦 Split into {len(text_chunks)} chunks") 405 | print("=" * 70) 406 | 407 | # Create outer progress bar for chunks (layered progress) 408 | import comfy.utils 409 | chunk_progress = comfy.utils.ProgressBar(len(text_chunks)) 410 | 411 | all_audio_data = [] 412 | sample_rate = None 413 | 414 | for i, chunk_text in enumerate(text_chunks): 415 | # Create visual progress display for chunks 416 | chunk_bar = create_progress_bar(i + 1, len(text_chunks), width=6) 417 | print(f"\n🎤 Chunk Progress: {chunk_bar}") 418 | print(f"📝 Text: {chunk_text[:60]}...") 419 | print("=" * 70) 420 | 421 | # Check for cancellation before each chunk 422 | mm.throw_exception_if_processing_interrupted() 423 | 424 | # Recursively call generate_speech for this chunk with chunk_longform=False 425 | # to avoid infinite recursion 426 | chunk_audio = self.generate_speech( 427 | model_name=model_name, 428 | dtype=dtype, 429 | attention_mechanism=attention_mechanism, 430 | device=device, 431 | voice_description=voice_description, 432 | text=chunk_text, 433 | keep_model_in_vram=True, # Keep in VRAM between chunks 434 | temperature=temperature, 435 | top_p=top_p, 436 | max_new_tokens=max_new_tokens, 437 | repetition_penalty=repetition_penalty, 438 | seed=actual_seed, # Use same seed for all chunks 439 | chunk_longform=False, # Disable chunking for recursive calls 440 | emotion_tag_insert=emotion_tag_insert, 441 | chunk_index=i + 1, # Pass chunk context for layered progress 442 | total_chunks=len(text_chunks) 443 | ) 444 | 445 | # Extract audio data (returns tuple, get first element) 446 | chunk_audio_dict = chunk_audio[0] 447 | chunk_waveform = chunk_audio_dict["waveform"] 448 | sample_rate = chunk_audio_dict["sample_rate"] 449 | 450 | # Update chunk progress (outer progress bar) 451 | chunk_progress.update(1) 452 | all_audio_data.append(chunk_waveform) 453 | 454 | mm.throw_exception_if_processing_interrupted() 455 | 456 | print(f"\n{'=' * 70}") 457 | print(f"🔗 Combining {len(all_audio_data)} audio chunks with crossfading...") 458 | 459 | # Combine audio chunks with crossfading for smooth transitions 460 | # Crossfade duration: 50ms = 1200 samples at 24kHz 461 | combined_waveform_np = all_audio_data[0] 462 | 463 | for i in range(1, len(all_audio_data)): 464 | # Crossfade between chunks (1200 samples = 50ms at 24kHz) 465 | combined_waveform_np = crossfade_audio( 466 | combined_waveform_np, 467 | all_audio_data[i], 468 | crossfade_samples=1200 469 | ) 470 | 471 | # Ensure it's a torch tensor 472 | if not isinstance(combined_waveform_np, torch.Tensor): 473 | combined_waveform = torch.from_numpy(combined_waveform_np) 474 | else: 475 | combined_waveform = combined_waveform_np 476 | 477 | print(f"✅ Generated {combined_waveform.shape[-1] / sample_rate:.2f}s of audio from {len(text_chunks)} chunks") 478 | print("=" * 70) 479 | 480 | # Handle VRAM cleanup if requested 481 | if not keep_model_in_vram: 482 | print("🗑️ Offloading model from VRAM...") 483 | Maya1ModelLoader.clear_cache(force=True) 484 | print("✅ Model offloaded from VRAM") 485 | 486 | return ({ 487 | "waveform": combined_waveform, 488 | "sample_rate": sample_rate 489 | },) 490 | 491 | # ========== SINGLE GENERATION (NO CHUNKING) ========== 492 | # Set seed for reproducibility 493 | torch.manual_seed(actual_seed) 494 | if torch.cuda.is_available(): 495 | torch.cuda.manual_seed_all(actual_seed) 496 | 497 | # Format prompt using Maya1's OFFICIAL format (from transformers_inference.py) 498 | print("🔤 Formatting prompt with control tokens...") 499 | 500 | # Official Maya1 control token IDs 501 | SOH_ID = 128259 # Start of Header 502 | EOH_ID = 128260 # End of Header 503 | SOA_ID = 128261 # Start of Audio 504 | CODE_START_TOKEN_ID = 128257 # Start of Speech codes 505 | TEXT_EOT_ID = 128009 # End of Text 506 | 507 | # Decode control tokens 508 | soh_token = maya1_model.tokenizer.decode([SOH_ID]) 509 | eoh_token = maya1_model.tokenizer.decode([EOH_ID]) 510 | soa_token = maya1_model.tokenizer.decode([SOA_ID]) 511 | sos_token = maya1_model.tokenizer.decode([CODE_START_TOKEN_ID]) 512 | eot_token = maya1_model.tokenizer.decode([TEXT_EOT_ID]) 513 | bos_token = maya1_model.tokenizer.bos_token 514 | 515 | # Build formatted text 516 | formatted_text = f' {text}' 517 | 518 | # Construct full prompt with all control tokens (CRITICAL for avoiding garbling!) 519 | prompt = ( 520 | soh_token + bos_token + formatted_text + eot_token + 521 | eoh_token + soa_token + sos_token 522 | ) 523 | 524 | # Debug: Print formatted prompt 525 | print(f"📝 Formatted text: {formatted_text[:100]}...") 526 | print(f"📝 Full prompt preview (first 200 chars): {repr(prompt[:200])}...") 527 | 528 | # Tokenize input 529 | inputs = maya1_model.tokenizer( 530 | prompt, 531 | return_tensors="pt" 532 | ) 533 | print(f"📊 Input token count: {inputs['input_ids'].shape[1]}") 534 | 535 | # Move to device 536 | inputs = {k: v.to(device) for k, v in inputs.items()} 537 | 538 | # Check for cancellation 539 | mm.throw_exception_if_processing_interrupted() 540 | 541 | # Generate with progress tracking and cancellation checks 542 | print(f"🎵 Generating speech (max {max_new_tokens} tokens)...") 543 | 544 | try: 545 | # Setup progress tracking 546 | from comfy.utils import ProgressBar 547 | progress_bar = ProgressBar(max_new_tokens) 548 | 549 | # Create stopping criteria for cancellation support 550 | from transformers import StoppingCriteria, StoppingCriteriaList 551 | 552 | class InterruptionStoppingCriteria(StoppingCriteria): 553 | """Custom stopping criteria that checks for ComfyUI cancellation.""" 554 | def __init__(self, progress_bar, chunk_index=None, total_chunks=None): 555 | self.progress_bar = progress_bar 556 | self.current_tokens = 0 557 | self.input_length = 0 558 | self.start_time = None 559 | self.last_print_time = None 560 | self.print_interval = 0.5 # Print progress every 0.5 seconds 561 | self.chunk_index = chunk_index 562 | self.total_chunks = total_chunks 563 | 564 | def __call__(self, input_ids, scores, **kwargs): 565 | import time 566 | 567 | # Store input length and start time on first call 568 | if self.input_length == 0: 569 | self.input_length = input_ids.shape[1] 570 | self.start_time = time.time() 571 | self.last_print_time = self.start_time 572 | 573 | # Update progress 574 | new_tokens = input_ids.shape[1] - self.input_length 575 | if new_tokens > self.current_tokens: 576 | self.progress_bar.update(new_tokens - self.current_tokens) 577 | self.current_tokens = new_tokens 578 | 579 | # Print progress with visual bar and it/s to console 580 | current_time = time.time() 581 | if current_time - self.last_print_time >= self.print_interval: 582 | elapsed = current_time - self.start_time 583 | it_per_sec = new_tokens / elapsed if elapsed > 0 else 0 584 | 585 | # Create visual progress bar for tokens 586 | token_bar = create_progress_bar(new_tokens, max_new_tokens, width=12) 587 | 588 | # Show layered progress if in chunked mode 589 | if self.chunk_index is not None and self.total_chunks is not None: 590 | chunk_bar = create_progress_bar(self.chunk_index, self.total_chunks, width=6, show_numbers=False) 591 | print(f" Chunk {chunk_bar} → Token Progress: {token_bar} | Speed: {it_per_sec:.2f} it/s", end='\r') 592 | else: 593 | print(f" Progress: {token_bar} | Speed: {it_per_sec:.2f} it/s | Elapsed: {elapsed:.1f}s", end='\r') 594 | 595 | self.last_print_time = current_time 596 | 597 | # Check for cancellation using ComfyUI's native method 598 | try: 599 | mm.throw_exception_if_processing_interrupted() 600 | except: 601 | # If interrupted, stop generation gracefully 602 | print("\n🛑 Generation cancelled by user") 603 | return True # Stop generation 604 | 605 | return False # Continue generation 606 | 607 | stopping_criteria = StoppingCriteriaList([ 608 | InterruptionStoppingCriteria(progress_bar, chunk_index=chunk_index, total_chunks=total_chunks) 609 | ]) 610 | 611 | # Generate tokens with cancellation support 612 | # CRITICAL: Maya1 has TWO EOS tokens in generation_config.json: 613 | # - 128009 (<|eot_id|>) - Text completion token 614 | # - 128258 - SNAC audio completion token 615 | # We need to ONLY stop on 128258 (SNAC done), not 128009 (text done) 616 | # Otherwise the model generates text, hits 128009, and stops before SNAC codes! 617 | 618 | print("🎵 Generation settings:") 619 | print(f" Using EOS token: 128258 (SNAC completion only)") 620 | print(f" Ignoring EOS token: 128009 (text completion)") 621 | 622 | import time 623 | generation_start = time.time() 624 | 625 | with torch.inference_mode(): 626 | outputs = maya1_model.model.generate( 627 | **inputs, 628 | max_new_tokens=max_new_tokens, 629 | min_new_tokens=28, # At least 4 SNAC frames (4 frames × 7 tokens = 28) 630 | temperature=temperature, 631 | top_p=top_p, 632 | do_sample=True, 633 | repetition_penalty=repetition_penalty, 634 | pad_token_id=maya1_model.tokenizer.pad_token_id, 635 | eos_token_id=128258, # CODE_END_TOKEN_ID - Stop at end of speech 636 | stopping_criteria=stopping_criteria, 637 | use_cache=True, # Enable KV cache for faster generation 638 | ) 639 | 640 | generation_time = time.time() - generation_start 641 | 642 | # Check for cancellation after generation 643 | mm.throw_exception_if_processing_interrupted() 644 | 645 | # Extract generated tokens (remove input tokens) 646 | generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist() 647 | 648 | # Print final generation statistics 649 | final_speed = len(generated_ids) / generation_time if generation_time > 0 else 0 650 | print(f"\n✅ Generated {len(generated_ids)} tokens in {generation_time:.2f}s ({final_speed:.2f} it/s)") 651 | 652 | # Debug: Print first few generated token IDs 653 | print(f"🔍 First 10 generated token IDs: {generated_ids[:10]}") 654 | 655 | # Debug: Decode generated tokens to see what was generated 656 | generated_text = maya1_model.tokenizer.decode(generated_ids, skip_special_tokens=False) 657 | print(f"🔍 Generated text (first 100 chars): {generated_text[:100]}...") 658 | 659 | # Filter SNAC tokens 660 | from ..core.snac_decoder import filter_snac_tokens 661 | snac_tokens = filter_snac_tokens(generated_ids) 662 | 663 | if len(snac_tokens) == 0: 664 | raise ValueError( 665 | "No SNAC audio tokens generated!\n" 666 | "The model may have only generated text tokens.\n" 667 | "Try adjusting the prompt or generation parameters." 668 | ) 669 | 670 | print(f"🎵 Found {len(snac_tokens)} SNAC tokens ({len(snac_tokens) // 7} frames)") 671 | 672 | # Check for cancellation before decoding 673 | mm.throw_exception_if_processing_interrupted() 674 | 675 | # Decode SNAC tokens to audio 676 | print("🔊 Decoding to audio...") 677 | audio_waveform = SNACDecoder.decode(snac_tokens, device=device) 678 | 679 | # Check for cancellation after decoding 680 | mm.throw_exception_if_processing_interrupted() 681 | 682 | # Convert to ComfyUI audio format 683 | audio_tensor = torch.from_numpy(audio_waveform).float() 684 | 685 | # Add batch and channel dimensions: [samples] -> [1, 1, samples] 686 | if audio_tensor.dim() == 1: 687 | audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0) 688 | elif audio_tensor.dim() == 2: 689 | audio_tensor = audio_tensor.unsqueeze(0) 690 | 691 | audio_output = { 692 | "waveform": audio_tensor, 693 | "sample_rate": 24000 694 | } 695 | 696 | print(f"✅ Generated {len(audio_waveform) / 24000:.2f}s of audio") 697 | print("=" * 70) 698 | 699 | # Handle VRAM management based on toggle 700 | if not keep_model_in_vram: 701 | print("🗑️ Offloading model from VRAM...") 702 | Maya1ModelLoader.clear_cache(force=True) 703 | print("✅ Model offloaded from VRAM") 704 | else: 705 | print("💾 Model kept in VRAM for faster next generation") 706 | 707 | return (audio_output,) 708 | 709 | except InterruptedError as e: 710 | # User cancelled the generation 711 | print(f"\n{str(e)}") 712 | print("=" * 70) 713 | # Note: VRAM cleanup handled by ComfyUI hooks 714 | raise 715 | 716 | except Exception as e: 717 | # Other errors 718 | print(f"\n❌ Generation failed: {str(e)}") 719 | print("=" * 70) 720 | # Note: VRAM cleanup handled by ComfyUI hooks 721 | raise 722 | 723 | 724 | # ComfyUI node mappings 725 | NODE_CLASS_MAPPINGS = { 726 | "Maya1TTS_Combined": Maya1TTSCombinedNode 727 | } 728 | 729 | NODE_DISPLAY_NAME_MAPPINGS = { 730 | "Maya1TTS_Combined": "Maya1 TTS (AIO)" 731 | } 732 | -------------------------------------------------------------------------------- /nodes/maya1_tts_barebones.py: -------------------------------------------------------------------------------- 1 | """ 2 | Maya1 TTS Barebones Node for ComfyUI. 3 | All-in-one node with standard ComfyUI widgets (no custom JavaScript UI). 4 | Use this version if you have issues with the custom JavaScript rendering. 5 | """ 6 | 7 | import torch 8 | import numpy as np 9 | import random 10 | import re 11 | import gc 12 | from typing import Tuple, List 13 | import comfy.model_management as mm 14 | 15 | from ..core import ( 16 | Maya1ModelLoader, 17 | SNACDecoder, 18 | discover_maya1_models, 19 | get_model_path, 20 | get_maya1_models_dir, 21 | format_prompt, 22 | check_interruption, 23 | load_emotions_list, 24 | crossfade_audio 25 | ) 26 | 27 | 28 | def create_progress_bar(current: int, total: int, width: int = 12, show_numbers: bool = True) -> str: 29 | """ 30 | Create a visual progress bar like ComfyUI's native one. 31 | 32 | Args: 33 | current: Current progress value 34 | total: Total value 35 | width: Width of the progress bar in characters 36 | show_numbers: Whether to show the numbers after the bar 37 | 38 | Returns: 39 | Formatted progress bar string 40 | """ 41 | if total == 0: 42 | percent = 0 43 | else: 44 | percent = min(current / total, 1.0) 45 | 46 | filled = int(width * percent) 47 | empty = width - filled 48 | 49 | bar = '█' * filled + '░' * empty 50 | 51 | if show_numbers: 52 | return f"[{bar}] {current}/{total}" 53 | else: 54 | return f"[{bar}]" 55 | 56 | def split_text_smartly(text: str, max_words_per_chunk: int = 100) -> List[str]: 57 | """ 58 | Split text into chunks at sentence boundaries, keeping emotion tags intact. 59 | Improved to NEVER cut words mid-sentence. 60 | 61 | Args: 62 | text: Input text to split 63 | max_words_per_chunk: Maximum words per chunk (default 100) 64 | 65 | Returns: 66 | List of text chunks 67 | """ 68 | # Better sentence boundary detection that handles emotion tags 69 | # Split on: . ! ? followed by whitespace (and optionally capital letter or end of string) 70 | # This regex keeps the punctuation with the sentence 71 | sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z<]|$)' 72 | sentences = re.split(sentence_pattern, text.strip()) 73 | 74 | # Clean up empty sentences 75 | sentences = [s.strip() for s in sentences if s.strip()] 76 | 77 | # Group sentences into chunks 78 | chunks = [] 79 | current_chunk = [] 80 | current_word_count = 0 81 | 82 | for sentence in sentences: 83 | # Count words (emotion tags don't count as words) 84 | # Remove emotion tags temporarily for word count 85 | text_without_tags = re.sub(r'<[^>]+>', '', sentence) 86 | word_count = len(text_without_tags.split()) 87 | 88 | # If single sentence exceeds max, split on commas or semicolons 89 | if word_count > max_words_per_chunk: 90 | # Split long sentence on commas, keeping punctuation 91 | parts = re.split(r'(,\s+|;\s+)', sentence) 92 | 93 | for i, part in enumerate(parts): 94 | if not part.strip(): 95 | continue 96 | 97 | # For delimiters (commas/semicolons), add to previous chunk 98 | if part.strip() in [',', ';']: 99 | if current_chunk: 100 | current_chunk[-1] += part 101 | continue 102 | 103 | # Count words in this part 104 | part_text = re.sub(r'<[^>]+>', '', part) 105 | part_words = len(part_text.split()) 106 | 107 | if current_word_count + part_words > max_words_per_chunk and current_chunk: 108 | # Start new chunk 109 | chunks.append(''.join(current_chunk)) 110 | current_chunk = [part] 111 | current_word_count = part_words 112 | else: 113 | # Add to current chunk 114 | if current_chunk and not current_chunk[-1].endswith((' ', ',', ';')): 115 | current_chunk.append(' ') 116 | current_chunk.append(part) 117 | current_word_count += part_words 118 | else: 119 | # Normal sentence handling 120 | if current_word_count + word_count > max_words_per_chunk and current_chunk: 121 | # Save current chunk and start new one 122 | chunks.append(''.join(current_chunk)) 123 | current_chunk = [sentence] 124 | current_word_count = word_count 125 | else: 126 | # Add to current chunk with space 127 | if current_chunk: 128 | current_chunk.append(' ') 129 | current_chunk.append(sentence) 130 | current_word_count += word_count 131 | 132 | # Add remaining chunk 133 | if current_chunk: 134 | chunks.append(''.join(current_chunk)) 135 | 136 | return chunks if chunks else [text] 137 | 138 | 139 | class Maya1TTSBarebonesNode: 140 | """ 141 | Barebones Maya1 TTS node - standard ComfyUI widgets only (no custom JavaScript). 142 | 143 | Use this version if you're experiencing issues with the custom UI rendering. 144 | 145 | Features: 146 | - Model loading with caching 147 | - Voice design through natural language 148 | - Emotion tags support (manually type , , etc.) 149 | - Native ComfyUI cancel support 150 | - Real-time progress tracking 151 | - VRAM management 152 | """ 153 | 154 | DESCRIPTION = "Maya1 TTS with standard widgets (for users with JavaScript rendering issues)" 155 | 156 | @classmethod 157 | def INPUT_TYPES(cls): 158 | """Define input parameters for the node.""" 159 | return { 160 | "required": { 161 | # Voice and text (top) 162 | "voice_description": ("STRING", { 163 | "multiline": True, 164 | "default": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.", 165 | "dynamicPrompts": False, 166 | "tooltip": "Describe your desired voice using natural language. Include: age (20s-50s), gender (male/female), accent (American/British/etc), pitch (low/normal/high), timbre (warm/gravelly/smooth), pacing (fast/conversational/slow), tone (happy/calm/energetic)" 167 | }), 168 | "text": ("STRING", { 169 | "multiline": True, 170 | "default": "Hello! This is Maya1 the best open source voice AI model with emotions.", 171 | "dynamicPrompts": False, 172 | "tooltip": "Your script text to synthesize. Add emotion tags anywhere in the text (type manually - no visual buttons in barebones mode). All 17 available tags: , , , , , , , , , , , , , , , , " 173 | }), 174 | 175 | # Model settings 176 | "model_name": (discover_maya1_models(), { 177 | "default": discover_maya1_models()[0] if discover_maya1_models() else None, 178 | "tooltip": "Select Maya1 model from ComfyUI/models/maya1-TTS/ folder. Models are auto-discovered on startup. Download from: huggingface.co/maya-research/maya1" 179 | }), 180 | "dtype": (["4bit (BNB)", "8bit (BNB)", "float16", "bfloat16", "float32"], { 181 | "default": "bfloat16", 182 | "tooltip": "Model precision. 4bit/8bit save VRAM but are SLOWER. Use float16/bfloat16 if you have 10GB+ VRAM for best speed. 4bit≈6GB, 8bit≈7GB, float16/bfloat16≈8-9GB, float32≈16GB" 183 | }), 184 | "attention_mechanism": (["sdpa", "eager", "flash_attention_2", "sage_attention"], { 185 | "default": "sdpa", 186 | "tooltip": "Attention algorithm. SDPA (default) is fastest for single TTS. Eager is standard PyTorch (slower). Flash Attention 2 helps with batch processing. Sage Attention is memory efficient" 187 | }), 188 | "device": (["cuda", "cpu"], { 189 | "default": "cuda", 190 | "tooltip": "Processing device. CUDA (GPU) is recommended for speed. CPU works but is much slower. Will auto-fallback to CPU if CUDA unavailable" 191 | }), 192 | 193 | # Generation settings 194 | "keep_model_in_vram": ("BOOLEAN", { 195 | "default": True, 196 | "tooltip": "Keep model loaded in VRAM after generation. True = faster repeated generations but uses VRAM. False = frees VRAM after each generation but slower" 197 | }), 198 | "chunk_longform": ("BOOLEAN", { 199 | "default": False, 200 | "tooltip": "Split long text into chunks at sentence boundaries with smooth crossfading. Enables unlimited audio length beyond the 18-20s limit. EXPERIMENTAL - may have quality/timing issues" 201 | }), 202 | "max_new_tokens": ("INT", { 203 | "default": 4000, 204 | "min": 100, 205 | "max": 16000, 206 | "step": 100, 207 | "tooltip": "Maximum NEW SNAC tokens to generate per chunk (excludes input prompt tokens). Higher = longer audio per chunk (~50 tokens/word). 4000 tokens ≈ 30-40s audio. Increase if audio cuts off too early" 208 | }), 209 | "temperature": ("FLOAT", { 210 | "default": 0.4, 211 | "min": 0.1, 212 | "max": 2.0, 213 | "step": 0.05, 214 | "tooltip": "Controls randomness/creativity. Lower (0.1-0.3) = more consistent/predictable. Higher (0.5-1.0) = more varied/creative. 0.4 is official Maya1 recommendation" 215 | }), 216 | "top_p": ("FLOAT", { 217 | "default": 0.9, 218 | "min": 0.1, 219 | "max": 1.0, 220 | "step": 0.05, 221 | "tooltip": "Nucleus sampling - controls diversity of token selection. 0.9 (default) works well for natural speech. Lower = more focused, higher = more diverse. Keep at 0.9 unless experimenting" 222 | }), 223 | "repetition_penalty": ("FLOAT", { 224 | "default": 1.1, 225 | "min": 1.0, 226 | "max": 2.0, 227 | "step": 0.05, 228 | "tooltip": "Reduces repetitive speech patterns. 1.0 = no penalty, higher = stronger penalty against repetition. 1.1 is a good default. Increase to 1.2-1.3 if speech sounds too repetitive" 229 | }), 230 | "seed": ("INT", { 231 | "default": 0, 232 | "min": 0, 233 | "max": 0xffffffffffffffff, 234 | "tooltip": "Random seed for reproducibility. 0 = random seed each time. Set specific number (1-999999) for same result every time. Use control_after_generate widget to auto-increment/randomize" 235 | }), 236 | }, 237 | "hidden": { 238 | "control_after_generate": "COMBO" 239 | } 240 | } 241 | 242 | RETURN_TYPES = ("AUDIO",) 243 | RETURN_NAMES = ("audio",) 244 | FUNCTION = "generate_speech" 245 | CATEGORY = "audio/maya1" 246 | 247 | def cleanup_vram(self): 248 | """ 249 | Native ComfyUI VRAM cleanup - unloads all models and clears cache. 250 | Follows best practices from ComfyUI's memory management system. 251 | """ 252 | print("🗑️ Cleaning up VRAM...") 253 | 254 | # Step 1: Unload all models from VRAM 255 | mm.unload_all_models() 256 | 257 | # Step 2: Clear ComfyUI's internal cache 258 | mm.soft_empty_cache() 259 | 260 | # Step 3: Python garbage collection 261 | gc.collect() 262 | 263 | # Step 4: Clear CUDA caches (if available) 264 | if torch.cuda.is_available(): 265 | torch.cuda.empty_cache() 266 | torch.cuda.ipc_collect() 267 | 268 | print("✅ VRAM cleanup complete") 269 | 270 | def generate_speech( 271 | self, 272 | voice_description: str, 273 | text: str, 274 | model_name: str, 275 | dtype: str, 276 | attention_mechanism: str, 277 | device: str, 278 | keep_model_in_vram: bool, 279 | chunk_longform: bool, 280 | max_new_tokens: int, 281 | temperature: float, 282 | top_p: float, 283 | repetition_penalty: float, 284 | seed: int, 285 | control_after_generate: str = "randomize", 286 | chunk_index: int = None, 287 | total_chunks: int = None 288 | ) -> Tuple[dict]: 289 | """ 290 | Load model (if needed) and generate expressive speech. 291 | 292 | Returns: 293 | Tuple containing audio dictionary for ComfyUI 294 | """ 295 | # Import ComfyUI utilities for progress and cancellation 296 | import comfy.utils 297 | import comfy.model_management as mm 298 | 299 | # Check for cancellation before starting 300 | mm.throw_exception_if_processing_interrupted() 301 | 302 | # Simple seed logic: if seed is 0, randomize; otherwise use the provided seed 303 | # This way seed=0 is always random, and you can set a specific seed for reproducibility 304 | if seed == 0: 305 | actual_seed = random.randint(1, 0xffffffffffffffff) 306 | else: 307 | actual_seed = seed 308 | 309 | print("=" * 70) 310 | print("🎤 Maya1 TTS Generation") 311 | print("=" * 70) 312 | print(f"🎲 Seed: {actual_seed}") 313 | print(f"💾 VRAM setting: {'Keep in VRAM' if keep_model_in_vram else 'Offload after generation'}") 314 | 315 | # ========== MODEL LOADING ========== 316 | # Get the expected models directory 317 | models_dir = get_maya1_models_dir() 318 | 319 | # Validate model name 320 | if model_name.startswith("(No"): 321 | raise ValueError( 322 | f"No valid Maya1 models found!\n\n" 323 | f"Expected location: {models_dir}\n\n" 324 | f"Please download a model:\n" 325 | f" 1. Create models directory:\n" 326 | f" mkdir -p {models_dir}\n\n" 327 | f" 2. Download Maya1:\n" 328 | f" huggingface-cli download maya-research/maya1 \\\n" 329 | f" --local-dir {models_dir}/maya1\n\n" 330 | f" 3. Restart ComfyUI to refresh the dropdown." 331 | ) 332 | 333 | # Get full model path 334 | model_path = get_model_path(model_name) 335 | 336 | if not model_path.exists(): 337 | raise FileNotFoundError( 338 | f"Model not found: {model_path}\n\n" 339 | f"Make sure the model is properly downloaded to:\n" 340 | f" {model_path}" 341 | ) 342 | 343 | # Check device availability 344 | if device == "cuda" and not torch.cuda.is_available(): 345 | print("⚠️ CUDA not available, falling back to CPU") 346 | device = "cpu" 347 | 348 | # ========== MODEL LOADING ========== 349 | print(f"🔍 Validating model files in: {model_path}") 350 | 351 | critical_files = { 352 | "config.json": model_path / "config.json", 353 | "generation_config.json": model_path / "generation_config.json", 354 | "tokenizer_config.json": model_path / "tokenizer" / "tokenizer_config.json", 355 | "tokenizer.json": model_path / "tokenizer" / "tokenizer.json", 356 | "model weights": model_path / "model-00001-of-00002.safetensors", 357 | } 358 | 359 | missing_files = [] 360 | for file_name, file_path in critical_files.items(): 361 | if file_path.exists(): 362 | print(f" ✅ {file_name}") 363 | else: 364 | print(f" ❌ {file_name} - MISSING!") 365 | missing_files.append(file_name) 366 | 367 | if missing_files: 368 | raise FileNotFoundError( 369 | f"Missing critical model files: {', '.join(missing_files)}\n\n" 370 | f"Model directory: {model_path}\n\n" 371 | f"Please re-download the complete model:\n" 372 | f" huggingface-cli download maya-research/maya1 \\\n" 373 | f" --local-dir {model_path}" 374 | ) 375 | 376 | # Strip "(BNB)" suffix from dtype labels if present 377 | dtype_clean = dtype.replace(" (BNB)", "") 378 | 379 | # Load model using the wrapper (with caching) 380 | try: 381 | maya1_model = Maya1ModelLoader.load_model( 382 | model_path=model_path, 383 | attention_type=attention_mechanism, 384 | dtype=dtype_clean, 385 | device=device 386 | ) 387 | except Exception as e: 388 | raise RuntimeError( 389 | f"Failed to load Maya1 model:\n{str(e)}\n\n" 390 | f"Model: {model_name}\n" 391 | f"Attention: {attention_mechanism}\n" 392 | f"Dtype: {dtype_clean}\n" 393 | f"Device: {device}" 394 | ) 395 | 396 | mm.throw_exception_if_processing_interrupted() 397 | 398 | # ========== SPEECH GENERATION ========== 399 | print(f"Keep in VRAM: {keep_model_in_vram}") 400 | print(f"Voice: {voice_description[:60]}...") 401 | print(f"Text: {text[:60]}...") 402 | print(f"Temperature: {temperature}, Top-p: {top_p}") 403 | print(f"Max tokens: {max_new_tokens}") 404 | print("=" * 70) 405 | 406 | # ========== LONGFORM CHUNKING ========== 407 | # Check if text should be chunked (enabled + text is reasonably long) 408 | word_count = len(text.split()) 409 | if chunk_longform and word_count > 80: # Only chunk if >80 words 410 | print(f"📚 Longform mode enabled: {word_count} words detected") 411 | print(f"🔪 Splitting text into chunks at sentence boundaries...") 412 | 413 | # Calculate words per chunk based on max_new_tokens 414 | # Empirical data: 1 word ≈ 50-55 SNAC tokens 415 | # Leave some headroom (80%) to avoid exceeding max_new_tokens 416 | estimated_words_per_chunk = int((max_new_tokens * 0.8) / 50) 417 | estimated_words_per_chunk = max(50, min(estimated_words_per_chunk, 300)) # Clamp between 50-300 418 | 419 | print(f"📏 Max tokens: {max_new_tokens} → ~{estimated_words_per_chunk} words per chunk (~{estimated_words_per_chunk / 150:.1f}min per chunk)") 420 | 421 | text_chunks = split_text_smartly(text, max_words_per_chunk=estimated_words_per_chunk) 422 | print(f"📦 Split into {len(text_chunks)} chunks") 423 | print("=" * 70) 424 | 425 | # Create outer progress bar for chunks (layered progress) 426 | chunk_progress = comfy.utils.ProgressBar(len(text_chunks)) 427 | 428 | all_audio_data = [] 429 | sample_rate = None 430 | 431 | for i, chunk_text in enumerate(text_chunks): 432 | # Create visual progress display for chunks 433 | chunk_bar = create_progress_bar(i + 1, len(text_chunks), width=6) 434 | print(f"\n🎤 Chunk Progress: {chunk_bar}") 435 | print(f"📝 Text: {chunk_text[:60]}...") 436 | print("=" * 70) 437 | 438 | # Check for cancellation before each chunk 439 | mm.throw_exception_if_processing_interrupted() 440 | 441 | # Recursively call generate_speech for this chunk with chunk_longform=False 442 | # to avoid infinite recursion 443 | chunk_audio = self.generate_speech( 444 | voice_description=voice_description, 445 | text=chunk_text, 446 | model_name=model_name, 447 | dtype=dtype, 448 | attention_mechanism=attention_mechanism, 449 | device=device, 450 | keep_model_in_vram=True, # Keep in VRAM between chunks 451 | chunk_longform=False, # Disable chunking for recursive calls 452 | max_new_tokens=max_new_tokens, 453 | temperature=temperature, 454 | top_p=top_p, 455 | repetition_penalty=repetition_penalty, 456 | seed=actual_seed, # Use same seed for all chunks 457 | control_after_generate=control_after_generate, 458 | chunk_index=i + 1, # Pass chunk context for layered progress 459 | total_chunks=len(text_chunks) 460 | ) 461 | 462 | # Extract audio data (returns tuple, get first element) 463 | chunk_audio_dict = chunk_audio[0] 464 | chunk_waveform = chunk_audio_dict["waveform"] 465 | sample_rate = chunk_audio_dict["sample_rate"] 466 | all_audio_data.append(chunk_waveform) 467 | 468 | # Update chunk progress (outer progress bar) 469 | chunk_progress.update(1) 470 | 471 | mm.throw_exception_if_processing_interrupted() 472 | 473 | print(f"\n{'=' * 70}") 474 | print(f"🔗 Combining {len(all_audio_data)} audio chunks with crossfading...") 475 | 476 | # Combine audio chunks with crossfading for smooth transitions 477 | # Crossfade duration: 50ms = 1200 samples at 24kHz 478 | combined_waveform_np = all_audio_data[0] 479 | 480 | for i in range(1, len(all_audio_data)): 481 | # Crossfade between chunks (1200 samples = 50ms at 24kHz) 482 | combined_waveform_np = crossfade_audio( 483 | combined_waveform_np, 484 | all_audio_data[i], 485 | crossfade_samples=1200 486 | ) 487 | 488 | # Ensure it's a torch tensor 489 | if not isinstance(combined_waveform_np, torch.Tensor): 490 | combined_waveform = torch.from_numpy(combined_waveform_np) 491 | else: 492 | combined_waveform = combined_waveform_np 493 | 494 | print(f"✅ Generated {combined_waveform.shape[-1] / sample_rate:.2f}s of audio from {len(text_chunks)} chunks") 495 | print("=" * 70) 496 | 497 | # Handle VRAM cleanup if requested 498 | if not keep_model_in_vram: 499 | print("🗑️ Offloading model from VRAM...") 500 | Maya1ModelLoader.clear_cache(force=True) 501 | print("✅ Model offloaded from VRAM") 502 | 503 | return ({ 504 | "waveform": combined_waveform, 505 | "sample_rate": sample_rate 506 | },) 507 | 508 | # ========== SINGLE GENERATION (NO CHUNKING) ========== 509 | # Set seed for reproducibility 510 | torch.manual_seed(actual_seed) 511 | if torch.cuda.is_available(): 512 | torch.cuda.manual_seed_all(actual_seed) 513 | 514 | # Format prompt using Maya1's OFFICIAL format (from transformers_inference.py) 515 | print("🔤 Formatting prompt with control tokens...") 516 | 517 | # Official Maya1 control token IDs 518 | SOH_ID = 128259 # Start of Header 519 | EOH_ID = 128260 # End of Header 520 | SOA_ID = 128261 # Start of Audio 521 | CODE_START_TOKEN_ID = 128257 # Start of Speech codes 522 | TEXT_EOT_ID = 128009 # End of Text 523 | 524 | # Decode control tokens 525 | soh_token = maya1_model.tokenizer.decode([SOH_ID]) 526 | eoh_token = maya1_model.tokenizer.decode([EOH_ID]) 527 | soa_token = maya1_model.tokenizer.decode([SOA_ID]) 528 | sos_token = maya1_model.tokenizer.decode([CODE_START_TOKEN_ID]) 529 | eot_token = maya1_model.tokenizer.decode([TEXT_EOT_ID]) 530 | bos_token = maya1_model.tokenizer.bos_token 531 | 532 | # Build formatted text 533 | formatted_text = f' {text}' 534 | 535 | # Construct full prompt with all control tokens (CRITICAL for avoiding garbling!) 536 | prompt = ( 537 | soh_token + bos_token + formatted_text + eot_token + 538 | eoh_token + soa_token + sos_token 539 | ) 540 | 541 | # Debug: Print formatted prompt 542 | print(f"📝 Formatted text: {formatted_text[:100]}...") 543 | print(f"📝 Full prompt preview (first 200 chars): {repr(prompt[:200])}...") 544 | 545 | # Tokenize input 546 | inputs = maya1_model.tokenizer( 547 | prompt, 548 | return_tensors="pt" 549 | ) 550 | print(f"📊 Input token count: {inputs['input_ids'].shape[1]}") 551 | 552 | # Move to device 553 | inputs = {k: v.to(device) for k, v in inputs.items()} 554 | 555 | # Check for cancellation 556 | mm.throw_exception_if_processing_interrupted() 557 | 558 | # Generate with progress tracking and cancellation checks 559 | print(f"🎵 Generating speech (max {max_new_tokens} tokens)...") 560 | 561 | try: 562 | # Setup progress tracking (inner progress bar for token generation) 563 | progress_bar = comfy.utils.ProgressBar(max_new_tokens) 564 | 565 | # Create stopping criteria for cancellation support 566 | from transformers import StoppingCriteria, StoppingCriteriaList 567 | 568 | class InterruptionStoppingCriteria(StoppingCriteria): 569 | """Custom stopping criteria that checks for ComfyUI cancellation.""" 570 | def __init__(self, progress_bar, chunk_index=None, total_chunks=None): 571 | self.progress_bar = progress_bar 572 | self.current_tokens = 0 573 | self.input_length = 0 574 | self.start_time = None 575 | self.last_print_time = None 576 | self.print_interval = 0.5 # Print progress every 0.5 seconds 577 | self.chunk_index = chunk_index 578 | self.total_chunks = total_chunks 579 | 580 | def __call__(self, input_ids, scores, **kwargs): 581 | import time 582 | 583 | # Store input length and start time on first call 584 | if self.input_length == 0: 585 | self.input_length = input_ids.shape[1] 586 | self.start_time = time.time() 587 | self.last_print_time = self.start_time 588 | 589 | # Update progress 590 | new_tokens = input_ids.shape[1] - self.input_length 591 | if new_tokens > self.current_tokens: 592 | self.progress_bar.update(new_tokens - self.current_tokens) 593 | self.current_tokens = new_tokens 594 | 595 | # Print progress with visual bar and it/s to console 596 | current_time = time.time() 597 | if current_time - self.last_print_time >= self.print_interval: 598 | elapsed = current_time - self.start_time 599 | it_per_sec = new_tokens / elapsed if elapsed > 0 else 0 600 | 601 | # Create visual progress bar for tokens 602 | token_bar = create_progress_bar(new_tokens, max_new_tokens, width=12) 603 | 604 | # Show layered progress if in chunked mode 605 | if self.chunk_index is not None and self.total_chunks is not None: 606 | chunk_bar = create_progress_bar(self.chunk_index, self.total_chunks, width=6, show_numbers=False) 607 | print(f" Chunk {chunk_bar} → Token Progress: {token_bar} | Speed: {it_per_sec:.2f} it/s", end='\r') 608 | else: 609 | print(f" Progress: {token_bar} | Speed: {it_per_sec:.2f} it/s | Elapsed: {elapsed:.1f}s", end='\r') 610 | 611 | self.last_print_time = current_time 612 | 613 | # Check for cancellation using ComfyUI's native method 614 | try: 615 | mm.throw_exception_if_processing_interrupted() 616 | except: 617 | # If interrupted, stop generation gracefully 618 | print("\n🛑 Generation cancelled by user") 619 | return True # Stop generation 620 | 621 | return False # Continue generation 622 | 623 | stopping_criteria = StoppingCriteriaList([ 624 | InterruptionStoppingCriteria(progress_bar, chunk_index=chunk_index, total_chunks=total_chunks) 625 | ]) 626 | 627 | # Generate tokens with cancellation support 628 | # CRITICAL: Maya1 has TWO EOS tokens in generation_config.json: 629 | # - 128009 (<|eot_id|>) - Text completion token 630 | # - 128258 - SNAC audio completion token 631 | # We need to ONLY stop on 128258 (SNAC done), not 128009 (text done) 632 | # Otherwise the model generates text, hits 128009, and stops before SNAC codes! 633 | 634 | print("🎵 Generation settings:") 635 | print(f" Using EOS token: 128258 (SNAC completion only)") 636 | print(f" Ignoring EOS token: 128009 (text completion)") 637 | 638 | import time 639 | generation_start = time.time() 640 | 641 | with torch.inference_mode(): 642 | outputs = maya1_model.model.generate( 643 | **inputs, 644 | max_new_tokens=max_new_tokens, 645 | min_new_tokens=28, # At least 4 SNAC frames (4 frames × 7 tokens = 28) 646 | temperature=temperature, 647 | top_p=top_p, 648 | do_sample=True, 649 | repetition_penalty=repetition_penalty, 650 | pad_token_id=maya1_model.tokenizer.pad_token_id, 651 | eos_token_id=128258, # CODE_END_TOKEN_ID - Stop at end of speech 652 | stopping_criteria=stopping_criteria, 653 | use_cache=True, # Enable KV cache for faster generation 654 | ) 655 | 656 | generation_time = time.time() - generation_start 657 | 658 | # Check for cancellation after generation 659 | mm.throw_exception_if_processing_interrupted() 660 | 661 | # Extract generated tokens (remove input tokens) 662 | generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist() 663 | 664 | # Print final generation statistics 665 | final_speed = len(generated_ids) / generation_time if generation_time > 0 else 0 666 | print(f"\n✅ Generated {len(generated_ids)} tokens in {generation_time:.2f}s ({final_speed:.2f} it/s)") 667 | 668 | # Debug: Print first few generated token IDs 669 | print(f"🔍 First 10 generated token IDs: {generated_ids[:10]}") 670 | 671 | # Debug: Decode generated tokens to see what was generated 672 | generated_text = maya1_model.tokenizer.decode(generated_ids, skip_special_tokens=False) 673 | print(f"🔍 Generated text (first 100 chars): {generated_text[:100]}...") 674 | 675 | # Filter SNAC tokens 676 | from ..core.snac_decoder import filter_snac_tokens 677 | snac_tokens = filter_snac_tokens(generated_ids) 678 | 679 | if len(snac_tokens) == 0: 680 | raise ValueError( 681 | "No SNAC audio tokens generated!\n" 682 | "The model may have only generated text tokens.\n" 683 | "Try adjusting the prompt or generation parameters." 684 | ) 685 | 686 | print(f"🎵 Found {len(snac_tokens)} SNAC tokens ({len(snac_tokens) // 7} frames)") 687 | 688 | # Check for cancellation before decoding 689 | mm.throw_exception_if_processing_interrupted() 690 | 691 | # Decode SNAC tokens to audio 692 | print("🔊 Decoding to audio...") 693 | audio_waveform = SNACDecoder.decode(snac_tokens, device=device) 694 | 695 | # Check for cancellation after decoding 696 | mm.throw_exception_if_processing_interrupted() 697 | 698 | # Convert to ComfyUI audio format 699 | audio_tensor = torch.from_numpy(audio_waveform).float() 700 | 701 | # Add batch and channel dimensions: [samples] -> [1, 1, samples] 702 | if audio_tensor.dim() == 1: 703 | audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0) 704 | elif audio_tensor.dim() == 2: 705 | audio_tensor = audio_tensor.unsqueeze(0) 706 | 707 | audio_output = { 708 | "waveform": audio_tensor, 709 | "sample_rate": 24000 710 | } 711 | 712 | print(f"✅ Generated {len(audio_waveform) / 24000:.2f}s of audio") 713 | print("=" * 70) 714 | 715 | # Handle VRAM management based on toggle 716 | if not keep_model_in_vram: 717 | print("🗑️ Offloading model from VRAM...") 718 | Maya1ModelLoader.clear_cache(force=True) 719 | print("✅ Model offloaded from VRAM") 720 | else: 721 | print("💾 Model kept in VRAM for faster next generation") 722 | 723 | return (audio_output,) 724 | 725 | except InterruptedError as e: 726 | # User cancelled the generation 727 | print(f"\n{str(e)}") 728 | print("=" * 70) 729 | # Note: VRAM cleanup handled by ComfyUI hooks 730 | raise 731 | 732 | except Exception as e: 733 | # Other errors 734 | print(f"\n❌ Generation failed: {str(e)}") 735 | print("=" * 70) 736 | # Note: VRAM cleanup handled by ComfyUI hooks 737 | raise 738 | 739 | 740 | # ComfyUI node mappings 741 | NODE_CLASS_MAPPINGS = { 742 | "Maya1TTS_Barebones": Maya1TTSBarebonesNode 743 | } 744 | 745 | NODE_DISPLAY_NAME_MAPPINGS = { 746 | "Maya1TTS_Barebones": "Maya1 TTS (AIO) Barebones" 747 | } 748 | --------------------------------------------------------------------------------