├── example_workflow
    ├── Maya1_TTS-example_workflow.png
    └── Maya1_TTS-example_workflow.json
├── resources
    ├── emotions.txt
    └── prompt_examples.txt
├── .github
    └── workflows
    │   └── publish_action.yml
├── requirements.txt
├── pyproject.toml
├── .gitignore
├── nodes
    ├── __init__.py
    ├── maya1_tts_combined.py
    └── maya1_tts_barebones.py
├── core
    ├── __init__.py
    ├── chunking.py
    ├── snac_decoder.py
    ├── utils.py
    └── model_wrapper.py
├── __init__.py
├── js
    └── config.js
├── LICENSE
└── README.md


/example_workflow/Maya1_TTS-example_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Saganaki22/ComfyUI-Maya1_TTS/HEAD/example_workflow/Maya1_TTS-example_workflow.png


--------------------------------------------------------------------------------
/resources/emotions.txt:
--------------------------------------------------------------------------------
 1 | laugh
 2 | laugh_harder
 3 | giggle
 4 | chuckle
 5 | cry
 6 | sigh
 7 | gasp
 8 | whisper
 9 | angry
10 | scream
11 | snort
12 | yawn
13 | cough
14 | sneeze
15 | breathing
16 | humming
17 | throat_clearing
18 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_action.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to ComfyUI Registry
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |     paths: [ pyproject.toml ]
 7 | 
 8 | jobs:
 9 |   publish:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@v4
14 |         with:
15 |           fetch-depth: 0
16 | 
17 |       - name: Publish Custom Node to Registry
18 |         uses: Comfy-Org/publish-node-action@main
19 |         with:
20 |           personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies for Maya1 TTS ComfyUI integration
 2 | torch>=2.0.0
 3 | transformers>=4.40.0
 4 | numpy>=1.24.0
 5 | 
 6 | # SNAC audio codec
 7 | snac>=1.0.0
 8 | 
 9 | # Audio processing
10 | soundfile>=0.12.0
11 | 
12 | # Optional: Accelerated attention mechanisms (uncomment to enable)
13 | # flash-attn>=2.5.0          # Flash Attention 2 (fastest, CUDA only)
14 | # sageattention>=1.0.0       # Sage Attention (memory efficient)
15 | 
16 | # Optional: Performance optimization
17 | # accelerate>=0.20.0         # For better device management
18 | # xformers>=0.0.20           # Additional attention optimizations
19 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "ComfyUI-Maya1_TTS"
 3 | description = "ComfyUI node for Maya1 TTS - Expressive voice generation with 20+ emotions, voice design, and SNAC neural codec"
 4 | version = "1.0.6"
 5 | license = {file = "LICENSE"}
 6 | dependencies = [
 7 |     "torch>=2.0.0",
 8 |     "transformers>=4.50.0",
 9 |     "numpy>=1.21.0",
10 |     "snac>=1.0.0",
11 | ]
12 | 
13 | [project.urls]
14 | Repository = "https://github.com/Saganaki22/ComfyUI-Maya1_TTS"
15 | #  Used by Comfy Registry https://comfyregistry.org
16 | 
17 | [tool.comfy]
18 | PublisherId = "saganaki22"
19 | DisplayName = "Maya1 TTS"
20 | Icon = "https://avatars.githubusercontent.com/u/84208527"
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual environments
24 | venv/
25 | ENV/
26 | env/
27 | 
28 | # IDEs
29 | .vscode/
30 | .idea/
31 | *.swp
32 | *.swo
33 | *~
34 | 
35 | # OS
36 | .DS_Store
37 | Thumbs.db
38 | 
39 | # Model cache (users should download models separately)
40 | models/
41 | *.safetensors
42 | *.bin
43 | *.pth
44 | 
45 | # Audio outputs (for testing)
46 | *.wav
47 | *.mp3
48 | *.ogg
49 | 
50 | # Logs
51 | *.log
52 | 
53 | # Temporary files
54 | tmp/
55 | temp/
56 | 


--------------------------------------------------------------------------------
/nodes/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ComfyUI nodes for Maya1 TTS.
 3 | """
 4 | 
 5 | from .maya1_tts_combined import (
 6 |     Maya1TTSCombinedNode,
 7 |     NODE_CLASS_MAPPINGS as COMBINED_MAPPINGS,
 8 |     NODE_DISPLAY_NAME_MAPPINGS as COMBINED_DISPLAY_MAPPINGS
 9 | )
10 | 
11 | from .maya1_tts_barebones import (
12 |     Maya1TTSBarebonesNode,
13 |     NODE_CLASS_MAPPINGS as BAREBONES_MAPPINGS,
14 |     NODE_DISPLAY_NAME_MAPPINGS as BAREBONES_DISPLAY_MAPPINGS
15 | )
16 | 
17 | # Merge the mappings from both nodes
18 | NODE_CLASS_MAPPINGS = {**COMBINED_MAPPINGS, **BAREBONES_MAPPINGS}
19 | NODE_DISPLAY_NAME_MAPPINGS = {**COMBINED_DISPLAY_MAPPINGS, **BAREBONES_DISPLAY_MAPPINGS}
20 | 
21 | __all__ = [
22 |     "Maya1TTSCombinedNode",
23 |     "Maya1TTSBarebonesNode",
24 |     "NODE_CLASS_MAPPINGS",
25 |     "NODE_DISPLAY_NAME_MAPPINGS",
26 | ]
27 | 


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core modules for Maya1 TTS ComfyUI integration.
 3 | """
 4 | 
 5 | from .model_wrapper import Maya1Model, Maya1ModelLoader
 6 | from .snac_decoder import SNACDecoder
 7 | from .chunking import (
 8 |     smart_chunk_text,
 9 |     estimate_tokens_for_text,
10 |     should_chunk_text
11 | )
12 | from .utils import (
13 |     discover_maya1_models,
14 |     get_model_path,
15 |     get_maya1_models_dir,
16 |     load_emotions_list,
17 |     format_prompt,
18 |     check_interruption,
19 |     ProgressCallback,
20 |     crossfade_audio
21 | )
22 | 
23 | __all__ = [
24 |     "Maya1Model",
25 |     "Maya1ModelLoader",
26 |     "SNACDecoder",
27 |     "smart_chunk_text",
28 |     "estimate_tokens_for_text",
29 |     "should_chunk_text",
30 |     "discover_maya1_models",
31 |     "get_model_path",
32 |     "get_maya1_models_dir",
33 |     "load_emotions_list",
34 |     "format_prompt",
35 |     "check_interruption",
36 |     "ProgressCallback",
37 |     "crossfade_audio",
38 | ]
39 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ComfyUI-Maya1_TTS: Maya1 Text-to-Speech Integration for ComfyUI
 3 | 
 4 | Maya1 is a 3B-parameter speech model built for expressive voice generation
 5 | with rich human emotion and precise voice design.
 6 | 
 7 | Features:
 8 | - Voice design through natural language descriptions
 9 | - 20+ emotions: laugh, cry, whisper, angry, sigh, gasp, and more
10 | - Real-time streaming with SNAC neural codec
11 | - Multiple attention mechanisms: SDPA, Flash Attention 2, Sage Attention
12 | - Native ComfyUI cancel support
13 | 
14 | Author: Maya Research
15 | License: Apache 2.0
16 | """
17 | 
18 | import os
19 | from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
20 | from .core.model_wrapper import Maya1ModelLoader
21 | 
22 | __version__ = "1.0.6"
23 | 
24 | # ComfyUI requires these exports
25 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS", "WEB_DIRECTORY"]
26 | 
27 | # Tell ComfyUI where to find our JavaScript extensions
28 | WEB_DIRECTORY = "./js"
29 | 
30 | # Note: VRAM management is controlled by the keep_model_in_vram toggle in the node
31 | # Maya1 models are kept in a separate cache and are not affected by ComfyUI's
32 | # "Unload Models" button. Use the toggle in the node to control VRAM usage.
33 | 
34 | # Print banner on load
35 | print("=" * 70)
36 | print("🎤 ComfyUI-Maya1_TTS")
37 | print("   Expressive Voice Generation with Emotions")
38 | print("=" * 70)
39 | print("📦 Nodes loaded:")
40 | for node_name in NODE_CLASS_MAPPINGS.keys():
41 |     display_name = NODE_DISPLAY_NAME_MAPPINGS.get(node_name, node_name)
42 |     print(f"   • {display_name} ({node_name})")
43 | print("=" * 70)
44 | 


--------------------------------------------------------------------------------
/resources/prompt_examples.txt:
--------------------------------------------------------------------------------
 1 | # Maya1 Voice Description Examples
 2 | 
 3 | ## Basic Voice Descriptions
 4 | 
 5 | ### Female Voices
 6 | - Female, in her 30s with an American accent and is an event host, energetic, clear diction
 7 | - Female voice in their 20s with a British accent. High pitch, warm timbre, fast pacing, happy tone
 8 | - Mythical godlike magical character, Female voice in their 30s slow pacing, curious tone at medium intensity
 9 | 
10 | ### Male Voices
11 | - Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing
12 | - Dark villain character, Male voice in their 40s with a British accent. low pitch, gravelly timbre, slow pacing, angry tone at high intensity
13 | - Demon character, Male voice in their 30s with a Middle Eastern accent. screaming tone at high intensity
14 | 
15 | ## Emotion Tag Examples
16 | 
17 | ### Text with Emotions
18 | - Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.
19 | - Wow. This place looks even better than I imagined. <gasp> How did they set all this up so perfectly?
20 | - Welcome back to another episode of our podcast! <laugh_harder> Today we are diving into an absolutely fascinating topic
21 | - After all we went through to pull him out of that mess <cry> I can't believe he was the traitor
22 | - You dare challenge me, mortal <snort> how amusing. Your kind always thinks they can win
23 | - I can't believe you did that <sigh> we talked about this so many times already
24 | 
25 | ## Available Emotion Tags
26 | <laugh>, <laugh_harder>, <giggle>, <chuckle>, <cry>, <sigh>, <gasp>, <whisper>, <angry>, <scream>, <snort>, <yawn>, <cough>, <sneeze>, <breathing>, <humming>, <throat_clearing>
27 | 
28 | ## Voice Description Components
29 | 
30 | ### Age
31 | - in their 20s
32 | - in their 30s
33 | - in their 40s
34 | - in their 50s
35 | 
36 | ### Accent
37 | - American accent
38 | - British accent
39 | - Middle Eastern accent
40 | - Australian accent
41 | - Indian accent
42 | 
43 | ### Pitch
44 | - high pitch
45 | - normal pitch
46 | - low pitch
47 | 
48 | ### Timbre
49 | - warm timbre
50 | - gravelly timbre
51 | - smooth timbre
52 | - raspy timbre
53 | 
54 | ### Pacing
55 | - fast pacing
56 | - conversational pacing
57 | - slow pacing
58 | 
59 | ### Tone & Intensity
60 | - happy tone at high intensity
61 | - angry tone at medium intensity
62 | - curious tone at low intensity
63 | - energetic
64 | - calm
65 | - dramatic
66 | 


--------------------------------------------------------------------------------
/core/chunking.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Smart text chunking utilities for Maya1 TTS.
  3 | Handles splitting long texts into manageable chunks for generation.
  4 | """
  5 | 
  6 | import re
  7 | from typing import List
  8 | 
  9 | 
 10 | def smart_chunk_text(text: str, max_chunk_chars: int = 200) -> List[str]:
 11 |     """
 12 |     Split text into chunks at sentence boundaries for natural TTS.
 13 | 
 14 |     Tries to split at:
 15 |     1. Sentence boundaries (. ! ?)
 16 |     2. Clause boundaries (, ; :)
 17 |     3. Word boundaries (spaces)
 18 | 
 19 |     Args:
 20 |         text: Full text to chunk
 21 |         max_chunk_chars: Maximum characters per chunk (soft limit)
 22 | 
 23 |     Returns:
 24 |         List of text chunks
 25 |     """
 26 |     if len(text) <= max_chunk_chars:
 27 |         return [text]
 28 | 
 29 |     chunks = []
 30 | 
 31 |     # Split on sentence boundaries first
 32 |     sentence_pattern = r'(?<=[.!?])\s+'
 33 |     sentences = re.split(sentence_pattern, text)
 34 | 
 35 |     current_chunk = ""
 36 | 
 37 |     for sentence in sentences:
 38 |         # If sentence itself is too long, split it further
 39 |         if len(sentence) > max_chunk_chars:
 40 |             # First, save current chunk if exists
 41 |             if current_chunk:
 42 |                 chunks.append(current_chunk.strip())
 43 |                 current_chunk = ""
 44 | 
 45 |             # Split long sentence on clause boundaries
 46 |             clause_pattern = r'(?<=[,;:])\s+'
 47 |             clauses = re.split(clause_pattern, sentence)
 48 | 
 49 |             for clause in clauses:
 50 |                 # If clause is still too long, split on words
 51 |                 if len(clause) > max_chunk_chars:
 52 |                     if current_chunk:
 53 |                         chunks.append(current_chunk.strip())
 54 |                         current_chunk = ""
 55 | 
 56 |                     words = clause.split()
 57 |                     for word in words:
 58 |                         if len(current_chunk) + len(word) + 1 > max_chunk_chars:
 59 |                             if current_chunk:
 60 |                                 chunks.append(current_chunk.strip())
 61 |                             current_chunk = word
 62 |                         else:
 63 |                             current_chunk += (" " if current_chunk else "") + word
 64 |                 else:
 65 |                     # Add clause to current chunk
 66 |                     if len(current_chunk) + len(clause) + 1 > max_chunk_chars:
 67 |                         chunks.append(current_chunk.strip())
 68 |                         current_chunk = clause
 69 |                     else:
 70 |                         current_chunk += (" " if current_chunk else "") + clause
 71 |         else:
 72 |             # Try to add sentence to current chunk
 73 |             if len(current_chunk) + len(sentence) + 1 > max_chunk_chars:
 74 |                 # Current chunk is full, save it
 75 |                 if current_chunk:
 76 |                     chunks.append(current_chunk.strip())
 77 |                 current_chunk = sentence
 78 |             else:
 79 |                 # Add sentence to current chunk
 80 |                 current_chunk += (" " if current_chunk else "") + sentence
 81 | 
 82 |     # Add remaining chunk
 83 |     if current_chunk:
 84 |         chunks.append(current_chunk.strip())
 85 | 
 86 |     return chunks
 87 | 
 88 | 
 89 | def estimate_tokens_for_text(text: str) -> int:
 90 |     """
 91 |     Rough estimate of how many tokens the text will generate.
 92 | 
 93 |     Maya1 typically uses:
 94 |     - ~1 text token per word
 95 |     - ~7 SNAC tokens per frame
 96 |     - ~0.021 seconds per frame
 97 |     - Roughly 350 SNAC tokens per second of audio
 98 | 
 99 |     Args:
100 |         text: Input text
101 | 
102 |     Returns:
103 |         Estimated number of tokens
104 |     """
105 |     # Rough heuristic: 1 word = 3-4 SNAC frames = ~25 tokens
106 |     word_count = len(text.split())
107 |     estimated_tokens = word_count * 25
108 | 
109 |     return estimated_tokens
110 | 
111 | 
112 | def should_chunk_text(text: str, max_tokens: int) -> bool:
113 |     """
114 |     Determine if text should be chunked based on estimated token count.
115 | 
116 |     Args:
117 |         text: Input text
118 |         max_tokens: Maximum tokens allowed per generation
119 | 
120 |     Returns:
121 |         True if text should be chunked
122 |     """
123 |     estimated = estimate_tokens_for_text(text)
124 | 
125 |     # Use 80% of max_tokens as threshold to be safe
126 |     threshold = int(max_tokens * 0.8)
127 | 
128 |     return estimated > threshold
129 | 


--------------------------------------------------------------------------------
/js/config.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Maya1 TTS Configuration
 3 |  * Tooltips, Character Presets, and Emotion Tags
 4 |  */
 5 | 
 6 | export const tooltips = {
 7 |     // Model settings
 8 |     model_name: "Select your downloaded Maya1 model from models/maya1-TTS/",
 9 |     dtype: "Model precision:\n• float16: Fast, 8GB VRAM, good quality\n• bfloat16: Most stable, 8GB VRAM (recommended)\n• float32: Best quality, 16GB VRAM, slower\n• 4bit/8bit: Saves VRAM but slower generation",
10 |     attention_mechanism: "Attention implementation:\n• sdpa: Default, fast, works everywhere\n• flash_attention_2: Fastest (requires: pip install flash-attn)\n• sage_attention: Memory efficient for long sequences",
11 |     device: "Hardware to run on:\n• cuda: Use GPU (fast, needs VRAM)\n• cpu: Use CPU only (slow, no VRAM needed)",
12 | 
13 |     // Voice and text
14 |     voice_description: "Describe the voice characteristics:\n• Age, gender, accent\n• Tone (warm, cold, energetic)\n• Pacing (slow, conversational, fast)\n• Timbre (deep, high-pitched, raspy)\n\nExample: 'Female in 20s, British accent, warm tone, conversational pacing'\n\n💡 Ctrl+Enter to save | Escape to cancel | Click outside to save\n⏎ Enter for new line",
15 |     text: "Your text to speak. Can include emotion tags like:\n<laugh> <whisper> <excited> <angry> <cry>\n\nClick emotion buttons below to insert tags easily!\n\n💡 Ctrl+Enter to save | Escape to cancel | Click outside to save\n⏎ Enter for new line\n⛶ Click expand button for longform text editor",
16 | 
17 |     // Generation settings
18 |     keep_model_in_vram: "Keep model loaded after generation:\n• ON: Faster repeated generations (uses 8-16GB VRAM)\n• OFF: Clears VRAM after each generation",
19 |     temperature: "Controls randomness (0.1-2.0):\n• 0.4: Recommended, balanced\n• Lower (0.1-0.3): More consistent, robotic\n• Higher (0.5-1.0): More creative, varied",
20 |     top_p: "Nucleus sampling (0.1-1.0):\n• 0.9: Recommended, natural speech\n• Lower: More focused, less variety\n• Higher: More diverse but less coherent",
21 |     max_new_tokens: "Maximum NEW audio tokens to generate (excludes input prompt):\n• ~500 tokens ≈ 10 seconds\n• ~1000 tokens ≈ 20 seconds\n• ~2000 tokens ≈ 40 seconds\n• 4000 tokens ≈ 30-40 seconds\n\nFor longform chunking: Each chunk respects this limit",
22 |     repetition_penalty: "Prevents repetitive patterns:\n• 1.1: Recommended\n• Higher (1.2-1.5): Reduces loops but may affect quality\n• 1.0: No penalty (may loop)",
23 |     seed: "Random seed for reproducibility:\n• 0: Random output each time\n• Fixed number (e.g., 42): Same output with same inputs",
24 |     chunk_longform: "⚠️ EXPERIMENTAL: Auto-split long text:\n• ON: Splits text >80 words at sentences, combines audio\n• OFF: Generates entire text at once (may fail if too long)",
25 |     debug_mode: "Console output verbosity:\n• ON: Shows detailed info (token IDs, timings, stats)\n• OFF: Shows only essentials (seed, VRAM, progress)",
26 | 
27 |     // Emotion tag insert dropdown
28 |     emotion_tag_insert: "Legacy emotion tag selector\n(Use clickable buttons below instead!)"
29 | };
30 | 
31 | export const characterPresets = [
32 |     {
33 |         emoji: "♂️",
34 |         name: "Male US",
35 |         description: "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing."
36 |     },
37 |     {
38 |         emoji: "♀️",
39 |         name: "Female UK",
40 |         description: "Realistic female voice in the 20s age with british accent. Normal pitch, warm timbre, conversational pacing."
41 |     },
42 |     {
43 |         emoji: "🎙️",
44 |         name: "Announcer",
45 |         description: "Professional male announcer voice in the 40s age with american accent. Rich pitch, powerful timbre, clear measured pacing."
46 |     },
47 |     {
48 |         emoji: "🤖",
49 |         name: "Robot",
50 |         description: "Robotic AI voice, neutral gender in synthetic age. Monotone pitch, metallic timbre, precise mechanical pacing, emotionless delivery."
51 |     },
52 |     {
53 |         emoji: "😈",
54 |         name: "Demon",
55 |         description: "Demonic entity voice, deep male in unknown age with hellish accent. Very low pitch, gravelly timbre, menacing pacing, evil tone."
56 |     }
57 | ];
58 | 
59 | // All emotion tags use the same purple gradient color for consistency
60 | const EMOTION_COLOR = "#667eea";  // Purple accent matching theme
61 | 
62 | export const emotionTags = [
63 |     { tag: "<laugh>", display: "laugh", color: EMOTION_COLOR },
64 |     { tag: "<laugh_harder>", display: "laugh harder", color: EMOTION_COLOR },
65 |     { tag: "<chuckle>", display: "chuckle", color: EMOTION_COLOR },
66 |     { tag: "<giggle>", display: "giggle", color: EMOTION_COLOR },
67 |     { tag: "<sigh>", display: "sigh", color: EMOTION_COLOR },
68 |     { tag: "<gasp>", display: "gasp", color: EMOTION_COLOR },
69 |     { tag: "<angry>", display: "angry", color: EMOTION_COLOR },
70 |     { tag: "<excited>", display: "excited", color: EMOTION_COLOR },
71 |     { tag: "<whisper>", display: "whisper", color: EMOTION_COLOR },
72 |     { tag: "<cry>", display: "cry", color: EMOTION_COLOR },
73 |     { tag: "<scream>", display: "scream", color: EMOTION_COLOR },
74 |     { tag: "<sing>", display: "sing", color: EMOTION_COLOR },
75 |     { tag: "<snort>", display: "snort", color: EMOTION_COLOR },
76 |     { tag: "<exhale>", display: "exhale", color: EMOTION_COLOR },
77 |     { tag: "<gulp>", display: "gulp", color: EMOTION_COLOR },
78 |     { tag: "<sarcastic>", display: "sarcastic", color: EMOTION_COLOR }
79 | ];
80 | 


--------------------------------------------------------------------------------
/core/snac_decoder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | SNAC (Speech Neural Audio Codec) decoder for Maya1 TTS.
  3 | Handles unpacking 7-token frames and decoding to 24kHz audio.
  4 | """
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | from typing import List, Tuple
  9 | 
 10 | 
 11 | # Maya1 SNAC token range: 128266 to 156937
 12 | SNAC_TOKEN_START = 128266
 13 | SNAC_TOKEN_END = 156937
 14 | SNAC_CODEBOOK_SIZE = 4096  # Each level uses 4096 codes
 15 | 
 16 | 
 17 | def is_snac_token(token_id: int) -> bool:
 18 |     """
 19 |     Check if a token ID is a SNAC audio token.
 20 | 
 21 |     Args:
 22 |         token_id: Token ID to check
 23 | 
 24 |     Returns:
 25 |         True if the token is a SNAC token
 26 |     """
 27 |     return SNAC_TOKEN_START <= token_id <= SNAC_TOKEN_END
 28 | 
 29 | 
 30 | def filter_snac_tokens(token_ids: List[int]) -> List[int]:
 31 |     """
 32 |     Filter only SNAC tokens from a list of token IDs.
 33 | 
 34 |     Args:
 35 |         token_ids: List of token IDs (may include text tokens)
 36 | 
 37 |     Returns:
 38 |         List of only SNAC tokens
 39 |     """
 40 |     return [t for t in token_ids if is_snac_token(t)]
 41 | 
 42 | 
 43 | def unpack_snac_tokens(snac_tokens: List[int]) -> Tuple[List[List[int]], int]:
 44 |     """
 45 |     Unpack 7-token SNAC frames into 3 hierarchical codebook levels.
 46 | 
 47 |     Maya1 packs SNAC codes into 7 tokens per frame:
 48 |     - Frame: [slot0, slot1, slot2, slot3, slot4, slot5, slot6]
 49 |     - L1 (12Hz): slot0
 50 |     - L2 (23Hz): slot1, slot4
 51 |     - L3 (47Hz): slot2, slot3, slot5, slot6
 52 | 
 53 |     Args:
 54 |         snac_tokens: List of SNAC token IDs (should be multiple of 7)
 55 | 
 56 |     Returns:
 57 |         Tuple of (codes, num_frames):
 58 |         - codes: List of 3 lists [L1, L2, L3] with unpacked codes
 59 |         - num_frames: Number of frames processed
 60 |     """
 61 |     num_frames = len(snac_tokens) // 7
 62 | 
 63 |     if len(snac_tokens) % 7 != 0:
 64 |         print(f"⚠️  Warning: SNAC tokens ({len(snac_tokens)}) not divisible by 7. "
 65 |               f"Truncating to {num_frames * 7} tokens.")
 66 | 
 67 |     # Initialize codebook levels
 68 |     l1_codes = []  # 1 code per frame (12 Hz)
 69 |     l2_codes = []  # 2 codes per frame (23 Hz)
 70 |     l3_codes = []  # 4 codes per frame (47 Hz)
 71 | 
 72 |     for i in range(num_frames):
 73 |         # Extract 7 tokens for this frame
 74 |         frame_start = i * 7
 75 |         slots = snac_tokens[frame_start:frame_start + 7]
 76 | 
 77 |         # Unpack to codebook indices (subtract offset and mod by codebook size)
 78 |         l1_codes.append((slots[0] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE)
 79 | 
 80 |         l2_codes.append((slots[1] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE)
 81 |         l2_codes.append((slots[4] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE)
 82 | 
 83 |         l3_codes.append((slots[2] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE)
 84 |         l3_codes.append((slots[3] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE)
 85 |         l3_codes.append((slots[5] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE)
 86 |         l3_codes.append((slots[6] - SNAC_TOKEN_START) % SNAC_CODEBOOK_SIZE)
 87 | 
 88 |     codes = [l1_codes, l2_codes, l3_codes]
 89 |     return codes, num_frames
 90 | 
 91 | 
 92 | def decode_snac_to_audio(codes: List[List[int]], snac_model, device: str = "cuda") -> np.ndarray:
 93 |     """
 94 |     Decode SNAC codes to audio waveform using the SNAC decoder.
 95 | 
 96 |     Args:
 97 |         codes: List of 3 lists [L1, L2, L3] with unpacked codes
 98 |         snac_model: Loaded SNAC model with decoder
 99 |         device: Device to run decoding on
100 | 
101 |     Returns:
102 |         Audio waveform as numpy array (24kHz, mono, float32)
103 |     """
104 |     # Convert codes to tensors
105 |     codes_tensor = [
106 |         torch.tensor(level_codes, dtype=torch.long, device=device).unsqueeze(0)
107 |         for level_codes in codes
108 |     ]
109 | 
110 |     # Decode using SNAC quantizer + decoder
111 |     with torch.inference_mode():
112 |         quantized = snac_model.quantizer.from_codes(codes_tensor)
113 |         audio_tensor = snac_model.decoder(quantized)
114 | 
115 |     # Extract audio: shape is [batch, channels, samples]
116 |     audio = audio_tensor[0, 0].cpu().numpy()
117 | 
118 |     # Trim warmup samples (first 2048 samples) - from official transformers_inference.py
119 |     if len(audio) > 2048:
120 |         audio = audio[2048:]
121 | 
122 |     return audio
123 | 
124 | 
125 | class SNACDecoder:
126 |     """
127 |     Wrapper class for SNAC decoding with model caching.
128 |     """
129 | 
130 |     _cached_model = None
131 |     _cached_device = None
132 | 
133 |     @classmethod
134 |     def load_snac_model(cls, device: str = "cuda"):
135 |         """
136 |         Load SNAC 24kHz model with caching.
137 | 
138 |         Args:
139 |             device: Device to load model on
140 | 
141 |         Returns:
142 |             Loaded SNAC model
143 |         """
144 |         # Return cached model if available
145 |         if cls._cached_model is not None and cls._cached_device == device:
146 |             return cls._cached_model
147 | 
148 |         print("📦 Loading SNAC 24kHz decoder...")
149 | 
150 |         try:
151 |             from snac import SNAC
152 |         except ImportError:
153 |             raise ImportError(
154 |                 "SNAC package not found. Install with: pip install snac\n"
155 |                 "GitHub: https://github.com/hubertsiuzdak/snac"
156 |             )
157 | 
158 |         # Load SNAC 24kHz model
159 |         snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
160 | 
161 |         # Cache the model
162 |         cls._cached_model = snac_model
163 |         cls._cached_device = device
164 | 
165 |         print(f"✅ SNAC decoder loaded on {device}")
166 | 
167 |         return snac_model
168 | 
169 |     @classmethod
170 |     def decode(cls, snac_tokens: List[int], device: str = "cuda") -> np.ndarray:
171 |         """
172 |         Full pipeline: filter tokens → unpack → decode to audio.
173 | 
174 |         Args:
175 |             snac_tokens: List of SNAC token IDs
176 |             device: Device to run on
177 | 
178 |         Returns:
179 |             Audio waveform as numpy array (24kHz, mono, float32)
180 |         """
181 |         # Load SNAC model (cached)
182 |         snac_model = cls.load_snac_model(device)
183 | 
184 |         # Unpack tokens to codes
185 |         codes, num_frames = unpack_snac_tokens(snac_tokens)
186 | 
187 |         if num_frames == 0:
188 |             print("⚠️  No SNAC frames to decode!")
189 |             return np.zeros(0, dtype=np.float32)
190 | 
191 |         print(f"🎵 Decoding {num_frames} SNAC frames (~{num_frames * 0.021:.2f}s audio)...")
192 | 
193 |         # Decode to audio
194 |         audio = decode_snac_to_audio(codes, snac_model, device)
195 | 
196 |         return audio
197 | 


--------------------------------------------------------------------------------
/core/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility functions for Maya1 TTS ComfyUI nodes.
  3 | Includes ComfyUI-native cancel support and progress tracking.
  4 | """
  5 | 
  6 | import os
  7 | from pathlib import Path
  8 | from typing import List, Optional
  9 | 
 10 | 
 11 | def get_maya1_models_dir() -> Path:
 12 |     """
 13 |     Get the Maya1 models directory within ComfyUI's models folder.
 14 | 
 15 |     Returns:
 16 |         Path to ComfyUI/models/maya1-TTS/
 17 |     """
 18 |     try:
 19 |         # Try to use ComfyUI's folder_paths
 20 |         import folder_paths
 21 |         comfyui_models_dir = Path(folder_paths.models_dir)
 22 |     except:
 23 |         # Fallback: try to detect ComfyUI directory
 24 |         # Look for ComfyUI installation in common locations
 25 |         current_file = Path(__file__).resolve()
 26 | 
 27 |         # Navigate up from custom_nodes/ComfyUI-Maya1_TTS/core/utils.py
 28 |         # to find ComfyUI root (should have a 'models' folder)
 29 |         for parent in current_file.parents:
 30 |             if (parent / "models").exists() and (parent / "custom_nodes").exists():
 31 |                 comfyui_models_dir = parent / "models"
 32 |                 break
 33 |         else:
 34 |             # Ultimate fallback: use current directory
 35 |             comfyui_models_dir = Path.cwd() / "models"
 36 | 
 37 |     maya1_models_dir = comfyui_models_dir / "maya1-TTS"
 38 |     return maya1_models_dir
 39 | 
 40 | 
 41 | def discover_maya1_models() -> List[str]:
 42 |     """
 43 |     Scan ComfyUI/models/maya1-TTS/ for available Maya1 models.
 44 | 
 45 |     Returns:
 46 |         List of model directory names (relative to ComfyUI/models/maya1-TTS/)
 47 |     """
 48 |     models_dir = get_maya1_models_dir()
 49 | 
 50 |     if not models_dir.exists():
 51 |         print(f"⚠️  Maya1 models directory not found: {models_dir}")
 52 |         print(f"💡 Create it and download models with:")
 53 |         print(f"   mkdir -p {models_dir}")
 54 |         print(f"   huggingface-cli download maya-research/maya1 --local-dir {models_dir}/maya1")
 55 |         return ["(No models folder found - see console for instructions)"]
 56 | 
 57 |     # Find directories with config.json (HuggingFace model format)
 58 |     models = []
 59 |     for item in models_dir.iterdir():
 60 |         if item.is_dir():
 61 |             # Check for config.json in root or in a checkpoint subdirectory
 62 |             if (item / "config.json").exists():
 63 |                 models.append(item.name)
 64 |             elif any((item / d / "config.json").exists() for d in ["checkpoint-*"] if (item / d).exists()):
 65 |                 models.append(item.name)
 66 | 
 67 |     if not models:
 68 |         print(f"⚠️  No valid Maya1 models found in {models_dir}")
 69 |         print(f"💡 Download a model with:")
 70 |         print(f"   huggingface-cli download maya-research/maya1 --local-dir {models_dir}/maya1")
 71 |         return ["(No valid models found - see console for instructions)"]
 72 | 
 73 |     return sorted(models)
 74 | 
 75 | 
 76 | def get_model_path(model_name: str) -> Path:
 77 |     """
 78 |     Get the full path to a model directory.
 79 | 
 80 |     Args:
 81 |         model_name: Name of the model folder
 82 | 
 83 |     Returns:
 84 |         Full path to the model directory
 85 |     """
 86 |     return get_maya1_models_dir() / model_name
 87 | 
 88 | 
 89 | def load_emotions_list() -> List[str]:
 90 |     """
 91 |     Load the list of supported emotion tags from resources/emotions.txt.
 92 | 
 93 |     Returns:
 94 |         List of emotion tag names (without angle brackets)
 95 |     """
 96 |     emotions_file = Path(__file__).parent.parent / "resources" / "emotions.txt"
 97 | 
 98 |     if not emotions_file.exists():
 99 |         # Fallback list if file doesn't exist
100 |         return [
101 |             "laugh", "laugh_harder", "giggle", "chuckle", "cry", "sigh",
102 |             "gasp", "whisper", "angry", "scream", "snort", "yawn",
103 |             "cough", "sneeze", "breathing", "humming", "throat_clearing"
104 |         ]
105 | 
106 |     with open(emotions_file, 'r') as f:
107 |         emotions = [line.strip() for line in f if line.strip()]
108 | 
109 |     return emotions
110 | 
111 | 
112 | def format_prompt(voice_description: str, text: str) -> str:
113 |     """
114 |     Format the prompt using Maya1's expected format with chat template.
115 | 
116 |     Args:
117 |         voice_description: Natural language voice description
118 |         text: Text to synthesize (may contain emotion tags)
119 | 
120 |     Returns:
121 |         Formatted prompt string
122 |     """
123 |     # Maya1 uses a chat-like format with system/user messages
124 |     # The voice description acts as the "system" instruction
125 |     # The text to synthesize is the "user" message
126 | 
127 |     # Format as a conversation to trigger audio generation
128 |     prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
129 | 
130 | You are a voice synthesis system. Generate natural speech audio using SNAC codes for the following voice characteristics: {voice_description}<|eot_id|><|start_header_id|>user<|end_header_id|>
131 | 
132 | {text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
133 | 
134 | """
135 | 
136 |     return prompt
137 | 
138 | 
139 | def check_interruption():
140 |     """
141 |     Check if ComfyUI has requested interruption.
142 |     Raises an exception if cancellation was requested.
143 | 
144 |     This integrates with ComfyUI's native cancel functionality.
145 |     """
146 |     try:
147 |         # Try to import ComfyUI's execution module
148 |         import execution
149 |         if hasattr(execution, 'interruption_requested') and execution.interruption_requested():
150 |             raise InterruptedError("🛑 Generation cancelled by user")
151 |     except ImportError:
152 |         # If ComfyUI modules aren't available (e.g., testing), just continue
153 |         pass
154 |     except InterruptedError:
155 |         # Re-raise interruption errors
156 |         raise
157 |     except Exception as e:
158 |         # Silently ignore other errors (module might not have the attribute in older versions)
159 |         pass
160 | 
161 | 
162 | class ProgressCallback:
163 |     """
164 |     Progress tracking callback for ComfyUI integration.
165 |     Shows generation progress in the ComfyUI UI.
166 |     """
167 | 
168 |     def __init__(self, total_steps: int, desc: str = "Generating"):
169 |         self.total_steps = total_steps
170 |         self.current_step = 0
171 |         self.desc = desc
172 |         self.pbar = None
173 | 
174 |         # Try to use ComfyUI's progress bar
175 |         try:
176 |             from comfy.utils import ProgressBar
177 |             self.pbar = ProgressBar(total_steps)
178 |         except ImportError:
179 |             # Fallback: just print progress
180 |             self.pbar = None
181 | 
182 |     def update(self, steps: int = 1):
183 |         """Update progress by the specified number of steps."""
184 |         self.current_step += steps
185 | 
186 |         if self.pbar is not None:
187 |             self.pbar.update(steps)
188 |         else:
189 |             # Fallback: print percentage
190 |             if self.current_step % max(1, self.total_steps // 10) == 0:
191 |                 pct = (self.current_step / self.total_steps) * 100
192 |                 print(f"⏳ {self.desc}: {pct:.1f}%")
193 | 
194 |         # Check for cancellation on each update
195 |         check_interruption()
196 | 
197 |     def close(self):
198 |         """Close the progress bar."""
199 |         if self.pbar is not None:
200 |             self.pbar.update(self.total_steps - self.current_step)
201 | 
202 | 
203 | def crossfade_audio(audio1, audio2, crossfade_samples: int = 1200):
204 |     """
205 |     Crossfade two audio arrays for smooth transitions.
206 | 
207 |     Args:
208 |         audio1: First audio array (numpy or torch)
209 |         audio2: Second audio array (numpy or torch)
210 |         crossfade_samples: Number of samples to crossfade (default 1200 = 50ms at 24kHz)
211 | 
212 |     Returns:
213 |         Crossfaded audio array
214 |     """
215 |     import numpy as np
216 |     import torch
217 | 
218 |     # Convert to numpy for processing
219 |     is_torch = False
220 |     if isinstance(audio1, torch.Tensor):
221 |         is_torch = True
222 |         audio1_np = audio1.cpu().numpy()
223 |         audio2_np = audio2.cpu().numpy()
224 |     else:
225 |         audio1_np = audio1
226 |         audio2_np = audio2
227 | 
228 |     # Handle different shapes: [batch, channels, samples] or [samples]
229 |     if audio1_np.ndim == 3:
230 |         # Shape: [batch, channels, samples]
231 |         batch, channels, samples1 = audio1_np.shape
232 |         samples2 = audio2_np.shape[2]
233 | 
234 |         # Ensure crossfade_samples doesn't exceed audio length
235 |         crossfade_samples = min(crossfade_samples, samples1, samples2)
236 | 
237 |         if crossfade_samples > 0:
238 |             # Create fade curves
239 |             fade_out = np.linspace(1.0, 0.0, crossfade_samples).reshape(1, 1, -1)
240 |             fade_in = np.linspace(0.0, 1.0, crossfade_samples).reshape(1, 1, -1)
241 | 
242 |             # Apply crossfade to overlapping region
243 |             audio1_fade = audio1_np.copy()
244 |             audio1_fade[:, :, -crossfade_samples:] *= fade_out
245 | 
246 |             audio2_fade = audio2_np.copy()
247 |             audio2_fade[:, :, :crossfade_samples] *= fade_in
248 | 
249 |             # Combine: audio1 (minus fade region) + crossfade + audio2 (minus fade region)
250 |             result = np.concatenate([
251 |                 audio1_fade[:, :, :-crossfade_samples],
252 |                 audio1_fade[:, :, -crossfade_samples:] + audio2_fade[:, :, :crossfade_samples],
253 |                 audio2_fade[:, :, crossfade_samples:]
254 |             ], axis=2)
255 |         else:
256 |             # No crossfade, just concatenate
257 |             result = np.concatenate([audio1_np, audio2_np], axis=2)
258 | 
259 |     elif audio1_np.ndim == 1:
260 |         # Shape: [samples]
261 |         samples1 = len(audio1_np)
262 |         samples2 = len(audio2_np)
263 | 
264 |         crossfade_samples = min(crossfade_samples, samples1, samples2)
265 | 
266 |         if crossfade_samples > 0:
267 |             fade_out = np.linspace(1.0, 0.0, crossfade_samples)
268 |             fade_in = np.linspace(0.0, 1.0, crossfade_samples)
269 | 
270 |             audio1_fade = audio1_np.copy()
271 |             audio1_fade[-crossfade_samples:] *= fade_out
272 | 
273 |             audio2_fade = audio2_np.copy()
274 |             audio2_fade[:crossfade_samples] *= fade_in
275 | 
276 |             result = np.concatenate([
277 |                 audio1_fade[:-crossfade_samples],
278 |                 audio1_fade[-crossfade_samples:] + audio2_fade[:crossfade_samples],
279 |                 audio2_fade[crossfade_samples:]
280 |             ])
281 |         else:
282 |             result = np.concatenate([audio1_np, audio2_np])
283 | 
284 |     else:
285 |         raise ValueError(f"Unexpected audio shape: {audio1_np.shape}")
286 | 
287 |     # Convert back to torch if needed
288 |     if is_torch:
289 |         result = torch.from_numpy(result)
290 | 
291 |     return result
292 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2025 Saganaki22
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/example_workflow/Maya1_TTS-example_workflow.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "id": "418a17f1-fce3-46a3-8e6d-035b3399740f",
  3 |   "revision": 0,
  4 |   "last_node_id": 9,
  5 |   "last_link_id": 2,
  6 |   "nodes": [
  7 |     {
  8 |       "id": 4,
  9 |       "type": "SaveAudioMP3",
 10 |       "pos": [
 11 |         1105.8284912109375,
 12 |         157.3507843017578
 13 |       ],
 14 |       "size": [
 15 |         270,
 16 |         136
 17 |       ],
 18 |       "flags": {
 19 |         "pinned": true
 20 |       },
 21 |       "order": 6,
 22 |       "mode": 4,
 23 |       "inputs": [
 24 |         {
 25 |           "name": "audio",
 26 |           "type": "AUDIO",
 27 |           "link": 2
 28 |         }
 29 |       ],
 30 |       "outputs": [],
 31 |       "properties": {
 32 |         "cnr_id": "comfy-core",
 33 |         "ver": "0.3.64",
 34 |         "Node name for S&R": "SaveAudioMP3",
 35 |         "ue_properties": {
 36 |           "widget_ue_connectable": {},
 37 |           "input_ue_unconnectable": {},
 38 |           "version": "7.4.1"
 39 |         }
 40 |       },
 41 |       "widgets_values": [
 42 |         "audio/ComfyUI",
 43 |         "320k"
 44 |       ]
 45 |     },
 46 |     {
 47 |       "id": 1,
 48 |       "type": "Maya1TTS_Combined",
 49 |       "pos": [
 50 |         547.344970703125,
 51 |         154.19032287597656
 52 |       ],
 53 |       "size": [
 54 |         513.5526123046875,
 55 |         935
 56 |       ],
 57 |       "flags": {
 58 |         "pinned": true
 59 |       },
 60 |       "order": 0,
 61 |       "mode": 4,
 62 |       "inputs": [],
 63 |       "outputs": [
 64 |         {
 65 |           "name": "audio",
 66 |           "type": "AUDIO",
 67 |           "links": [
 68 |             2
 69 |           ]
 70 |         }
 71 |       ],
 72 |       "properties": {
 73 |         "cnr_id": "ComfyUI-Maya1_TTS",
 74 |         "ver": "2152b6fc507e414bc8538059e1f228bfe7be2dec",
 75 |         "Node name for S&R": "Maya1TTS_Combined",
 76 |         "ue_properties": {
 77 |           "widget_ue_connectable": {},
 78 |           "input_ue_unconnectable": {},
 79 |           "version": "7.4.1"
 80 |         }
 81 |       },
 82 |       "widgets_values": [
 83 |         "maya1",
 84 |         "bfloat16",
 85 |         "sdpa",
 86 |         "cuda",
 87 |         "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
 88 |         "Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
 89 |         true,
 90 |         0.4,
 91 |         0.9,
 92 |         4000,
 93 |         1.1,
 94 |         380055393262059,
 95 |         "randomize",
 96 |         false
 97 |       ]
 98 |     },
 99 |     {
100 |       "id": 9,
101 |       "type": "MarkdownNote",
102 |       "pos": [
103 |         1109.0732421875,
104 |         -32.40277099609375
105 |       ],
106 |       "size": [
107 |         361.2737731933594,
108 |         126.04345703125
109 |       ],
110 |       "flags": {
111 |         "pinned": true
112 |       },
113 |       "order": 1,
114 |       "mode": 0,
115 |       "inputs": [],
116 |       "outputs": [],
117 |       "properties": {},
118 |       "widgets_values": [
119 |         "# If you are facing JS Loading / rendering issues with the bottom node use the top native node"
120 |       ],
121 |       "color": "#422342",
122 |       "bgcolor": "rgba(24,24,27,.9)"
123 |     },
124 |     {
125 |       "id": 6,
126 |       "type": "MarkdownNote",
127 |       "pos": [
128 |         -16.587236404418945,
129 |         296.7104797363281
130 |       ],
131 |       "size": [
132 |         504.536376953125,
133 |         795.1909790039062
134 |       ],
135 |       "flags": {
136 |         "pinned": true
137 |       },
138 |       "order": 2,
139 |       "mode": 0,
140 |       "inputs": [],
141 |       "outputs": [],
142 |       "title": "Voice Description examples",
143 |       "properties": {
144 |         "ue_properties": {
145 |           "widget_ue_connectable": {},
146 |           "version": "7.4.1",
147 |           "input_ue_unconnectable": {}
148 |         }
149 |       },
150 |       "widgets_values": [
151 |         "# Realistic Voices\n\n## 1. Professional Podcast Host\nRealistic male voice in his 30s with an American accent. Normal pitch, warm timbre, conversational pacing, neutral emotion at medium intensity. Podcast domain, podcast host role, formal register.\n\n## 2. Energetic Young Instructor\nRealistic female voice in her 20s with a British accent. High pitch, smooth timbre, brisk pacing, energetic emotion at high intensity. Education domain, elearning instructor role, neutral register.\n\n## 3. Customer Support Agent\nRealistic female voice in her 30s with an Indian accent. Normal pitch, warm timbre, conversational pacing, neutral emotion at medium intensity. Support domain, customer support agent role, neutral register.\n\n## 4. Corporate Explainer Voice\nRealistic male voice in his 40s with a Middle Eastern accent. Low pitch, deep timbre, slow pacing, neutral emotion at medium intensity. Corporate domain, explainer video voice role, formal register.\n\n## 5. Social Media Creator\nRealistic female voice in her 20s with an American accent. Normal pitch, smooth timbre, brisk pacing, energetic emotion at medium intensity. Social content domain, social media creator role, casual register.\n\n<br>\n\n# Creative Voices\n\n## 1. AI Robot Voice\nCreative, AI machine voice character. Male voice in his 20s with an American accent. Normal pitch, robotic timbre, conversational pacing, neutral emotion at medium intensity.\n\n## 2. Pirate\nCreative pirate character. Male voice in his 30s with a British accent. Low pitch, gravelly timbre, slow pacing, energetic emotion at high intensity.\n\n## 3. Mythical Godlike Voice\nCreative, mythical godlike magical character. Female voice in her 40s with an American accent. Low pitch, ethereal timbre, very slow pacing, neutral emotion at medium intensity.\n\n## 4. Flirty Anime Character\nCreative, anime and flirty character. Female voice in her 20s with an Asian American accent. High pitch, smooth timbre, slow pacing, energetic emotion at medium intensity.\n\n## 5. Dark Villain\nCreative dark villain character. Male voice in his 30s with a Middle Eastern accent. Low pitch, raspy timbre, conversational pacing, sarcastic emotion at high intensity.\n"
152 |       ],
153 |       "color": "#006691",
154 |       "bgcolor": "rgba(24,24,27,.9)"
155 |     },
156 |     {
157 |       "id": 8,
158 |       "type": "MarkdownNote",
159 |       "pos": [
160 |         -24.692302703857422,
161 |         -1134.520751953125
162 |       ],
163 |       "size": [
164 |         1406.847900390625,
165 |         697.9292602539062
166 |       ],
167 |       "flags": {
168 |         "pinned": true
169 |       },
170 |       "order": 3,
171 |       "mode": 0,
172 |       "inputs": [],
173 |       "outputs": [],
174 |       "properties": {
175 |         "ue_properties": {
176 |           "widget_ue_connectable": {},
177 |           "version": "7.4.1",
178 |           "input_ue_unconnectable": {}
179 |         }
180 |       },
181 |       "widgets_values": [
182 |         "# TTS Prompts with Full Emotion Tag Variety (30-Second Each)\n\n## 1. Professional Podcast Host\nWelcome to today’s episode! <excited> We’re diving deep into the latest tech trends, exploring breakthroughs in AI, robotics, and space exploration. <curious> Later, we’ll speak with an expert about how these technologies are shaping industries and impacting daily life. <gasp> Stay tuned, because some of these developments might completely change the way you think about the future. By the end, you’ll feel inspired and ready to explore these innovations yourself.\n\n## 2. Energetic Young Instructor\nGood morning class! <giggle> Today we’re exploring some of the most exciting physics concepts that can seem tricky at first. <snort> Don’t worry—I’ll break them down step by step, using simple experiments and examples you can relate to. <exhale> We’ll cover motion, energy, and forces in ways that make sense. <curious> By the end of the session, you’ll understand why these principles govern everything around us, and you might even start seeing the world differently.\n\n## 3. Customer Support Agent\nHello, I’m here to help with your issue today. <sigh> I know how frustrating it can be when things don’t work as expected, but we’ll figure it out together. <whisper> Let’s carefully go through every step, from troubleshooting your settings to checking account details. <gulp> If anything goes wrong, I’ll guide you patiently until it’s fixed. <exhale> By the end of this call, everything should be running smoothly, and you’ll feel confident using the system again.\n\n## 4. Corporate Explainer Voice\nOur new platform enhances productivity and streamlines operations like never before. <curious> From scheduling and task management to analytics and reporting, every feature is designed to save time and reduce errors. <exhale> You’ll notice a smoother workflow immediately, whether you’re collaborating with a team or managing projects individually. <sarcastic> Trust me, once you start using this, you’ll wonder how you ever survived without it. By the end, your efficiency will be transformed, and your team will thank you.\n\n## 5. Casual YouTube Influencer\nHey everyone! <laugh> Today I’m sharing my top 10 life hacks that have completely changed how I organize my day. <snort> Some of them are really unexpected, like using everyday items in creative ways. <giggle> I’ll also give tips for saving money, boosting productivity, and making small routines fun. <curious> Stick around for the last hack—it’s a game-changer you’ll definitely want to try. By the end of this video, you’ll have at least a few new tricks to make life easier and more exciting.\n\n## 6. AI Robot Voice\nGreetings, human. <exhale> I am your AI assistant, programmed to optimize your environment, manage tasks efficiently, and provide detailed data analysis. <curious> Today, I will guide you through scheduling, prioritizing important notifications, and organizing your workflow. <gasp> Please provide your input carefully, as my systems will adapt to your preferences in real-time. By the end of this session, you will notice increased productivity and a fully organized digital environment.\n\n## 7. Pirate Captain\nAhoy, mateys! <laugh_harder> Today we set sail on treacherous seas in search of hidden treasure. <scream> The waves are rough, and the winds are fierce, but only the bravest shall prevail. <angry> Keep your eyes on the horizon, sharpen your swords, and be ready for anything. <giggle> Along the journey, we’ll swap tales of old adventures, sing shanties to lift our spirits, and outwit rival crews. By the end of the voyage, either the treasure will be ours or we’ll have epic stories to tell.\n\n## 8. Mythical Godlike Voice\nListen carefully, mortals. <gasp> The heavens speak through me, revealing ancient secrets hidden in the cosmos. <whisper> Those who pay attention will gain wisdom, understanding the delicate balance of the universe. <cry> Ignore these warnings, and consequences may follow. <sigh> Today, I will guide you through visions of stars, planets, and forces that govern existence. By the end of this message, you will feel both awe and responsibility for the knowledge granted to you.\n\n## 9. Flirty Anime Character\nOh my, you didn’t expect to see me here, did you? <giggle> I’ve got a little surprise waiting just for you, something playful and fun. <whisper> Let’s enjoy the moment together, teasing, laughing, and sharing a few secrets. <laugh> I might even challenge you to a small game or dare to make it more exciting. <curious> By the end, I hope you’ll remember this encounter fondly, feeling entertained and charmed in equal measure.\n\n## 10. Dark Villain\nSo, you finally arrive. <chuckle> Did you really think it would be easy to find me? <angry> Every step you took has brought you straight into my plan. <scream> The traps, the misdirections, everything was set for this moment. <snort> Now, witness the full extent of my scheme and see whether you can survive. By the end of this encounter, either you’ll escape, or you’ll be part of my legacy forever.\n"
183 |       ],
184 |       "color": "#006691",
185 |       "bgcolor": "rgba(24,24,27,.9)"
186 |     },
187 |     {
188 |       "id": 2,
189 |       "type": "Maya1TTS_Barebones",
190 |       "pos": [
191 |         548.1511840820312,
192 |         -377.1851806640625
193 |       ],
194 |       "size": [
195 |         498.0000305175781,
196 |         468
197 |       ],
198 |       "flags": {
199 |         "pinned": true
200 |       },
201 |       "order": 4,
202 |       "mode": 0,
203 |       "inputs": [],
204 |       "outputs": [
205 |         {
206 |           "name": "audio",
207 |           "type": "AUDIO",
208 |           "links": [
209 |             1
210 |           ]
211 |         }
212 |       ],
213 |       "properties": {
214 |         "cnr_id": "ComfyUI-Maya1_TTS",
215 |         "ver": "2152b6fc507e414bc8538059e1f228bfe7be2dec",
216 |         "Node name for S&R": "Maya1TTS_Barebones",
217 |         "ue_properties": {
218 |           "widget_ue_connectable": {},
219 |           "input_ue_unconnectable": {},
220 |           "version": "7.4.1"
221 |         }
222 |       },
223 |       "widgets_values": [
224 |         "Creative pirate character. Male voice in his 30s with a British accent. Low pitch, gravelly timbre, slow pacing, energetic emotion at high intensity.",
225 |         "Ahoy, mateys! <laugh_harder> Today we set sail on treacherous seas in search of hidden treasure. <scream> The waves are rough, and the winds are fierce, but only the bravest shall prevail. <angry> Keep your eyes on the horizon, sharpen your swords, and be ready for anything. <giggle> Along the journey, we’ll swap tales of old adventures, sing shanties to lift our spirits, and outwit rival crews. By the end of the voyage, either the treasure will be ours or we’ll have epic stories to tell.",
226 |         "maya1",
227 |         "bfloat16",
228 |         "sdpa",
229 |         "cuda",
230 |         true,
231 |         false,
232 |         4000,
233 |         0.4,
234 |         0.9,
235 |         1.1,
236 |         477370007007039,
237 |         "randomize"
238 |       ]
239 |     },
240 |     {
241 |       "id": 3,
242 |       "type": "SaveAudioMP3",
243 |       "pos": [
244 |         1106.2691650390625,
245 |         -376.3995666503906
246 |       ],
247 |       "size": [
248 |         270,
249 |         136
250 |       ],
251 |       "flags": {
252 |         "pinned": true
253 |       },
254 |       "order": 7,
255 |       "mode": 0,
256 |       "inputs": [
257 |         {
258 |           "name": "audio",
259 |           "type": "AUDIO",
260 |           "link": 1
261 |         }
262 |       ],
263 |       "outputs": [],
264 |       "properties": {
265 |         "cnr_id": "comfy-core",
266 |         "ver": "0.3.64",
267 |         "Node name for S&R": "SaveAudioMP3",
268 |         "ue_properties": {
269 |           "widget_ue_connectable": {},
270 |           "input_ue_unconnectable": {},
271 |           "version": "7.4.1"
272 |         }
273 |       },
274 |       "widgets_values": [
275 |         "audio/ComfyUI",
276 |         "320k"
277 |       ]
278 |     },
279 |     {
280 |       "id": 5,
281 |       "type": "MarkdownNote",
282 |       "pos": [
283 |         -21.270532608032227,
284 |         -378.1773376464844
285 |       ],
286 |       "size": [
287 |         508.7115173339844,
288 |         613.2767333984375
289 |       ],
290 |       "flags": {
291 |         "pinned": true
292 |       },
293 |       "order": 5,
294 |       "mode": 0,
295 |       "inputs": [],
296 |       "outputs": [],
297 |       "title": "Maya1 Model dir instruction",
298 |       "properties": {
299 |         "ue_properties": {
300 |           "widget_ue_connectable": {},
301 |           "version": "7.4.1",
302 |           "input_ue_unconnectable": {}
303 |         }
304 |       },
305 |       "widgets_values": [
306 |         "# [Maya1 Huggingface](https://huggingface.co/maya-research/maya1/tree/main)\n\n\n\n\n### Model Location\n\nModels go in: `ComfyUI/models/maya1-TTS/`\n\n\n### Expected Folder Structure\n\nAfter downloading, your model folder should look like this:\n\n```\nComfyUI/\n└── models/\n    └── maya1-TTS/\n        └── maya1/                                # Model name (can be anything)\n            ├── chat_template.jinja               # Chat template\n            ├── config.json                       # Model configuration\n            ├── generation_config.json            # Generation settings\n            ├── model-00001-of-00002.safetensors  # Model weights (shard 1)\n            ├── model-00002-of-00002.safetensors  # Model weights (shard 2)\n            ├── model.safetensors.index.json      # Weight index\n            ├── special_tokens_map.json           # Special tokens\n            └── tokenizer/                        # Tokenizer subfolder\n                ├── chat_template.jinja           # Chat template (duplicate)\n                ├── special_tokens_map.json       # Special tokens (duplicate)\n                ├── tokenizer.json                # Tokenizer vocabulary (22.9 MB)\n                └── tokenizer_config.json         # Tokenizer config\n\n```\n\n# Install HF CLI\n```pip install huggingface-hub```\n\n# Create directory\n```cd ComfyUI``` <br>\n```mkdir -p models/maya1-TTS```\n\n# Download model\n```hf download maya-research/maya1 --local-dir models/maya1-TTS/maya1```\n\n"
307 |       ],
308 |       "color": "#c09430",
309 |       "bgcolor": "rgba(24,24,27,.9)"
310 |     }
311 |   ],
312 |   "links": [
313 |     [
314 |       1,
315 |       2,
316 |       0,
317 |       3,
318 |       0,
319 |       "AUDIO"
320 |     ],
321 |     [
322 |       2,
323 |       1,
324 |       0,
325 |       4,
326 |       0,
327 |       "AUDIO"
328 |     ]
329 |   ],
330 |   "groups": [],
331 |   "config": {},
332 |   "extra": {
333 |     "ue_links": [],
334 |     "ds": {
335 |       "scale": 0.5131581182307068,
336 |       "offset": [
337 |         850.4592685668841,
338 |         1247.5450982664793
339 |       ]
340 |     },
341 |     "links_added_by_ue": [],
342 |     "frontendVersion": "1.27.10",
343 |     "VHS_latentpreview": false,
344 |     "VHS_latentpreviewrate": 0,
345 |     "VHS_MetadataImage": true,
346 |     "VHS_KeepIntermediate": true
347 |   },
348 |   "version": 0.4
349 | }
350 | 


--------------------------------------------------------------------------------
/core/model_wrapper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Model loading and management for Maya1 TTS.
  3 | Supports multiple attention mechanisms: SDPA, Flash Attention 2, Sage Attention.
  4 | """
  5 | 
  6 | import torch
  7 | from pathlib import Path
  8 | from typing import Optional, Dict, Any
  9 | import gc
 10 | 
 11 | 
 12 | class Maya1Model:
 13 |     """
 14 |     Wrapper class for Maya1 model with tokenizer and attention mechanism support.
 15 |     """
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         model,
 20 |         tokenizer,
 21 |         model_name: str,
 22 |         attention_type: str,
 23 |         dtype: str,
 24 |         device: str
 25 |     ):
 26 |         self.model = model
 27 |         self.tokenizer = tokenizer
 28 |         self.model_name = model_name
 29 |         self.attention_type = attention_type
 30 |         self.dtype = dtype
 31 |         self.device = device
 32 | 
 33 |     def __repr__(self):
 34 |         return (f"Maya1Model(name={self.model_name}, "
 35 |                 f"attention={self.attention_type}, "
 36 |                 f"dtype={self.dtype}, "
 37 |                 f"device={self.device})")
 38 | 
 39 | 
 40 | class Maya1ModelLoader:
 41 |     """
 42 |     Model loader with caching and attention mechanism configuration.
 43 |     """
 44 | 
 45 |     # Cache for loaded models
 46 |     _model_cache: Dict[str, Maya1Model] = {}
 47 | 
 48 |     @staticmethod
 49 |     def _get_cache_key(model_path: str, attention_type: str, dtype: str) -> str:
 50 |         """Generate a unique cache key for a model configuration."""
 51 |         return f"{model_path}|{attention_type}|{dtype}"
 52 | 
 53 |     @classmethod
 54 |     def load_model(
 55 |         cls,
 56 |         model_path: Path,
 57 |         attention_type: str = "sdpa",
 58 |         dtype: str = "bfloat16",
 59 |         device: str = "cuda"
 60 |     ) -> Maya1Model:
 61 |         """
 62 |         Load Maya1 model with specified configuration.
 63 | 
 64 |         Args:
 65 |             model_path: Path to model directory
 66 |             attention_type: Attention mechanism ("sdpa", "flash_attention_2", "sage_attention")
 67 |             dtype: Data type ("bfloat16", "float16", "float32", "8bit", "4bit")
 68 |             device: Device to load on ("cuda", "cpu")
 69 | 
 70 |         Returns:
 71 |             Maya1Model wrapper with model and tokenizer
 72 |         """
 73 |         # Check if dtype OR attention changed from cached model
 74 |         # If either changed, clear cache to reload with new settings
 75 |         model_path_str = str(model_path)
 76 |         for cached_key, cached_model in list(cls._model_cache.items()):
 77 |             if model_path_str in cached_key:
 78 |                 dtype_changed = cached_model.dtype != dtype
 79 |                 attention_changed = cached_model.attention_type != attention_type
 80 | 
 81 |                 if dtype_changed or attention_changed:
 82 |                     if dtype_changed:
 83 |                         print(f"🔄 Dtype changed: {cached_model.dtype} → {dtype}")
 84 |                     if attention_changed:
 85 |                         print(f"🔄 Attention changed: {cached_model.attention_type} → {attention_type}")
 86 | 
 87 |                     print(f"🗑️  Clearing VRAM and reloading model with new settings...")
 88 |                     cls.clear_cache(force=True)
 89 |                     print(f"✅ VRAM cleared, loading fresh model...")
 90 |                     break
 91 | 
 92 |         # Check cache
 93 |         cache_key = cls._get_cache_key(str(model_path), attention_type, dtype)
 94 |         if cache_key in cls._model_cache:
 95 |             print(f"✅ Using cached Maya1 model: {model_path.name}")
 96 |             return cls._model_cache[cache_key]
 97 | 
 98 |         print(f"📦 Loading Maya1 model: {model_path.name}")
 99 |         print(f"   Attention: {attention_type}")
100 |         print(f"   Dtype: {dtype}")
101 |         print(f"   Device: {device}")
102 | 
103 |         # Import required libraries
104 |         try:
105 |             from transformers import AutoModelForCausalLM, AutoTokenizer
106 |         except ImportError:
107 |             raise ImportError(
108 |                 "Transformers library not found. Install with:\n"
109 |                 "pip install transformers"
110 |             )
111 | 
112 |         # Check if using bitsandbytes quantization
113 |         use_quantization = dtype in ["8bit", "4bit"]
114 | 
115 |         if use_quantization:
116 |             # Bitsandbytes quantization
117 |             torch_dtype = torch.bfloat16  # Base dtype for quantization
118 |             print(f"🔧 Quantization requested: {dtype}")
119 |         else:
120 |             # Standard dtype
121 |             torch_dtype = getattr(torch, dtype)
122 | 
123 |         # Configure attention mechanism
124 |         attn_kwargs = cls._configure_attention(attention_type)
125 | 
126 |         # Load tokenizer
127 |         tokenizer = cls._load_tokenizer(model_path)
128 | 
129 |         # Load model
130 |         model = cls._load_model_with_attention(
131 |             model_path,
132 |             torch_dtype,
133 |             device,
134 |             attn_kwargs,
135 |             quantization=dtype if use_quantization else None
136 |         )
137 | 
138 |         # Apply Sage Attention if selected
139 |         if attention_type == "sage_attention":
140 |             model = cls._apply_sage_attention(model)
141 | 
142 |         # Create wrapper
143 |         maya1_model = Maya1Model(
144 |             model=model,
145 |             tokenizer=tokenizer,
146 |             model_name=model_path.name,
147 |             attention_type=attention_type,
148 |             dtype=dtype,
149 |             device=device
150 |         )
151 | 
152 |         # Cache the model
153 |         cls._model_cache[cache_key] = maya1_model
154 | 
155 |         # Verify actual settings applied
156 |         print(f"✅ Maya1 model loaded successfully!")
157 |         cls._verify_model_config(model, attention_type, dtype)
158 | 
159 |         return maya1_model
160 | 
161 |     @staticmethod
162 |     def _verify_model_config(model, expected_attention: str, expected_dtype: str):
163 |         """Verify that the model is actually using the requested configuration."""
164 |         print("🔍 Verifying model configuration:")
165 | 
166 |         # Check actual dtype
167 |         actual_dtype = next(model.parameters()).dtype
168 |         print(f"   ✓ Dtype: {actual_dtype} (requested: {expected_dtype})")
169 | 
170 |         # Check attention implementation
171 |         if hasattr(model.config, '_attn_implementation'):
172 |             actual_attn = model.config._attn_implementation
173 | 
174 |             # Special handling for Sage Attention
175 |             if expected_attention == "sage_attention":
176 |                 # Sage Attention uses eager as base, so this is expected
177 |                 if actual_attn == "eager":
178 |                     print(f"   ✓ Attention: sage_attention (base: eager) ✅")
179 |                 else:
180 |                     print(f"   ✓ Attention: {actual_attn} (requested: {expected_attention})")
181 |             else:
182 |                 # For other attention types, show normally
183 |                 print(f"   ✓ Attention: {actual_attn} (requested: {expected_attention})")
184 |         else:
185 |             # For Sage Attention, check if hooks are registered
186 |             if expected_attention == "sage_attention":
187 |                 # Check if forward hooks exist (Sage adds hooks)
188 |                 has_hooks = any(
189 |                     hasattr(module, '_forward_hooks') and len(module._forward_hooks) > 0
190 |                     for module in model.modules()
191 |                 )
192 |                 if has_hooks:
193 |                     print(f"   ✓ Attention: sage_attention hooks applied ✅")
194 |                 else:
195 |                     print(f"   ⚠ Attention: sage_attention hooks may not be applied")
196 |             else:
197 |                 print(f"   ⚠ Attention: Unable to verify (config._attn_implementation not found)")
198 | 
199 |     @staticmethod
200 |     def _configure_attention(attention_type: str) -> Dict[str, Any]:
201 |         """
202 |         Configure attention mechanism parameters.
203 | 
204 |         Args:
205 |             attention_type: Type of attention mechanism
206 | 
207 |         Returns:
208 |             Dictionary of kwargs for model loading
209 |         """
210 |         if attention_type == "sdpa":
211 |             # PyTorch's scaled_dot_product_attention (default, most compatible)
212 |             return {"attn_implementation": "sdpa"}
213 | 
214 |         elif attention_type == "flash_attention_2":
215 |             # Flash Attention 2 (fastest, requires flash-attn package)
216 |             try:
217 |                 import flash_attn
218 |                 return {"attn_implementation": "flash_attention_2"}
219 |             except ImportError:
220 |                 print("⚠️  flash-attn not found, falling back to SDPA")
221 |                 print("   Install with: pip install flash-attn")
222 |                 return {"attn_implementation": "sdpa"}
223 | 
224 |         elif attention_type == "sage_attention":
225 |             # Sage Attention (memory efficient, requires sageattention package)
226 |             # Use eager mode first, then apply Sage Attention manually
227 |             return {"attn_implementation": "eager"}
228 | 
229 |         elif attention_type == "eager":
230 |             # Standard PyTorch eager attention (slowest but most compatible)
231 |             return {"attn_implementation": "eager"}
232 | 
233 |         else:
234 |             print(f"⚠️  Unknown attention type: {attention_type}, using SDPA")
235 |             return {"attn_implementation": "sdpa"}
236 | 
237 |     @staticmethod
238 |     def _load_tokenizer(model_path: Path):
239 |         """
240 |         Load tokenizer from model path.
241 |         Handles both root and tokenizer/ subdirectory structures.
242 | 
243 |         Args:
244 |             model_path: Path to model directory
245 | 
246 |         Returns:
247 |             Loaded tokenizer
248 |         """
249 |         from transformers import AutoTokenizer
250 | 
251 |         # Check if tokenizer is in a subdirectory
252 |         if (model_path / "tokenizer").exists():
253 |             print("   Loading tokenizer from tokenizer/ subdirectory...")
254 |             tokenizer = AutoTokenizer.from_pretrained(
255 |                 str(model_path),
256 |                 subfolder="tokenizer",
257 |                 trust_remote_code=True
258 |             )
259 |         else:
260 |             print("   Loading tokenizer from root...")
261 |             tokenizer = AutoTokenizer.from_pretrained(
262 |                 str(model_path),
263 |                 trust_remote_code=True
264 |             )
265 | 
266 |         return tokenizer
267 | 
268 |     @staticmethod
269 |     def _load_model_with_attention(
270 |         model_path: Path,
271 |         torch_dtype,
272 |         device: str,
273 |         attn_kwargs: Dict[str, Any],
274 |         quantization: Optional[str] = None
275 |     ):
276 |         """
277 |         Load the model with specified attention configuration.
278 | 
279 |         Args:
280 |             model_path: Path to model directory
281 |             torch_dtype: PyTorch data type
282 |             device: Device to load on
283 |             attn_kwargs: Attention configuration kwargs
284 |             quantization: Quantization type ("8bit", "4bit", None)
285 | 
286 |         Returns:
287 |             Loaded model
288 |         """
289 |         from transformers import AutoModelForCausalLM
290 | 
291 |         # Prepare loading kwargs
292 |         load_kwargs = {
293 |             "torch_dtype": torch_dtype,
294 |             "device_map": "auto" if device == "cuda" else device,
295 |             "trust_remote_code": True,
296 |             **attn_kwargs
297 |         }
298 | 
299 |         # Add bitsandbytes quantization if requested
300 |         if quantization == "8bit":
301 |             try:
302 |                 import bitsandbytes
303 |                 print(f"   Using 8-bit quantization (bitsandbytes)")
304 |                 load_kwargs["load_in_8bit"] = True
305 |                 # Remove device_map incompatibility
306 |                 if device == "cpu":
307 |                     print(f"   ⚠️  8-bit quantization requires CUDA, ignoring device=cpu")
308 |                     load_kwargs["device_map"] = "auto"
309 |             except ImportError:
310 |                 print(f"⚠️  bitsandbytes not found, loading in bfloat16 instead")
311 |                 print(f"   Install with: pip install bitsandbytes")
312 |                 quantization = None
313 | 
314 |         elif quantization == "4bit":
315 |             try:
316 |                 import bitsandbytes
317 |                 from transformers import BitsAndBytesConfig
318 |                 print(f"   Using 4-bit quantization (bitsandbytes NF4)")
319 | 
320 |                 bnb_config = BitsAndBytesConfig(
321 |                     load_in_4bit=True,
322 |                     bnb_4bit_compute_dtype=torch_dtype,
323 |                     bnb_4bit_use_double_quant=True,  # Nested quantization for better quality
324 |                     bnb_4bit_quant_type="nf4"  # NormalFloat4 - best quality
325 |                 )
326 |                 load_kwargs["quantization_config"] = bnb_config
327 |                 # Remove incompatible parameters
328 |                 load_kwargs.pop("torch_dtype", None)
329 |                 if device == "cpu":
330 |                     print(f"   ⚠️  4-bit quantization requires CUDA, ignoring device=cpu")
331 |                     load_kwargs["device_map"] = "auto"
332 |             except ImportError:
333 |                 print(f"⚠️  bitsandbytes not found, loading in bfloat16 instead")
334 |                 print(f"   Install with: pip install bitsandbytes")
335 |                 quantization = None
336 | 
337 |         model = AutoModelForCausalLM.from_pretrained(
338 |             str(model_path),
339 |             **load_kwargs
340 |         )
341 | 
342 |         model.eval()  # Set to evaluation mode
343 | 
344 |         if quantization:
345 |             print(f"✅ Model quantized to {quantization}")
346 | 
347 |         return model
348 | 
349 |     @staticmethod
350 |     def _apply_sage_attention(model):
351 |         """
352 |         Apply Sage Attention to the model.
353 |         Supports both Sage Attention v1.x and v2.x APIs.
354 | 
355 |         Args:
356 |             model: Loaded model
357 | 
358 |         Returns:
359 |             Model with Sage Attention applied
360 |         """
361 |         try:
362 |             # Try Sage Attention v1.x API first
363 |             try:
364 |                 from sageattention import apply_sage_attn
365 |                 print("   Applying Sage Attention (v1.x)...")
366 |                 model = apply_sage_attn(model)
367 |                 print("   ✅ Sage Attention v1.x applied successfully")
368 |                 return model
369 |             except ImportError:
370 |                 # Try Sage Attention v2.x API
371 |                 from sageattention import sageattn
372 |                 print("   Applying Sage Attention (v2.x)...")
373 |                 # For v2.x, we need to replace attention in each layer
374 |                 for name, module in model.named_modules():
375 |                     if hasattr(module, 'self_attn') or 'attention' in name.lower():
376 |                         # Sage Attention v2+ auto-replaces attention when imported
377 |                         pass
378 |                 print("   ✅ Sage Attention v2.x detected and enabled")
379 |                 return model
380 | 
381 |         except ImportError:
382 |             print("⚠️  sageattention not found, using standard eager attention")
383 |             print("   Install with: pip install sageattention")
384 |             return model
385 |         except Exception as e:
386 |             print(f"⚠️  Failed to apply Sage Attention: {e}")
387 |             print("   Continuing with standard eager attention")
388 |             return model
389 | 
390 |     @classmethod
391 |     def clear_cache(cls, force: bool = False):
392 |         """
393 |         Clear the model cache and free VRAM using ComfyUI's native memory management.
394 |         This actually removes models from VRAM, not just moves them to CPU.
395 |         """
396 |         if not cls._model_cache:
397 |             return  # Nothing to clear
398 | 
399 |         try:
400 |             # Import ComfyUI's model management
401 |             import comfy.model_management as mm
402 | 
403 |             # Step 1: Delete model references from our cache
404 |             # This removes the Python references to the models
405 |             for cache_key, maya1_model in list(cls._model_cache.items()):
406 |                 try:
407 |                     # Delete the model object to free references
408 |                     if maya1_model.model is not None:
409 |                         del maya1_model.model
410 |                     if maya1_model.tokenizer is not None:
411 |                         del maya1_model.tokenizer
412 |                 except Exception as e:
413 |                     print(f"   ⚠ Warning: Failed to delete {maya1_model.model_name}: {e}")
414 | 
415 |             # Step 2: Clear our cache dictionary
416 |             cls._model_cache.clear()
417 | 
418 |             # Step 3: Use ComfyUI's native VRAM cleanup
419 |             # This unloads ALL models from VRAM (including ours)
420 |             mm.unload_all_models()
421 | 
422 |             # Step 4: Clear ComfyUI's internal cache
423 |             mm.soft_empty_cache()
424 | 
425 |             # Step 5: Python garbage collection
426 |             gc.collect()
427 | 
428 |             # Step 6: Clear CUDA caches
429 |             if torch.cuda.is_available():
430 |                 torch.cuda.empty_cache()
431 |                 torch.cuda.ipc_collect()
432 | 
433 |         except ImportError:
434 |             # Fallback if comfy.model_management is not available
435 |             print("   ⚠ Warning: ComfyUI model_management not available, using fallback cleanup")
436 | 
437 |             # Fallback: Just clear the cache and force GC
438 |             cls._model_cache.clear()
439 |             gc.collect()
440 |             if torch.cuda.is_available():
441 |                 torch.cuda.empty_cache()
442 |                 torch.cuda.ipc_collect()
443 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ComfyUI-Maya1_TTS
  2 | 
  3 | **Expressive Voice Generation with Emotions for ComfyUI**
  4 | 
  5 | A ComfyUI node pack for [Maya1](https://huggingface.co/maya-research/maya1), a 3B-parameter speech model built for expressive voice generation with rich human emotion and precise voice design.
  6 | 
  7 | ![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)
  8 | ![Python](https://img.shields.io/badge/python-3.11+-blue.svg)
  9 | ![ComfyUI](https://img.shields.io/badge/ComfyUI-compatible-green.svg)
 10 | 
 11 | https://github.com/user-attachments/assets/1be0c2a0-22fb-4890-9147-d20abeb2e067
 12 | 
 13 | 
 14 | ---
 15 | 
 16 | ## ✨ Features
 17 | 
 18 | ### Core Features
 19 | - 🎭 **Voice Design** through natural language descriptions
 20 | - 😊 **16 Emotion Tags**: laugh, cry, whisper, angry, sigh, gasp, scream, and more
 21 | - ⚡ **Real-time Generation** with SNAC neural codec (24kHz audio)
 22 | - 🔧 **Multiple Attention Mechanisms**: SDPA, eager, Flash Attention 2, Sage Attention (1/2)
 23 | - 💾 **Quantization Support**: 4-bit and 8-bit for memory-constrained GPUs
 24 | - 🛑 **Native ComfyUI Cancel**: Stop generation anytime
 25 | - 📊 **Progress Tracking**: Real-time token generation speed (it/s)
 26 | - 🔄 **Model Caching**: Fast subsequent generations
 27 | - 🎯 **Smart VRAM Management**: Auto-clears on dtype changes
 28 | 
 29 | ### Custom Canvas UI
 30 | - 🎨 **Beautiful Dark Theme** with purple accents and smooth animations
 31 | - 👤 **5 Character Presets**: Quick-load voice templates (Male US, Female UK, Announcer, Robot, Demon)
 32 | - 🎭 **16 Visual Emotion Buttons**: One-click emotion tag insertion at cursor position
 33 | - ⛶ **Professional HTML Modal Editor**: Fullscreen text editor with native textarea for longform content
 34 | - 🔤 **Font Size Controls**: Adjustable 12-20px font size with visual slider
 35 | - ⌨️ **Advanced Keyboard Shortcuts**: Ctrl+A, Ctrl+C, Ctrl+V, Ctrl+X, Ctrl+Enter to save, ESC to cancel
 36 | - 🔔 **Toast Notifications**: Visual feedback for save success and validation errors
 37 | - 📝 **Inline Text Editing**: Click-to-edit with cursor positioning and drag-to-select
 38 | - 🖱️ **Scroll Support**: Custom themed scrollbars with mouse wheel scrolling
 39 | - 📱 **Responsive Design**: Modal adapts to all screen sizes
 40 | - 💡 **Contextual Tooltips**: Helpful hints on every control
 41 | - 🎬 **Collapsible Sections**: Clean, organized interface
 42 | - 🔄 **Smart Audio Processing**: Auto-chunking for long text with crossfade blending for seamless output
 43 | 
 44 | ---
 45 | 
 46 | ## 📦 Installation
 47 | 
 48 | <details>
 49 | <summary><b>Quick Install (Click to expand)</b></summary>
 50 | 
 51 | ### 1. Clone the Repository
 52 | 
 53 | ```bash
 54 | cd ComfyUI/custom_nodes/
 55 | git clone https://github.com/Saganaki22/ComfyUI-Maya1_TTS.git
 56 | cd ComfyUI-Maya1_TTS
 57 | ```
 58 | 
 59 | ### 2. Install Dependencies
 60 | 
 61 | **Core dependencies** (required):
 62 | ```bash
 63 | pip install torch>=2.0.0 transformers>=4.50.0 numpy>=1.21.0 snac>=1.0.0
 64 | ```
 65 | 
 66 | **Or install from requirements.txt:**
 67 | ```bash
 68 | pip install -r requirements.txt
 69 | ```
 70 | 
 71 | </details>
 72 | 
 73 | <details>
 74 | <summary><b>Optional: Enhanced Performance (Click to expand)</b></summary>
 75 | 
 76 | ### Quantization (Memory Savings)
 77 | 
 78 | For 4-bit/8-bit quantization support:
 79 | ```bash
 80 | pip install bitsandbytes>=0.41.0
 81 | ```
 82 | 
 83 | **Memory savings:**
 84 | - 4-bit: ~6GB → (slight quality loss)
 85 | - 8-bit: ~6GB → (minimal quality loss)
 86 | 
 87 | ### Accelerated Attention
 88 | 
 89 | **Flash Attention 2** (CUDA only):
 90 | ```bash
 91 | pip install flash-attn>=2.0.0
 92 | ```
 93 | 
 94 | **Sage Attention** (memory efficient for batch):
 95 | ```bash
 96 | pip install sageattention>=1.0.0
 97 | ```
 98 | 
 99 | ### Install All Optional Dependencies
100 | 
101 | ```bash
102 | pip install bitsandbytes flash-attn sageattention
103 | ```
104 | 
105 | </details>
106 | 
107 | <details>
108 | <summary><b>Download Maya1 Model (Click to expand)</b></summary>
109 | 
110 | ### Model Location
111 | 
112 | Models go in: `ComfyUI/models/maya1-TTS/`
113 | 
114 | ### Expected Folder Structure
115 | 
116 | After downloading, your model folder should look like this:
117 | 
118 | ```
119 | ComfyUI/
120 | └── models/
121 |     └── maya1-TTS/
122 |         └── maya1/                                # Model name (can be anything)
123 |             ├── chat_template.jinja               # Chat template
124 |             ├── config.json                       # Model configuration
125 |             ├── generation_config.json            # Generation settings
126 |             ├── model-00001-of-00002.safetensors  # Model weights (shard 1)
127 |             ├── model-00002-of-00002.safetensors  # Model weights (shard 2)
128 |             ├── model.safetensors.index.json      # Weight index
129 |             ├── special_tokens_map.json           # Special tokens
130 |             └── tokenizer/                        # Tokenizer subfolder
131 |                 ├── chat_template.jinja           # Chat template (duplicate)
132 |                 ├── special_tokens_map.json       # Special tokens (duplicate)
133 |                 ├── tokenizer.json                # Tokenizer vocabulary (22.9 MB)
134 |                 └── tokenizer_config.json         # Tokenizer config
135 | ```
136 | 
137 | **Critical files required:**
138 | - `config.json` - Model architecture configuration
139 | - `generation_config.json` - Default generation parameters
140 | - `model-00001-of-00002.safetensors` & `model-00002-of-00002.safetensors` - Model weights (2 shards)
141 | - `model.safetensors.index.json` - Weight index mapping
142 | - `chat_template.jinja` & `special_tokens_map.json` - In root folder
143 | - `tokenizer/` folder with all 4 tokenizer files
144 | 
145 | **Note:** You can have multiple models by creating separate folders like `maya1`, `maya1-finetuned`, etc.
146 | 
147 | ### Option 1: Hugging Face CLI (Recommended)
148 | 
149 | ```bash
150 | # Install HF CLI
151 | pip install huggingface-hub
152 | 
153 | # Create directory
154 | cd ComfyUI
155 | mkdir -p models/maya1-TTS
156 | 
157 | # Download model
158 | hf download maya-research/maya1 --local-dir models/maya1-TTS/maya1
159 | ```
160 | 
161 | ### Option 2: Python Script
162 | 
163 | ```python
164 | from huggingface_hub import snapshot_download
165 | 
166 | snapshot_download(
167 |     repo_id="maya-research/maya1",
168 |     local_dir="ComfyUI/models/maya1-TTS/maya1",
169 |     local_dir_use_symlinks=False
170 | )
171 | ```
172 | 
173 | ### Option 3: Manual Download
174 | 
175 | 1. Go to [Maya1 on HuggingFace](https://huggingface.co/maya-research/maya1)
176 | 2. Download all files to `ComfyUI/models/maya1-TTS/maya1/`
177 | 
178 | </details>
179 | 
180 | <details>
181 | <summary><b>Restart ComfyUI</b></summary>
182 | 
183 | Restart ComfyUI to load the new nodes. The node will appear under:
184 | 
185 | **Add Node → audio → Maya1 TTS (AIO) / Maya1 TTS (AIO) Barebones**
186 | 
187 | </details>
188 | 
189 | ---
190 | 
191 | ## 🎮 Usage
192 | 
193 | ### Two Node Options
194 | 
195 | **Maya1 TTS (AIO)** - Full custom UI with visual controls (recommended)
196 | - Beautiful dark theme with character presets, emotion buttons, and modal editor
197 | - Best user experience with visual feedback and tooltips
198 | 
199 | **Maya1 TTS (AIO) Barebones** - Standard ComfyUI widgets only
200 | - For users experiencing JavaScript rendering issues (black box)
201 | - Same functionality, simpler interface
202 | - All inputs stacked vertically with standard dropdowns and text boxes
203 | 
204 | ---
205 | 
206 | ### Node: Maya1 TTS (AIO)
207 | 
208 | All-in-one node for loading models and generating speech with a beautiful custom canvas UI.
209 | 
210 | | Maya1 TTS (AIO) | Maya1 TTS (AIO) Barebones |
211 | |:---:|:---:|
212 | | <img width="615" alt="Screenshot 2025-11-07 084153" src="https://github.com/user-attachments/assets/19105cc2-030a-40e3-b4d9-e18bd6d50b65" /> | <img width="648" alt="image" src="https://github.com/user-attachments/assets/7aff4fbb-c434-4d16-b4f9-22017167d455" /> |
213 | 
214 | 
215 | 
216 | ### ✨ Custom Canvas Interface
217 | 
218 | The node features a completely custom-built interface with:
219 | 
220 | **Character Presets** (Top Row)
221 | - Click any preset to instantly load a pre-configured voice description
222 | - 5 presets: ♂️ Male US, ♀️ Female UK, 🎙️ Announcer, 🤖 Robot, 😈 Demon
223 | 
224 | **Text Fields**
225 | - **Voice Description**: Describe your desired voice characteristics
226 | - **Text**: Your script with optional emotion tags
227 | - Click inside to edit with full keyboard support
228 | - Press **Enter** for new line, **Ctrl+Enter** to save, **Escape** to cancel
229 | 
230 | **Emotion Tags** (Collapsible Grid)
231 | - 16 emotion buttons in 4×4 grid
232 | - Click any emotion to insert tag at cursor position
233 | - Tags insert where you're typing, not just at the end
234 | - Click header to collapse/expand section
235 | 
236 | **⛶ Professional HTML Modal** (Bottom right of Text field)
237 | - Click the expand button (⛶) for fullscreen text editing
238 | - Native HTML textarea with proper newline and whitespace support
239 | - **Font Size Slider**: Adjust text size from 12px to 20px with visual A/A controls
240 | - All 16 emotion buttons available inside modal for quick tag insertion
241 | - **Custom Themed Scrollbar**: Purple accents matching the node design
242 | - **Toast Notifications**: Green checkmark for "Text Saved", red X for validation errors
243 | - **Empty Text Validation**: Prevents saving blank text with helpful error message
244 | - **Keyboard Shortcuts**:
245 |   - **Ctrl+Enter**: Save and close
246 |   - **ESC**: Cancel without saving
247 |   - Full text selection and clipboard support (Ctrl+A, C, V, X)
248 | - **Responsive Design**: Modal adapts to small and large screens, buttons always visible
249 | - **Visual Hints**: Subtle grey text under buttons showing keyboard shortcuts
250 | 
251 | **Keyboard Shortcuts** (Inline Editing & Modal)
252 | - `Enter`: New line (in multiline text fields)
253 | - `Ctrl+Enter`: Save and apply changes
254 | - `Escape`: Cancel editing without saving
255 | - `Ctrl+A`: Select all text
256 | - `Ctrl+C/V/X`: Copy, paste, cut selected text
257 | - Click outside field: Auto-save (inline editing only)
258 | 
259 | <details>
260 | <summary><b>Model Settings</b></summary>
261 | 
262 | **model_name** (dropdown)
263 | - Select from models in `ComfyUI/models/maya1-TTS/`
264 | - Model auto-discovered on startup
265 | 
266 | **dtype** (dropdown)
267 | - `4bit`: NF4 quantization (~6GB VRAM, requires bitsandbytes, **SLOWER**)
268 | - `8bit`: INT8 quantization (~7GB VRAM, requires bitsandbytes, **SLOWER**)
269 | - `float16`: 16-bit half precision (~8-9GB VRAM, **FAST**, good quality)
270 | - `bfloat16`: 16-bit brain float (~8-9GB VRAM, **FAST**, recommended)
271 | - `float32`: 32-bit full precision (~16GB VRAM, highest quality, slower)
272 | 
273 | ⚠️ **IMPORTANT:** Quantization (4-bit/8-bit) is **SLOWER** than float16/bfloat16!
274 | - Only use quantization if you have **limited VRAM** (<10GB)
275 | - If you have **10GB+ VRAM**, use **float16** or **bfloat16** for best speed
276 | 
277 | **attention_mechanism** (dropdown)
278 | - `sdpa`: PyTorch SDPA (**default**, fastest for single TTS)
279 | - `flash_attention_2`: Flash Attention 2 (batch inference)
280 | - `sage_attention`: Sage Attention (memory efficient)
281 | 
282 | **device** (dropdown)
283 | - `cuda`: Use GPU (recommended)
284 | - `cpu`: Use CPU (slower)
285 | 
286 | </details>
287 | 
288 | <details>
289 | <summary><b>Voice & Text Settings</b></summary>
290 | 
291 | **voice_description**
292 | 
293 | Describe the voice using natural language. Click inside to edit or use character presets.
294 | 
295 | **Example:**
296 | ```
297 | Realistic male voice in the 30s with American accent. Normal pitch, warm timbre, conversational pacing.
298 | ```
299 | 
300 | **Voice Components:**
301 | - **Age**: `in their 20s`, `30s`, `40s`, `50s`
302 | - **Gender**: `Male voice`, `Female voice`
303 | - **Accent**: `American`, `British`, `Australian`, `Indian`, `Middle Eastern`
304 | - **Pitch**: `high pitch`, `normal pitch`, `low pitch`
305 | - **Timbre**: `warm`, `gravelly`, `smooth`, `raspy`
306 | - **Pacing**: `fast pacing`, `conversational`, `slow pacing`
307 | - **Tone**: `happy`, `angry`, `curious`, `energetic`, `calm`
308 | 
309 | **💡 Tip**: Use character presets for quick voice templates!
310 | 
311 | **text**
312 | 
313 | Text to synthesize with optional emotion tags. Click emotion buttons to insert tags at cursor.
314 | 
315 | **Example:**
316 | ```
317 | Hello! This is Maya1 <laugh> the best open source voice AI!
318 | ```
319 | 
320 | **💡 Tip**: Click ⛶ expand button for longform text editing in fullscreen modal!
321 | 
322 | </details>
323 | 
324 | <details>
325 | <summary><b>Generation Settings</b></summary>
326 | 
327 | **keep_model_in_vram** (boolean)
328 | - `True`: Keep model loaded for faster repeated generations
329 | - `False`: Clear VRAM after generation (saves memory)
330 | - Auto-clears when dtype changes
331 | 
332 | **chunk_longform** (boolean) ⚠️ EXPERIMENTAL
333 | - `True`: Auto-split long text (>80 words) at sentences, combines audio
334 | - `False`: Generate entire text at once (may fail if too long)
335 | - **Note**: This feature is experimental and may have quality/timing issues
336 | 
337 | **temperature** (0.1-2.0, default: 0.4)
338 | - Lower = more consistent
339 | - Higher = more varied/creative
340 | 
341 | **top_p** (0.1-1.0, default: 0.9)
342 | - Nucleus sampling parameter
343 | - 0.9 recommended for natural speech
344 | 
345 | **max_tokens** (100-8000, default: 2000)
346 | - Maximum audio tokens to generate
347 | - Higher = longer audio
348 | 
349 | **repetition_penalty** (1.0-2.0, default: 1.1)
350 | - Reduces repetitive speech
351 | - 1.1 is good default
352 | 
353 | **seed** (integer, default: 0)
354 | - Use same seed for reproducible results
355 | - Use ComfyUI's control_after_generate for random/increment
356 | 
357 | </details>
358 | 
359 | <details>
360 | <summary><b>Outputs</b></summary>
361 | 
362 | **audio** (ComfyUI AUDIO type)
363 | - 24kHz mono audio
364 | - Compatible with all ComfyUI audio nodes
365 | - Connect to PreviewAudio, SaveAudio, etc.
366 | 
367 | </details>
368 | 
369 | ---
370 | 
371 | ### Node: Maya1 TTS (AIO) Barebones
372 | 
373 | Standard ComfyUI widgets version for users experiencing JavaScript rendering issues.
374 | 
375 | **When to use Barebones:**
376 | - Custom UI shows as a black box
377 | - Browser console shows JavaScript errors
378 | - You prefer simple, standard ComfyUI widgets
379 | - Working with older ComfyUI versions
380 | 
381 | **Inputs (in order):**
382 | 
383 | 1. **voice_description** (multiline text)
384 |    - Describe voice characteristics in natural language
385 |    - Same as main node, just standard text box
386 | 
387 | 2. **text** (multiline text)
388 |    - Your script with manual emotion tags like `<laugh>` or `<cry>`
389 |    - Type emotion tags manually (no visual buttons in barebones version)
390 | 
391 | 3. **model_name** (dropdown)
392 |    - Select Maya1 model from `ComfyUI/models/maya1-TTS/`
393 | 
394 | 4. **dtype** (dropdown)
395 |    - `4bit (BNB)`, `8bit (BNB)`, `float16`, `bfloat16`, `float32`
396 | 
397 | 5. **attention_mechanism** (dropdown)
398 |    - `sdpa` (default), `flash_attention_2`, `sage_attention`
399 | 
400 | 6. **device** (dropdown)
401 |    - `cuda` (GPU) or `cpu`
402 | 
403 | 7. **keep_model_in_vram** (boolean toggle)
404 |    - Keep model loaded for faster subsequent generations
405 | 
406 | 8. **chunk_longform** (boolean toggle)
407 |    - Split long text with crossfading for unlimited length
408 | 
409 | 9. **max_tokens** (integer)
410 |    - Max SNAC tokens per chunk (default: 4000)
411 | 
412 | 10. **temperature** (float)
413 |     - Generation randomness (default: 0.4)
414 | 
415 | 11. **top_p** (float)
416 |     - Nucleus sampling (default: 0.9)
417 | 
418 | 12. **repetition_penalty** (float)
419 |     - Reduce repetition (default: 1.1)
420 | 
421 | 13. **seed** (integer)
422 |     - 0 = random, or set specific seed for reproducibility
423 |     - Use control_after_generate widget for seed management
424 | 
425 | **All other features (model loading, VRAM management, chunking, progress tracking) work identically to the main node.**
426 | 
427 | ---
428 | 
429 | ## 🎭 Emotion Tags
430 | 
431 | Add emotions anywhere in your text using `<tag>` syntax, or click the visual emotion buttons in the UI!
432 | 
433 | **Examples:**
434 | ```
435 | Hello! This is amazing <laugh> I can't believe it!
436 | ```
437 | 
438 | ```
439 | After all we went through <cry> I can't believe he was the traitor.
440 | ```
441 | 
442 | ```
443 | Wow! <gasp> This place looks incredible!
444 | ```
445 | 
446 | <details>
447 | <summary><b>All 16 Available Emotions (Click to expand)</b></summary>
448 | 
449 | **Laughter & Joy:**
450 | - `<laugh>` - Normal laugh
451 | - `<laugh_harder>` - Intense laughing
452 | - `<giggle>` - Light giggling
453 | - `<chuckle>` - Soft chuckle
454 | 
455 | **Sadness & Sighs:**
456 | - `<cry>` - Crying
457 | - `<sigh>` - Sighing
458 | 
459 | **Surprise & Breath:**
460 | - `<gasp>` - Surprised gasp
461 | - `<excited>` - Excited tone
462 | 
463 | **Intensity & Emotion:**
464 | - `<whisper>` - Whispering
465 | - `<angry>` - Angry tone
466 | - `<scream>` - Screaming
467 | - `<sarcastic>` - Sarcastic delivery
468 | 
469 | **Natural Sounds:**
470 | - `<snort>` - Snorting
471 | - `<exhale>` - Exhaling
472 | - `<gulp>` - Gulping
473 | - `<sing>` - Singing
474 | 
475 | </details>
476 | 
477 | **💡 Tip:** Click emotion buttons in the node UI to insert tags at cursor position!
478 | 
479 | ---
480 | 
481 | ## 🎬 Example Character Speeches
482 | 
483 | <details>
484 | <summary><b>Generative AI & ComfyUI Examples (Click to expand)</b></summary>
485 | 
486 | ### Example 1: Excited AI Researcher
487 | 
488 | **Voice Description:**
489 | ```
490 | Female voice in her 30s with American accent. High pitch, energetic tone at high intensity, fast pacing.
491 | ```
492 | 
493 | **Text:**
494 | ```
495 | Oh my god! <laugh> Have you seen the new Stable Diffusion model in ComfyUI? The quality is absolutely incredible! <gasp> I just generated a photorealistic portrait in like 20 seconds. This is game-changing for our workflow!
496 | ```
497 | 
498 | ---
499 | 
500 | ### Example 2: Skeptical Developer
501 | 
502 | **Voice Description:**
503 | ```
504 | Male voice in his 40s with British accent. Low pitch, calm tone, conversational pacing.
505 | ```
506 | 
507 | **Text:**
508 | ```
509 | I've been testing this new node pack in ComfyUI <sigh> and honestly, I'm impressed. At first I was skeptical about the whole generative AI hype, but <gasp> the control you get with custom nodes is remarkable. This changes everything.
510 | ```
511 | 
512 | ---
513 | 
514 | ### Example 3: Enthusiastic Tutorial Creator
515 | 
516 | **Voice Description:**
517 | ```
518 | Female voice in her 20s with Australian accent. Normal pitch, warm timbre, energetic tone at medium intensity.
519 | ```
520 | 
521 | **Text:**
522 | ```
523 | Hey everyone! <laugh> Welcome back to my ComfyUI tutorial series! Today we're diving into the most powerful image generation workflow I've ever seen. <gasp> You're not gonna believe how easy this is! Let's get started!
524 | ```
525 | 
526 | ---
527 | 
528 | ### Example 4: Frustrated Beginner
529 | 
530 | **Voice Description:**
531 | ```
532 | Male voice in his 30s with American accent. Normal pitch, stressed tone at medium intensity, fast pacing.
533 | ```
534 | 
535 | **Text:**
536 | ```
537 | Why won't this workflow run? <angry> I've connected all the nodes exactly like the tutorial showed! <sigh> Wait... Oh no. <laugh> I forgot to load the checkpoint model. Classic beginner mistake! Okay, let's try this again.
538 | ```
539 | 
540 | ---
541 | 
542 | ### Example 5: Amazed AI Artist
543 | 
544 | **Voice Description:**
545 | ```
546 | Female voice in her 40s with Indian accent. Normal pitch, curious tone, slow pacing, dramatic delivery.
547 | ```
548 | 
549 | **Text:**
550 | ```
551 | When I first discovered ComfyUI <whisper> I thought it was just another image generator. But then <gasp> I realized you can chain workflows together, use custom models, and <laugh> even generate animations! This is the future of digital art!
552 | ```
553 | 
554 | ---
555 | 
556 | ### Example 6: Confident AI Entrepreneur
557 | 
558 | **Voice Description:**
559 | ```
560 | Male voice in his 50s with Middle Eastern accent. Low pitch, gravelly timbre, slow pacing, confident tone at high intensity.
561 | ```
562 | 
563 | **Text:**
564 | ```
565 | The generative AI revolution is here. <dramatic pause> ComfyUI gives us the tools to build production-ready workflows. <chuckle> While others are still playing with web UIs, we're automating entire creative pipelines. This is how you stay ahead of the curve.
566 | ```
567 | 
568 | </details>
569 | 
570 | ---
571 | 
572 | ## ⚙️ Advanced Configuration
573 | 
574 | <details>
575 | <summary><b>Attention Mechanisms Comparison</b></summary>
576 | 
577 | | Mechanism | Speed | Memory | Best For | Requirements |
578 | |-----------|-------|--------|----------|--------------|
579 | | **SDPA** | ⚡⚡⚡ | Good | Single TTS generation | PyTorch ≥2.0 |
580 | | **Flash Attention 2** | ⚡⚡ | Good | Batch processing | flash-attn, CUDA |
581 | | **Sage Attention** | ⚡⚡ | Excellent | Long sequences | sageattention |
582 | 
583 | **Why is SDPA fastest for TTS?**
584 | - Optimized for single-sequence autoregressive generation
585 | - Lower kernel launch overhead (~20μs vs 50-60μs)
586 | - Flash/Sage Attention shine with batch size ≥8
587 | 
588 | **Recommendation:** Use **SDPA** (default) for single audio generation.
589 | 
590 | </details>
591 | 
592 | <details>
593 | <summary><b>Quantization Details</b></summary>
594 | 
595 | ⚠️ **CRITICAL: Quantization is SLOWER than fp16/bf16!**
596 | 
597 | ### Memory Usage (Maya1 3B Model)
598 | 
599 | | Dtype | VRAM Usage | Speed | Quality |
600 | |-------|------------|-------|---------|
601 | | **4-bit NF4** | ~6GB | Slow ⚡ | Good (slight loss) |
602 | | **8-bit INT8** | ~7GB | Slow ⚡ | Excellent (minimal loss) |
603 | | **float16** | ~8-9GB | **Fast** ⚡⚡⚡ | Excellent |
604 | | **bfloat16** | ~8-9GB | **Fast** ⚡⚡⚡ | Excellent |
605 | | **float32** | ~16GB | Medium ⚡⚡ | Perfect |
606 | 
607 | ### 4-bit NF4 Quantization
608 | 
609 | **Features:**
610 | - Uses NormalFloat4 (NF4) for best 4-bit quality
611 | - Double quantization (nested) for better accuracy
612 | - Memory savings: ~6GB (vs ~8-9GB for fp16)
613 | 
614 | **When to use:**
615 | - You have **limited VRAM** (8GB or less GPU)
616 | - Speed is not critical (inference is slower due to dequantization)
617 | - Need to fit model in smaller VRAM
618 | 
619 | **When NOT to use:**
620 | - You have 10GB+ VRAM → Use float16/bfloat16 instead for better speed!
621 | 
622 | ### 8-bit INT8 Quantization
623 | 
624 | **Features:**
625 | - Standard 8-bit integer quantization
626 | - Memory savings: ~7GB (vs ~8-9GB for fp16)
627 | - Minimal quality impact
628 | 
629 | **When to use:**
630 | - You have moderate VRAM constraints (8-10GB GPU)
631 | - Want good quality with some memory savings
632 | - Speed is not critical
633 | 
634 | **When NOT to use:**
635 | - You have 10GB+ VRAM → Use float16/bfloat16 instead for better speed!
636 | 
637 | ### Why is Quantization Slower?
638 | 
639 | Quantized models require **dequantization** on every forward pass:
640 | 1. Model weights stored in 4-bit/8-bit
641 | 2. Weights dequantized to fp16 for computation
642 | 3. Computation happens in fp16
643 | 4. Extra overhead = slower inference
644 | 
645 | **Recommendation:** Only use quantization if you truly need the memory savings!
646 | 
647 | ### Automatic Dtype Switching
648 | 
649 | The node automatically clears VRAM when you switch dtypes:
650 | 
651 | ```
652 | 🔄 Dtype changed from bfloat16 to 4bit
653 |    Clearing cache to reload model...
654 | ```
655 | 
656 | This prevents dtype mismatch errors and ensures correct quantization.
657 | 
658 | </details>
659 | 
660 | <details>
661 | <summary><b>Console Progress Output</b></summary>
662 | 
663 | Real-time generation statistics in the console:
664 | 
665 | ```
666 | 🎲 Seed: 1337
667 | 🎵 Generating speech (max 2000 tokens)...
668 |    Tokens: 500/2000 | Speed: 12.45 it/s | Elapsed: 40.2s
669 | ✅ Generated 1500 tokens in 120.34s (12.47 it/s)
670 | ```
671 | 
672 | **it/s** = iterations per second (tokens/second)
673 | 
674 | </details>
675 | 
676 | ---
677 | 
678 | ## 🐛 Troubleshooting
679 | 
680 | <details>
681 | <summary><b>Node Shows as Black Box (JavaScript Issues)</b></summary>
682 | 
683 | **Issue:** Maya1 TTS (AIO) node appears completely black with no widgets visible.
684 | 
685 | **Quick Fix:**
686 | Use **Maya1 TTS (AIO) Barebones** instead!
687 | - Same functionality, standard ComfyUI widgets only
688 | - No custom JavaScript required
689 | - Find it under: Add Node → audio → Maya1 TTS (AIO) Barebones
690 | 
691 | **Debugging Steps:**
692 | 1. Open browser DevTools (F12) → Console tab
693 | 2. Look for JavaScript errors mentioning "maya1" or "Unexpected token"
694 | 3. Try hard refresh: Ctrl+Shift+R (Windows/Linux) or Cmd+Shift+R (Mac)
695 | 4. Clear browser cache completely
696 | 5. Test in incognito/private window
697 | 6. Check if maya1_tts.js loads in Network tab (should be 200 status)
698 | 7. Disable browser extensions (ad blockers, script blockers)
699 | 8. Update ComfyUI to latest version
700 | 
701 | **Note:** The barebones version is specifically designed for this issue!
702 | 
703 | </details>
704 | 
705 | <details>
706 | <summary><b>Model Not Found</b></summary>
707 | 
708 | **Error:** `No valid Maya1 models found`
709 | 
710 | **Solutions:**
711 | 1. Check model location: `ComfyUI/models/maya1-TTS/`
712 | 2. Download model (see Installation section)
713 | 3. Restart ComfyUI
714 | 4. Check console for model discovery messages
715 | 
716 | </details>
717 | 
718 | <details>
719 | <summary><b>Out of Memory (OOM)</b></summary>
720 | 
721 | **Error:** `CUDA out of memory`
722 | 
723 | **Memory requirements:**
724 | - 4-bit: ~6GB VRAM (slower)
725 | - 8-bit: ~7GB VRAM (slower)
726 | - float16/bfloat16: ~8-9GB VRAM (fast, recommended)
727 | - float32: ~16GB VRAM
728 | 
729 | **Solutions (try in order):**
730 | 1. Use **4-bit** dtype if you have ≤8GB VRAM (~6GB usage)
731 | 2. Use **8-bit** dtype if you have ~8-10GB VRAM (~7GB usage)
732 | 3. Use **float16** if you have 10GB+ VRAM (faster than quantization!)
733 | 4. Enable `keep_model_in_vram=False` to free VRAM after generation
734 | 5. Reduce `max_tokens` to 1000-1500
735 | 6. Close other VRAM-heavy applications
736 | 7. Use CPU (much slower but works)
737 | 
738 | **Note:** If you have 10GB+ VRAM, use float16/bfloat16 for best speed!
739 | 
740 | </details>
741 | 
742 | <details>
743 | <parameter name="summary"><b>Quantization Errors</b></summary>
744 | 
745 | **Error:** `bitsandbytes not found`
746 | 
747 | **Solution:**
748 | ```bash
749 | pip install bitsandbytes>=0.41.0
750 | ```
751 | 
752 | **Error:** `Quantization requires CUDA`
753 | 
754 | **Solution:**
755 | - 4-bit/8-bit only work on CUDA
756 | - Switch to `float16`/`bfloat16` for CPU
757 | 
758 | </details>
759 | 
760 | <details>
761 | <summary><b>No Audio Generated</b></summary>
762 | 
763 | **Error:** `No SNAC audio tokens generated!`
764 | 
765 | **Solutions:**
766 | 1. Increase `max_tokens` to 2000-4000
767 | 2. Adjust `temperature` to 0.3-0.5
768 | 3. Simplify voice description
769 | 4. Check text isn't too long
770 | 5. Try different seed value
771 | 
772 | </details>
773 | 
774 | <details>
775 | <summary><b>Flash Attention Installation Failed</b></summary>
776 | 
777 | **Error:** `flash-attn` won't install
778 | 
779 | **Solution:**
780 | - Flash Attention requires CUDA and specific setup
781 | - Just use **SDPA** instead (works great, actually faster for TTS!)
782 | - SDPA is the recommended default
783 | 
784 | </details>
785 | 
786 | <details>
787 | <summary><b>Info Button Not Visible</b></summary>
788 | 
789 | **Issue:** Can't see the "?" or "i" icon, only hover tooltip
790 | 
791 | **Answer:** This is **normal** and working correctly!
792 | 
793 | - ComfyUI's `DESCRIPTION` creates a hover tooltip
794 | - Some ComfyUI versions show no visible icon
795 | - Just hover over the node title area to see help
796 | - Contains all emotion tags and usage examples
797 | 
798 | </details>
799 | 
800 | ---
801 | 
802 | ## 📊 Performance Tips
803 | 
804 | 1. **Use float16/bfloat16** if you have 10GB+ VRAM (fastest!)
805 | 2. **Use quantization (4-bit/8-bit)** ONLY if limited VRAM (<10GB) - slower but fits in memory
806 | 3. **Keep SDPA** as attention mechanism (fastest for single TTS)
807 | 4. **Enable model caching** (`keep_model_in_vram=True`) for multiple generations
808 | 5. **Optimize max_tokens**: Start with 1500-2000
809 | 6. **Batch similar requests** with same voice description for efficiency
810 | 
811 | ⚠️ **Speed ranking:** float16/bfloat16 (fastest) > float32 > 8-bit > 4-bit (slowest)
812 | 
813 | ---
814 | 
815 | ## 🏗️ Technical Details
816 | 
817 | <details>
818 | <summary><b>Architecture</b></summary>
819 | 
820 | - **Model**: 3B-parameter Llama-based transformer
821 | - **Audio Codec**: SNAC (Speech Neural Audio Codec)
822 | - **Sample Rate**: 24kHz mono
823 | - **Frame Structure**: 7 tokens per frame (3 hierarchical levels)
824 | - **Token Ranges**:
825 |   - SNAC tokens: 128266-156937
826 |   - Text EOS: 128009
827 |   - SNAC EOS: 128258
828 | - **Compression**: ~0.98 kbps streaming
829 | 
830 | </details>
831 | 
832 | <details>
833 | <summary><b>File Structure</b></summary>
834 | 
835 | ```
836 | ComfyUI-Maya1_TTS/
837 | ├── __init__.py                 # Node registration
838 | ├── nodes/
839 | │   ├── __init__.py
840 | │   └── maya1_tts_combined.py   # AIO node (backend)
841 | ├── js/
842 | │   ├── maya1_tts.js            # Custom canvas UI (1800+ lines)
843 | │   └── config.js               # UI config (presets, emotions, tooltips)
844 | ├── core/
845 | │   ├── model_wrapper.py        # Model loading & quantization
846 | │   ├── snac_decoder.py         # SNAC audio decoding
847 | │   └── utils.py                # Utilities & cancel support
848 | ├── resources/
849 | │   ├── emotions.txt            # 16 emotion tags
850 | │   └── prompt_examples.txt     # Voice description examples
851 | ├── pyproject.toml              # Package metadata
852 | ├── requirements.txt            # Dependencies
853 | └── README.md                   # This file
854 | ```
855 | 
856 | </details>
857 | 
858 | <details>
859 | <summary><b>ComfyUI Integration</b></summary>
860 | 
861 | - **Custom Canvas UI**: Full JavaScript UI with LiteGraph.js canvas API
862 | - **Cancel Support**: Native `execution.interruption_requested()`
863 | - **Progress Bars**: `comfy.utils.ProgressBar`
864 | - **Audio Format**: ComfyUI AUDIO type (24kHz mono)
865 | - **Model Caching**: Automatic with dtype change detection
866 | - **VRAM Management**: Manual control via toggle
867 | - **Event Handling**: Document-level keyboard/mouse capture for proper text editing
868 | - **Visual Feedback**: Real-time tooltips, animations, and hover states
869 | 
870 | </details>
871 | 
872 | 
873 | 
874 | 
875 | ---
876 | 
877 | ## 📝 Credits
878 | 
879 | - **Maya1 Model**: [Maya Research](https://www.mayaresearch.ai/)
880 | - **HuggingFace**: [maya-research/maya1](https://huggingface.co/maya-research/maya1)
881 | - **SNAC Codec**: [hubertsiuzdak/snac](https://github.com/hubertsiuzdak/snac)
882 | - **ComfyUI**: [comfyanonymous/ComfyUI](https://github.com/comfyanonymous/ComfyUI)
883 | 
884 | ---
885 | 
886 | ## 📄 License
887 | 
888 | Apache 2.0 - See [LICENSE](LICENSE)
889 | 
890 | Maya1 model is also licensed under Apache 2.0 by Maya Research.
891 | 
892 | ---
893 | 
894 | ## 🔗 Links
895 | 
896 | - **Issues**: [GitHub Issues](https://github.com/Saganaki22/-ComfyUI-Maya1_TTS/issues)
897 | - **Maya Research**: [Website](https://www.mayaresearch.ai/) | [Twitter](https://twitter.com/mayaresearch_ai)
898 | - **Model Page**: [HuggingFace](https://huggingface.co/maya-research/maya1)
899 | 
900 | ---
901 | 
902 | ## 📖 Citation
903 | 
904 | If you use Maya1 in your research, please cite:
905 | 
906 | ```bibtex
907 | @misc{maya1voice2025,
908 |   title={Maya1: Open Source Voice AI with Emotional Intelligence},
909 |   author={Maya Research},
910 |   year={2025},
911 |   publisher={Hugging Face},
912 |   howpublished={\url{https://huggingface.co/maya-research/maya1}},
913 | }
914 | ```
915 | 
916 | ---
917 | 
918 | *Bringing expressive voice AI to everyone through open source.*
919 | 


--------------------------------------------------------------------------------
/nodes/maya1_tts_combined.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Maya1 TTS Combined Node for ComfyUI.
  3 | All-in-one node with model loading and TTS generation.
  4 | """
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import random
  9 | import re
 10 | import gc
 11 | from typing import Tuple, List
 12 | import comfy.model_management as mm
 13 | 
 14 | from ..core import (
 15 |     Maya1ModelLoader,
 16 |     SNACDecoder,
 17 |     discover_maya1_models,
 18 |     get_model_path,
 19 |     get_maya1_models_dir,
 20 |     format_prompt,
 21 |     load_emotions_list,
 22 |     crossfade_audio
 23 | )
 24 | 
 25 | 
 26 | def create_progress_bar(current: int, total: int, width: int = 12, show_numbers: bool = True) -> str:
 27 |     """
 28 |     Create a visual progress bar like ComfyUI's native one.
 29 | 
 30 |     Args:
 31 |         current: Current progress value
 32 |         total: Total value
 33 |         width: Width of the progress bar in characters
 34 |         show_numbers: Whether to show the numbers after the bar
 35 | 
 36 |     Returns:
 37 |         Formatted progress bar string
 38 |     """
 39 |     if total == 0:
 40 |         percent = 0
 41 |     else:
 42 |         percent = min(current / total, 1.0)
 43 | 
 44 |     filled = int(width * percent)
 45 |     empty = width - filled
 46 | 
 47 |     bar = '█' * filled + '░' * empty
 48 | 
 49 |     if show_numbers:
 50 |         return f"[{bar}] {current}/{total}"
 51 |     else:
 52 |         return f"[{bar}]"
 53 | 
 54 | def split_text_smartly(text: str, max_words_per_chunk: int = 100) -> List[str]:
 55 |     """
 56 |     Split text into chunks at sentence boundaries, keeping emotion tags intact.
 57 |     Improved to NEVER cut words mid-sentence.
 58 | 
 59 |     Args:
 60 |         text: Input text to split
 61 |         max_words_per_chunk: Maximum words per chunk (default 100)
 62 | 
 63 |     Returns:
 64 |         List of text chunks
 65 |     """
 66 |     # Better sentence boundary detection that handles emotion tags
 67 |     # Split on: . ! ? followed by whitespace (and optionally capital letter or end of string)
 68 |     # This regex keeps the punctuation with the sentence
 69 |     sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z<]|$)'
 70 |     sentences = re.split(sentence_pattern, text.strip())
 71 | 
 72 |     # Clean up empty sentences
 73 |     sentences = [s.strip() for s in sentences if s.strip()]
 74 | 
 75 |     # Group sentences into chunks
 76 |     chunks = []
 77 |     current_chunk = []
 78 |     current_word_count = 0
 79 | 
 80 |     for sentence in sentences:
 81 |         # Count words (emotion tags don't count as words)
 82 |         # Remove emotion tags temporarily for word count
 83 |         text_without_tags = re.sub(r'<[^>]+>', '', sentence)
 84 |         word_count = len(text_without_tags.split())
 85 | 
 86 |         # If single sentence exceeds max, split on commas or semicolons
 87 |         if word_count > max_words_per_chunk:
 88 |             # Split long sentence on commas, keeping punctuation
 89 |             parts = re.split(r'(,\s+|;\s+)', sentence)
 90 | 
 91 |             for i, part in enumerate(parts):
 92 |                 if not part.strip():
 93 |                     continue
 94 | 
 95 |                 # For delimiters (commas/semicolons), add to previous chunk
 96 |                 if part.strip() in [',', ';']:
 97 |                     if current_chunk:
 98 |                         current_chunk[-1] += part
 99 |                     continue
100 | 
101 |                 # Count words in this part
102 |                 part_text = re.sub(r'<[^>]+>', '', part)
103 |                 part_words = len(part_text.split())
104 | 
105 |                 if current_word_count + part_words > max_words_per_chunk and current_chunk:
106 |                     # Start new chunk
107 |                     chunks.append(''.join(current_chunk))
108 |                     current_chunk = [part]
109 |                     current_word_count = part_words
110 |                 else:
111 |                     # Add to current chunk
112 |                     if current_chunk and not current_chunk[-1].endswith((' ', ',', ';')):
113 |                         current_chunk.append(' ')
114 |                     current_chunk.append(part)
115 |                     current_word_count += part_words
116 |         else:
117 |             # Normal sentence handling
118 |             if current_word_count + word_count > max_words_per_chunk and current_chunk:
119 |                 # Save current chunk and start new one
120 |                 chunks.append(''.join(current_chunk))
121 |                 current_chunk = [sentence]
122 |                 current_word_count = word_count
123 |             else:
124 |                 # Add to current chunk with space
125 |                 if current_chunk:
126 |                     current_chunk.append(' ')
127 |                 current_chunk.append(sentence)
128 |                 current_word_count += word_count
129 | 
130 |     # Add remaining chunk
131 |     if current_chunk:
132 |         chunks.append(''.join(current_chunk))
133 | 
134 |     return chunks if chunks else [text]
135 | 
136 | 
137 | class Maya1TTSCombinedNode:
138 |     """
139 |     Combined Maya1 TTS node - loads model and generates speech in one node.
140 | 
141 |     Features:
142 |     - Model loading with caching
143 |     - Voice design through natural language
144 |     - 20+ emotion tags with clickable buttons
145 |     - Native ComfyUI cancel support
146 |     - Real-time progress tracking
147 |     - VRAM management
148 |     """
149 | 
150 |     DESCRIPTION = ""
151 | 
152 |     @classmethod
153 |     def INPUT_TYPES(cls):
154 |         """Define input parameters for the node."""
155 |         return {
156 |             "required": {
157 |                 # Model settings
158 |                 "model_name": (discover_maya1_models(), {
159 |                     "default": discover_maya1_models()[0] if discover_maya1_models() else None
160 |                 }),
161 |                 "dtype": (["4bit (BNB)", "8bit (BNB)", "float16", "bfloat16", "float32"], {
162 |                     "default": "bfloat16"
163 |                 }),
164 |                 "attention_mechanism": (["sdpa", "eager", "flash_attention_2", "sage_attention"], {
165 |                     "default": "sdpa"
166 |                 }),
167 |                 "device": (["cuda", "cpu"], {
168 |                     "default": "cuda"
169 |                 }),
170 | 
171 |                 # Voice and text
172 |                 "voice_description": ("STRING", {
173 |                     "multiline": True,
174 |                     "default": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
175 |                     "dynamicPrompts": False
176 |                 }),
177 |                 "text": ("STRING", {
178 |                     "multiline": True,
179 |                     "default": "Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
180 |                     "dynamicPrompts": False
181 |                 }),
182 | 
183 |                 # Generation settings
184 |                 "keep_model_in_vram": ("BOOLEAN", {
185 |                     "default": True
186 |                 }),
187 |                 "temperature": ("FLOAT", {
188 |                     "default": 0.4,  # Official Maya1 recommendation (from transformers_inference.py)
189 |                     "min": 0.1,
190 |                     "max": 2.0,
191 |                     "step": 0.05
192 |                 }),
193 |                 "top_p": ("FLOAT", {
194 |                     "default": 0.9,
195 |                     "min": 0.1,
196 |                     "max": 1.0,
197 |                     "step": 0.05
198 |                 }),
199 |                 "max_new_tokens": ("INT", {
200 |                     "default": 4000,
201 |                     "min": 100,
202 |                     "max": 16000,
203 |                     "step": 100,
204 |                     "tooltip": "Maximum NEW SNAC tokens to generate per chunk (excludes input prompt tokens). Higher = longer audio per chunk (~50 tokens/word). 4000 tokens ≈ 30-40s audio"
205 |                 }),
206 |                 "repetition_penalty": ("FLOAT", {
207 |                     "default": 1.1,
208 |                     "min": 1.0,
209 |                     "max": 2.0,
210 |                     "step": 0.05
211 |                 }),
212 |                 "seed": ("INT", {
213 |                     "default": 0,
214 |                     "min": 0,
215 |                     "max": 0xffffffffffffffff
216 |                 }),
217 |                 "chunk_longform": ("BOOLEAN", {
218 |                     "default": False,
219 |                     "tooltip": "Split long text into chunks at sentence boundaries with smooth crossfading. Enables unlimited audio length beyond the 18-20s limit"
220 |                 }),
221 |             }
222 |         }
223 | 
224 |     RETURN_TYPES = ("AUDIO",)
225 |     RETURN_NAMES = ("audio",)
226 |     FUNCTION = "generate_speech"
227 |     CATEGORY = "audio/maya1"
228 | 
229 |     def cleanup_vram(self):
230 |         """
231 |         Native ComfyUI VRAM cleanup - unloads all models and clears cache.
232 |         Follows best practices from ComfyUI's memory management system.
233 |         """
234 |         print("🗑️  Cleaning up VRAM...")
235 | 
236 |         # Step 1: Unload all models from VRAM
237 |         mm.unload_all_models()
238 | 
239 |         # Step 2: Clear ComfyUI's internal cache
240 |         mm.soft_empty_cache()
241 | 
242 |         # Step 3: Python garbage collection
243 |         gc.collect()
244 | 
245 |         # Step 4: Clear CUDA caches (if available)
246 |         if torch.cuda.is_available():
247 |             torch.cuda.empty_cache()
248 |             torch.cuda.ipc_collect()
249 | 
250 |         print("✅ VRAM cleanup complete")
251 | 
252 |     def generate_speech(
253 |         self,
254 |         model_name: str,
255 |         dtype: str,
256 |         attention_mechanism: str,
257 |         device: str,
258 |         voice_description: str,
259 |         text: str,
260 |         keep_model_in_vram: bool,
261 |         temperature: float,
262 |         top_p: float,
263 |         max_new_tokens: int,
264 |         repetition_penalty: float,
265 |         seed: int,
266 |         chunk_longform: bool,
267 |         emotion_tag_insert: str = "(none)",
268 |         chunk_index: int = None,
269 |         total_chunks: int = None
270 |     ) -> Tuple[dict]:
271 |         """
272 |         Load model (if needed) and generate expressive speech.
273 | 
274 |         Returns:
275 |             Tuple containing audio dictionary for ComfyUI
276 |         """
277 |         # Import ComfyUI utilities for progress and cancellation
278 |         import comfy.utils
279 |         import comfy.model_management as mm
280 | 
281 |         # Check for cancellation before starting
282 |         mm.throw_exception_if_processing_interrupted()
283 | 
284 |         # Simple seed logic: if seed is 0, randomize; otherwise use the provided seed
285 |         # This way seed=0 is always random, and you can set a specific seed for reproducibility
286 |         if seed == 0:
287 |             actual_seed = random.randint(1, 0xffffffffffffffff)
288 |         else:
289 |             actual_seed = seed
290 | 
291 |         print("=" * 70)
292 |         print("🎤 Maya1 TTS Generation")
293 |         print("=" * 70)
294 |         print(f"🎲 Seed: {actual_seed}")
295 |         print(f"💾 VRAM setting: {'Keep in VRAM' if keep_model_in_vram else 'Offload after generation'}")
296 | 
297 |         # ========== MODEL LOADING ==========
298 |         # Get the expected models directory
299 |         models_dir = get_maya1_models_dir()
300 | 
301 |         # Validate model name
302 |         if model_name.startswith("(No"):
303 |             raise ValueError(
304 |                 f"No valid Maya1 models found!\n\n"
305 |                 f"Expected location: {models_dir}\n\n"
306 |                 f"Please download a model:\n"
307 |                 f"  1. Create models directory:\n"
308 |                 f"     mkdir -p {models_dir}\n\n"
309 |                 f"  2. Download Maya1:\n"
310 |                 f"     huggingface-cli download maya-research/maya1 \\\n"
311 |                 f"       --local-dir {models_dir}/maya1\n\n"
312 |                 f"  3. Restart ComfyUI to refresh the dropdown."
313 |             )
314 | 
315 |         # Get full model path
316 |         model_path = get_model_path(model_name)
317 | 
318 |         if not model_path.exists():
319 |             raise FileNotFoundError(
320 |                 f"Model not found: {model_path}\n\n"
321 |                 f"Make sure the model is properly downloaded to:\n"
322 |                 f"  {model_path}"
323 |             )
324 | 
325 |         # Check device availability
326 |         if device == "cuda" and not torch.cuda.is_available():
327 |             print("⚠️  CUDA not available, falling back to CPU")
328 |             device = "cpu"
329 | 
330 |         # ========== MODEL LOADING ==========
331 |         print(f"🔍 Validating model files in: {model_path}")
332 | 
333 |         critical_files = {
334 |             "config.json": model_path / "config.json",
335 |             "generation_config.json": model_path / "generation_config.json",
336 |             "tokenizer_config.json": model_path / "tokenizer" / "tokenizer_config.json",
337 |             "tokenizer.json": model_path / "tokenizer" / "tokenizer.json",
338 |             "model weights": model_path / "model-00001-of-00002.safetensors",
339 |         }
340 | 
341 |         missing_files = []
342 |         for file_name, file_path in critical_files.items():
343 |             if file_path.exists():
344 |                 print(f"   ✅ {file_name}")
345 |             else:
346 |                 print(f"   ❌ {file_name} - MISSING!")
347 |                 missing_files.append(file_name)
348 | 
349 |         if missing_files:
350 |             raise FileNotFoundError(
351 |                 f"Missing critical model files: {', '.join(missing_files)}\n\n"
352 |                 f"Model directory: {model_path}\n\n"
353 |                 f"Please re-download the complete model:\n"
354 |                 f"  huggingface-cli download maya-research/maya1 \\\n"
355 |                 f"    --local-dir {model_path}"
356 |             )
357 | 
358 |         # Strip "(BNB)" suffix from dtype labels if present
359 |         dtype_clean = dtype.replace(" (BNB)", "")
360 | 
361 |         # Load model using the wrapper (with caching)
362 |         try:
363 |             maya1_model = Maya1ModelLoader.load_model(
364 |                 model_path=model_path,
365 |                 attention_type=attention_mechanism,
366 |                 dtype=dtype_clean,
367 |                 device=device
368 |             )
369 |         except Exception as e:
370 |             raise RuntimeError(
371 |                 f"Failed to load Maya1 model:\n{str(e)}\n\n"
372 |                 f"Model: {model_name}\n"
373 |                 f"Attention: {attention_mechanism}\n"
374 |                 f"Dtype: {dtype_clean}\n"
375 |                 f"Device: {device}"
376 |             )
377 | 
378 |         mm.throw_exception_if_processing_interrupted()
379 | 
380 |         # ========== SPEECH GENERATION ==========
381 |         print(f"Keep in VRAM: {keep_model_in_vram}")
382 |         print(f"Voice: {voice_description[:60]}...")
383 |         print(f"Text: {text[:60]}...")
384 |         print(f"Temperature: {temperature}, Top-p: {top_p}")
385 |         print(f"Max tokens: {max_new_tokens}")
386 |         print("=" * 70)
387 | 
388 |         # ========== LONGFORM CHUNKING ==========
389 |         # Check if text should be chunked (enabled + text is reasonably long)
390 |         word_count = len(text.split())
391 |         if chunk_longform and word_count > 80:  # Only chunk if >80 words
392 |             print(f"📚 Longform mode enabled: {word_count} words detected")
393 |             print(f"🔪 Splitting text into chunks at sentence boundaries...")
394 | 
395 |             # Calculate words per chunk based on max_new_tokens
396 |             # Empirical data: 1 word ≈ 50-55 SNAC tokens
397 |             # Leave some headroom (80%) to avoid exceeding max_new_tokens
398 |             estimated_words_per_chunk = int((max_new_tokens * 0.8) / 50)
399 |             estimated_words_per_chunk = max(50, min(estimated_words_per_chunk, 300))  # Clamp between 50-300
400 | 
401 |             print(f"📏 Max tokens: {max_new_tokens} → ~{estimated_words_per_chunk} words per chunk (~{estimated_words_per_chunk / 150:.1f}min per chunk)")
402 | 
403 |             text_chunks = split_text_smartly(text, max_words_per_chunk=estimated_words_per_chunk)
404 |             print(f"📦 Split into {len(text_chunks)} chunks")
405 |             print("=" * 70)
406 | 
407 |             # Create outer progress bar for chunks (layered progress)
408 |             import comfy.utils
409 |             chunk_progress = comfy.utils.ProgressBar(len(text_chunks))
410 | 
411 |             all_audio_data = []
412 |             sample_rate = None
413 | 
414 |             for i, chunk_text in enumerate(text_chunks):
415 |                 # Create visual progress display for chunks
416 |                 chunk_bar = create_progress_bar(i + 1, len(text_chunks), width=6)
417 |                 print(f"\n🎤 Chunk Progress: {chunk_bar}")
418 |                 print(f"📝 Text: {chunk_text[:60]}...")
419 |                 print("=" * 70)
420 | 
421 |                 # Check for cancellation before each chunk
422 |                 mm.throw_exception_if_processing_interrupted()
423 | 
424 |                 # Recursively call generate_speech for this chunk with chunk_longform=False
425 |                 # to avoid infinite recursion
426 |                 chunk_audio = self.generate_speech(
427 |                     model_name=model_name,
428 |                     dtype=dtype,
429 |                     attention_mechanism=attention_mechanism,
430 |                     device=device,
431 |                     voice_description=voice_description,
432 |                     text=chunk_text,
433 |                     keep_model_in_vram=True,  # Keep in VRAM between chunks
434 |                     temperature=temperature,
435 |                     top_p=top_p,
436 |                     max_new_tokens=max_new_tokens,
437 |                     repetition_penalty=repetition_penalty,
438 |                     seed=actual_seed,  # Use same seed for all chunks
439 |                     chunk_longform=False,  # Disable chunking for recursive calls
440 |                     emotion_tag_insert=emotion_tag_insert,
441 |                     chunk_index=i + 1,  # Pass chunk context for layered progress
442 |                     total_chunks=len(text_chunks)
443 |                 )
444 | 
445 |                 # Extract audio data (returns tuple, get first element)
446 |                 chunk_audio_dict = chunk_audio[0]
447 |                 chunk_waveform = chunk_audio_dict["waveform"]
448 |                 sample_rate = chunk_audio_dict["sample_rate"]
449 | 
450 |                 # Update chunk progress (outer progress bar)
451 |                 chunk_progress.update(1)
452 |                 all_audio_data.append(chunk_waveform)
453 | 
454 |                 mm.throw_exception_if_processing_interrupted()
455 | 
456 |             print(f"\n{'=' * 70}")
457 |             print(f"🔗 Combining {len(all_audio_data)} audio chunks with crossfading...")
458 | 
459 |             # Combine audio chunks with crossfading for smooth transitions
460 |             # Crossfade duration: 50ms = 1200 samples at 24kHz
461 |             combined_waveform_np = all_audio_data[0]
462 | 
463 |             for i in range(1, len(all_audio_data)):
464 |                 # Crossfade between chunks (1200 samples = 50ms at 24kHz)
465 |                 combined_waveform_np = crossfade_audio(
466 |                     combined_waveform_np,
467 |                     all_audio_data[i],
468 |                     crossfade_samples=1200
469 |                 )
470 | 
471 |             # Ensure it's a torch tensor
472 |             if not isinstance(combined_waveform_np, torch.Tensor):
473 |                 combined_waveform = torch.from_numpy(combined_waveform_np)
474 |             else:
475 |                 combined_waveform = combined_waveform_np
476 | 
477 |             print(f"✅ Generated {combined_waveform.shape[-1] / sample_rate:.2f}s of audio from {len(text_chunks)} chunks")
478 |             print("=" * 70)
479 | 
480 |             # Handle VRAM cleanup if requested
481 |             if not keep_model_in_vram:
482 |                 print("🗑️  Offloading model from VRAM...")
483 |                 Maya1ModelLoader.clear_cache(force=True)
484 |                 print("✅ Model offloaded from VRAM")
485 | 
486 |             return ({
487 |                 "waveform": combined_waveform,
488 |                 "sample_rate": sample_rate
489 |             },)
490 | 
491 |         # ========== SINGLE GENERATION (NO CHUNKING) ==========
492 |         # Set seed for reproducibility
493 |         torch.manual_seed(actual_seed)
494 |         if torch.cuda.is_available():
495 |             torch.cuda.manual_seed_all(actual_seed)
496 | 
497 |         # Format prompt using Maya1's OFFICIAL format (from transformers_inference.py)
498 |         print("🔤 Formatting prompt with control tokens...")
499 | 
500 |         # Official Maya1 control token IDs
501 |         SOH_ID = 128259  # Start of Header
502 |         EOH_ID = 128260  # End of Header
503 |         SOA_ID = 128261  # Start of Audio
504 |         CODE_START_TOKEN_ID = 128257  # Start of Speech codes
505 |         TEXT_EOT_ID = 128009  # End of Text
506 | 
507 |         # Decode control tokens
508 |         soh_token = maya1_model.tokenizer.decode([SOH_ID])
509 |         eoh_token = maya1_model.tokenizer.decode([EOH_ID])
510 |         soa_token = maya1_model.tokenizer.decode([SOA_ID])
511 |         sos_token = maya1_model.tokenizer.decode([CODE_START_TOKEN_ID])
512 |         eot_token = maya1_model.tokenizer.decode([TEXT_EOT_ID])
513 |         bos_token = maya1_model.tokenizer.bos_token
514 | 
515 |         # Build formatted text
516 |         formatted_text = f'<description="{voice_description}"> {text}'
517 | 
518 |         # Construct full prompt with all control tokens (CRITICAL for avoiding garbling!)
519 |         prompt = (
520 |             soh_token + bos_token + formatted_text + eot_token +
521 |             eoh_token + soa_token + sos_token
522 |         )
523 | 
524 |         # Debug: Print formatted prompt
525 |         print(f"📝 Formatted text: {formatted_text[:100]}...")
526 |         print(f"📝 Full prompt preview (first 200 chars): {repr(prompt[:200])}...")
527 | 
528 |         # Tokenize input
529 |         inputs = maya1_model.tokenizer(
530 |             prompt,
531 |             return_tensors="pt"
532 |         )
533 |         print(f"📊 Input token count: {inputs['input_ids'].shape[1]}")
534 | 
535 |         # Move to device
536 |         inputs = {k: v.to(device) for k, v in inputs.items()}
537 | 
538 |         # Check for cancellation
539 |         mm.throw_exception_if_processing_interrupted()
540 | 
541 |         # Generate with progress tracking and cancellation checks
542 |         print(f"🎵 Generating speech (max {max_new_tokens} tokens)...")
543 | 
544 |         try:
545 |             # Setup progress tracking
546 |             from comfy.utils import ProgressBar
547 |             progress_bar = ProgressBar(max_new_tokens)
548 | 
549 |             # Create stopping criteria for cancellation support
550 |             from transformers import StoppingCriteria, StoppingCriteriaList
551 | 
552 |             class InterruptionStoppingCriteria(StoppingCriteria):
553 |                 """Custom stopping criteria that checks for ComfyUI cancellation."""
554 |                 def __init__(self, progress_bar, chunk_index=None, total_chunks=None):
555 |                     self.progress_bar = progress_bar
556 |                     self.current_tokens = 0
557 |                     self.input_length = 0
558 |                     self.start_time = None
559 |                     self.last_print_time = None
560 |                     self.print_interval = 0.5  # Print progress every 0.5 seconds
561 |                     self.chunk_index = chunk_index
562 |                     self.total_chunks = total_chunks
563 | 
564 |                 def __call__(self, input_ids, scores, **kwargs):
565 |                     import time
566 | 
567 |                     # Store input length and start time on first call
568 |                     if self.input_length == 0:
569 |                         self.input_length = input_ids.shape[1]
570 |                         self.start_time = time.time()
571 |                         self.last_print_time = self.start_time
572 | 
573 |                     # Update progress
574 |                     new_tokens = input_ids.shape[1] - self.input_length
575 |                     if new_tokens > self.current_tokens:
576 |                         self.progress_bar.update(new_tokens - self.current_tokens)
577 |                         self.current_tokens = new_tokens
578 | 
579 |                         # Print progress with visual bar and it/s to console
580 |                         current_time = time.time()
581 |                         if current_time - self.last_print_time >= self.print_interval:
582 |                             elapsed = current_time - self.start_time
583 |                             it_per_sec = new_tokens / elapsed if elapsed > 0 else 0
584 | 
585 |                             # Create visual progress bar for tokens
586 |                             token_bar = create_progress_bar(new_tokens, max_new_tokens, width=12)
587 | 
588 |                             # Show layered progress if in chunked mode
589 |                             if self.chunk_index is not None and self.total_chunks is not None:
590 |                                 chunk_bar = create_progress_bar(self.chunk_index, self.total_chunks, width=6, show_numbers=False)
591 |                                 print(f"   Chunk {chunk_bar} → Token Progress: {token_bar} | Speed: {it_per_sec:.2f} it/s", end='\r')
592 |                             else:
593 |                                 print(f"   Progress: {token_bar} | Speed: {it_per_sec:.2f} it/s | Elapsed: {elapsed:.1f}s", end='\r')
594 | 
595 |                             self.last_print_time = current_time
596 | 
597 |                     # Check for cancellation using ComfyUI's native method
598 |                     try:
599 |                         mm.throw_exception_if_processing_interrupted()
600 |                     except:
601 |                         # If interrupted, stop generation gracefully
602 |                         print("\n🛑 Generation cancelled by user")
603 |                         return True  # Stop generation
604 | 
605 |                     return False  # Continue generation
606 | 
607 |             stopping_criteria = StoppingCriteriaList([
608 |                 InterruptionStoppingCriteria(progress_bar, chunk_index=chunk_index, total_chunks=total_chunks)
609 |             ])
610 | 
611 |             # Generate tokens with cancellation support
612 |             # CRITICAL: Maya1 has TWO EOS tokens in generation_config.json:
613 |             #   - 128009 (<|eot_id|>) - Text completion token
614 |             #   - 128258 - SNAC audio completion token
615 |             # We need to ONLY stop on 128258 (SNAC done), not 128009 (text done)
616 |             # Otherwise the model generates text, hits 128009, and stops before SNAC codes!
617 | 
618 |             print("🎵 Generation settings:")
619 |             print(f"   Using EOS token: 128258 (SNAC completion only)")
620 |             print(f"   Ignoring EOS token: 128009 (text completion)")
621 | 
622 |             import time
623 |             generation_start = time.time()
624 | 
625 |             with torch.inference_mode():
626 |                 outputs = maya1_model.model.generate(
627 |                     **inputs,
628 |                     max_new_tokens=max_new_tokens,
629 |                     min_new_tokens=28,  # At least 4 SNAC frames (4 frames × 7 tokens = 28)
630 |                     temperature=temperature,
631 |                     top_p=top_p,
632 |                     do_sample=True,
633 |                     repetition_penalty=repetition_penalty,
634 |                     pad_token_id=maya1_model.tokenizer.pad_token_id,
635 |                     eos_token_id=128258,  # CODE_END_TOKEN_ID - Stop at end of speech
636 |                     stopping_criteria=stopping_criteria,
637 |                     use_cache=True,  # Enable KV cache for faster generation
638 |                 )
639 | 
640 |             generation_time = time.time() - generation_start
641 | 
642 |             # Check for cancellation after generation
643 |             mm.throw_exception_if_processing_interrupted()
644 | 
645 |             # Extract generated tokens (remove input tokens)
646 |             generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
647 | 
648 |             # Print final generation statistics
649 |             final_speed = len(generated_ids) / generation_time if generation_time > 0 else 0
650 |             print(f"\n✅ Generated {len(generated_ids)} tokens in {generation_time:.2f}s ({final_speed:.2f} it/s)")
651 | 
652 |             # Debug: Print first few generated token IDs
653 |             print(f"🔍 First 10 generated token IDs: {generated_ids[:10]}")
654 | 
655 |             # Debug: Decode generated tokens to see what was generated
656 |             generated_text = maya1_model.tokenizer.decode(generated_ids, skip_special_tokens=False)
657 |             print(f"🔍 Generated text (first 100 chars): {generated_text[:100]}...")
658 | 
659 |             # Filter SNAC tokens
660 |             from ..core.snac_decoder import filter_snac_tokens
661 |             snac_tokens = filter_snac_tokens(generated_ids)
662 | 
663 |             if len(snac_tokens) == 0:
664 |                 raise ValueError(
665 |                     "No SNAC audio tokens generated!\n"
666 |                     "The model may have only generated text tokens.\n"
667 |                     "Try adjusting the prompt or generation parameters."
668 |                 )
669 | 
670 |             print(f"🎵 Found {len(snac_tokens)} SNAC tokens ({len(snac_tokens) // 7} frames)")
671 | 
672 |             # Check for cancellation before decoding
673 |             mm.throw_exception_if_processing_interrupted()
674 | 
675 |             # Decode SNAC tokens to audio
676 |             print("🔊 Decoding to audio...")
677 |             audio_waveform = SNACDecoder.decode(snac_tokens, device=device)
678 | 
679 |             # Check for cancellation after decoding
680 |             mm.throw_exception_if_processing_interrupted()
681 | 
682 |             # Convert to ComfyUI audio format
683 |             audio_tensor = torch.from_numpy(audio_waveform).float()
684 | 
685 |             # Add batch and channel dimensions: [samples] -> [1, 1, samples]
686 |             if audio_tensor.dim() == 1:
687 |                 audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0)
688 |             elif audio_tensor.dim() == 2:
689 |                 audio_tensor = audio_tensor.unsqueeze(0)
690 | 
691 |             audio_output = {
692 |                 "waveform": audio_tensor,
693 |                 "sample_rate": 24000
694 |             }
695 | 
696 |             print(f"✅ Generated {len(audio_waveform) / 24000:.2f}s of audio")
697 |             print("=" * 70)
698 | 
699 |             # Handle VRAM management based on toggle
700 |             if not keep_model_in_vram:
701 |                 print("🗑️  Offloading model from VRAM...")
702 |                 Maya1ModelLoader.clear_cache(force=True)
703 |                 print("✅ Model offloaded from VRAM")
704 |             else:
705 |                 print("💾 Model kept in VRAM for faster next generation")
706 | 
707 |             return (audio_output,)
708 | 
709 |         except InterruptedError as e:
710 |             # User cancelled the generation
711 |             print(f"\n{str(e)}")
712 |             print("=" * 70)
713 |             # Note: VRAM cleanup handled by ComfyUI hooks
714 |             raise
715 | 
716 |         except Exception as e:
717 |             # Other errors
718 |             print(f"\n❌ Generation failed: {str(e)}")
719 |             print("=" * 70)
720 |             # Note: VRAM cleanup handled by ComfyUI hooks
721 |             raise
722 | 
723 | 
724 | # ComfyUI node mappings
725 | NODE_CLASS_MAPPINGS = {
726 |     "Maya1TTS_Combined": Maya1TTSCombinedNode
727 | }
728 | 
729 | NODE_DISPLAY_NAME_MAPPINGS = {
730 |     "Maya1TTS_Combined": "Maya1 TTS (AIO)"
731 | }
732 | 


--------------------------------------------------------------------------------
/nodes/maya1_tts_barebones.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Maya1 TTS Barebones Node for ComfyUI.
  3 | All-in-one node with standard ComfyUI widgets (no custom JavaScript UI).
  4 | Use this version if you have issues with the custom JavaScript rendering.
  5 | """
  6 | 
  7 | import torch
  8 | import numpy as np
  9 | import random
 10 | import re
 11 | import gc
 12 | from typing import Tuple, List
 13 | import comfy.model_management as mm
 14 | 
 15 | from ..core import (
 16 |     Maya1ModelLoader,
 17 |     SNACDecoder,
 18 |     discover_maya1_models,
 19 |     get_model_path,
 20 |     get_maya1_models_dir,
 21 |     format_prompt,
 22 |     check_interruption,
 23 |     load_emotions_list,
 24 |     crossfade_audio
 25 | )
 26 | 
 27 | 
 28 | def create_progress_bar(current: int, total: int, width: int = 12, show_numbers: bool = True) -> str:
 29 |     """
 30 |     Create a visual progress bar like ComfyUI's native one.
 31 | 
 32 |     Args:
 33 |         current: Current progress value
 34 |         total: Total value
 35 |         width: Width of the progress bar in characters
 36 |         show_numbers: Whether to show the numbers after the bar
 37 | 
 38 |     Returns:
 39 |         Formatted progress bar string
 40 |     """
 41 |     if total == 0:
 42 |         percent = 0
 43 |     else:
 44 |         percent = min(current / total, 1.0)
 45 | 
 46 |     filled = int(width * percent)
 47 |     empty = width - filled
 48 | 
 49 |     bar = '█' * filled + '░' * empty
 50 | 
 51 |     if show_numbers:
 52 |         return f"[{bar}] {current}/{total}"
 53 |     else:
 54 |         return f"[{bar}]"
 55 | 
 56 | def split_text_smartly(text: str, max_words_per_chunk: int = 100) -> List[str]:
 57 |     """
 58 |     Split text into chunks at sentence boundaries, keeping emotion tags intact.
 59 |     Improved to NEVER cut words mid-sentence.
 60 | 
 61 |     Args:
 62 |         text: Input text to split
 63 |         max_words_per_chunk: Maximum words per chunk (default 100)
 64 | 
 65 |     Returns:
 66 |         List of text chunks
 67 |     """
 68 |     # Better sentence boundary detection that handles emotion tags
 69 |     # Split on: . ! ? followed by whitespace (and optionally capital letter or end of string)
 70 |     # This regex keeps the punctuation with the sentence
 71 |     sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z<]|$)'
 72 |     sentences = re.split(sentence_pattern, text.strip())
 73 | 
 74 |     # Clean up empty sentences
 75 |     sentences = [s.strip() for s in sentences if s.strip()]
 76 | 
 77 |     # Group sentences into chunks
 78 |     chunks = []
 79 |     current_chunk = []
 80 |     current_word_count = 0
 81 | 
 82 |     for sentence in sentences:
 83 |         # Count words (emotion tags don't count as words)
 84 |         # Remove emotion tags temporarily for word count
 85 |         text_without_tags = re.sub(r'<[^>]+>', '', sentence)
 86 |         word_count = len(text_without_tags.split())
 87 | 
 88 |         # If single sentence exceeds max, split on commas or semicolons
 89 |         if word_count > max_words_per_chunk:
 90 |             # Split long sentence on commas, keeping punctuation
 91 |             parts = re.split(r'(,\s+|;\s+)', sentence)
 92 | 
 93 |             for i, part in enumerate(parts):
 94 |                 if not part.strip():
 95 |                     continue
 96 | 
 97 |                 # For delimiters (commas/semicolons), add to previous chunk
 98 |                 if part.strip() in [',', ';']:
 99 |                     if current_chunk:
100 |                         current_chunk[-1] += part
101 |                     continue
102 | 
103 |                 # Count words in this part
104 |                 part_text = re.sub(r'<[^>]+>', '', part)
105 |                 part_words = len(part_text.split())
106 | 
107 |                 if current_word_count + part_words > max_words_per_chunk and current_chunk:
108 |                     # Start new chunk
109 |                     chunks.append(''.join(current_chunk))
110 |                     current_chunk = [part]
111 |                     current_word_count = part_words
112 |                 else:
113 |                     # Add to current chunk
114 |                     if current_chunk and not current_chunk[-1].endswith((' ', ',', ';')):
115 |                         current_chunk.append(' ')
116 |                     current_chunk.append(part)
117 |                     current_word_count += part_words
118 |         else:
119 |             # Normal sentence handling
120 |             if current_word_count + word_count > max_words_per_chunk and current_chunk:
121 |                 # Save current chunk and start new one
122 |                 chunks.append(''.join(current_chunk))
123 |                 current_chunk = [sentence]
124 |                 current_word_count = word_count
125 |             else:
126 |                 # Add to current chunk with space
127 |                 if current_chunk:
128 |                     current_chunk.append(' ')
129 |                 current_chunk.append(sentence)
130 |                 current_word_count += word_count
131 | 
132 |     # Add remaining chunk
133 |     if current_chunk:
134 |         chunks.append(''.join(current_chunk))
135 | 
136 |     return chunks if chunks else [text]
137 | 
138 | 
139 | class Maya1TTSBarebonesNode:
140 |     """
141 |     Barebones Maya1 TTS node - standard ComfyUI widgets only (no custom JavaScript).
142 | 
143 |     Use this version if you're experiencing issues with the custom UI rendering.
144 | 
145 |     Features:
146 |     - Model loading with caching
147 |     - Voice design through natural language
148 |     - Emotion tags support (manually type <laugh>, <cry>, etc.)
149 |     - Native ComfyUI cancel support
150 |     - Real-time progress tracking
151 |     - VRAM management
152 |     """
153 | 
154 |     DESCRIPTION = "Maya1 TTS with standard widgets (for users with JavaScript rendering issues)"
155 | 
156 |     @classmethod
157 |     def INPUT_TYPES(cls):
158 |         """Define input parameters for the node."""
159 |         return {
160 |             "required": {
161 |                 # Voice and text (top)
162 |                 "voice_description": ("STRING", {
163 |                     "multiline": True,
164 |                     "default": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
165 |                     "dynamicPrompts": False,
166 |                     "tooltip": "Describe your desired voice using natural language. Include: age (20s-50s), gender (male/female), accent (American/British/etc), pitch (low/normal/high), timbre (warm/gravelly/smooth), pacing (fast/conversational/slow), tone (happy/calm/energetic)"
167 |                 }),
168 |                 "text": ("STRING", {
169 |                     "multiline": True,
170 |                     "default": "Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
171 |                     "dynamicPrompts": False,
172 |                     "tooltip": "Your script text to synthesize. Add emotion tags anywhere in the text (type manually - no visual buttons in barebones mode). All 17 available tags: <laugh>, <laugh_harder>, <giggle>, <chuckle>, <cry>, <sigh>, <gasp>, <whisper>, <angry>, <scream>, <snort>, <yawn>, <cough>, <sneeze>, <breathing>, <humming>, <throat_clearing>"
173 |                 }),
174 | 
175 |                 # Model settings
176 |                 "model_name": (discover_maya1_models(), {
177 |                     "default": discover_maya1_models()[0] if discover_maya1_models() else None,
178 |                     "tooltip": "Select Maya1 model from ComfyUI/models/maya1-TTS/ folder. Models are auto-discovered on startup. Download from: huggingface.co/maya-research/maya1"
179 |                 }),
180 |                 "dtype": (["4bit (BNB)", "8bit (BNB)", "float16", "bfloat16", "float32"], {
181 |                     "default": "bfloat16",
182 |                     "tooltip": "Model precision. 4bit/8bit save VRAM but are SLOWER. Use float16/bfloat16 if you have 10GB+ VRAM for best speed. 4bit≈6GB, 8bit≈7GB, float16/bfloat16≈8-9GB, float32≈16GB"
183 |                 }),
184 |                 "attention_mechanism": (["sdpa", "eager", "flash_attention_2", "sage_attention"], {
185 |                     "default": "sdpa",
186 |                     "tooltip": "Attention algorithm. SDPA (default) is fastest for single TTS. Eager is standard PyTorch (slower). Flash Attention 2 helps with batch processing. Sage Attention is memory efficient"
187 |                 }),
188 |                 "device": (["cuda", "cpu"], {
189 |                     "default": "cuda",
190 |                     "tooltip": "Processing device. CUDA (GPU) is recommended for speed. CPU works but is much slower. Will auto-fallback to CPU if CUDA unavailable"
191 |                 }),
192 | 
193 |                 # Generation settings
194 |                 "keep_model_in_vram": ("BOOLEAN", {
195 |                     "default": True,
196 |                     "tooltip": "Keep model loaded in VRAM after generation. True = faster repeated generations but uses VRAM. False = frees VRAM after each generation but slower"
197 |                 }),
198 |                 "chunk_longform": ("BOOLEAN", {
199 |                     "default": False,
200 |                     "tooltip": "Split long text into chunks at sentence boundaries with smooth crossfading. Enables unlimited audio length beyond the 18-20s limit. EXPERIMENTAL - may have quality/timing issues"
201 |                 }),
202 |                 "max_new_tokens": ("INT", {
203 |                     "default": 4000,
204 |                     "min": 100,
205 |                     "max": 16000,
206 |                     "step": 100,
207 |                     "tooltip": "Maximum NEW SNAC tokens to generate per chunk (excludes input prompt tokens). Higher = longer audio per chunk (~50 tokens/word). 4000 tokens ≈ 30-40s audio. Increase if audio cuts off too early"
208 |                 }),
209 |                 "temperature": ("FLOAT", {
210 |                     "default": 0.4,
211 |                     "min": 0.1,
212 |                     "max": 2.0,
213 |                     "step": 0.05,
214 |                     "tooltip": "Controls randomness/creativity. Lower (0.1-0.3) = more consistent/predictable. Higher (0.5-1.0) = more varied/creative. 0.4 is official Maya1 recommendation"
215 |                 }),
216 |                 "top_p": ("FLOAT", {
217 |                     "default": 0.9,
218 |                     "min": 0.1,
219 |                     "max": 1.0,
220 |                     "step": 0.05,
221 |                     "tooltip": "Nucleus sampling - controls diversity of token selection. 0.9 (default) works well for natural speech. Lower = more focused, higher = more diverse. Keep at 0.9 unless experimenting"
222 |                 }),
223 |                 "repetition_penalty": ("FLOAT", {
224 |                     "default": 1.1,
225 |                     "min": 1.0,
226 |                     "max": 2.0,
227 |                     "step": 0.05,
228 |                     "tooltip": "Reduces repetitive speech patterns. 1.0 = no penalty, higher = stronger penalty against repetition. 1.1 is a good default. Increase to 1.2-1.3 if speech sounds too repetitive"
229 |                 }),
230 |                 "seed": ("INT", {
231 |                     "default": 0,
232 |                     "min": 0,
233 |                     "max": 0xffffffffffffffff,
234 |                     "tooltip": "Random seed for reproducibility. 0 = random seed each time. Set specific number (1-999999) for same result every time. Use control_after_generate widget to auto-increment/randomize"
235 |                 }),
236 |             },
237 |             "hidden": {
238 |                 "control_after_generate": "COMBO"
239 |             }
240 |         }
241 | 
242 |     RETURN_TYPES = ("AUDIO",)
243 |     RETURN_NAMES = ("audio",)
244 |     FUNCTION = "generate_speech"
245 |     CATEGORY = "audio/maya1"
246 | 
247 |     def cleanup_vram(self):
248 |         """
249 |         Native ComfyUI VRAM cleanup - unloads all models and clears cache.
250 |         Follows best practices from ComfyUI's memory management system.
251 |         """
252 |         print("🗑️  Cleaning up VRAM...")
253 | 
254 |         # Step 1: Unload all models from VRAM
255 |         mm.unload_all_models()
256 | 
257 |         # Step 2: Clear ComfyUI's internal cache
258 |         mm.soft_empty_cache()
259 | 
260 |         # Step 3: Python garbage collection
261 |         gc.collect()
262 | 
263 |         # Step 4: Clear CUDA caches (if available)
264 |         if torch.cuda.is_available():
265 |             torch.cuda.empty_cache()
266 |             torch.cuda.ipc_collect()
267 | 
268 |         print("✅ VRAM cleanup complete")
269 | 
270 |     def generate_speech(
271 |         self,
272 |         voice_description: str,
273 |         text: str,
274 |         model_name: str,
275 |         dtype: str,
276 |         attention_mechanism: str,
277 |         device: str,
278 |         keep_model_in_vram: bool,
279 |         chunk_longform: bool,
280 |         max_new_tokens: int,
281 |         temperature: float,
282 |         top_p: float,
283 |         repetition_penalty: float,
284 |         seed: int,
285 |         control_after_generate: str = "randomize",
286 |         chunk_index: int = None,
287 |         total_chunks: int = None
288 |     ) -> Tuple[dict]:
289 |         """
290 |         Load model (if needed) and generate expressive speech.
291 | 
292 |         Returns:
293 |             Tuple containing audio dictionary for ComfyUI
294 |         """
295 |         # Import ComfyUI utilities for progress and cancellation
296 |         import comfy.utils
297 |         import comfy.model_management as mm
298 | 
299 |         # Check for cancellation before starting
300 |         mm.throw_exception_if_processing_interrupted()
301 | 
302 |         # Simple seed logic: if seed is 0, randomize; otherwise use the provided seed
303 |         # This way seed=0 is always random, and you can set a specific seed for reproducibility
304 |         if seed == 0:
305 |             actual_seed = random.randint(1, 0xffffffffffffffff)
306 |         else:
307 |             actual_seed = seed
308 | 
309 |         print("=" * 70)
310 |         print("🎤 Maya1 TTS Generation")
311 |         print("=" * 70)
312 |         print(f"🎲 Seed: {actual_seed}")
313 |         print(f"💾 VRAM setting: {'Keep in VRAM' if keep_model_in_vram else 'Offload after generation'}")
314 | 
315 |         # ========== MODEL LOADING ==========
316 |         # Get the expected models directory
317 |         models_dir = get_maya1_models_dir()
318 | 
319 |         # Validate model name
320 |         if model_name.startswith("(No"):
321 |             raise ValueError(
322 |                 f"No valid Maya1 models found!\n\n"
323 |                 f"Expected location: {models_dir}\n\n"
324 |                 f"Please download a model:\n"
325 |                 f"  1. Create models directory:\n"
326 |                 f"     mkdir -p {models_dir}\n\n"
327 |                 f"  2. Download Maya1:\n"
328 |                 f"     huggingface-cli download maya-research/maya1 \\\n"
329 |                 f"       --local-dir {models_dir}/maya1\n\n"
330 |                 f"  3. Restart ComfyUI to refresh the dropdown."
331 |             )
332 | 
333 |         # Get full model path
334 |         model_path = get_model_path(model_name)
335 | 
336 |         if not model_path.exists():
337 |             raise FileNotFoundError(
338 |                 f"Model not found: {model_path}\n\n"
339 |                 f"Make sure the model is properly downloaded to:\n"
340 |                 f"  {model_path}"
341 |             )
342 | 
343 |         # Check device availability
344 |         if device == "cuda" and not torch.cuda.is_available():
345 |             print("⚠️  CUDA not available, falling back to CPU")
346 |             device = "cpu"
347 | 
348 |         # ========== MODEL LOADING ==========
349 |         print(f"🔍 Validating model files in: {model_path}")
350 | 
351 |         critical_files = {
352 |             "config.json": model_path / "config.json",
353 |             "generation_config.json": model_path / "generation_config.json",
354 |             "tokenizer_config.json": model_path / "tokenizer" / "tokenizer_config.json",
355 |             "tokenizer.json": model_path / "tokenizer" / "tokenizer.json",
356 |             "model weights": model_path / "model-00001-of-00002.safetensors",
357 |         }
358 | 
359 |         missing_files = []
360 |         for file_name, file_path in critical_files.items():
361 |             if file_path.exists():
362 |                 print(f"   ✅ {file_name}")
363 |             else:
364 |                 print(f"   ❌ {file_name} - MISSING!")
365 |                 missing_files.append(file_name)
366 | 
367 |         if missing_files:
368 |             raise FileNotFoundError(
369 |                 f"Missing critical model files: {', '.join(missing_files)}\n\n"
370 |                 f"Model directory: {model_path}\n\n"
371 |                 f"Please re-download the complete model:\n"
372 |                 f"  huggingface-cli download maya-research/maya1 \\\n"
373 |                 f"    --local-dir {model_path}"
374 |             )
375 | 
376 |         # Strip "(BNB)" suffix from dtype labels if present
377 |         dtype_clean = dtype.replace(" (BNB)", "")
378 | 
379 |         # Load model using the wrapper (with caching)
380 |         try:
381 |             maya1_model = Maya1ModelLoader.load_model(
382 |                 model_path=model_path,
383 |                 attention_type=attention_mechanism,
384 |                 dtype=dtype_clean,
385 |                 device=device
386 |             )
387 |         except Exception as e:
388 |             raise RuntimeError(
389 |                 f"Failed to load Maya1 model:\n{str(e)}\n\n"
390 |                 f"Model: {model_name}\n"
391 |                 f"Attention: {attention_mechanism}\n"
392 |                 f"Dtype: {dtype_clean}\n"
393 |                 f"Device: {device}"
394 |             )
395 | 
396 |         mm.throw_exception_if_processing_interrupted()
397 | 
398 |         # ========== SPEECH GENERATION ==========
399 |         print(f"Keep in VRAM: {keep_model_in_vram}")
400 |         print(f"Voice: {voice_description[:60]}...")
401 |         print(f"Text: {text[:60]}...")
402 |         print(f"Temperature: {temperature}, Top-p: {top_p}")
403 |         print(f"Max tokens: {max_new_tokens}")
404 |         print("=" * 70)
405 | 
406 |         # ========== LONGFORM CHUNKING ==========
407 |         # Check if text should be chunked (enabled + text is reasonably long)
408 |         word_count = len(text.split())
409 |         if chunk_longform and word_count > 80:  # Only chunk if >80 words
410 |             print(f"📚 Longform mode enabled: {word_count} words detected")
411 |             print(f"🔪 Splitting text into chunks at sentence boundaries...")
412 | 
413 |             # Calculate words per chunk based on max_new_tokens
414 |             # Empirical data: 1 word ≈ 50-55 SNAC tokens
415 |             # Leave some headroom (80%) to avoid exceeding max_new_tokens
416 |             estimated_words_per_chunk = int((max_new_tokens * 0.8) / 50)
417 |             estimated_words_per_chunk = max(50, min(estimated_words_per_chunk, 300))  # Clamp between 50-300
418 | 
419 |             print(f"📏 Max tokens: {max_new_tokens} → ~{estimated_words_per_chunk} words per chunk (~{estimated_words_per_chunk / 150:.1f}min per chunk)")
420 | 
421 |             text_chunks = split_text_smartly(text, max_words_per_chunk=estimated_words_per_chunk)
422 |             print(f"📦 Split into {len(text_chunks)} chunks")
423 |             print("=" * 70)
424 | 
425 |             # Create outer progress bar for chunks (layered progress)
426 |             chunk_progress = comfy.utils.ProgressBar(len(text_chunks))
427 | 
428 |             all_audio_data = []
429 |             sample_rate = None
430 | 
431 |             for i, chunk_text in enumerate(text_chunks):
432 |                 # Create visual progress display for chunks
433 |                 chunk_bar = create_progress_bar(i + 1, len(text_chunks), width=6)
434 |                 print(f"\n🎤 Chunk Progress: {chunk_bar}")
435 |                 print(f"📝 Text: {chunk_text[:60]}...")
436 |                 print("=" * 70)
437 | 
438 |                 # Check for cancellation before each chunk
439 |                 mm.throw_exception_if_processing_interrupted()
440 | 
441 |                 # Recursively call generate_speech for this chunk with chunk_longform=False
442 |                 # to avoid infinite recursion
443 |                 chunk_audio = self.generate_speech(
444 |                     voice_description=voice_description,
445 |                     text=chunk_text,
446 |                     model_name=model_name,
447 |                     dtype=dtype,
448 |                     attention_mechanism=attention_mechanism,
449 |                     device=device,
450 |                     keep_model_in_vram=True,  # Keep in VRAM between chunks
451 |                     chunk_longform=False,  # Disable chunking for recursive calls
452 |                     max_new_tokens=max_new_tokens,
453 |                     temperature=temperature,
454 |                     top_p=top_p,
455 |                     repetition_penalty=repetition_penalty,
456 |                     seed=actual_seed,  # Use same seed for all chunks
457 |                     control_after_generate=control_after_generate,
458 |                     chunk_index=i + 1,  # Pass chunk context for layered progress
459 |                     total_chunks=len(text_chunks)
460 |                 )
461 | 
462 |                 # Extract audio data (returns tuple, get first element)
463 |                 chunk_audio_dict = chunk_audio[0]
464 |                 chunk_waveform = chunk_audio_dict["waveform"]
465 |                 sample_rate = chunk_audio_dict["sample_rate"]
466 |                 all_audio_data.append(chunk_waveform)
467 | 
468 |                 # Update chunk progress (outer progress bar)
469 |                 chunk_progress.update(1)
470 | 
471 |                 mm.throw_exception_if_processing_interrupted()
472 | 
473 |             print(f"\n{'=' * 70}")
474 |             print(f"🔗 Combining {len(all_audio_data)} audio chunks with crossfading...")
475 | 
476 |             # Combine audio chunks with crossfading for smooth transitions
477 |             # Crossfade duration: 50ms = 1200 samples at 24kHz
478 |             combined_waveform_np = all_audio_data[0]
479 | 
480 |             for i in range(1, len(all_audio_data)):
481 |                 # Crossfade between chunks (1200 samples = 50ms at 24kHz)
482 |                 combined_waveform_np = crossfade_audio(
483 |                     combined_waveform_np,
484 |                     all_audio_data[i],
485 |                     crossfade_samples=1200
486 |                 )
487 | 
488 |             # Ensure it's a torch tensor
489 |             if not isinstance(combined_waveform_np, torch.Tensor):
490 |                 combined_waveform = torch.from_numpy(combined_waveform_np)
491 |             else:
492 |                 combined_waveform = combined_waveform_np
493 | 
494 |             print(f"✅ Generated {combined_waveform.shape[-1] / sample_rate:.2f}s of audio from {len(text_chunks)} chunks")
495 |             print("=" * 70)
496 | 
497 |             # Handle VRAM cleanup if requested
498 |             if not keep_model_in_vram:
499 |                 print("🗑️  Offloading model from VRAM...")
500 |                 Maya1ModelLoader.clear_cache(force=True)
501 |                 print("✅ Model offloaded from VRAM")
502 | 
503 |             return ({
504 |                 "waveform": combined_waveform,
505 |                 "sample_rate": sample_rate
506 |             },)
507 | 
508 |         # ========== SINGLE GENERATION (NO CHUNKING) ==========
509 |         # Set seed for reproducibility
510 |         torch.manual_seed(actual_seed)
511 |         if torch.cuda.is_available():
512 |             torch.cuda.manual_seed_all(actual_seed)
513 | 
514 |         # Format prompt using Maya1's OFFICIAL format (from transformers_inference.py)
515 |         print("🔤 Formatting prompt with control tokens...")
516 | 
517 |         # Official Maya1 control token IDs
518 |         SOH_ID = 128259  # Start of Header
519 |         EOH_ID = 128260  # End of Header
520 |         SOA_ID = 128261  # Start of Audio
521 |         CODE_START_TOKEN_ID = 128257  # Start of Speech codes
522 |         TEXT_EOT_ID = 128009  # End of Text
523 | 
524 |         # Decode control tokens
525 |         soh_token = maya1_model.tokenizer.decode([SOH_ID])
526 |         eoh_token = maya1_model.tokenizer.decode([EOH_ID])
527 |         soa_token = maya1_model.tokenizer.decode([SOA_ID])
528 |         sos_token = maya1_model.tokenizer.decode([CODE_START_TOKEN_ID])
529 |         eot_token = maya1_model.tokenizer.decode([TEXT_EOT_ID])
530 |         bos_token = maya1_model.tokenizer.bos_token
531 | 
532 |         # Build formatted text
533 |         formatted_text = f'<description="{voice_description}"> {text}'
534 | 
535 |         # Construct full prompt with all control tokens (CRITICAL for avoiding garbling!)
536 |         prompt = (
537 |             soh_token + bos_token + formatted_text + eot_token +
538 |             eoh_token + soa_token + sos_token
539 |         )
540 | 
541 |         # Debug: Print formatted prompt
542 |         print(f"📝 Formatted text: {formatted_text[:100]}...")
543 |         print(f"📝 Full prompt preview (first 200 chars): {repr(prompt[:200])}...")
544 | 
545 |         # Tokenize input
546 |         inputs = maya1_model.tokenizer(
547 |             prompt,
548 |             return_tensors="pt"
549 |         )
550 |         print(f"📊 Input token count: {inputs['input_ids'].shape[1]}")
551 | 
552 |         # Move to device
553 |         inputs = {k: v.to(device) for k, v in inputs.items()}
554 | 
555 |         # Check for cancellation
556 |         mm.throw_exception_if_processing_interrupted()
557 | 
558 |         # Generate with progress tracking and cancellation checks
559 |         print(f"🎵 Generating speech (max {max_new_tokens} tokens)...")
560 | 
561 |         try:
562 |             # Setup progress tracking (inner progress bar for token generation)
563 |             progress_bar = comfy.utils.ProgressBar(max_new_tokens)
564 | 
565 |             # Create stopping criteria for cancellation support
566 |             from transformers import StoppingCriteria, StoppingCriteriaList
567 | 
568 |             class InterruptionStoppingCriteria(StoppingCriteria):
569 |                 """Custom stopping criteria that checks for ComfyUI cancellation."""
570 |                 def __init__(self, progress_bar, chunk_index=None, total_chunks=None):
571 |                     self.progress_bar = progress_bar
572 |                     self.current_tokens = 0
573 |                     self.input_length = 0
574 |                     self.start_time = None
575 |                     self.last_print_time = None
576 |                     self.print_interval = 0.5  # Print progress every 0.5 seconds
577 |                     self.chunk_index = chunk_index
578 |                     self.total_chunks = total_chunks
579 | 
580 |                 def __call__(self, input_ids, scores, **kwargs):
581 |                     import time
582 | 
583 |                     # Store input length and start time on first call
584 |                     if self.input_length == 0:
585 |                         self.input_length = input_ids.shape[1]
586 |                         self.start_time = time.time()
587 |                         self.last_print_time = self.start_time
588 | 
589 |                     # Update progress
590 |                     new_tokens = input_ids.shape[1] - self.input_length
591 |                     if new_tokens > self.current_tokens:
592 |                         self.progress_bar.update(new_tokens - self.current_tokens)
593 |                         self.current_tokens = new_tokens
594 | 
595 |                         # Print progress with visual bar and it/s to console
596 |                         current_time = time.time()
597 |                         if current_time - self.last_print_time >= self.print_interval:
598 |                             elapsed = current_time - self.start_time
599 |                             it_per_sec = new_tokens / elapsed if elapsed > 0 else 0
600 | 
601 |                             # Create visual progress bar for tokens
602 |                             token_bar = create_progress_bar(new_tokens, max_new_tokens, width=12)
603 | 
604 |                             # Show layered progress if in chunked mode
605 |                             if self.chunk_index is not None and self.total_chunks is not None:
606 |                                 chunk_bar = create_progress_bar(self.chunk_index, self.total_chunks, width=6, show_numbers=False)
607 |                                 print(f"   Chunk {chunk_bar} → Token Progress: {token_bar} | Speed: {it_per_sec:.2f} it/s", end='\r')
608 |                             else:
609 |                                 print(f"   Progress: {token_bar} | Speed: {it_per_sec:.2f} it/s | Elapsed: {elapsed:.1f}s", end='\r')
610 | 
611 |                             self.last_print_time = current_time
612 | 
613 |                     # Check for cancellation using ComfyUI's native method
614 |                     try:
615 |                         mm.throw_exception_if_processing_interrupted()
616 |                     except:
617 |                         # If interrupted, stop generation gracefully
618 |                         print("\n🛑 Generation cancelled by user")
619 |                         return True  # Stop generation
620 | 
621 |                     return False  # Continue generation
622 | 
623 |             stopping_criteria = StoppingCriteriaList([
624 |                 InterruptionStoppingCriteria(progress_bar, chunk_index=chunk_index, total_chunks=total_chunks)
625 |             ])
626 | 
627 |             # Generate tokens with cancellation support
628 |             # CRITICAL: Maya1 has TWO EOS tokens in generation_config.json:
629 |             #   - 128009 (<|eot_id|>) - Text completion token
630 |             #   - 128258 - SNAC audio completion token
631 |             # We need to ONLY stop on 128258 (SNAC done), not 128009 (text done)
632 |             # Otherwise the model generates text, hits 128009, and stops before SNAC codes!
633 | 
634 |             print("🎵 Generation settings:")
635 |             print(f"   Using EOS token: 128258 (SNAC completion only)")
636 |             print(f"   Ignoring EOS token: 128009 (text completion)")
637 | 
638 |             import time
639 |             generation_start = time.time()
640 | 
641 |             with torch.inference_mode():
642 |                 outputs = maya1_model.model.generate(
643 |                     **inputs,
644 |                     max_new_tokens=max_new_tokens,
645 |                     min_new_tokens=28,  # At least 4 SNAC frames (4 frames × 7 tokens = 28)
646 |                     temperature=temperature,
647 |                     top_p=top_p,
648 |                     do_sample=True,
649 |                     repetition_penalty=repetition_penalty,
650 |                     pad_token_id=maya1_model.tokenizer.pad_token_id,
651 |                     eos_token_id=128258,  # CODE_END_TOKEN_ID - Stop at end of speech
652 |                     stopping_criteria=stopping_criteria,
653 |                     use_cache=True,  # Enable KV cache for faster generation
654 |                 )
655 | 
656 |             generation_time = time.time() - generation_start
657 | 
658 |             # Check for cancellation after generation
659 |             mm.throw_exception_if_processing_interrupted()
660 | 
661 |             # Extract generated tokens (remove input tokens)
662 |             generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
663 | 
664 |             # Print final generation statistics
665 |             final_speed = len(generated_ids) / generation_time if generation_time > 0 else 0
666 |             print(f"\n✅ Generated {len(generated_ids)} tokens in {generation_time:.2f}s ({final_speed:.2f} it/s)")
667 | 
668 |             # Debug: Print first few generated token IDs
669 |             print(f"🔍 First 10 generated token IDs: {generated_ids[:10]}")
670 | 
671 |             # Debug: Decode generated tokens to see what was generated
672 |             generated_text = maya1_model.tokenizer.decode(generated_ids, skip_special_tokens=False)
673 |             print(f"🔍 Generated text (first 100 chars): {generated_text[:100]}...")
674 | 
675 |             # Filter SNAC tokens
676 |             from ..core.snac_decoder import filter_snac_tokens
677 |             snac_tokens = filter_snac_tokens(generated_ids)
678 | 
679 |             if len(snac_tokens) == 0:
680 |                 raise ValueError(
681 |                     "No SNAC audio tokens generated!\n"
682 |                     "The model may have only generated text tokens.\n"
683 |                     "Try adjusting the prompt or generation parameters."
684 |                 )
685 | 
686 |             print(f"🎵 Found {len(snac_tokens)} SNAC tokens ({len(snac_tokens) // 7} frames)")
687 | 
688 |             # Check for cancellation before decoding
689 |             mm.throw_exception_if_processing_interrupted()
690 | 
691 |             # Decode SNAC tokens to audio
692 |             print("🔊 Decoding to audio...")
693 |             audio_waveform = SNACDecoder.decode(snac_tokens, device=device)
694 | 
695 |             # Check for cancellation after decoding
696 |             mm.throw_exception_if_processing_interrupted()
697 | 
698 |             # Convert to ComfyUI audio format
699 |             audio_tensor = torch.from_numpy(audio_waveform).float()
700 | 
701 |             # Add batch and channel dimensions: [samples] -> [1, 1, samples]
702 |             if audio_tensor.dim() == 1:
703 |                 audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0)
704 |             elif audio_tensor.dim() == 2:
705 |                 audio_tensor = audio_tensor.unsqueeze(0)
706 | 
707 |             audio_output = {
708 |                 "waveform": audio_tensor,
709 |                 "sample_rate": 24000
710 |             }
711 | 
712 |             print(f"✅ Generated {len(audio_waveform) / 24000:.2f}s of audio")
713 |             print("=" * 70)
714 | 
715 |             # Handle VRAM management based on toggle
716 |             if not keep_model_in_vram:
717 |                 print("🗑️  Offloading model from VRAM...")
718 |                 Maya1ModelLoader.clear_cache(force=True)
719 |                 print("✅ Model offloaded from VRAM")
720 |             else:
721 |                 print("💾 Model kept in VRAM for faster next generation")
722 | 
723 |             return (audio_output,)
724 | 
725 |         except InterruptedError as e:
726 |             # User cancelled the generation
727 |             print(f"\n{str(e)}")
728 |             print("=" * 70)
729 |             # Note: VRAM cleanup handled by ComfyUI hooks
730 |             raise
731 | 
732 |         except Exception as e:
733 |             # Other errors
734 |             print(f"\n❌ Generation failed: {str(e)}")
735 |             print("=" * 70)
736 |             # Note: VRAM cleanup handled by ComfyUI hooks
737 |             raise
738 | 
739 | 
740 | # ComfyUI node mappings
741 | NODE_CLASS_MAPPINGS = {
742 |     "Maya1TTS_Barebones": Maya1TTSBarebonesNode
743 | }
744 | 
745 | NODE_DISPLAY_NAME_MAPPINGS = {
746 |     "Maya1TTS_Barebones": "Maya1 TTS (AIO) Barebones"
747 | }
748 | 


--------------------------------------------------------------------------------