├── .github
    └── workflows
    │   └── publish.yml
├── README.md
├── TimbreModel
    ├── FF7-爱丽丝.WAV
    ├── FF7-蒂法.WAV
    ├── LOL-亚托克斯.wav
    ├── 三国杀-曹操.WAV
    ├── 不羁青年.mp3
    ├── 使命召唤-Ghost.WAV
    ├── 傲娇御姐.mp3
    ├── 凡人修仙传-紫灵.mp3
    ├── 北京大爷.mp3
    ├── 南方小哥.mp3
    ├── 原神-胡桃.WAV
    ├── 原神-雷电将军.WAV
    ├── 周杰伦_.flac
    ├── 哪吒 低迷.wav
    ├── 嚣张小妞.mp3
    ├── 守望先锋-DVA.WAV
    ├── 小智机器人-台湾女孩.wav
    ├── 徐志胜.wav
    ├── 御姐配音.wav
    ├── 恋与深空-夏以昼.WAV
    ├── 恋与深空-秦彻.WAV
    ├── 恋与深空-黎深.WAV
    ├── 憨憨萌兽.mp3
    ├── 我的英雄学院-绿谷出久.WAV
    ├── 抖音-读小说.wav
    ├── 播音中年男.mp3
    ├── 新闻女声.WAV
    ├── 星穹铁道-卡夫卡.WAV
    ├── 星穹铁道-流莹.WAV
    ├── 星穹铁道-黄泉.WAV
    ├── 机械战甲.mp3
    ├── 林志玲.wav
    ├── 林黛玉.wav
    ├── 沉稳高管.mp3
    ├── 清澈弟弟.mp3
    ├── 温暖少女.mp3
    ├── 温暖闺蜜.mp3
    ├── 温柔学姐.mp3
    ├── 温润男声.mp3
    ├── 温润青年.mp3
    ├── 港普空姐.mp3
    ├── 火影-雏田.WAV
    ├── 热心大婶.mp3
    ├── 率真小老弟.mp3
    ├── 甜美女声.mp3
    ├── 电台男主播.mp3
    ├── 碧瑶.MP3
    ├── 纳西妲.mp3
    ├── 罗振宇老师.wav
    ├── 花甲奶奶.mp3
    ├── 蔡徐坤.wav
    ├── 说书人.WAV
    ├── 软糯女孩.mp3
    ├── 钟离.wav
    ├── 阅历姐姐.mp3
    ├── 随性男青.mp3
    ├── 鬼灭之刃-炭治郎.WAV
    └── 鬼灭之刃-祢豆子.WAV
├── __init__.py
├── audio_enhancement.py
├── index_tts_pro.py
├── indextts
    ├── BigVGAN
    │   ├── ECAPA_TDNN.py
    │   ├── __init__.py
    │   ├── activations.py
    │   ├── alias_free_activation
    │   │   ├── cuda
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-311.pyc
    │   │   │   │   └── load.cpython-311.pyc
    │   │   │   ├── activation1d.py
    │   │   │   ├── anti_alias_activation.cpp
    │   │   │   ├── anti_alias_activation_cuda.cu
    │   │   │   ├── compat.h
    │   │   │   ├── load.py
    │   │   │   └── type_shim.h
    │   │   └── torch
    │   │   │   ├── __init__.py
    │   │   │   ├── act.py
    │   │   │   ├── filter.py
    │   │   │   └── resample.py
    │   ├── alias_free_torch
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-311.pyc
    │   │   │   ├── act.cpython-311.pyc
    │   │   │   ├── filter.cpython-311.pyc
    │   │   │   └── resample.cpython-311.pyc
    │   │   ├── act.py
    │   │   ├── filter.py
    │   │   └── resample.py
    │   ├── bigvgan.py
    │   ├── models.py
    │   ├── nnet
    │   │   ├── CNN.py
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── CNN.cpython-311.pyc
    │   │   │   ├── __init__.cpython-311.pyc
    │   │   │   ├── linear.cpython-311.pyc
    │   │   │   └── normalization.cpython-311.pyc
    │   │   ├── linear.py
    │   │   └── normalization.py
    │   └── utils.py
    ├── __init__.py
    ├── cli.py
    ├── gpt
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-311.pyc
    │   │   ├── conformer_encoder.cpython-311.pyc
    │   │   ├── model.cpython-311.pyc
    │   │   └── perceiver.cpython-311.pyc
    │   ├── conformer
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-311.pyc
    │   │   │   ├── attention.cpython-311.pyc
    │   │   │   ├── embedding.cpython-311.pyc
    │   │   │   └── subsampling.cpython-311.pyc
    │   │   ├── attention.py
    │   │   ├── embedding.py
    │   │   └── subsampling.py
    │   ├── conformer_encoder.py
    │   ├── model.py
    │   └── perceiver.py
    ├── infer.py
    ├── utils
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-311.pyc
    │   │   ├── arch_util.cpython-311.pyc
    │   │   ├── checkpoint.cpython-311.pyc
    │   │   ├── common.cpython-311.pyc
    │   │   ├── feature_extractors.cpython-311.pyc
    │   │   ├── front.cpython-311.pyc
    │   │   ├── typical_sampling.cpython-311.pyc
    │   │   └── xtransformers.cpython-311.pyc
    │   ├── arch_util.py
    │   ├── checkpoint.py
    │   ├── common.py
    │   ├── feature_extractors.py
    │   ├── front.py
    │   ├── typical_sampling.py
    │   ├── webui_utils.py
    │   └── xtransformers.py
    └── vqvae
    │   ├── __init__.py
    │   └── xtts_dvae.py
├── llm_prompt模板.txt
├── nodes.py
├── novel_text_parser.py
├── pyproject.toml
├── requirements.txt
├── timbre_audio_loader.py
├── tts_models.py
├── tts_nodes
    ├── __init__.py
    ├── audio_preview.py
    └── tts_node.py
├── utils
    ├── __init__.py
    ├── audio_utils.py
    └── index_tts_impl.py
└── workflow
    ├── workflow.json
    └── 读小说用这个.json


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Comfy registry
 2 | on:
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |     paths:
 9 |       - "pyproject.toml"
10 | 
11 | permissions:
12 |   issues: write
13 | 
14 | jobs:
15 |   publish-node:
16 |     name: Publish Custom Node to registry
17 |     runs-on: ubuntu-latest
18 |     if: ${{ github.repository_owner == 'chenpipi0807' }}
19 |     steps:
20 |       - name: Check out code
21 |         uses: actions/checkout@v4
22 |         with:
23 |           submodules: true
24 |       - name: Publish Custom Node
25 |         uses: Comfy-Org/publish-node-action@v1
26 |         with:
27 |           ## Add your own personal access token to your Github Repository secrets and reference it here.
28 |           personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 免责声明
  2 | 
  3 | 本项目基于B站开源项目进行二次开发，由本人对项目进行了ComfyUI的实现，并进行了部分功能优化与调整与进阶功能的开发。然而，需要强调的是，本项目严禁用于任何非法目的以及与侵犯版权相关的任何行为！本项目仅用于开源社区内的交流与学习，以促进技术共享与创新，旨在为开发者提供有益的参考和学习资源。
  4 | 
  5 | 在此郑重声明，本项目所有个人使用行为与开发者本人及本项目本身均无任何关联。开发者对于项目使用者的行为不承担任何责任，使用者应自行承担使用过程中可能产生的所有风险和法律责任。请广大使用者在遵守法律法规及相关规定的前提下，合理、合法地使用本项目，维护开源社区的良好秩序与健康发展。
  6 | 
  7 | 感谢您的理解与支持！
  8 | 
  9 | 
 10 | # ComfyUI-Index-TTS
 11 | 
 12 | 使用IndexTTS模型在ComfyUI中实现高质量文本到语音转换的自定义节点。支持中文和英文文本，可以基于参考音频复刻声音特征。
 13 | 
 14 | ![示例截图1](https://github.com/user-attachments/assets/41960425-f739-4496-9520-8f9cae34ff51)
 15 | ![示例截图2](https://github.com/user-attachments/assets/1ff0d1d0-7a04-4d91-9d53-cd119250ed67)
 16 | ![微信截图_20250605215845](https://github.com/user-attachments/assets/d5eb22f6-2ca2-40cf-a619-d709746f83e3)
 17 | 
 18 | 
 19 | 
 20 | ## 功能特点
 21 | 
 22 | - 支持中文和英文文本合成
 23 | - 基于参考音频复刻声音特征（变声功能）
 24 | - 支持调节语速（原版不支持后处理实现效果会有一点折损）
 25 | - 多种音频合成参数控制
 26 | - Windows兼容（无需额外依赖）
 27 | 
 28 | 
 29 | ## 废话两句
 30 | 
 31 | - 生成的很快，真的很快！而且竟然也很像！！！
 32 | - 效果很好，感谢小破站的开源哈哈哈哈哈 
 33 | - 如果你想体验一下效果 附赠道友B站的传送阵[demo](https://huggingface.co/spaces/IndexTeam/IndexTTS)
 34 | - 如果你不知道去哪找音频，那我建议你去[隔壁](https://drive.google.com/drive/folders/1AyB3egmr0hAKp0CScI0eXJaUdVccArGB)偷哈哈哈哈哈
 35 | 
 36 | ## 演示案例
 37 | 
 38 | 以下是一些实际使用效果演示：
 39 | 
 40 | | 参考音频 | 输入文本 | 推理结果 |
 41 | |---------|---------|---------|
 42 | | <video src="https://github.com/user-attachments/assets/5e8cb570-242f-4a16-8472-8a64a23183fb"></video> | 我想把钉钉的自动回复设置成"服务器繁忙，请稍后再试"，仅对老板可见。  我想把钉钉的自动回复设置成"服务器繁忙，请稍后再试"，仅对老板可见。 | <video src="https://github.com/user-attachments/assets/d8b89db3-5cf5-406f-b930-fa75d13ff0bd"></video> |
 43 | | <video src="https://github.com/user-attachments/assets/8e774223-e0f7-410b-ae4e-e46215e47e96"></video> | 我想把钉钉的自动回复设置成"服务器繁忙，请稍后再试"，仅对老板可见。 | <video src="https://github.com/user-attachments/assets/6e3e63ed-2d3d-4d5a-bc2e-b42530748fa0"></video> |
 44 | 
 45 | - 长文本测试：
 46 | 
 47 | <video src="https://github.com/user-attachments/assets/6bfa35dc-1a30-4da0-a4dc-ac3def25452b"></video>
 48 | 
 49 | - 多角色小说测试：
 50 | 
 51 | <video src="https://github.com/user-attachments/assets/6d4737f4-9d75-431e-bb11-fe3e86a4ab0e"></video>
 52 | 
 53 | 
 54 | 
 55 | ## 更新日志
 56 | 
 57 | ### 2025-06-05
 58 | 
 59 | - 改进了小说文本解析器（Novel Text Parser）的功能
 60 |   - 增加了对预格式化文本的检测和处理
 61 |   - 优化了对话检测和角色识别算法
 62 |   - 改进了中文角色名称的识别
 63 |   - 支持引号中的对话自动识别
 64 | 
 65 | ## 多角色小说文本解析
 66 | 
 67 | 本项目包含一个专门用于解析小说文本的节点（Novel Text Structure Node），可以将普通小说文本解析为多角色对话结构，以便生成更加自然的多声音TTS效果。
 68 | 
 69 | ### 使用说明
 70 | 
 71 | - 节点会尝试自动识别小说中的角色对话和旁白部分
 72 | - 对话部分会标记为`<CharacterX>`形式（X为数字，最多支持5个角色）
 73 | - 旁白部分会标记为`<Narrator>`
 74 | - 解析后的文本可直接用于多声音TTS生成
 75 | 
 76 | ### 局限性
 77 | 
 78 | - 当前解析算法并不完美，复杂的小说结构可能导致错误的角色识别
 79 | - 对于重要文本，建议使用LLM（如GPT等）手动拆分文本为以下格式：
 80 | 
 81 | ```
 82 | <Narrator>少女此时就站在院墙那边，她有一双杏眼，怯怯弱弱。</Narrator>
 83 | <Narrator>院门那边，有个嗓音说：</Narrator>
 84 | <Character1>"你这婢女卖不卖？"</Character1>
 85 | <Narrator>宋集薪愣了愣，循着声音转头望去，是个眉眼含笑的锦衣少年，站在院外，一张全然陌生的面孔。</Narrator>
 86 | <Narrator>锦衣少年身边站着一位身材高大的老者，面容白皙，脸色和蔼，轻轻眯眼打量着两座毗邻院落的少年少女。</Narrator>
 87 | <Narrator>老者的视线在陈平安一扫而过，并无停滞，但是在宋集薪和婢女身上，多有停留，笑意渐渐浓郁。</Narrator>
 88 | <Narrator>宋集薪斜眼道：</Narrator>
 89 | <Character2>"卖！怎么不卖！"</Character2>
 90 | <Narrator>那少年微笑道：</Narrator>
 91 | <Character1>"那你说个价。"</Character1>
 92 | <Narrator>少女瞪大眼眸，满脸匪夷所思，像一头惊慌失措的年幼麋鹿。</Narrator>
 93 | <Narrator>宋集薪翻了个白眼，伸出一根手指，晃了晃，</Narrator>
 94 | <Character2>"白银一万两！"</Character2>
 95 | <Narrator>锦衣少年脸色如常，点头道：</Narrator>
 96 | <Character1>"好。"</Character1>
 97 | <Narrator>宋集薪见那少年不像是开玩笑的样子，连忙改口道：</Narrator>
 98 | <Character2>"是黄金万两！"</Character2>
 99 | <Narrator>锦衣少年嘴角翘起，道：</Narrator>
100 | <Character1>"逗你玩的。"</Character1>
101 | <Narrator>宋集薪脸色阴沉。</Narrator>
102 | ```
103 | 
104 | ### 示例用法
105 | 
106 | 1. 将小说文本输入到 Novel Text Structure 节点
107 | 2. 连接输出到 Index TTS Pro 节点
108 | 3. 设置不同角色的语音
109 | 4. 运行工作流生成多声音小说朗读
110 | 5. 实在不会看我最新增加的工作流
111 | 6. 如果你想在comfyui中一站式完成这个，我推荐你使用各类的llm节点，比如[kimichat](https://github.com/chenpipi0807/PIP_KIMI2comfyui)
112 | 7. 我也提供了一段llm提示词模板，你可以在llm_prompt模板.txt中看到他
113 | 
114 | 
115 | ### 2025-05-18
116 | 
117 | - 优化了长期以来transformers库4.50+版本的API变化与原始IndexTTS模型代码不兼容导致的生成报错问题
118 | 
119 | 
120 | ### 2025-05-16
121 | 
122 | - 新增对**IndexTTS-1.5**模型的支持
123 |   - 现在可以在UI中通过下拉菜单切换不同版本的模型
124 |   - 支持原始的Index-TTS和新的IndexTTS-1.5模型
125 |   - 切换模型时会自动加载相应版本，无需重启ComfyUI
126 |  
127 |   ![微信截图_20250516182957](https://github.com/user-attachments/assets/ce13f02c-9834-43b8-82e9-5567bb226280)
128 |   
129 | 
130 | ### 2025-05-11
131 | - 增加了seed功能，现在linux也可以重复执行抽卡了
132 | - 增加了对 Apple Silicon MPS 设备的检测（仍需测试反馈~）
133 | 
134 | 
135 | ### 2025-04-23
136 | 
137 | ![微信截图_20250423175608](https://github.com/user-attachments/assets/f2b15d8a-3453-4c88-b609-167b372aab74)
138 | 
139 | 
140 | - 新增 **Audio Cleaner** 节点，用于处理TTS输出音频中的混响和杂音问题
141 |   - 该节点可以连接在 Index TTS 节点之后，优化生成音频的质量
142 |   - 主要功能：去除混响、降噪、频率滤波和音频归一化
143 |   - 适用于处理有杂音或混响问题的TTS输出
144 | 
145 | - 修复了对于transformers版本强依赖的问题
146 | 
147 | #### Audio Cleaner 参数说明
148 | 
149 | **必需参数**：：
150 | - **audio**: 输入音频（通常为 Index TTS 节点的输出）
151 | - **denoise_strength**: 降噪强度（0.1-1.0，默认0.5）
152 |   - 值越大，降噪效果越强，但可能影响语音自然度
153 | - **dereverb_strength**: 去混响强度（0.0-1.0，默认0.7）
154 |   - 值越大，去混响效果越强，适合处理在回声环境下录制的参考音频
155 | 
156 | **可选参数**：：
157 | - **high_pass_freq**: 高通滤波器频率（20-500Hz，默认100Hz）
158 |   - 用于过滤低频噪音，如环境嗡嗡声
159 | - **low_pass_freq**: 低通滤波器频率（1000-16000Hz，默认8000Hz）
160 |   - 用于过滤高频噪音
161 | - **normalize**: 是否归一化音频（"true"或"false"，默认"true"）
162 |   - 开启可使音量更均衡
163 | 
164 | #### 使用建议
165 | 
166 | - 对于有明显混响的音频，将 `dereverb_strength` 设置为 0.7-0.9
167 | - 对于有背景噪音的音频，将 `denoise_strength` 设置为 0.5-0.8
168 | - 如果处理后音频听起来不自然，尝试减小 `dereverb_strength` 和 `denoise_strength`
169 | - 高通和低通滤波器可以微调以获得最佳人声效果
170 | 
171 | 
172 | ### 2025-04-25
173 | - 优化了阿拉伯数字的发音判断问题；可以参考这个case使用：“4 0 9 0”会发音四零九零，“4090”会发音四千零九十； 
174 | 
175 | 
176 | ### 2025-04-26
177 | - 优化英文逗号导致吞字的问题；
178 | 
179 | 
180 | ### 2025-04-29
181 | - 修正了语言模式切换en的时候4090依然读中文的问题，auto现在会按照中英文占比确定阿拉伯数字读法
182 | - 新增了从列表读取音频的方法，同时新增了一些音色音频供大家玩耍；你可以将自己喜欢的音频放入 ComfyUI-Index-TTS\TimbreModel 里，当然也很鼓励你能把好玩的声音分享出来。
183 | - 示例用法如图：
184 | 
185 | ![微信截图_20250429112255](https://github.com/user-attachments/assets/a0af9a5b-7609-4c34-adf5-e14321b379a7)
186 | 
187 | 
188 | ## 安装
189 | 
190 | ### 安装节点
191 | 
192 | 1. 将此代码库克隆或下载到ComfyUI的`custom_nodes`目录：
193 | 
194 |    ```bash
195 |    cd ComfyUI/custom_nodes
196 |    git clone https://github.com/chenpipi0807/ComfyUI-Index-TTS.git
197 |    ```
198 | 
199 | 2. 安装依赖： 安装依赖：
200 | 
201 |    ```bash
202 |    cd ComfyUI-Index-TTS
203 |    .\python_embeded\python.exe -m pip install -r requirements.txt
204 | 
205 |    git pull # 更新很频繁你可能需要
206 |    ```
207 | 
208 | ### 下载模型
209 | 
210 | #### 原始版本 (Index-TTS)
211 | 
212 | 1. 从[Hugging Face](https://huggingface.co/IndexTeam/Index-TTS/tree/main)或者[魔搭](https://modelscope.cn/models/IndexTeam/Index-TTS)下载IndexTTS模型文件
213 | 2. 将模型文件放置在`ComfyUI/models/Index-TTS`目录中（如果目录不存在，请创建）
214 | 3. 模型文件夹结构：
215 |    
216 |    ```
217 |    ComfyUI/models/Index-TTS/
218 |    ├── .gitattributes
219 |    ├── bigvgan_discriminator.pth
220 |    ├── bigvgan_generator.pth
221 |    ├── bpe.model
222 |    ├── config.yaml
223 |    ├── configuration.json
224 |    ├── dvae.pth
225 |    ├── gpt.pth
226 |    ├── README.md
227 |    └── unigram_12000.vocab
228 |    ```
229 |    
230 |    确保所有文件都已完整下载，特别是较大的模型文件如`bigvgan_discriminator.pth`(1.6GB)和`gpt.pth`(696MB)。
231 | 
232 | #### 新版本 (IndexTTS-1.5)
233 | 
234 | 1. 从[Hugging Face](https://huggingface.co/IndexTeam/IndexTTS-1.5/tree/main)下载IndexTTS-1.5模型文件
235 | 2. 将模型文件放置在`ComfyUI/models/IndexTTS-1.5`目录中（如果目录不存在，请创建）
236 | 3. 模型文件夹结构与Index-TTS基本相同，但文件大小和内容会有所不同：
237 |    
238 |    ```
239 |    ComfyUI/models/IndexTTS-1.5/
240 |    ├── .gitattributes
241 |    ├── bigvgan_discriminator.pth
242 |    ├── bigvgan_generator.pth
243 |    ├── bpe.model
244 |    ├── config.yaml
245 |    ├── configuration.json
246 |    ├── dvae.pth
247 |    ├── gpt.pth
248 |    ├── README.md
249 |    └── unigram_12000.vocab
250 |    ```
251 | 
252 | ## 使用方法
253 | 
254 | 1. 在ComfyUI中，找到并添加`Index TTS`节点
255 | 2. 连接参考音频输入（AUDIO类型）
256 | 3. 输入要转换为语音的文本
257 | 4. 调整参数（语言、语速等）
258 | 5. 运行工作流获取生成的语音输出
259 | 
260 | ### 示例工作流
261 | 
262 | 项目包含一个基础工作流示例，位于`workflow/workflow.json`，您可以在ComfyUI中通过导入此文件来快速开始使用。
263 | 
264 | ## 参数说明
265 | 
266 | ### 必需参数
267 | 
268 | - **text**: 要转换为语音的文本（支持中英文）
269 | - **reference_audio**: 参考音频，模型会复刻其声音特征
270 | - **model_version**: 模型版本选择，可选项：
271 |   - `Index-TTS`: 原始模型版本（默认）
272 |   - `IndexTTS-1.5`: 新版本模型
273 | - **language**: 文本语言选择，可选项：
274 |   - `auto`: 自动检测语言（默认）
275 |   - `zh`: 强制使用中文模式
276 |   - `en`: 强制使用英文模式
277 | - **speed**: 语速因子（0.5~2.0，默认1.0）
278 | 
279 | ### 可选参数
280 | 
281 | 以下参数适用于高级用户，用于调整语音生成质量和特性：
282 | 
283 | - **temperature** (默认1.0): 控制生成随机性，较高的值增加多样性但可能降低稳定性
284 | - **top_p** (默认0.8): 采样时考虑的概率质量，降低可获得更准确但可能不够自然的发音
285 | - **top_k** (默认30): 采样时考虑的候选项数量
286 | - **repetition_penalty** (默认10.0): 重复内容的惩罚系数
287 | - **length_penalty** (默认0.0): 生成内容长度的调节因子
288 | - **num_beams** (默认3): 束搜索的宽度，增加可提高质量但降低速度
289 | - **max_mel_tokens** (默认600): 最大音频token数量
290 | - **sentence_split** (默认auto): 句子拆分方式
291 | 
292 | ## 音色优化建议
293 | 
294 | 要提高音色相似度：
295 | 
296 | - 使用高质量的参考音频（清晰、无噪音）
297 | - 尝试调整`temperature`参数（0.7-0.9范围内效果较好）
298 | - 增加`repetition_penalty`（10.0-12.0）可以提高音色一致性
299 | - 对于长文本，确保`max_mel_tokens`足够大
300 | 
301 | ## 故障排除
302 | 
303 | 
304 | - 如果出现“模型加载失败”，检查模型文件是否完整且放置在正确目录
305 | - 对于Windows用户，无需额外安装特殊依赖，节点已优化
306 | - 如果显示CUDA错误，尝试重启ComfyUI或减少`num_beams`值
307 | - 如果你是pytorch2.7运行报错，短期无法适配，请尝试降级方案(.\python_embeded\python.exe -m pip install transformers==4.48.3)
308 | 
309 | 
310 | 
311 | ## 鸣谢
312 | 
313 | - 基于原始[IndexTTS](https://github.com/index-tts/index-tts)模型
314 | - 感谢ComfyUI社区的支持
315 | - 感谢使用！
316 | - 
317 | 
318 | ## 许可证
319 | 
320 | 请参考原始IndexTTS项目许可证。
321 | 


--------------------------------------------------------------------------------
/TimbreModel/FF7-爱丽丝.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/FF7-爱丽丝.WAV


--------------------------------------------------------------------------------
/TimbreModel/FF7-蒂法.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/FF7-蒂法.WAV


--------------------------------------------------------------------------------
/TimbreModel/LOL-亚托克斯.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/LOL-亚托克斯.wav


--------------------------------------------------------------------------------
/TimbreModel/三国杀-曹操.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/三国杀-曹操.WAV


--------------------------------------------------------------------------------
/TimbreModel/不羁青年.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/不羁青年.mp3


--------------------------------------------------------------------------------
/TimbreModel/使命召唤-Ghost.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/使命召唤-Ghost.WAV


--------------------------------------------------------------------------------
/TimbreModel/傲娇御姐.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/傲娇御姐.mp3


--------------------------------------------------------------------------------
/TimbreModel/凡人修仙传-紫灵.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/凡人修仙传-紫灵.mp3


--------------------------------------------------------------------------------
/TimbreModel/北京大爷.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/北京大爷.mp3


--------------------------------------------------------------------------------
/TimbreModel/南方小哥.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/南方小哥.mp3


--------------------------------------------------------------------------------
/TimbreModel/原神-胡桃.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/原神-胡桃.WAV


--------------------------------------------------------------------------------
/TimbreModel/原神-雷电将军.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/原神-雷电将军.WAV


--------------------------------------------------------------------------------
/TimbreModel/周杰伦_.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/周杰伦_.flac


--------------------------------------------------------------------------------
/TimbreModel/哪吒 低迷.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/哪吒 低迷.wav


--------------------------------------------------------------------------------
/TimbreModel/嚣张小妞.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/嚣张小妞.mp3


--------------------------------------------------------------------------------
/TimbreModel/守望先锋-DVA.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/守望先锋-DVA.WAV


--------------------------------------------------------------------------------
/TimbreModel/小智机器人-台湾女孩.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/小智机器人-台湾女孩.wav


--------------------------------------------------------------------------------
/TimbreModel/徐志胜.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/徐志胜.wav


--------------------------------------------------------------------------------
/TimbreModel/御姐配音.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/御姐配音.wav


--------------------------------------------------------------------------------
/TimbreModel/恋与深空-夏以昼.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/恋与深空-夏以昼.WAV


--------------------------------------------------------------------------------
/TimbreModel/恋与深空-秦彻.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/恋与深空-秦彻.WAV


--------------------------------------------------------------------------------
/TimbreModel/恋与深空-黎深.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/恋与深空-黎深.WAV


--------------------------------------------------------------------------------
/TimbreModel/憨憨萌兽.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/憨憨萌兽.mp3


--------------------------------------------------------------------------------
/TimbreModel/我的英雄学院-绿谷出久.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/我的英雄学院-绿谷出久.WAV


--------------------------------------------------------------------------------
/TimbreModel/抖音-读小说.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/抖音-读小说.wav


--------------------------------------------------------------------------------
/TimbreModel/播音中年男.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/播音中年男.mp3


--------------------------------------------------------------------------------
/TimbreModel/新闻女声.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/新闻女声.WAV


--------------------------------------------------------------------------------
/TimbreModel/星穹铁道-卡夫卡.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/星穹铁道-卡夫卡.WAV


--------------------------------------------------------------------------------
/TimbreModel/星穹铁道-流莹.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/星穹铁道-流莹.WAV


--------------------------------------------------------------------------------
/TimbreModel/星穹铁道-黄泉.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/星穹铁道-黄泉.WAV


--------------------------------------------------------------------------------
/TimbreModel/机械战甲.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/机械战甲.mp3


--------------------------------------------------------------------------------
/TimbreModel/林志玲.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/林志玲.wav


--------------------------------------------------------------------------------
/TimbreModel/林黛玉.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/林黛玉.wav


--------------------------------------------------------------------------------
/TimbreModel/沉稳高管.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/沉稳高管.mp3


--------------------------------------------------------------------------------
/TimbreModel/清澈弟弟.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/清澈弟弟.mp3


--------------------------------------------------------------------------------
/TimbreModel/温暖少女.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/温暖少女.mp3


--------------------------------------------------------------------------------
/TimbreModel/温暖闺蜜.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/温暖闺蜜.mp3


--------------------------------------------------------------------------------
/TimbreModel/温柔学姐.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/温柔学姐.mp3


--------------------------------------------------------------------------------
/TimbreModel/温润男声.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/温润男声.mp3


--------------------------------------------------------------------------------
/TimbreModel/温润青年.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/温润青年.mp3


--------------------------------------------------------------------------------
/TimbreModel/港普空姐.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/港普空姐.mp3


--------------------------------------------------------------------------------
/TimbreModel/火影-雏田.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/火影-雏田.WAV


--------------------------------------------------------------------------------
/TimbreModel/热心大婶.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/热心大婶.mp3


--------------------------------------------------------------------------------
/TimbreModel/率真小老弟.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/率真小老弟.mp3


--------------------------------------------------------------------------------
/TimbreModel/甜美女声.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/甜美女声.mp3


--------------------------------------------------------------------------------
/TimbreModel/电台男主播.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/电台男主播.mp3


--------------------------------------------------------------------------------
/TimbreModel/碧瑶.MP3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/碧瑶.MP3


--------------------------------------------------------------------------------
/TimbreModel/纳西妲.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/纳西妲.mp3


--------------------------------------------------------------------------------
/TimbreModel/罗振宇老师.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/罗振宇老师.wav


--------------------------------------------------------------------------------
/TimbreModel/花甲奶奶.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/花甲奶奶.mp3


--------------------------------------------------------------------------------
/TimbreModel/蔡徐坤.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/蔡徐坤.wav


--------------------------------------------------------------------------------
/TimbreModel/说书人.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/说书人.WAV


--------------------------------------------------------------------------------
/TimbreModel/软糯女孩.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/软糯女孩.mp3


--------------------------------------------------------------------------------
/TimbreModel/钟离.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/钟离.wav


--------------------------------------------------------------------------------
/TimbreModel/阅历姐姐.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/阅历姐姐.mp3


--------------------------------------------------------------------------------
/TimbreModel/随性男青.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/随性男青.mp3


--------------------------------------------------------------------------------
/TimbreModel/鬼灭之刃-炭治郎.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/鬼灭之刃-炭治郎.WAV


--------------------------------------------------------------------------------
/TimbreModel/鬼灭之刃-祢豆子.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/TimbreModel/鬼灭之刃-祢豆子.WAV


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | """ 
 2 | @title: IndexTTS for ComfyUI
 3 | @author: ComfyUI-Index-TTS
 4 | @description: ComfyUI接口的工业级零样本文本到语音合成系统
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | 
10 | # 确保当前目录在导入路径中
11 | current_dir = os.path.dirname(os.path.abspath(__file__))
12 | if current_dir not in sys.path:
13 |     sys.path.append(current_dir)
14 | 
15 | # 导入节点定义
16 | from .nodes import IndexTTSNode
17 | from .audio_enhancement import AudioCleanupNode
18 | from .timbre_audio_loader import TimbreAudioLoader
19 | from .novel_text_parser import NovelTextStructureNode  # 导入小说文本结构化节点
20 | from .index_tts_pro import IndexTTSProNode  # 导入增强版TTS节点
21 | 
22 | # 注册ComfyUI节点
23 | NODE_CLASS_MAPPINGS = {
24 |     "IndexTTSNode": IndexTTSNode,
25 |     "AudioCleanupNode": AudioCleanupNode,
26 |     "TimbreAudioLoader": TimbreAudioLoader,
27 |     "NovelTextStructureNode": NovelTextStructureNode,  # 添加小说文本结构化节点
28 |     "IndexTTSProNode": IndexTTSProNode,             # 添加增强版TTS节点
29 | }
30 | 
31 | NODE_DISPLAY_NAME_MAPPINGS = {
32 |     "IndexTTSNode": "Index TTS",
33 |     "AudioCleanupNode": "Audio Cleaner",
34 |     "TimbreAudioLoader": "Timbre音频加载器",
35 |     "NovelTextStructureNode": "小说文本结构化",   # 添加小说文本结构化节点显示名称
36 |     "IndexTTSProNode": "Index TTS Pro",         # 添加增强版TTS节点显示名称
37 | }
38 | 
39 | __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
40 | 


--------------------------------------------------------------------------------
/audio_enhancement.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @title: 音频增强处理模块
  3 | @author: ComfyUI-Index-TTS
  4 | @description: 用于处理和增强TTS生成的音频质量
  5 | """
  6 | 
  7 | import os
  8 | import sys
  9 | import numpy as np
 10 | import torch
 11 | import tempfile
 12 | import time
 13 | import librosa
 14 | import soundfile as sf
 15 | from scipy import signal as scipy_signal
 16 | 
 17 | # 确保当前目录在导入路径中
 18 | current_dir = os.path.dirname(os.path.abspath(__file__))
 19 | if current_dir not in sys.path:
 20 |     sys.path.append(current_dir)
 21 | 
 22 | 
 23 | class AudioCleanupNode:
 24 |     """
 25 |     ComfyUI的音频清理节点，用于去除混响和杂音，提高人声质量
 26 |     """
 27 |     
 28 |     @classmethod
 29 |     def INPUT_TYPES(cls):
 30 |         return {
 31 |             "required": {
 32 |                 "audio": ("AUDIO", ),
 33 |                 "denoise_strength": ("FLOAT", {"default": 0.5, "min": 0.1, "max": 1.0, "step": 0.05}),
 34 |                 "dereverb_strength": ("FLOAT", {"default": 0.7, "min": 0.0, "max": 1.0, "step": 0.05}),
 35 |             },
 36 |             "optional": {
 37 |                 "high_pass_freq": ("FLOAT", {"default": 100.0, "min": 20.0, "max": 500.0, "step": 10.0}),
 38 |                 "low_pass_freq": ("FLOAT", {"default": 8000.0, "min": 1000.0, "max": 16000.0, "step": 100.0}),
 39 |                 "normalize": (["true", "false"], {"default": "true"}),
 40 |             }
 41 |         }
 42 |     
 43 |     RETURN_TYPES = ("AUDIO",)
 44 |     RETURN_NAMES = ("enhanced_audio",)
 45 |     FUNCTION = "enhance_audio"
 46 |     CATEGORY = "audio"
 47 |     
 48 |     def __init__(self):
 49 |         print("[AudioCleanup] 初始化音频清理节点")
 50 |     
 51 |     def enhance_audio(self, audio, denoise_strength=0.5, dereverb_strength=0.7, 
 52 |                      high_pass_freq=100.0, low_pass_freq=8000.0, normalize="true"):
 53 |         """
 54 |         增强音频质量，去除杂音和混响
 55 |         
 56 |         参数:
 57 |             audio: ComfyUI音频格式，字典包含 "waveform" 和 "sample_rate"
 58 |             denoise_strength: 降噪强度，0.1到1.0
 59 |             dereverb_strength: 去混响强度，0.0到1.0
 60 |             high_pass_freq: 高通滤波频率，20到500
 61 |             low_pass_freq: 低通滤波频率，1000到16000
 62 |             normalize: 是否归一化音频，"true"或"false"
 63 |             
 64 |         返回:
 65 |             enhanced_audio: 增强后的音频，ComfyUI音频格式
 66 |         """
 67 |         try:
 68 |             print(f"[AudioCleanup] 开始处理音频")
 69 |             
 70 |             # 处理ComfyUI的音频格式
 71 |             if isinstance(audio, dict) and "waveform" in audio and "sample_rate" in audio:
 72 |                 waveform = audio["waveform"]
 73 |                 sample_rate = audio["sample_rate"]
 74 |                 
 75 |                 print(f"[AudioCleanup] 输入音频格式: ComfyUI字典格式, sample_rate={sample_rate}")
 76 |                 print(f"[AudioCleanup] waveform类型: {type(waveform)}, 形状: {waveform.shape if hasattr(waveform, 'shape') else '未知'}")
 77 |                 
 78 |                 # 如果是tensor，转换为numpy
 79 |                 if isinstance(waveform, torch.Tensor):
 80 |                     # 确保我们处理的是一个二维数组 [通道, 样本]
 81 |                     if waveform.dim() == 3:  # [batch, 通道, 样本]
 82 |                         waveform = waveform.squeeze(0)  # 移除batch维度
 83 |                     waveform = waveform.cpu().numpy()
 84 |                     
 85 |                     # 如果是多通道，取第一个通道
 86 |                     if waveform.ndim > 1 and waveform.shape[0] > 1:
 87 |                         print(f"[AudioCleanup] 检测到多通道音频({waveform.shape[0]}通道)，使用第一个通道")
 88 |                         audio_data = waveform[0]
 89 |                     else:
 90 |                         audio_data = waveform.squeeze()  # 确保是一维数组
 91 |                 else:
 92 |                     audio_data = waveform
 93 |                     
 94 |                 print(f"[AudioCleanup] 处理前的音频形状: {audio_data.shape}")
 95 |                 print(f"[AudioCleanup] 处理参数: 降噪强度={denoise_strength}, 去混响强度={dereverb_strength}")
 96 |                 print(f"[AudioCleanup] 滤波设置: 高通={high_pass_freq}Hz, 低通={low_pass_freq}Hz, 归一化={normalize}")
 97 |                 
 98 |                 # 开始音频处理流程
 99 |                 enhanced_audio = audio_data.copy()
100 |                 
101 |                 # 1. 应用高通滤波器去除低频噪音
102 |                 if high_pass_freq > 20.0:
103 |                     print(f"[AudioCleanup] 应用高通滤波器，截止频率: {high_pass_freq}Hz")
104 |                     b, a = scipy_signal.butter(4, high_pass_freq / (sample_rate / 2), 'highpass')
105 |                     enhanced_audio = scipy_signal.filtfilt(b, a, enhanced_audio)
106 |                 
107 |                 # 2. 应用低通滤波器去除高频噪音
108 |                 if low_pass_freq < 16000.0:
109 |                     print(f"[AudioCleanup] 应用低通滤波器，截止频率: {low_pass_freq}Hz")
110 |                     b, a = scipy_signal.butter(4, low_pass_freq / (sample_rate / 2), 'lowpass')
111 |                     enhanced_audio = scipy_signal.filtfilt(b, a, enhanced_audio)
112 |                 
113 |                 # 3. 降噪处理
114 |                 if denoise_strength > 0.1:
115 |                     print(f"[AudioCleanup] 应用降噪处理，强度: {denoise_strength}")
116 |                     # 使用谱减法降噪
117 |                     # 计算短时傅里叶变换(STFT)
118 |                     stft = librosa.stft(enhanced_audio)
119 |                     
120 |                     # 估计噪声谱
121 |                     noise_stft = np.abs(stft[:, :int(stft.shape[1] * 0.1)])  # 使用前10%作为噪声估计
122 |                     noise_spec = np.mean(noise_stft, axis=1)
123 |                     
124 |                     # 谱减法
125 |                     spec = np.abs(stft)
126 |                     phase = np.angle(stft)
127 |                     spec_sub = np.maximum(spec - denoise_strength * np.expand_dims(noise_spec, 1), 0)
128 |                     
129 |                     # 重建信号
130 |                     enhanced_stft = spec_sub * np.exp(1j * phase)
131 |                     enhanced_audio = librosa.istft(enhanced_stft, length=len(enhanced_audio))
132 |                 
133 |                 # 4. 去混响处理
134 |                 if dereverb_strength > 0.0:
135 |                     print(f"[AudioCleanup] 应用去混响处理，强度: {dereverb_strength}")
136 |                     # 简化的去混响方法 - 使用谱包络增强
137 |                     D = librosa.stft(enhanced_audio)
138 |                     S_db = librosa.amplitude_to_db(np.abs(D))
139 |                     
140 |                     # 应用谱包络增强
141 |                     percentile = int((1 - dereverb_strength) * 100)
142 |                     percentile = max(1, min(percentile, 99))  # 确保在有效范围内
143 |                     S_enhanced = np.percentile(S_db, percentile, axis=1)
144 |                     S_enhanced = np.expand_dims(S_enhanced, 1)
145 |                     
146 |                     # 应用增强
147 |                     gain = np.repeat(S_enhanced, S_db.shape[1], axis=1)
148 |                     S_db_enhanced = S_db * dereverb_strength + gain * (1 - dereverb_strength)
149 |                     
150 |                     # 转回时域
151 |                     S_enhanced = librosa.db_to_amplitude(S_db_enhanced)
152 |                     phase = np.angle(D)
153 |                     D_enhanced = S_enhanced * np.exp(1j * phase)
154 |                     enhanced_audio = librosa.istft(D_enhanced, length=len(enhanced_audio))
155 |                 
156 |                 # 5. 归一化
157 |                 if normalize == "true":
158 |                     print(f"[AudioCleanup] 应用音频归一化")
159 |                     enhanced_audio = librosa.util.normalize(enhanced_audio)
160 |                 
161 |                 # 输出处理结果信息
162 |                 original_rms = np.sqrt(np.mean(audio_data ** 2))
163 |                 enhanced_rms = np.sqrt(np.mean(enhanced_audio ** 2))
164 |                 print(f"[AudioCleanup] 原始音频RMS: {original_rms:.6f}")
165 |                 print(f"[AudioCleanup] 增强后音频RMS: {enhanced_rms:.6f}")
166 |                 print(f"[AudioCleanup] RMS变化比例: {enhanced_rms/original_rms if original_rms > 0 else 'N/A'}")
167 |                 
168 |                 # 转换为torch tensor并设置为ComfyUI期望的格式
169 |                 enhanced_tensor = torch.tensor(enhanced_audio, dtype=torch.float32)
170 |                 
171 |                 # 确保是3D张量 [batch, channels, samples]
172 |                 if enhanced_tensor.dim() == 1:
173 |                     enhanced_tensor = enhanced_tensor.unsqueeze(0).unsqueeze(0)  # [1, 1, samples]
174 |                 
175 |                 # 返回ComfyUI音频格式
176 |                 enhanced_dict = {
177 |                     "waveform": enhanced_tensor,
178 |                     "sample_rate": sample_rate
179 |                 }
180 |                 
181 |                 print(f"[AudioCleanup] 音频增强完成，输出形状: {enhanced_tensor.shape}")
182 |                 return (enhanced_dict,)
183 |             else:
184 |                 print(f"[AudioCleanup] 错误: 输入音频格式不正确: {type(audio)}")
185 |                 raise ValueError("输入音频格式不支持，应为ComfyUI的AUDIO类型")
186 |                 
187 |         except Exception as e:
188 |             import traceback
189 |             print(f"[AudioCleanup] 处理音频失败: {e}")
190 |             print(f"[AudioCleanup] 错误详情:")
191 |             traceback.print_exc()
192 |             
193 |             # 生成一个简单的错误提示音频
194 |             sample_rate = 24000
195 |             duration = 1.0  # 1秒
196 |             t = np.linspace(0, duration, int(sample_rate * duration))
197 |             warning_tone = np.sin(2 * np.pi * 880 * t).astype(np.float32)  # 880Hz警告音
198 |             print(f"[AudioCleanup] 生成警告音频作为错误处理")
199 |             
200 |             # 转换为ComfyUI音频格式
201 |             signal_tensor = torch.tensor(warning_tone, dtype=torch.float32).unsqueeze(0).unsqueeze(0)  # [1, 1, samples]
202 |             audio_dict = {
203 |                 "waveform": signal_tensor,
204 |                 "sample_rate": sample_rate
205 |             }
206 |             
207 |             return (audio_dict,)
208 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/__init__.py


--------------------------------------------------------------------------------
/indextts/BigVGAN/activations.py:
--------------------------------------------------------------------------------
  1 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | import torch
  5 | from torch import nn, pow, sin
  6 | from torch.nn import Parameter
  7 | 
  8 | 
  9 | class Snake(nn.Module):
 10 |     '''
 11 |     Implementation of a sine-based periodic activation function
 12 |     Shape:
 13 |         - Input: (B, C, T)
 14 |         - Output: (B, C, T), same shape as the input
 15 |     Parameters:
 16 |         - alpha - trainable parameter
 17 |     References:
 18 |         - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
 19 |         https://arxiv.org/abs/2006.08195
 20 |     Examples:
 21 |         >>> a1 = snake(256)
 22 |         >>> x = torch.randn(256)
 23 |         >>> x = a1(x)
 24 |     '''
 25 | 
 26 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
 27 |         '''
 28 |         Initialization.
 29 |         INPUT:
 30 |             - in_features: shape of the input
 31 |             - alpha: trainable parameter
 32 |             alpha is initialized to 1 by default, higher values = higher-frequency.
 33 |             alpha will be trained along with the rest of your model.
 34 |         '''
 35 |         super(Snake, self).__init__()
 36 |         self.in_features = in_features
 37 | 
 38 |         # initialize alpha
 39 |         self.alpha_logscale = alpha_logscale
 40 |         if self.alpha_logscale:  # log scale alphas initialized to zeros
 41 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
 42 |         else:  # linear scale alphas initialized to ones
 43 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
 44 | 
 45 |         self.alpha.requires_grad = alpha_trainable
 46 | 
 47 |         self.no_div_by_zero = 0.000000001
 48 | 
 49 |     def forward(self, x):
 50 |         '''
 51 |         Forward pass of the function.
 52 |         Applies the function to the input elementwise.
 53 |         Snake ∶= x + 1/a * sin^2 (xa)
 54 |         '''
 55 |         alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
 56 |         if self.alpha_logscale:
 57 |             alpha = torch.exp(alpha)
 58 |         x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
 59 | 
 60 |         return x
 61 | 
 62 | 
 63 | class SnakeBeta(nn.Module):
 64 |     '''
 65 |     A modified Snake function which uses separate parameters for the magnitude of the periodic components
 66 |     Shape:
 67 |         - Input: (B, C, T)
 68 |         - Output: (B, C, T), same shape as the input
 69 |     Parameters:
 70 |         - alpha - trainable parameter that controls frequency
 71 |         - beta - trainable parameter that controls magnitude
 72 |     References:
 73 |         - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
 74 |         https://arxiv.org/abs/2006.08195
 75 |     Examples:
 76 |         >>> a1 = snakebeta(256)
 77 |         >>> x = torch.randn(256)
 78 |         >>> x = a1(x)
 79 |     '''
 80 | 
 81 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
 82 |         '''
 83 |         Initialization.
 84 |         INPUT:
 85 |             - in_features: shape of the input
 86 |             - alpha - trainable parameter that controls frequency
 87 |             - beta - trainable parameter that controls magnitude
 88 |             alpha is initialized to 1 by default, higher values = higher-frequency.
 89 |             beta is initialized to 1 by default, higher values = higher-magnitude.
 90 |             alpha will be trained along with the rest of your model.
 91 |         '''
 92 |         super(SnakeBeta, self).__init__()
 93 |         self.in_features = in_features
 94 | 
 95 |         # initialize alpha
 96 |         self.alpha_logscale = alpha_logscale
 97 |         if self.alpha_logscale:  # log scale alphas initialized to zeros
 98 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
 99 |             self.beta = Parameter(torch.zeros(in_features) * alpha)
100 |         else:  # linear scale alphas initialized to ones
101 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
102 |             self.beta = Parameter(torch.ones(in_features) * alpha)
103 | 
104 |         self.alpha.requires_grad = alpha_trainable
105 |         self.beta.requires_grad = alpha_trainable
106 | 
107 |         self.no_div_by_zero = 0.000000001
108 | 
109 |     def forward(self, x):
110 |         '''
111 |         Forward pass of the function.
112 |         Applies the function to the input elementwise.
113 |         SnakeBeta ∶= x + 1/b * sin^2 (xa)
114 |         '''
115 |         alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
116 |         beta = self.beta.unsqueeze(0).unsqueeze(-1)
117 |         if self.alpha_logscale:
118 |             alpha = torch.exp(alpha)
119 |             beta = torch.exp(beta)
120 |         x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
121 | 
122 |         return x
123 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/alias_free_activation/cuda/__init__.py


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/cuda/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/alias_free_activation/cuda/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/cuda/__pycache__/load.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/alias_free_activation/cuda/__pycache__/load.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | # load fused CUDA kernel: this enables importing anti_alias_activation_cuda
 7 | from indextts.BigVGAN.alias_free_activation.cuda import load
 8 | from indextts.BigVGAN.alias_free_activation.torch.resample import DownSample1d, UpSample1d
 9 | 
10 | anti_alias_activation_cuda = load.load()
11 | 
12 | 
13 | class FusedAntiAliasActivation(torch.autograd.Function):
14 |     """
15 |     Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
16 |     The hyperparameters are hard-coded in the kernel to maximize speed.
17 |     NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
18 |     """
19 | 
20 |     @staticmethod
21 |     def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
22 |         activation_results = anti_alias_activation_cuda.forward(
23 |             inputs, up_ftr, down_ftr, alpha, beta
24 |         )
25 | 
26 |         return activation_results
27 | 
28 |     @staticmethod
29 |     def backward(ctx, output_grads):
30 |         raise NotImplementedError
31 |         return output_grads, None, None
32 | 
33 | 
34 | class Activation1d(nn.Module):
35 |     def __init__(
36 |         self,
37 |         activation,
38 |         up_ratio: int = 2,
39 |         down_ratio: int = 2,
40 |         up_kernel_size: int = 12,
41 |         down_kernel_size: int = 12,
42 |         fused: bool = True,
43 |     ):
44 |         super().__init__()
45 |         self.up_ratio = up_ratio
46 |         self.down_ratio = down_ratio
47 |         self.act = activation
48 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
49 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
50 | 
51 |         self.fused = fused  # Whether to use fused CUDA kernel or not
52 | 
53 |     def forward(self, x):
54 |         if not self.fused:
55 |             x = self.upsample(x)
56 |             x = self.act(x)
57 |             x = self.downsample(x)
58 |             return x
59 |         else:
60 |             if self.act.__class__.__name__ == "Snake":
61 |                 beta = self.act.alpha.data  # Snake uses same params for alpha and beta
62 |             else:
63 |                 beta = (
64 |                     self.act.beta.data
65 |                 )  # Snakebeta uses different params for alpha and beta
66 |             alpha = self.act.alpha.data
67 |             if (
68 |                 not self.act.alpha_logscale
69 |             ):  # Exp baked into cuda kernel, cancel it out with a log
70 |                 alpha = torch.log(alpha)
71 |                 beta = torch.log(beta)
72 | 
73 |             x = FusedAntiAliasActivation.apply(
74 |                 x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta
75 |             )
76 |             return x
77 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 |  #include <torch/extension.h>
18 | 
19 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
20 | 
21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
22 |     m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
23 | }


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu:
--------------------------------------------------------------------------------
  1 | /* coding=utf-8
  2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ATen/ATen.h>
 18 | #include <cuda.h>
 19 | #include <cuda_runtime.h>
 20 | #include <cuda_fp16.h>
 21 | #include <cuda_profiler_api.h>
 22 | #include <ATen/cuda/CUDAContext.h>
 23 | #include <torch/extension.h>
 24 | #include "type_shim.h"
 25 | #include <assert.h>
 26 | #include <cfloat>
 27 | #include <limits>
 28 | #include <stdint.h>
 29 | #include <c10/macros/Macros.h>
 30 | 
 31 | namespace
 32 | {
 33 |     // Hard-coded hyperparameters
 34 |     // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
 35 |     constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4;
 36 |     constexpr int BUFFER_SIZE = 32;
 37 |     constexpr int FILTER_SIZE = 12;
 38 |     constexpr int HALF_FILTER_SIZE = 6;
 39 |     constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl
 40 |     constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl
 41 |     constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl
 42 | 
 43 |     template <typename input_t, typename output_t, typename acc_t>
 44 |     __global__ void anti_alias_activation_forward(
 45 |         output_t *dst,
 46 |         const input_t *src,
 47 |         const acc_t *up_ftr,
 48 |         const acc_t *down_ftr,
 49 |         const acc_t *alpha,
 50 |         const acc_t *beta,
 51 |         int batch_size,
 52 |         int channels,
 53 |         int seq_len)
 54 |     {
 55 |         // Up and downsample filters
 56 |         input_t up_filter[FILTER_SIZE];
 57 |         input_t down_filter[FILTER_SIZE];
 58 | 
 59 |         // Load data from global memory including extra indices reserved for replication paddings
 60 |         input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0};
 61 |         input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0};
 62 | 
 63 |         // Output stores downsampled output before writing to dst
 64 |         output_t output[BUFFER_SIZE];
 65 | 
 66 |         // blockDim/threadIdx = (128, 1, 1)
 67 |         // gridDim/blockIdx = (seq_blocks, channels, batches)
 68 |         int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
 69 |         int local_offset = threadIdx.x * BUFFER_SIZE;
 70 |         int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset;
 71 | 
 72 |         // intermediate have double the seq_len
 73 |         int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2;
 74 |         int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset;
 75 | 
 76 |         // Get values needed for replication padding before moving pointer
 77 |         const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
 78 |         input_t seq_left_most_value = right_most_pntr[0];
 79 |         input_t seq_right_most_value = right_most_pntr[seq_len - 1];
 80 | 
 81 |         // Move src and dst pointers
 82 |         src += block_offset + local_offset;
 83 |         dst += block_offset + local_offset;
 84 | 
 85 |         // Alpha and beta values for snake activatons. Applies exp by default
 86 |         alpha = alpha + blockIdx.y;
 87 |         beta = beta + blockIdx.y;
 88 | 
 89 |         acc_t alpha_val = expf(alpha[0]);
 90 |         acc_t beta_val = expf(beta[0]);
 91 | 
 92 |         #pragma unroll
 93 |         for (int it = 0; it < FILTER_SIZE; it += 1)
 94 |         {
 95 |             up_filter[it] = up_ftr[it];
 96 |             down_filter[it] = down_ftr[it];
 97 |         }
 98 | 
 99 |         // Apply replication padding for upsampling, matching torch impl
100 |         #pragma unroll
101 |         for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1)
102 |         {
103 |             int element_index = seq_offset + it; // index for element
104 |             if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD))
105 |             {
106 |                 elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value;
107 |             }
108 |             if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD))
109 |             {
110 |                 elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value;
111 |             }
112 |             if ((element_index >= 0) && (element_index < seq_len))
113 |             {
114 |                 elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it];
115 |             }
116 |         }
117 | 
118 |         // Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later
119 |         #pragma unroll
120 |         for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1)
121 |         {
122 |             acc_t acc = 0.0;
123 |             int element_index = intermediate_seq_offset + it; // index for intermediate
124 |             #pragma unroll
125 |             for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
126 |             {
127 |                 if ((element_index + f_idx) >= 0)
128 |                 {
129 |                     acc += up_filter[f_idx] * elements[it + f_idx];
130 |                 }
131 |             }
132 |             intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc;
133 |         }
134 | 
135 |         // Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later
136 |         double no_div_by_zero = 0.000000001;
137 |         #pragma unroll
138 |         for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1)
139 |         {
140 |             acc_t a = sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val);
141 |             intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * a * a;
142 |         }
143 | 
144 |         // Apply replication padding before downsampling conv from intermediates
145 |         #pragma unroll
146 |         for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1)
147 |         {
148 |             intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT];
149 |         }
150 |         #pragma unroll
151 |         for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1)
152 |         {
153 |             intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1];
154 |         }
155 | 
156 |         // Apply downsample strided convolution (assuming stride=2) from intermediates
157 |         #pragma unroll
158 |         for (int it = 0; it < BUFFER_SIZE; it += 1)
159 |         {
160 |             acc_t acc = 0.0;
161 |             #pragma unroll
162 |             for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
163 |             {
164 |                 // Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation
165 |                 acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT];
166 |             }
167 |             output[it] = acc;
168 |         }
169 | 
170 |         // Write output to dst
171 |         #pragma unroll
172 |         for (int it = 0;  it < BUFFER_SIZE;  it += ELEMENTS_PER_LDG_STG)
173 |         {
174 |             int element_index = seq_offset + it;
175 |             if (element_index < seq_len)
176 |             {
177 |                 dst[it] = output[it];
178 |             }
179 |         }
180 | 
181 |     }
182 | 
183 |     template <typename input_t, typename output_t, typename acc_t>
184 |     void dispatch_anti_alias_activation_forward(
185 |         output_t *dst,
186 |         const input_t *src,
187 |         const acc_t *up_ftr,
188 |         const acc_t *down_ftr,
189 |         const acc_t *alpha,
190 |         const acc_t *beta,
191 |         int batch_size,
192 |         int channels,
193 |         int seq_len)
194 |     {
195 |         if (seq_len == 0)
196 |         {
197 |             return;
198 |         }
199 |         else
200 |         {
201 |             // Use 128 threads per block to maximimize gpu utilization
202 |             constexpr int threads_per_block = 128;
203 |             constexpr int seq_len_per_block = 4096;
204 |             int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block;
205 |             dim3 blocks(blocks_per_seq_len, channels, batch_size);
206 |             dim3 threads(threads_per_block, 1, 1);
207 | 
208 |             anti_alias_activation_forward<input_t, output_t, acc_t>
209 |                 <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len);
210 |         }
211 |     }
212 | }
213 | 
214 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta)
215 | {
216 |     // Input is a 3d tensor with dimensions [batches, channels, seq_len]
217 |     const int batches = input.size(0);
218 |     const int channels = input.size(1);
219 |     const int seq_len = input.size(2);
220 | 
221 |     // Output
222 |     auto act_options = input.options().requires_grad(false);
223 | 
224 |     torch::Tensor anti_alias_activation_results =
225 |         torch::empty({batches, channels, seq_len}, act_options);
226 | 
227 |     using float32 = float;
228 |     // The dtype of input is float16, bfloat16, or float32
229 |     // The dtype of up_filter, down_filter, alpha, and beta is float32
230 |     // printf("input scalar type: %d\n", input.scalar_type());
231 |     // printf("up_filter scalar type: %d\n", up_filter.scalar_type());
232 |     // printf("down_filter scalar type: %d\n", down_filter.scalar_type());
233 |     // printf("alpha scalar type: %d\n", alpha.scalar_type());
234 |     // printf("beta scalar type: %d\n", beta.scalar_type());
235 |     void *input_ptr = static_cast<void *>(input.data_ptr());
236 |     float32 *up_filter_ptr = static_cast<float32 *>(up_filter.data_ptr());
237 |     float32 *down_filter_ptr = static_cast<float32 *>(down_filter.data_ptr());
238 |     float32 *alpha_ptr = static_cast<float32 *>(alpha.data_ptr());
239 |     float32 *beta_ptr = static_cast<float32 *>(beta.data_ptr());
240 |     void *anti_alias_activation_results_ptr = static_cast<void *>(anti_alias_activation_results.data_ptr());
241 | 
242 |     DISPATCH_FLOAT_HALF_AND_BFLOAT(
243 |         input.scalar_type(),
244 |         "dispatch anti alias activation_forward",
245 |         dispatch_anti_alias_activation_forward<scalar_t, scalar_t, float32>(
246 |             reinterpret_cast<scalar_t *>(anti_alias_activation_results_ptr),
247 |             reinterpret_cast<const scalar_t *>(input_ptr),
248 |             reinterpret_cast<const float32 *>(up_filter_ptr),
249 |             reinterpret_cast<const float32 *>(down_filter_ptr),
250 |             reinterpret_cast<const float32 *>(alpha_ptr),
251 |             reinterpret_cast<const float32 *>(beta_ptr),
252 |             batches,
253 |             channels,
254 |             seq_len););
255 |     return anti_alias_activation_results;
256 | }


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/cuda/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied fron NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 | 
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/cuda/load.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 NVIDIA CORPORATION.
  2 | #   Licensed under the MIT license.
  3 | 
  4 | import os
  5 | import pathlib
  6 | import subprocess
  7 | 
  8 | from torch.utils import cpp_extension
  9 | 
 10 | """
 11 | Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels. 
 12 | Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
 13 | """
 14 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 15 | 
 16 | 
 17 | import re
 18 | import shutil
 19 | import tempfile
 20 | 
 21 | # 补丁修复：sources 路径含中文字符时，生成 build.ninja 乱码导致编译失败
 22 | # 使用临时目录来规避 ninja 编译失败（比如中文路径）
 23 | def chinese_path_compile_support(sources, buildpath):
 24 |     pattern = re.compile(r'[\u4e00-\u9fff]')  
 25 |     if not bool(pattern.search(str(sources[0].resolve()))):
 26 |         return buildpath # 检测非中文路径跳过
 27 |     # Create build directory
 28 |     resolves = [ item.name for item in sources]
 29 |     ninja_compile_dir = os.path.join(tempfile.gettempdir(), "BigVGAN", "cuda")
 30 |     os.makedirs(ninja_compile_dir, exist_ok=True)
 31 |     new_buildpath = os.path.join(ninja_compile_dir, "build")
 32 |     os.makedirs(new_buildpath, exist_ok=True)
 33 |     print(f"ninja_buildpath: {new_buildpath}")
 34 |     # Copy files to directory
 35 |     sources.clear()
 36 |     current_dir = os.path.dirname(__file__)
 37 |     ALLOWED_EXTENSIONS = {'.py', '.cu', '.cpp', '.h'}
 38 |     for filename in os.listdir(current_dir):
 39 |         item = pathlib.Path(current_dir).joinpath(filename)
 40 |         tar_path = pathlib.Path(ninja_compile_dir).joinpath(item.name)
 41 |         if not item.suffix.lower() in ALLOWED_EXTENSIONS:continue
 42 |         pathlib.Path(shutil.copy2(item, tar_path))
 43 |         if tar_path.name in resolves:sources.append(tar_path)
 44 |     return new_buildpath
 45 | 
 46 | 
 47 | 
 48 | def load():
 49 |     # Check if cuda 11 is installed for compute capability 8.0
 50 |     cc_flag = []
 51 |     _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
 52 |     if int(bare_metal_major) >= 11:
 53 |         cc_flag.append("-gencode")
 54 |         cc_flag.append("arch=compute_80,code=sm_80")
 55 | 
 56 |     # Build path
 57 |     srcpath = pathlib.Path(__file__).parent.absolute()
 58 |     buildpath = srcpath / "build"
 59 |     _create_build_dir(buildpath)
 60 | 
 61 |     # Helper function to build the kernels.
 62 |     def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
 63 |         return cpp_extension.load(
 64 |             name=name,
 65 |             sources=sources,
 66 |             build_directory=buildpath,
 67 |             extra_cflags=[
 68 |                 "-O3",
 69 |             ],
 70 |             extra_cuda_cflags=[
 71 |                 "-O3",
 72 |                 "-gencode",
 73 |                 "arch=compute_70,code=sm_70",
 74 |                 "--use_fast_math",
 75 |             ]
 76 |             + extra_cuda_flags
 77 |             + cc_flag,
 78 |             verbose=True,
 79 |         )
 80 | 
 81 |     extra_cuda_flags = [
 82 |         "-U__CUDA_NO_HALF_OPERATORS__",
 83 |         "-U__CUDA_NO_HALF_CONVERSIONS__",
 84 |         "--expt-relaxed-constexpr",
 85 |         "--expt-extended-lambda",
 86 |     ]
 87 | 
 88 |     sources = [
 89 |         srcpath / "anti_alias_activation.cpp",
 90 |         srcpath / "anti_alias_activation_cuda.cu",
 91 |     ]
 92 |     
 93 |     # 兼容方案：ninja 特殊字符路径编译支持处理（比如中文路径）
 94 |     buildpath = chinese_path_compile_support(sources, buildpath)
 95 |     
 96 |     anti_alias_activation_cuda = _cpp_extention_load_helper(
 97 |         "anti_alias_activation_cuda", sources, extra_cuda_flags
 98 |     )
 99 | 
100 |     return anti_alias_activation_cuda
101 | 
102 | 
103 | def _get_cuda_bare_metal_version(cuda_dir):
104 |     raw_output = subprocess.check_output(
105 |         [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
106 |     )
107 |     output = raw_output.split()
108 |     release_idx = output.index("release") + 1
109 |     release = output[release_idx].split(".")
110 |     bare_metal_major = release[0]
111 |     bare_metal_minor = release[1][0]
112 | 
113 |     return raw_output, bare_metal_major, bare_metal_minor
114 | 
115 | 
116 | def _create_build_dir(buildpath):
117 |     try:
118 |         os.mkdir(buildpath)
119 |     except OSError:
120 |         if not os.path.isdir(buildpath):
121 |             print(f"Creation of the build directory {buildpath} failed")
122 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <ATen/ATen.h>
18 | #include "compat.h"
19 | 
20 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, NAME, ...)                 \
21 | 	switch (TYPE)                                                       \
22 | 	{                                                                   \
23 | 	case at::ScalarType::Float:                                         \
24 | 	{                                                                   \
25 | 		using scalar_t = float;                                         \
26 | 		__VA_ARGS__;                                                    \
27 | 		break;                                                          \
28 | 	}                                                                   \
29 | 	case at::ScalarType::Half:                                          \
30 | 	{                                                                   \
31 | 		using scalar_t = at::Half;                                      \
32 | 		__VA_ARGS__;                                                    \
33 | 		break;                                                          \
34 | 	}                                                                   \
35 | 	case at::ScalarType::BFloat16:                                      \
36 | 	{                                                                   \
37 | 		using scalar_t = at::BFloat16;                                  \
38 | 		__VA_ARGS__;                                                    \
39 | 		break;                                                          \
40 | 	}                                                                   \
41 | 	default:                                                            \
42 | 		AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
43 | 	}
44 | 
45 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
46 | 	switch (TYPEIN)                                                            \
47 | 	{                                                                          \
48 | 	case at::ScalarType::Float:                                                \
49 | 	{                                                                          \
50 | 		using scalar_t_in = float;                                             \
51 | 		switch (TYPEOUT)                                                       \
52 | 		{                                                                      \
53 | 		case at::ScalarType::Float:                                            \
54 | 		{                                                                      \
55 | 			using scalar_t_out = float;                                        \
56 | 			__VA_ARGS__;                                                       \
57 | 			break;                                                             \
58 | 		}                                                                      \
59 | 		case at::ScalarType::Half:                                             \
60 | 		{                                                                      \
61 | 			using scalar_t_out = at::Half;                                     \
62 | 			__VA_ARGS__;                                                       \
63 | 			break;                                                             \
64 | 		}                                                                      \
65 | 		case at::ScalarType::BFloat16:                                         \
66 | 		{                                                                      \
67 | 			using scalar_t_out = at::BFloat16;                                 \
68 | 			__VA_ARGS__;                                                       \
69 | 			break;                                                             \
70 | 		}                                                                      \
71 | 		default:                                                               \
72 | 			AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
73 | 		}                                                                      \
74 | 		break;                                                                 \
75 | 	}                                                                          \
76 | 	case at::ScalarType::Half:                                                 \
77 | 	{                                                                          \
78 | 		using scalar_t_in = at::Half;                                          \
79 | 		using scalar_t_out = at::Half;                                         \
80 | 		__VA_ARGS__;                                                           \
81 | 		break;                                                                 \
82 | 	}                                                                          \
83 | 	case at::ScalarType::BFloat16:                                             \
84 | 	{                                                                          \
85 | 		using scalar_t_in = at::BFloat16;                                      \
86 | 		using scalar_t_out = at::BFloat16;                                     \
87 | 		__VA_ARGS__;                                                           \
88 | 		break;                                                                 \
89 | 	}                                                                          \
90 | 	default:                                                                   \
91 | 		AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");      \
92 | 	}
93 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | #   LICENSE is in incl_licenses directory.
3 | 
4 | from .act import *
5 | from .filter import *
6 | from .resample import *
7 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/torch/act.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from .resample import DownSample1d, UpSample1d
 7 | 
 8 | 
 9 | class Activation1d(nn.Module):
10 |     def __init__(
11 |         self,
12 |         activation,
13 |         up_ratio: int = 2,
14 |         down_ratio: int = 2,
15 |         up_kernel_size: int = 12,
16 |         down_kernel_size: int = 12,
17 |     ):
18 |         super().__init__()
19 |         self.up_ratio = up_ratio
20 |         self.down_ratio = down_ratio
21 |         self.act = activation
22 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
23 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
24 | 
25 |     # x: [B,C,T]
26 |     def forward(self, x):
27 |         x = self.upsample(x)
28 |         x = self.act(x)
29 |         x = self.downsample(x)
30 | 
31 |         return x
32 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/torch/filter.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | import math
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | if "sinc" in dir(torch):
 11 |     sinc = torch.sinc
 12 | else:
 13 |     # This code is adopted from adefossez's julius.core.sinc under the MIT License
 14 |     # https://adefossez.github.io/julius/julius/core.html
 15 |     #   LICENSE is in incl_licenses directory.
 16 |     def sinc(x: torch.Tensor):
 17 |         """
 18 |         Implementation of sinc, i.e. sin(pi * x) / (pi * x)
 19 |         __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
 20 |         """
 21 |         return torch.where(
 22 |             x == 0,
 23 |             torch.tensor(1.0, device=x.device, dtype=x.dtype),
 24 |             torch.sin(math.pi * x) / math.pi / x,
 25 |         )
 26 | 
 27 | 
 28 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
 29 | # https://adefossez.github.io/julius/julius/lowpass.html
 30 | #   LICENSE is in incl_licenses directory.
 31 | def kaiser_sinc_filter1d(
 32 |     cutoff, half_width, kernel_size
 33 | ):  # return filter [1,1,kernel_size]
 34 |     even = kernel_size % 2 == 0
 35 |     half_size = kernel_size // 2
 36 | 
 37 |     # For kaiser window
 38 |     delta_f = 4 * half_width
 39 |     A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
 40 |     if A > 50.0:
 41 |         beta = 0.1102 * (A - 8.7)
 42 |     elif A >= 21.0:
 43 |         beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
 44 |     else:
 45 |         beta = 0.0
 46 |     window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
 47 | 
 48 |     # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
 49 |     if even:
 50 |         time = torch.arange(-half_size, half_size) + 0.5
 51 |     else:
 52 |         time = torch.arange(kernel_size) - half_size
 53 |     if cutoff == 0:
 54 |         filter_ = torch.zeros_like(time)
 55 |     else:
 56 |         filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
 57 |         """
 58 |         Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
 59 |         """
 60 |         filter_ /= filter_.sum()
 61 |         filter = filter_.view(1, 1, kernel_size)
 62 | 
 63 |     return filter
 64 | 
 65 | 
 66 | class LowPassFilter1d(nn.Module):
 67 |     def __init__(
 68 |         self,
 69 |         cutoff=0.5,
 70 |         half_width=0.6,
 71 |         stride: int = 1,
 72 |         padding: bool = True,
 73 |         padding_mode: str = "replicate",
 74 |         kernel_size: int = 12,
 75 |     ):
 76 |         """
 77 |         kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
 78 |         """
 79 |         super().__init__()
 80 |         if cutoff < -0.0:
 81 |             raise ValueError("Minimum cutoff must be larger than zero.")
 82 |         if cutoff > 0.5:
 83 |             raise ValueError("A cutoff above 0.5 does not make sense.")
 84 |         self.kernel_size = kernel_size
 85 |         self.even = kernel_size % 2 == 0
 86 |         self.pad_left = kernel_size // 2 - int(self.even)
 87 |         self.pad_right = kernel_size // 2
 88 |         self.stride = stride
 89 |         self.padding = padding
 90 |         self.padding_mode = padding_mode
 91 |         filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
 92 |         self.register_buffer("filter", filter)
 93 | 
 94 |     # Input [B, C, T]
 95 |     def forward(self, x):
 96 |         _, C, _ = x.shape
 97 | 
 98 |         if self.padding:
 99 |             x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
100 |         out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
101 | 
102 |         return out
103 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_activation/torch/resample.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from torch.nn import functional as F
 6 | 
 7 | from .filter import LowPassFilter1d, kaiser_sinc_filter1d
 8 | 
 9 | 
10 | class UpSample1d(nn.Module):
11 |     def __init__(self, ratio=2, kernel_size=None):
12 |         super().__init__()
13 |         self.ratio = ratio
14 |         self.kernel_size = (
15 |             int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
16 |         )
17 |         self.stride = ratio
18 |         self.pad = self.kernel_size // ratio - 1
19 |         self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
20 |         self.pad_right = (
21 |             self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
22 |         )
23 |         filter = kaiser_sinc_filter1d(
24 |             cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
25 |         )
26 |         self.register_buffer("filter", filter)
27 | 
28 |     # x: [B, C, T]
29 |     def forward(self, x):
30 |         _, C, _ = x.shape
31 | 
32 |         x = F.pad(x, (self.pad, self.pad), mode="replicate")
33 |         x = self.ratio * F.conv_transpose1d(
34 |             x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
35 |         )
36 |         x = x[..., self.pad_left : -self.pad_right]
37 | 
38 |         return x
39 | 
40 | 
41 | class DownSample1d(nn.Module):
42 |     def __init__(self, ratio=2, kernel_size=None):
43 |         super().__init__()
44 |         self.ratio = ratio
45 |         self.kernel_size = (
46 |             int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
47 |         )
48 |         self.lowpass = LowPassFilter1d(
49 |             cutoff=0.5 / ratio,
50 |             half_width=0.6 / ratio,
51 |             stride=ratio,
52 |             kernel_size=self.kernel_size,
53 |         )
54 | 
55 |     def forward(self, x):
56 |         xx = self.lowpass(x)
57 | 
58 |         return xx
59 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | #   LICENSE is in incl_licenses directory.
3 | 
4 | from .act import *
5 | from .filter import *
6 | from .resample import *
7 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_torch/__pycache__/act.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/alias_free_torch/__pycache__/act.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_torch/__pycache__/filter.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/alias_free_torch/__pycache__/filter.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_torch/__pycache__/resample.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/alias_free_torch/__pycache__/resample.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_torch/act.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from .resample import DownSample1d, UpSample1d
 7 | 
 8 | 
 9 | class Activation1d(nn.Module):
10 |     def __init__(self,
11 |                  activation,
12 |                  up_ratio: int = 2,
13 |                  down_ratio: int = 2,
14 |                  up_kernel_size: int = 12,
15 |                  down_kernel_size: int = 12):
16 |         super().__init__()
17 |         self.up_ratio = up_ratio
18 |         self.down_ratio = down_ratio
19 |         self.act = activation
20 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
21 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
22 | 
23 |     # x: [B,C,T]
24 |     def forward(self, x):
25 |         x = self.upsample(x)
26 |         x = self.act(x)
27 |         x = self.downsample(x)
28 | 
29 |         return x


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_torch/filter.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import math
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | if 'sinc' in dir(torch):
11 |     sinc = torch.sinc
12 | else:
13 |     # This code is adopted from adefossez's julius.core.sinc under the MIT License
14 |     # https://adefossez.github.io/julius/julius/core.html
15 |     #   LICENSE is in incl_licenses directory.
16 |     def sinc(x: torch.Tensor):
17 |         """
18 |         Implementation of sinc, i.e. sin(pi * x) / (pi * x)
19 |         __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
20 |         """
21 |         return torch.where(x == 0,
22 |                            torch.tensor(1., device=x.device, dtype=x.dtype),
23 |                            torch.sin(math.pi * x) / math.pi / x)
24 | 
25 | 
26 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
27 | # https://adefossez.github.io/julius/julius/lowpass.html
28 | #   LICENSE is in incl_licenses directory.
29 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
30 |     even = (kernel_size % 2 == 0)
31 |     half_size = kernel_size // 2
32 | 
33 |     #For kaiser window
34 |     delta_f = 4 * half_width
35 |     A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
36 |     if A > 50.:
37 |         beta = 0.1102 * (A - 8.7)
38 |     elif A >= 21.:
39 |         beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
40 |     else:
41 |         beta = 0.
42 |     window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
43 | 
44 |     # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
45 |     if even:
46 |         time = (torch.arange(-half_size, half_size) + 0.5)
47 |     else:
48 |         time = torch.arange(kernel_size) - half_size
49 |     if cutoff == 0:
50 |         filter_ = torch.zeros_like(time)
51 |     else:
52 |         filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
53 |         # Normalize filter to have sum = 1, otherwise we will have a small leakage
54 |         # of the constant component in the input signal.
55 |         filter_ /= filter_.sum()
56 |         filter = filter_.view(1, 1, kernel_size)
57 | 
58 |     return filter
59 | 
60 | 
61 | class LowPassFilter1d(nn.Module):
62 |     def __init__(self,
63 |                  cutoff=0.5,
64 |                  half_width=0.6,
65 |                  stride: int = 1,
66 |                  padding: bool = True,
67 |                  padding_mode: str = 'replicate',
68 |                  kernel_size: int = 12):
69 |         # kernel_size should be even number for stylegan3 setup,
70 |         # in this implementation, odd number is also possible.
71 |         super().__init__()
72 |         if cutoff < -0.:
73 |             raise ValueError("Minimum cutoff must be larger than zero.")
74 |         if cutoff > 0.5:
75 |             raise ValueError("A cutoff above 0.5 does not make sense.")
76 |         self.kernel_size = kernel_size
77 |         self.even = (kernel_size % 2 == 0)
78 |         self.pad_left = kernel_size // 2 - int(self.even)
79 |         self.pad_right = kernel_size // 2
80 |         self.stride = stride
81 |         self.padding = padding
82 |         self.padding_mode = padding_mode
83 |         filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
84 |         self.register_buffer("filter", filter)
85 | 
86 |     #input [B, C, T]
87 |     def forward(self, x):
88 |         _, C, _ = x.shape
89 | 
90 |         if self.padding:
91 |             x = F.pad(x, (self.pad_left, self.pad_right),
92 |                       mode=self.padding_mode)
93 |         out = F.conv1d(x, self.filter.expand(C, -1, -1),
94 |                        stride=self.stride, groups=C)
95 | 
96 |         return out


--------------------------------------------------------------------------------
/indextts/BigVGAN/alias_free_torch/resample.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from torch.nn import functional as F
 6 | 
 7 | from .filter import LowPassFilter1d, kaiser_sinc_filter1d
 8 | 
 9 | 
10 | class UpSample1d(nn.Module):
11 |     def __init__(self, ratio=2, kernel_size=None):
12 |         super().__init__()
13 |         self.ratio = ratio
14 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
15 |         self.stride = ratio
16 |         self.pad = self.kernel_size // ratio - 1
17 |         self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
18 |         self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
19 |         filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
20 |                                       half_width=0.6 / ratio,
21 |                                       kernel_size=self.kernel_size)
22 |         self.register_buffer("filter", filter)
23 | 
24 |     # x: [B, C, T]
25 |     def forward(self, x):
26 |         _, C, _ = x.shape
27 | 
28 |         x = F.pad(x, (self.pad, self.pad), mode='replicate')
29 |         x = self.ratio * F.conv_transpose1d(
30 |             x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
31 |         x = x[..., self.pad_left:-self.pad_right]
32 | 
33 |         return x
34 | 
35 | 
36 | class DownSample1d(nn.Module):
37 |     def __init__(self, ratio=2, kernel_size=None):
38 |         super().__init__()
39 |         self.ratio = ratio
40 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
41 |         self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
42 |                                        half_width=0.6 / ratio,
43 |                                        stride=ratio,
44 |                                        kernel_size=self.kernel_size)
45 | 
46 |     def forward(self, x):
47 |         xx = self.lowpass(x)
48 | 
49 |         return xx


--------------------------------------------------------------------------------
/indextts/BigVGAN/nnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/nnet/__init__.py


--------------------------------------------------------------------------------
/indextts/BigVGAN/nnet/__pycache__/CNN.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/nnet/__pycache__/CNN.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/BigVGAN/nnet/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/nnet/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/BigVGAN/nnet/__pycache__/linear.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/nnet/__pycache__/linear.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/BigVGAN/nnet/__pycache__/normalization.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/BigVGAN/nnet/__pycache__/normalization.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/BigVGAN/nnet/linear.py:
--------------------------------------------------------------------------------
 1 | """Library implementing linear transformation.
 2 | 
 3 | Authors
 4 |  * Mirco Ravanelli 2020
 5 |  * Davide Borra 2021
 6 | """
 7 | 
 8 | import logging
 9 | 
10 | import torch
11 | import torch.nn as nn
12 | 
13 | 
14 | class Linear(torch.nn.Module):
15 |     """Computes a linear transformation y = wx + b.
16 | 
17 |     Arguments
18 |     ---------
19 |     n_neurons : int
20 |         It is the number of output neurons (i.e, the dimensionality of the
21 |         output).
22 |     input_shape : tuple
23 |         It is the shape of the input tensor.
24 |     input_size : int
25 |         Size of the input tensor.
26 |     bias : bool
27 |         If True, the additive bias b is adopted.
28 |     max_norm : float
29 |         weight max-norm.
30 |     combine_dims : bool
31 |         If True and the input is 4D, combine 3rd and 4th dimensions of input.
32 | 
33 |     Example
34 |     -------
35 |     >>> inputs = torch.rand(10, 50, 40)
36 |     >>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100)
37 |     >>> output = lin_t(inputs)
38 |     >>> output.shape
39 |     torch.Size([10, 50, 100])
40 |     """
41 | 
42 |     def __init__(
43 |         self,
44 |         n_neurons,
45 |         input_shape=None,
46 |         input_size=None,
47 |         bias=True,
48 |         max_norm=None,
49 |         combine_dims=False,
50 |     ):
51 |         super().__init__()
52 |         self.max_norm = max_norm
53 |         self.combine_dims = combine_dims
54 | 
55 |         if input_shape is None and input_size is None:
56 |             raise ValueError("Expected one of input_shape or input_size")
57 | 
58 |         if input_size is None:
59 |             input_size = input_shape[-1]
60 |             if len(input_shape) == 4 and self.combine_dims:
61 |                 input_size = input_shape[2] * input_shape[3]
62 | 
63 |         # Weights are initialized following pytorch approach
64 |         self.w = nn.Linear(input_size, n_neurons, bias=bias)
65 | 
66 |     def forward(self, x):
67 |         """Returns the linear transformation of input tensor.
68 | 
69 |         Arguments
70 |         ---------
71 |         x : torch.Tensor
72 |             Input to transform linearly.
73 | 
74 |         Returns
75 |         -------
76 |         wx : torch.Tensor
77 |             The linearly transformed outputs.
78 |         """
79 |         if x.ndim == 4 and self.combine_dims:
80 |             x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])
81 | 
82 |         if self.max_norm is not None:
83 |             self.w.weight.data = torch.renorm(
84 |                 self.w.weight.data, p=2, dim=0, maxnorm=self.max_norm
85 |             )
86 | 
87 |         wx = self.w(x)
88 | 
89 |         return wx
90 | 


--------------------------------------------------------------------------------
/indextts/BigVGAN/utils.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | import glob
  5 | import os
  6 | 
  7 | import matplotlib
  8 | import matplotlib.pylab as plt
  9 | import torch
 10 | from scipy.io.wavfile import write
 11 | from torch.nn.utils import weight_norm
 12 | 
 13 | matplotlib.use("Agg")
 14 | 
 15 | MAX_WAV_VALUE = 32768.0
 16 | 
 17 | 
 18 | def plot_spectrogram(spectrogram):
 19 |     fig, ax = plt.subplots(figsize=(10, 2))
 20 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
 21 |     plt.colorbar(im, ax=ax)
 22 | 
 23 |     fig.canvas.draw()
 24 |     plt.close()
 25 | 
 26 |     return fig
 27 | 
 28 | 
 29 | def plot_spectrogram_clipped(spectrogram, clip_max=2.0):
 30 |     fig, ax = plt.subplots(figsize=(10, 2))
 31 |     im = ax.imshow(
 32 |         spectrogram,
 33 |         aspect="auto",
 34 |         origin="lower",
 35 |         interpolation="none",
 36 |         vmin=1e-6,
 37 |         vmax=clip_max,
 38 |     )
 39 |     plt.colorbar(im, ax=ax)
 40 | 
 41 |     fig.canvas.draw()
 42 |     plt.close()
 43 | 
 44 |     return fig
 45 | 
 46 | 
 47 | def init_weights(m, mean=0.0, std=0.01):
 48 |     classname = m.__class__.__name__
 49 |     if classname.find("Conv") != -1:
 50 |         m.weight.data.normal_(mean, std)
 51 | 
 52 | 
 53 | def apply_weight_norm(m):
 54 |     classname = m.__class__.__name__
 55 |     if classname.find("Conv") != -1:
 56 |         weight_norm(m)
 57 | 
 58 | 
 59 | def get_padding(kernel_size, dilation=1):
 60 |     return int((kernel_size * dilation - dilation) / 2)
 61 | 
 62 | 
 63 | def load_checkpoint(filepath, device):
 64 |     assert os.path.isfile(filepath)
 65 |     print(f"Loading '{filepath}'")
 66 |     checkpoint_dict = torch.load(filepath, map_location=device)
 67 |     print("Complete.")
 68 |     return checkpoint_dict
 69 | 
 70 | 
 71 | def save_checkpoint(filepath, obj):
 72 |     print(f"Saving checkpoint to {filepath}")
 73 |     torch.save(obj, filepath)
 74 |     print("Complete.")
 75 | 
 76 | 
 77 | def scan_checkpoint(cp_dir, prefix, renamed_file=None):
 78 |     # Fallback to original scanning logic first
 79 |     pattern = os.path.join(cp_dir, prefix + "????????")
 80 |     cp_list = glob.glob(pattern)
 81 | 
 82 |     if len(cp_list) > 0:
 83 |         last_checkpoint_path = sorted(cp_list)[-1]
 84 |         print(f"[INFO] Resuming from checkpoint: '{last_checkpoint_path}'")
 85 |         return last_checkpoint_path
 86 | 
 87 |     # If no pattern-based checkpoints are found, check for renamed file
 88 |     if renamed_file:
 89 |         renamed_path = os.path.join(cp_dir, renamed_file)
 90 |         if os.path.isfile(renamed_path):
 91 |             print(f"[INFO] Resuming from renamed checkpoint: '{renamed_file}'")
 92 |             return renamed_path
 93 | 
 94 |     return None
 95 | 
 96 | 
 97 | def save_audio(audio, path, sr):
 98 |     # wav: torch with 1d shape
 99 |     audio = audio * MAX_WAV_VALUE
100 |     audio = audio.cpu().numpy().astype("int16")
101 |     write(path, sr, audio)
102 | 


--------------------------------------------------------------------------------
/indextts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/__init__.py


--------------------------------------------------------------------------------
/indextts/cli.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import warnings
 4 | # Suppress warnings from tensorflow and other libraries
 5 | warnings.filterwarnings("ignore", category=UserWarning)
 6 | warnings.filterwarnings("ignore", category=FutureWarning)
 7 | def main():
 8 |     import argparse
 9 |     parser = argparse.ArgumentParser(description="IndexTTS Command Line")
10 |     parser.add_argument("text", type=str, help="Text to be synthesized")
11 |     parser.add_argument("-v", "--voice", type=str, required=True, help="Path to the audio prompt file (wav format)")
12 |     parser.add_argument("-o", "--output_path", type=str, default="gen.wav", help="Path to the output wav file")
13 |     parser.add_argument("-c", "--config", type=str, default="checkpoints/config.yaml", help="Path to the config file. Default is 'checkpoints/config.yaml'")
14 |     parser.add_argument("--model_dir", type=str, default="checkpoints", help="Path to the model directory. Default is 'checkpoints'")
15 |     parser.add_argument("--fp16", action="store_true", default=True, help="Use FP16 for inference if available")
16 |     parser.add_argument("-f", "--force", action="store_true", default=False, help="Force to overwrite the output file if it exists")
17 |     parser.add_argument("-d", "--device", type=str, default=None, help="Device to run the model on (cpu, cuda, mps)." )
18 |     args = parser.parse_args()
19 |     if len(args.text.strip()) == 0:
20 |         print("ERROR: Text is empty.")
21 |         parser.print_help()
22 |         sys.exit(1)
23 |     if not os.path.exists(args.voice):
24 |         print(f"Audio prompt file {args.voice} does not exist.")
25 |         parser.print_help()
26 |         sys.exit(1)
27 |     if not os.path.exists(args.config):
28 |         print(f"Config file {args.config} does not exist.")
29 |         parser.print_help()
30 |         sys.exit(1)
31 | 
32 |     output_path = args.output_path
33 |     if os.path.exists(output_path):
34 |         if not args.force:
35 |             print(f"ERROR: Output file {output_path} already exists. Use --force to overwrite.")
36 |             parser.print_help()
37 |             sys.exit(1)
38 |         else:
39 |             os.remove(output_path)
40 |     
41 |     try:
42 |         import torch
43 |     except ImportError:
44 |         print("ERROR: PyTorch is not installed. Please install it first.")
45 |         sys.exit(1)
46 | 
47 |     if args.device is None:
48 |         if torch.cuda.is_available():
49 |             args.device = "cuda:0"
50 |         elif torch.mps.is_available():
51 |             args.device = "mps"
52 |         else:
53 |             args.device = "cpu"
54 |             args.fp16 = False # Disable FP16 on CPU
55 |             print("WARNING: Running on CPU may be slow.")
56 | 
57 |     from indextts.infer import IndexTTS
58 |     tts = IndexTTS(cfg_path=args.config, model_dir=args.model_dir, is_fp16=args.fp16, device=args.device)
59 |     tts.infer(audio_prompt=args.voice, text=args.text.strip(), output_path=output_path)
60 | 
61 | if __name__ == "__main__":
62 |     main()


--------------------------------------------------------------------------------
/indextts/gpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/gpt/__init__.py


--------------------------------------------------------------------------------
/indextts/gpt/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/gpt/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/gpt/__pycache__/conformer_encoder.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/gpt/__pycache__/conformer_encoder.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/gpt/__pycache__/model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/gpt/__pycache__/model.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/gpt/__pycache__/perceiver.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/gpt/__pycache__/perceiver.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/gpt/conformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/gpt/conformer/__init__.py


--------------------------------------------------------------------------------
/indextts/gpt/conformer/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/gpt/conformer/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/gpt/conformer/__pycache__/attention.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/gpt/conformer/__pycache__/attention.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/gpt/conformer/__pycache__/embedding.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/gpt/conformer/__pycache__/embedding.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/gpt/conformer/__pycache__/subsampling.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/gpt/conformer/__pycache__/subsampling.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/gpt/conformer/embedding.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # Modified from ESPnet(https://github.com/espnet/espnet)
 15 | 
 16 | """Positonal Encoding Module."""
 17 | 
 18 | import math
 19 | from typing import Tuple, Union
 20 | 
 21 | import torch
 22 | import torch.nn.functional as F
 23 | 
 24 | 
 25 | class PositionalEncoding(torch.nn.Module):
 26 |     """Positional encoding.
 27 | 
 28 |     :param int d_model: embedding dim
 29 |     :param float dropout_rate: dropout rate
 30 |     :param int max_len: maximum input length
 31 | 
 32 |     PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
 33 |     PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
 34 |     """
 35 |     def __init__(self,
 36 |                  d_model: int,
 37 |                  dropout_rate: float,
 38 |                  max_len: int = 5000,
 39 |                  reverse: bool = False):
 40 |         """Construct an PositionalEncoding object."""
 41 |         super().__init__()
 42 |         self.d_model = d_model
 43 |         self.xscale = math.sqrt(self.d_model)
 44 |         self.dropout = torch.nn.Dropout(p=dropout_rate)
 45 |         self.max_len = max_len
 46 | 
 47 |         pe = torch.zeros(self.max_len, self.d_model)
 48 |         position = torch.arange(0, self.max_len).unsqueeze(1)
 49 |         div_term = torch.exp(
 50 |             torch.arange(0, self.d_model, 2) *
 51 |             -(math.log(10000.0) / self.d_model))
 52 |         pe[:, 0::2] = torch.sin(position * div_term)
 53 |         pe[:, 1::2] = torch.cos(position * div_term)
 54 |         pe = pe.unsqueeze(0)
 55 |         self.register_buffer('pe', pe)
 56 | 
 57 |     def forward(self,
 58 |                 x: torch.Tensor,
 59 |                 offset: Union[int, torch.Tensor] = 0) \
 60 |             -> Tuple[torch.Tensor, torch.Tensor]:
 61 |         """Add positional encoding.
 62 | 
 63 |         Args:
 64 |             x (torch.Tensor): Input. Its shape is (batch, time, ...)
 65 |             offset (int, torch.tensor): position offset
 66 | 
 67 |         Returns:
 68 |             torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
 69 |             torch.Tensor: for compatibility to RelPositionalEncoding
 70 |         """
 71 | 
 72 |         self.pe = self.pe.to(x.device)
 73 |         pos_emb = self.position_encoding(offset, x.size(1), False)
 74 |         x = x * self.xscale + pos_emb
 75 |         return self.dropout(x), self.dropout(pos_emb)
 76 | 
 77 |     def position_encoding(self, offset: Union[int, torch.Tensor], size: int,
 78 |                           apply_dropout: bool = True) -> torch.Tensor:
 79 |         """ For getting encoding in a streaming fashion
 80 | 
 81 |         Attention!!!!!
 82 |         we apply dropout only once at the whole utterance level in a none
 83 |         streaming way, but will call this function several times with
 84 |         increasing input size in a streaming scenario, so the dropout will
 85 |         be applied several times.
 86 | 
 87 |         Args:
 88 |             offset (int or torch.tensor): start offset
 89 |             size (int): required size of position encoding
 90 | 
 91 |         Returns:
 92 |             torch.Tensor: Corresponding encoding
 93 |         """
 94 |         # How to subscript a Union type:
 95 |         #   https://github.com/pytorch/pytorch/issues/69434
 96 |         if isinstance(offset, int):
 97 |             assert offset + size < self.max_len
 98 |             pos_emb = self.pe[:, offset:offset + size]
 99 |         elif isinstance(offset, torch.Tensor) and offset.dim() == 0:  # scalar
100 |             assert offset + size < self.max_len
101 |             pos_emb = self.pe[:, offset:offset + size]
102 |         else:  # for batched streaming decoding on GPU
103 |             assert torch.max(offset) + size < self.max_len
104 |             index = offset.unsqueeze(1) + \
105 |                 torch.arange(0, size).to(offset.device)  # B X T
106 |             flag = index > 0
107 |             # remove negative offset
108 |             index = index * flag
109 |             pos_emb = F.embedding(index, self.pe[0])  # B X T X d_model
110 | 
111 |         if apply_dropout:
112 |             pos_emb = self.dropout(pos_emb)
113 |         return pos_emb
114 | 
115 | class RelPositionalEncoding(PositionalEncoding):
116 |     """Relative positional encoding module.
117 |     See : Appendix B in https://arxiv.org/abs/1901.02860
118 |     Args:
119 |         d_model (int): Embedding dimension.
120 |         dropout_rate (float): Dropout rate.
121 |         max_len (int): Maximum input length.
122 |     """
123 |     def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
124 |         """Initialize class."""
125 |         super().__init__(d_model, dropout_rate, max_len, reverse=True)
126 | 
127 |     def forward(self,
128 |                 x: torch.Tensor,
129 |                 offset: Union[int, torch.Tensor] = 0) \
130 |             -> Tuple[torch.Tensor, torch.Tensor]:
131 |         """Compute positional encoding.
132 |         Args:
133 |             x (torch.Tensor): Input tensor (batch, time, `*`).
134 |         Returns:
135 |             torch.Tensor: Encoded tensor (batch, time, `*`).
136 |             torch.Tensor: Positional embedding tensor (1, time, `*`).
137 |         """
138 |         self.pe = self.pe.to(x.device)
139 |         x = x * self.xscale
140 |         pos_emb = self.position_encoding(offset, x.size(1), False)
141 |         return self.dropout(x), self.dropout(pos_emb)
142 | 
143 | 
144 | class NoPositionalEncoding(torch.nn.Module):
145 |     """ No position encoding
146 |     """
147 |     def __init__(self, d_model: int, dropout_rate: float):
148 |         super().__init__()
149 |         self.d_model = d_model
150 |         self.dropout = torch.nn.Dropout(p=dropout_rate)
151 | 
152 |     def forward(self,
153 |                 x: torch.Tensor,
154 |                 offset: Union[int, torch.Tensor] = 0) \
155 |             -> Tuple[torch.Tensor, torch.Tensor]:
156 |         """ Just return zero vector for interface compatibility
157 |         """
158 |         pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
159 |         return self.dropout(x), pos_emb
160 | 
161 |     def position_encoding(
162 |             self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor:
163 |         return torch.zeros(1, size, self.d_model)
164 | 


--------------------------------------------------------------------------------
/indextts/gpt/conformer/subsampling.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # Modified from ESPnet(https://github.com/espnet/espnet)
 15 | 
 16 | 
 17 | """Subsampling layer definition."""
 18 | 
 19 | from typing import Tuple, Union
 20 | 
 21 | import torch
 22 | 
 23 | 
 24 | class BaseSubsampling(torch.nn.Module):
 25 |     def __init__(self):
 26 |         super().__init__()
 27 |         self.right_context = 0
 28 |         self.subsampling_rate = 1
 29 | 
 30 |     def position_encoding(self, offset: Union[int, torch.Tensor],
 31 |                           size: int) -> torch.Tensor:
 32 |         return self.pos_enc.position_encoding(offset, size)
 33 | 
 34 | 
 35 | class LinearNoSubsampling(BaseSubsampling):
 36 |     """Linear transform the input without subsampling
 37 | 
 38 |     Args:
 39 |         idim (int): Input dimension.
 40 |         odim (int): Output dimension.
 41 |         dropout_rate (float): Dropout rate.
 42 | 
 43 |     """
 44 |     def __init__(self, idim: int, odim: int, dropout_rate: float,
 45 |                  pos_enc_class: torch.nn.Module):
 46 |         """Construct an linear object."""
 47 |         super().__init__()
 48 |         self.out = torch.nn.Sequential(
 49 |             torch.nn.Linear(idim, odim),
 50 |             torch.nn.LayerNorm(odim, eps=1e-5),
 51 |             torch.nn.Dropout(dropout_rate),
 52 |         )
 53 |         self.pos_enc = pos_enc_class
 54 |         self.right_context = 0
 55 |         self.subsampling_rate = 1
 56 | 
 57 |     def forward(
 58 |             self,
 59 |             x: torch.Tensor,
 60 |             x_mask: torch.Tensor,
 61 |             offset: Union[int, torch.Tensor] = 0
 62 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
 63 |         """Input x.
 64 | 
 65 |         Args:
 66 |             x (torch.Tensor): Input tensor (#batch, time, idim).
 67 |             x_mask (torch.Tensor): Input mask (#batch, 1, time).
 68 | 
 69 |         Returns:
 70 |             torch.Tensor: linear input tensor (#batch, time', odim),
 71 |                 where time' = time .
 72 |             torch.Tensor: linear input mask (#batch, 1, time'),
 73 |                 where time' = time .
 74 | 
 75 |         """
 76 |         x = self.out(x)
 77 |         x, pos_emb = self.pos_enc(x, offset)
 78 |         return x, pos_emb, x_mask
 79 | 
 80 | 
 81 | class Conv2dSubsampling3(BaseSubsampling):
 82 |     """Convolutional 2D subsampling (to 1/3 length).
 83 | 
 84 |     Args:
 85 |         idim (int): Input dimension.
 86 |         odim (int): Output dimension.
 87 |         dropout_rate (float): Dropout rate.
 88 | 
 89 |     """
 90 |     def __init__(self, idim: int, odim: int, dropout_rate: float,
 91 |                  pos_enc_class: torch.nn.Module):
 92 |         """Construct an Conv2dSubsampling3 object."""
 93 |         super().__init__()
 94 |         self.conv = torch.nn.Sequential(
 95 |             torch.nn.Conv2d(1, odim, 5, 3),
 96 |             torch.nn.ReLU()
 97 |         )
 98 |         self.out = torch.nn.Sequential(
 99 |             torch.nn.Linear(odim * ((idim - 2) // 3), odim))
100 |         self.pos_enc = pos_enc_class
101 |         # The right context for every conv layer is computed by:
102 |         # (kernel_size - 1) * frame_rate_of_this_layer
103 |         self.subsampling_rate = 3
104 |         # 4 = (5 - 1) * 1
105 |         self.right_context = 4
106 | 
107 |     def forward(
108 |             self,
109 |             x: torch.Tensor,
110 |             x_mask: torch.Tensor,
111 |             offset: Union[int, torch.Tensor] = 0
112 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
113 |         """Subsample x.
114 | 
115 |         Args:
116 |             x (torch.Tensor): Input tensor (#batch, time, idim).
117 |             x_mask (torch.Tensor): Input mask (#batch, 1, time).
118 | 
119 |         Returns:
120 |             torch.Tensor: Subsampled tensor (#batch, time', odim),
121 |                 where time' = time // 3.
122 |             torch.Tensor: Subsampled mask (#batch, 1, time'),
123 |                 where time' = time // 3.
124 |             torch.Tensor: positional encoding
125 | 
126 |         """
127 |         x = x.unsqueeze(1)  # (b, c=1, t, f)
128 |         x = self.conv(x)
129 |         b, c, t, f = x.size()
130 |         x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
131 |         x, pos_emb = self.pos_enc(x, offset)
132 |         return x, pos_emb, x_mask[:, :, :-2:3]
133 | 
134 | 
135 | class Conv2dSubsampling2(BaseSubsampling):
136 |     """Convolutional 2D subsampling (to 1/2 length).
137 | 
138 |     Args:
139 |         idim (int): Input dimension.
140 |         odim (int): Output dimension.
141 |         dropout_rate (float): Dropout rate.
142 | 
143 |     """
144 |     def __init__(self, idim: int, odim: int, dropout_rate: float,
145 |                  pos_enc_class: torch.nn.Module):
146 |         """Construct an Conv2dSubsampling4 object."""
147 |         super().__init__()
148 |         self.conv = torch.nn.Sequential(
149 |             torch.nn.Conv2d(1, odim, 3, 2),
150 |             torch.nn.ReLU(),
151 |         )
152 |         self.out = torch.nn.Sequential(
153 |             torch.nn.Linear(odim * ((idim - 1) // 2), odim))
154 |         self.pos_enc = pos_enc_class
155 |         # The right context for every conv layer is computed by:
156 |         # (kernel_size - 1) * frame_rate_of_this_layer
157 |         self.subsampling_rate = 2
158 |         # 2 = (3 - 1) * 1
159 |         self.right_context = 2
160 | 
161 |     def forward(
162 |             self,
163 |             x: torch.Tensor,
164 |             x_mask: torch.Tensor,
165 |             offset: Union[int, torch.Tensor] = 0
166 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
167 |         """Subsample x.
168 | 
169 |         Args:
170 |             x (torch.Tensor): Input tensor (#batch, time, idim).
171 |             x_mask (torch.Tensor): Input mask (#batch, 1, time).
172 | 
173 |         Returns:
174 |             torch.Tensor: Subsampled tensor (#batch, time', odim),
175 |                 where time' = time // 2.
176 |             torch.Tensor: Subsampled mask (#batch, 1, time'),
177 |                 where time' = time // 2.
178 |             torch.Tensor: positional encoding
179 | 
180 |         """
181 |         x = x.unsqueeze(1)  # (b, c=1, t, f)
182 |         x = self.conv(x)
183 |         b, c, t, f = x.size()
184 |         x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
185 |         x, pos_emb = self.pos_enc(x, offset)
186 |         return x, pos_emb, x_mask[:, :, 2::2]
187 | 
188 | 
189 | class Conv2dSubsampling4(BaseSubsampling):
190 |     """Convolutional 2D subsampling (to 1/4 length).
191 | 
192 |     Args:
193 |         idim (int): Input dimension.
194 |         odim (int): Output dimension.
195 |         dropout_rate (float): Dropout rate.
196 | 
197 |     """
198 |     def __init__(self, idim: int, odim: int, dropout_rate: float,
199 |                  pos_enc_class: torch.nn.Module):
200 |         """Construct an Conv2dSubsampling4 object."""
201 |         super().__init__()
202 |         self.conv = torch.nn.Sequential(
203 |             torch.nn.Conv2d(1, odim, 3, 2),
204 |             torch.nn.ReLU(),
205 |             torch.nn.Conv2d(odim, odim, 3, 2),
206 |             torch.nn.ReLU(),
207 |         )
208 |         self.out = torch.nn.Sequential(
209 |             torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
210 |         self.pos_enc = pos_enc_class
211 |         # The right context for every conv layer is computed by:
212 |         # (kernel_size - 1) * frame_rate_of_this_layer
213 |         self.subsampling_rate = 4
214 |         # 6 = (3 - 1) * 1 + (3 - 1) * 2
215 |         self.right_context = 6
216 | 
217 |     def forward(
218 |             self,
219 |             x: torch.Tensor,
220 |             x_mask: torch.Tensor,
221 |             offset: Union[int, torch.Tensor] = 0
222 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
223 |         """Subsample x.
224 | 
225 |         Args:
226 |             x (torch.Tensor): Input tensor (#batch, time, idim).
227 |             x_mask (torch.Tensor): Input mask (#batch, 1, time).
228 | 
229 |         Returns:
230 |             torch.Tensor: Subsampled tensor (#batch, time', odim),
231 |                 where time' = time // 4.
232 |             torch.Tensor: Subsampled mask (#batch, 1, time'),
233 |                 where time' = time // 4.
234 |             torch.Tensor: positional encoding
235 | 
236 |         """
237 |         x = x.unsqueeze(1)  # (b, c=1, t, f)
238 |         x = self.conv(x)
239 |         b, c, t, f = x.size()
240 |         x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
241 |         x, pos_emb = self.pos_enc(x, offset)
242 |         return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2]
243 | 
244 | 
245 | class Conv2dSubsampling6(BaseSubsampling):
246 |     """Convolutional 2D subsampling (to 1/6 length).
247 |     Args:
248 |         idim (int): Input dimension.
249 |         odim (int): Output dimension.
250 |         dropout_rate (float): Dropout rate.
251 |         pos_enc (torch.nn.Module): Custom position encoding layer.
252 |     """
253 |     def __init__(self, idim: int, odim: int, dropout_rate: float,
254 |                  pos_enc_class: torch.nn.Module):
255 |         """Construct an Conv2dSubsampling6 object."""
256 |         super().__init__()
257 |         self.conv = torch.nn.Sequential(
258 |             torch.nn.Conv2d(1, odim, 3, 2),
259 |             torch.nn.ReLU(),
260 |             torch.nn.Conv2d(odim, odim, 5, 3),
261 |             torch.nn.ReLU(),
262 |         )
263 |         self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3),
264 |                                       odim)
265 |         self.pos_enc = pos_enc_class
266 |         # 10 = (3 - 1) * 1 + (5 - 1) * 2
267 |         self.subsampling_rate = 6
268 |         self.right_context = 10
269 | 
270 |     def forward(
271 |             self,
272 |             x: torch.Tensor,
273 |             x_mask: torch.Tensor,
274 |             offset: Union[int, torch.Tensor] = 0
275 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
276 |         """Subsample x.
277 |         Args:
278 |             x (torch.Tensor): Input tensor (#batch, time, idim).
279 |             x_mask (torch.Tensor): Input mask (#batch, 1, time).
280 | 
281 |         Returns:
282 |             torch.Tensor: Subsampled tensor (#batch, time', odim),
283 |                 where time' = time // 6.
284 |             torch.Tensor: Subsampled mask (#batch, 1, time'),
285 |                 where time' = time // 6.
286 |             torch.Tensor: positional encoding
287 |         """
288 |         x = x.unsqueeze(1)  # (b, c, t, f)
289 |         x = self.conv(x)
290 |         b, c, t, f = x.size()
291 |         x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
292 |         x, pos_emb = self.pos_enc(x, offset)
293 |         return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3]
294 | 
295 | 
296 | class Conv2dSubsampling8(BaseSubsampling):
297 |     """Convolutional 2D subsampling (to 1/8 length).
298 | 
299 |     Args:
300 |         idim (int): Input dimension.
301 |         odim (int): Output dimension.
302 |         dropout_rate (float): Dropout rate.
303 | 
304 |     """
305 |     def __init__(self, idim: int, odim: int, dropout_rate: float,
306 |                  pos_enc_class: torch.nn.Module):
307 |         """Construct an Conv2dSubsampling8 object."""
308 |         super().__init__()
309 |         self.conv = torch.nn.Sequential(
310 |             torch.nn.Conv2d(1, odim, 3, 2),
311 |             torch.nn.ReLU(),
312 |             torch.nn.Conv2d(odim, odim, 3, 2),
313 |             torch.nn.ReLU(),
314 |             torch.nn.Conv2d(odim, odim, 3, 2),
315 |             torch.nn.ReLU(),
316 |         )
317 |         self.linear = torch.nn.Linear(
318 |             odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim)
319 |         self.pos_enc = pos_enc_class
320 |         self.subsampling_rate = 8
321 |         # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
322 |         self.right_context = 14
323 | 
324 |     def forward(
325 |             self,
326 |             x: torch.Tensor,
327 |             x_mask: torch.Tensor,
328 |             offset: Union[int, torch.Tensor] = 0
329 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
330 |         """Subsample x.
331 | 
332 |         Args:
333 |             x (torch.Tensor): Input tensor (#batch, time, idim).
334 |             x_mask (torch.Tensor): Input mask (#batch, 1, time).
335 | 
336 |         Returns:
337 |             torch.Tensor: Subsampled tensor (#batch, time', odim),
338 |                 where time' = time // 8.
339 |             torch.Tensor: Subsampled mask (#batch, 1, time'),
340 |                 where time' = time // 8.
341 |             torch.Tensor: positional encoding
342 |         """
343 |         x = x.unsqueeze(1)  # (b, c, t, f)
344 |         x = self.conv(x)
345 |         b, c, t, f = x.size()
346 |         x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
347 |         x, pos_emb = self.pos_enc(x, offset)
348 |         return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2]
349 | 


--------------------------------------------------------------------------------
/indextts/gpt/perceiver.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/lucidrains/naturalspeech2-pytorch/blob/659bec7f7543e7747e809e950cc2f84242fbeec7/naturalspeech2_pytorch/naturalspeech2_pytorch.py#L532
  2 | 
  3 | from collections import namedtuple
  4 | from functools import wraps
  5 | 
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from einops import rearrange, repeat
  9 | from einops.layers.torch import Rearrange
 10 | from packaging import version
 11 | from torch import einsum, nn
 12 | 
 13 | 
 14 | def exists(val):
 15 |     return val is not None
 16 | 
 17 | 
 18 | def once(fn):
 19 |     called = False
 20 | 
 21 |     @wraps(fn)
 22 |     def inner(x):
 23 |         nonlocal called
 24 |         if called:
 25 |             return
 26 |         called = True
 27 |         return fn(x)
 28 | 
 29 |     return inner
 30 | 
 31 | 
 32 | print_once = once(print)
 33 | 
 34 | 
 35 | # main class
 36 | class Attend(nn.Module):
 37 |     def __init__(self, dropout=0.0, causal=False, use_flash=False):
 38 |         super().__init__()
 39 |         self.dropout = dropout
 40 |         self.attn_dropout = nn.Dropout(dropout)
 41 | 
 42 |         self.causal = causal
 43 |         self.register_buffer("mask", None, persistent=False)
 44 | 
 45 |         self.use_flash = use_flash
 46 |         assert not (
 47 |             use_flash and version.parse(torch.__version__) < version.parse("2.0.0")
 48 |         ), "in order to use flash attention, you must be using pytorch 2.0 or above"
 49 | 
 50 |         # determine efficient attention configs for cuda and cpu
 51 |         self.config = namedtuple("EfficientAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"])
 52 |         self.cpu_config = self.config(True, True, True)
 53 |         self.cuda_config = None
 54 | 
 55 |         if not torch.cuda.is_available() or not use_flash:
 56 |             return
 57 | 
 58 |         device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
 59 | 
 60 |         if device_properties.major == 8 and device_properties.minor == 0:
 61 |             print_once("A100 GPU detected, using flash attention if input tensor is on cuda")
 62 |             self.cuda_config = self.config(True, False, False)
 63 |         else:
 64 |             print_once("Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda")
 65 |             self.cuda_config = self.config(False, True, True)
 66 | 
 67 |     def get_mask(self, n, device):
 68 |         if exists(self.mask) and self.mask.shape[-1] >= n:
 69 |             return self.mask[:n, :n]
 70 | 
 71 |         mask = torch.ones((n, n), device=device, dtype=torch.bool).triu(1)
 72 |         self.register_buffer("mask", mask, persistent=False)
 73 |         return mask
 74 | 
 75 |     def flash_attn(self, q, k, v, mask=None):
 76 |         _, heads, q_len, _, k_len, is_cuda = *q.shape, k.shape[-2], q.is_cuda
 77 | 
 78 |         # Recommended for multi-query single-key-value attention by Tri Dao
 79 |         # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64])
 80 | 
 81 |         if k.ndim == 3:
 82 |             k = rearrange(k, "b ... -> b 1 ...").expand_as(q)
 83 | 
 84 |         if v.ndim == 3:
 85 |             v = rearrange(v, "b ... -> b 1 ...").expand_as(q)
 86 | 
 87 |         # Check if mask exists and expand to compatible shape
 88 |         # The mask is B L, so it would have to be expanded to B H N L
 89 | 
 90 |         if exists(mask):
 91 |             mask = rearrange(mask, "b j -> b 1 1 j")
 92 |             mask = mask.expand(-1, heads, q_len, -1)
 93 | 
 94 |         # Check if there is a compatible device for flash attention
 95 | 
 96 |         config = self.cuda_config if is_cuda else self.cpu_config
 97 | 
 98 |         # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
 99 | 
100 |         with torch.backends.cuda.sdp_kernel(**config._asdict()):
101 |             out = F.scaled_dot_product_attention(
102 |                 q, k, v, attn_mask=mask, dropout_p=self.dropout if self.training else 0.0, is_causal=self.causal
103 |             )
104 | 
105 |         return out
106 | 
107 |     def forward(self, q, k, v, mask=None):
108 |         """
109 |         einstein notation
110 |         b - batch
111 |         h - heads
112 |         n, i, j - sequence length (base sequence length, source, target)
113 |         d - feature dimension
114 |         """
115 | 
116 |         n, device = q.shape[-2], q.device
117 | 
118 |         scale = q.shape[-1] ** -0.5
119 | 
120 |         if self.use_flash:
121 |             return self.flash_attn(q, k, v, mask=mask)
122 | 
123 |         kv_einsum_eq = "b j d" if k.ndim == 3 else "b h j d"
124 | 
125 |         # similarity
126 | 
127 |         sim = einsum(f"b h i d, {kv_einsum_eq} -> b h i j", q, k) * scale
128 | 
129 |         # key padding mask
130 | 
131 |         if exists(mask):
132 |             mask = rearrange(mask, "b j -> b 1 1 j")
133 |             sim = sim.masked_fill(~mask, -torch.finfo(sim.dtype).max)
134 | 
135 |         # causal mask
136 | 
137 |         if self.causal:
138 |             causal_mask = self.get_mask(n, device)
139 |             sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max)
140 | 
141 |         # attention
142 | 
143 |         attn = sim.softmax(dim=-1)
144 |         attn = self.attn_dropout(attn)
145 | 
146 |         # aggregate values
147 | 
148 |         out = einsum(f"b h i j, {kv_einsum_eq} -> b h i d", attn, v)
149 | 
150 |         return out
151 | 
152 | 
153 | def Sequential(*mods):
154 |     return nn.Sequential(*filter(exists, mods))
155 | 
156 | 
157 | def exists(x):
158 |     return x is not None
159 | 
160 | 
161 | def default(val, d):
162 |     if exists(val):
163 |         return val
164 |     return d() if callable(d) else d
165 | 
166 | 
167 | class RMSNorm(nn.Module):
168 |     def __init__(self, dim, scale=True, dim_cond=None):
169 |         super().__init__()
170 |         self.cond = exists(dim_cond)
171 |         self.to_gamma_beta = nn.Linear(dim_cond, dim * 2) if self.cond else None
172 | 
173 |         self.scale = dim**0.5
174 |         self.gamma = nn.Parameter(torch.ones(dim)) if scale else None
175 | 
176 |     def forward(self, x, cond=None):
177 |         gamma = default(self.gamma, 1)
178 |         out = F.normalize(x, dim=-1) * self.scale * gamma
179 | 
180 |         if not self.cond:
181 |             return out
182 | 
183 |         assert exists(cond)
184 |         gamma, beta = self.to_gamma_beta(cond).chunk(2, dim=-1)
185 |         gamma, beta = map(lambda t: rearrange(t, "b d -> b 1 d"), (gamma, beta))
186 |         return out * gamma + beta
187 | 
188 | 
189 | class CausalConv1d(nn.Conv1d):
190 |     def __init__(self, *args, **kwargs):
191 |         super().__init__(*args, **kwargs)
192 |         (kernel_size,) = self.kernel_size
193 |         (dilation,) = self.dilation
194 |         (stride,) = self.stride
195 | 
196 |         assert stride == 1
197 |         self.causal_padding = dilation * (kernel_size - 1)
198 | 
199 |     def forward(self, x):
200 |         causal_padded_x = F.pad(x, (self.causal_padding, 0), value=0.0)
201 |         return super().forward(causal_padded_x)
202 | 
203 | 
204 | class GEGLU(nn.Module):
205 |     def forward(self, x):
206 |         x, gate = x.chunk(2, dim=-1)
207 |         return F.gelu(gate) * x
208 | 
209 | 
210 | def FeedForward(dim, mult=4, causal_conv=False):
211 |     dim_inner = int(dim * mult * 2 / 3)
212 | 
213 |     conv = None
214 |     if causal_conv:
215 |         conv = nn.Sequential(
216 |             Rearrange("b n d -> b d n"),
217 |             CausalConv1d(dim_inner, dim_inner, 3),
218 |             Rearrange("b d n -> b n d"),
219 |         )
220 | 
221 |     return Sequential(nn.Linear(dim, dim_inner * 2), GEGLU(), conv, nn.Linear(dim_inner, dim))
222 | 
223 | 
224 | class PerceiverResampler(nn.Module):
225 |     def __init__(
226 |         self,
227 |         dim,
228 |         depth=2,
229 |         dim_context=None,
230 |         num_latents=32,
231 |         dim_head=64,
232 |         heads=8,
233 |         ff_mult=4,
234 |         use_flash_attn=False,
235 |     ):
236 |         super().__init__()
237 |         dim_context = default(dim_context, dim)
238 | 
239 |         self.proj_context = nn.Linear(dim_context, dim) if dim_context != dim else nn.Identity()
240 | 
241 |         self.latents = nn.Parameter(torch.randn(num_latents, dim))
242 |         nn.init.normal_(self.latents, std=0.02)
243 | 
244 |         self.layers = nn.ModuleList([])
245 |         for _ in range(depth):
246 |             self.layers.append(
247 |                 nn.ModuleList(
248 |                     [
249 |                         Attention(
250 |                             dim=dim,
251 |                             dim_head=dim_head,
252 |                             heads=heads,
253 |                             use_flash=use_flash_attn,
254 |                             cross_attn_include_queries=True,
255 |                         ),
256 |                         FeedForward(dim=dim, mult=ff_mult),
257 |                     ]
258 |                 )
259 |             )
260 | 
261 |         self.norm = RMSNorm(dim)
262 | 
263 |     def forward(self, x, mask=None):
264 |         batch = x.shape[0]
265 | 
266 |         x = self.proj_context(x)
267 | 
268 |         latents = repeat(self.latents, "n d -> b n d", b=batch)
269 | 
270 |         for attn, ff in self.layers:
271 |             latents = attn(latents, x, mask=mask) + latents
272 |             latents = ff(latents) + latents
273 | 
274 |         return self.norm(latents)
275 | 
276 | 
277 | class Attention(nn.Module):
278 |     def __init__(
279 |         self,
280 |         dim,
281 |         *,
282 |         dim_context=None,
283 |         causal=False,
284 |         dim_head=64,
285 |         heads=8,
286 |         dropout=0.0,
287 |         use_flash=False,
288 |         cross_attn_include_queries=False,
289 |     ):
290 |         super().__init__()
291 |         self.scale = dim_head**-0.5
292 |         self.heads = heads
293 |         self.cross_attn_include_queries = cross_attn_include_queries
294 | 
295 |         dim_inner = dim_head * heads
296 |         dim_context = default(dim_context, dim)
297 | 
298 |         self.attend = Attend(causal=causal, dropout=dropout, use_flash=use_flash)
299 |         self.to_q = nn.Linear(dim, dim_inner, bias=False)
300 |         self.to_kv = nn.Linear(dim_context, dim_inner * 2, bias=False)
301 |         self.to_out = nn.Linear(dim_inner, dim, bias=False)
302 | 
303 |     def forward(self, x, context=None, mask=None):
304 |         h, has_context = self.heads, exists(context)
305 | 
306 |         context = default(context, x)
307 | 
308 |         if has_context and self.cross_attn_include_queries:
309 |             context = torch.cat((x, context), dim=-2)
310 | 
311 |         q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim=-1))
312 |         q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
313 | 
314 |         out = self.attend(q, k, v, mask=mask)
315 | 
316 |         out = rearrange(out, "b h n d -> b n (h d)")
317 |         return self.to_out(out)
318 | 


--------------------------------------------------------------------------------
/indextts/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/utils/__init__.py


--------------------------------------------------------------------------------
/indextts/utils/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/utils/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/utils/__pycache__/arch_util.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/utils/__pycache__/arch_util.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/utils/__pycache__/checkpoint.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/utils/__pycache__/checkpoint.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/utils/__pycache__/common.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/utils/__pycache__/common.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/utils/__pycache__/feature_extractors.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/utils/__pycache__/feature_extractors.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/utils/__pycache__/front.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/utils/__pycache__/front.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/utils/__pycache__/typical_sampling.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/utils/__pycache__/typical_sampling.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/utils/__pycache__/xtransformers.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/utils/__pycache__/xtransformers.cpython-311.pyc


--------------------------------------------------------------------------------
/indextts/utils/arch_util.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from indextts.utils.xtransformers import RelativePositionBias
  7 | 
  8 | 
  9 | def zero_module(module):
 10 |     """
 11 |     Zero out the parameters of a module and return it.
 12 |     """
 13 |     for p in module.parameters():
 14 |         p.detach().zero_()
 15 |     return module
 16 | 
 17 | 
 18 | class GroupNorm32(nn.GroupNorm):
 19 |     def forward(self, x):
 20 |         return super().forward(x.float()).type(x.dtype)
 21 | 
 22 | 
 23 | def normalization(channels):
 24 |     """
 25 |     Make a standard normalization layer.
 26 | 
 27 |     :param channels: number of input channels.
 28 |     :return: an nn.Module for normalization.
 29 |     """
 30 |     groups = 32
 31 |     if channels <= 16:
 32 |         groups = 8
 33 |     elif channels <= 64:
 34 |         groups = 16
 35 |     while channels % groups != 0:
 36 |         groups = int(groups / 2)
 37 |     assert groups > 2
 38 |     return GroupNorm32(groups, channels)
 39 | 
 40 | 
 41 | class QKVAttentionLegacy(nn.Module):
 42 |     """
 43 |     A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping
 44 |     """
 45 | 
 46 |     def __init__(self, n_heads):
 47 |         super().__init__()
 48 |         self.n_heads = n_heads
 49 | 
 50 |     def forward(self, qkv, mask=None, rel_pos=None):
 51 |         """
 52 |         Apply QKV attention.
 53 | 
 54 |         :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
 55 |         :return: an [N x (H * C) x T] tensor after attention.
 56 |         """
 57 |         bs, width, length = qkv.shape
 58 |         assert width % (3 * self.n_heads) == 0
 59 |         ch = width // (3 * self.n_heads)
 60 |         q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
 61 |         scale = 1 / math.sqrt(math.sqrt(ch))
 62 |         weight = torch.einsum(
 63 |             "bct,bcs->bts", q * scale, k * scale
 64 |         )  # More stable with f16 than dividing afterwards
 65 |         if rel_pos is not None:
 66 |             weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(bs * self.n_heads, weight.shape[-2], weight.shape[-1])
 67 |         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
 68 |         if mask is not None:
 69 |             # The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs.
 70 |             mask = mask.repeat(self.n_heads, 1).unsqueeze(1)
 71 |             weight = weight * mask
 72 |         a = torch.einsum("bts,bcs->bct", weight, v)
 73 | 
 74 |         return a.reshape(bs, -1, length)
 75 | 
 76 | 
 77 | class AttentionBlock(nn.Module):
 78 |     """
 79 |     An attention block that allows spatial positions to attend to each other.
 80 | 
 81 |     Originally ported from here, but adapted to the N-d case.
 82 |     https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
 83 |     """
 84 | 
 85 |     def __init__(
 86 |         self,
 87 |         channels,
 88 |         num_heads=1,
 89 |         num_head_channels=-1,
 90 |         do_checkpoint=True,
 91 |         relative_pos_embeddings=False,
 92 |     ):
 93 |         super().__init__()
 94 |         self.channels = channels
 95 |         self.do_checkpoint = do_checkpoint
 96 |         if num_head_channels == -1:
 97 |             self.num_heads = num_heads
 98 |         else:
 99 |             assert (
100 |                 channels % num_head_channels == 0
101 |             ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
102 |             self.num_heads = channels // num_head_channels
103 |         self.norm = normalization(channels)
104 |         self.qkv = nn.Conv1d(channels, channels * 3, 1)
105 |         # split heads before split qkv
106 |         self.attention = QKVAttentionLegacy(self.num_heads)
107 | 
108 |         self.proj_out = zero_module(nn.Conv1d(channels, channels, 1))
109 |         if relative_pos_embeddings:
110 |             self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64)
111 |         else:
112 |             self.relative_pos_embeddings = None
113 | 
114 |     def forward(self, x, mask=None):
115 |         b, c, *spatial = x.shape
116 |         x = x.reshape(b, c, -1)
117 |         qkv = self.qkv(self.norm(x))
118 |         h = self.attention(qkv, mask, self.relative_pos_embeddings)
119 |         h = self.proj_out(h)
120 |         return (x + h).reshape(b, c, *spatial)
121 | 


--------------------------------------------------------------------------------
/indextts/utils/checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang)
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import datetime
16 | import logging
17 | import os
18 | import re
19 | from collections import OrderedDict
20 | 
21 | import torch
22 | import yaml
23 | 
24 | 
25 | def load_checkpoint(model: torch.nn.Module, model_pth: str) -> dict:
26 |     checkpoint = torch.load(model_pth, map_location='cpu')
27 |     checkpoint = checkpoint['model'] if 'model' in checkpoint else checkpoint
28 |     model.load_state_dict(checkpoint, strict=True)
29 |     info_path = re.sub('.pth$', '.yaml', model_pth)
30 |     configs = {}
31 |     if os.path.exists(info_path):
32 |         with open(info_path, 'r') as fin:
33 |             configs = yaml.load(fin, Loader=yaml.FullLoader)
34 |     return configs
35 | 


--------------------------------------------------------------------------------
/indextts/utils/common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import re
 4 | 
 5 | import torch
 6 | import torchaudio
 7 | 
 8 | MATPLOTLIB_FLAG = False
 9 | 
10 | 
11 | def load_audio(audiopath, sampling_rate):
12 |     audio, sr = torchaudio.load(audiopath)
13 |     #print(f"wave shape: {audio.shape}, sample_rate: {sr}")
14 | 
15 |     if audio.size(0) > 1:  # mix to mono
16 |         audio = audio[0].unsqueeze(0)
17 | 
18 |     if sr != sampling_rate:
19 |         try:
20 |             audio = torchaudio.functional.resample(audio, sr, sampling_rate)
21 |         except Exception as e:
22 |             print(f"Warning: {audiopath}, wave shape: {audio.shape}, sample_rate: {sr}")
23 |             return None
24 |     # clip audio invalid values
25 |     audio.clip_(-1, 1)
26 |     return audio
27 | 
28 | 
29 | def tokenize_by_CJK_char(line: str) -> str: 
30 |     """  
31 |     Tokenize a line of text with CJK char.
32 | 
33 |     Note: All return charaters will be upper case.
34 | 
35 |     Example:                                                                                                                                                                                                                                                                    
36 |       input = "你好世界是 hello world 的中文"
37 |       output = "你 好 世 界 是 HELLO WORLD 的 中 文"
38 | 
39 |     Args:
40 |       line:
41 |         The input text.
42 | 
43 |     Return:
44 |       A new string tokenize by CJK char.
45 |     """
46 |     # The CJK ranges is from https://github.com/alvations/nltk/blob/79eed6ddea0d0a2c212c1060b477fc268fec4d4b/nltk/tokenize/util.py
47 |     pattern = re.compile(
48 |         r"([\u1100-\u11ff\u2e80-\ua4cf\ua840-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F\uFF65-\uFFDC\U00020000-\U0002FFFF])"
49 |     )    
50 |     chars = pattern.split(line.strip().upper())
51 |     return " ".join([w.strip() for w in chars if w.strip()])
52 | 
53 | 
54 | def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
55 |     """Make mask tensor containing indices of padded part.
56 | 
57 |     See description of make_non_pad_mask.
58 | 
59 |     Args:
60 |         lengths (torch.Tensor): Batch of lengths (B,).
61 |     Returns:
62 |         torch.Tensor: Mask tensor containing indices of padded part.
63 | 
64 |     Examples:
65 |         >>> lengths = [5, 3, 2]
66 |         >>> make_pad_mask(lengths)
67 |         masks = [[0, 0, 0, 0 ,0],
68 |                  [0, 0, 0, 1, 1],
69 |                  [0, 0, 1, 1, 1]]
70 |     """
71 |     batch_size = lengths.size(0)
72 |     max_len = max_len if max_len > 0 else lengths.max().item()
73 |     seq_range = torch.arange(0,
74 |                              max_len,
75 |                              dtype=torch.int64,
76 |                              device=lengths.device)
77 |     seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
78 |     seq_length_expand = lengths.unsqueeze(-1)
79 |     mask = seq_range_expand >= seq_length_expand
80 |     return mask
81 | 
82 | 
83 | def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
84 |     """
85 |     Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.
86 | 
87 |     Args:
88 |         x (Tensor): Input tensor.
89 |         clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.
90 | 
91 |     Returns:
92 |         Tensor: Element-wise logarithm of the input tensor with clipping applied.
93 |     """
94 |     return torch.log(torch.clip(x, min=clip_val))
95 | 


--------------------------------------------------------------------------------
/indextts/utils/feature_extractors.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchaudio
 3 | from torch import nn
 4 | from indextts.utils.common import safe_log
 5 | 
 6 | 
 7 | class FeatureExtractor(nn.Module):
 8 |     """Base class for feature extractors."""
 9 | 
10 |     def forward(self, audio: torch.Tensor, **kwargs) -> torch.Tensor:
11 |         """
12 |         Extract features from the given audio.
13 | 
14 |         Args:
15 |             audio (Tensor): Input audio waveform.
16 | 
17 |         Returns:
18 |             Tensor: Extracted features of shape (B, C, L), where B is the batch size,
19 |                     C denotes output features, and L is the sequence length.
20 |         """
21 |         raise NotImplementedError("Subclasses must implement the forward method.")
22 | 
23 | 
24 | class MelSpectrogramFeatures(FeatureExtractor):
25 |     def __init__(self, sample_rate=24000, n_fft=1024, hop_length=256, win_length=None,
26 |                  n_mels=100, mel_fmin=0, mel_fmax=None, normalize=False, padding="center"):
27 |         super().__init__()
28 |         if padding not in ["center", "same"]:
29 |             raise ValueError("Padding must be 'center' or 'same'.")
30 |         self.padding = padding
31 |         self.mel_spec = torchaudio.transforms.MelSpectrogram(
32 |             sample_rate=sample_rate,
33 |             n_fft=n_fft,
34 |             hop_length=hop_length,
35 |             win_length=win_length,
36 |             power=1,
37 |             normalized=normalize,
38 |             f_min=mel_fmin,
39 |             f_max=mel_fmax,
40 |             n_mels=n_mels,
41 |             center=padding == "center",
42 |         )
43 | 
44 |     def forward(self, audio, **kwargs):
45 |         if self.padding == "same":
46 |             pad = self.mel_spec.win_length - self.mel_spec.hop_length
47 |             audio = torch.nn.functional.pad(audio, (pad // 2, pad // 2), mode="reflect")
48 |         mel = self.mel_spec(audio)
49 |         mel = safe_log(mel)
50 |         return mel
51 | 


--------------------------------------------------------------------------------
/indextts/utils/typical_sampling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import importlib.metadata
 3 | 
 4 | # 检查transformers库版本
 5 | try:
 6 |     transformers_version = importlib.metadata.version('transformers')
 7 |     major, minor = map(int, transformers_version.split('.')[:2])
 8 |     use_new_api = (major > 4) or (major == 4 and minor >= 49)
 9 | except (importlib.metadata.PackageNotFoundError, ValueError):
10 |     # 如果无法确定版本，假设使用旧版本API
11 |     use_new_api = False
12 | 
13 | # 根据版本选择正确的导入
14 | if use_new_api:
15 |     try:
16 |         # 在新版本中，LogitsWarper已合并到LogitsProcessor
17 |         from transformers import LogitsProcessor as BaseClass
18 |         print("[IndexTTS] 使用transformers新版API (>= 4.49)，LogitsProcessor")
19 |     except ImportError:
20 |         # 如果新导入失败，尝试旧版本
21 |         from transformers import LogitsWarper as BaseClass
22 |         print("[IndexTTS] 使用transformers旧版API (< 4.49)，LogitsWarper")
23 | else:
24 |     # 旧版本继续使用LogitsWarper
25 |     try:
26 |         from transformers import LogitsWarper as BaseClass
27 |         print("[IndexTTS] 使用transformers旧版API (< 4.49)，LogitsWarper")
28 |     except ImportError:
29 |         # 如果旧导入失败，尝试新版本
30 |         from transformers import LogitsProcessor as BaseClass
31 |         print("[IndexTTS] 使用transformers新版API (>= 4.49)，LogitsProcessor")
32 | 
33 | 
34 | class TypicalLogitsWarper(BaseClass):
35 |     def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
36 |         self.filter_value = filter_value
37 |         self.mass = mass
38 |         self.min_tokens_to_keep = min_tokens_to_keep
39 | 
40 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
41 |         # calculate entropy
42 |         normalized = torch.nn.functional.log_softmax(scores, dim=-1)
43 |         p = torch.exp(normalized)
44 |         ent = -(normalized * p).nansum(-1, keepdim=True)
45 | 
46 |         # shift and sort
47 |         shifted_scores = torch.abs((-normalized) - ent)
48 |         sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False)
49 |         sorted_logits = scores.gather(-1, sorted_indices)
50 |         cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
51 | 
52 |         # Remove tokens with cumulative mass above the threshold
53 |         last_ind = (cumulative_probs < self.mass).sum(dim=1)
54 |         last_ind[last_ind < 0] = 0
55 |         sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1))
56 |         if self.min_tokens_to_keep > 1:
57 |             # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
58 |             sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
59 |         indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
60 | 
61 |         scores = scores.masked_fill(indices_to_remove, self.filter_value)
62 |         return scores
63 | 


--------------------------------------------------------------------------------
/indextts/utils/webui_utils.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | 
 3 | 
 4 | def html_center(text, label='p'):
 5 |     return f"""<div style="text-align: center; margin: 100; padding: 50;">
 6 |                 <{label} style="margin: 0; padding: 0;">{text}</{label}>
 7 |                 </div>"""
 8 | 
 9 | 
10 | def html_left(text, label='p'):
11 |     return f"""<div style="text-align: left; margin: 0; padding: 0;">
12 |                 <{label} style="margin: 0; padding: 0;">{text}</{label}>
13 |                 </div>"""
14 | 
15 | 
16 | def next_page(page_number,sentences):
17 |     new_page_number = int(page_number) + 1
18 |     update_page_number = gr.update(value=str(new_page_number))
19 |     update_prev_page = gr.update(visible=True, interactive=True)
20 |     if len(sentences.values) <= new_page_number * 20:
21 |         update_next_page = gr.update(visible=False, interactive=False)
22 |     else:
23 |         update_next_page = gr.update(visible=True, interactive=True)
24 |     return update_page_number, update_next_page, update_prev_page
25 | 
26 | 
27 | def prev_page(page_number):
28 |     new_page_number = int(page_number) - 1
29 |     update_page_number = gr.update(value=str(new_page_number))
30 |     if new_page_number == 1:
31 |         update_prev_page = gr.update(visible=False, interactive=False)
32 |     else:
33 |         update_prev_page = gr.update(visible=True, interactive=True)
34 |     update_next_page = gr.update(visible=True, interactive=True)
35 |     return update_page_number, update_next_page, update_prev_page
36 | 
37 | 
38 | def update_current_texts(page_number,sentences):
39 |     start_index = (int(page_number) - 1) * 20
40 |     end_index = int(page_number) * 20
41 |     current_texts = sentences.values[start_index:end_index if end_index < len(sentences.values) else len(sentences.values)]
42 |     return gr.update(values=current_texts)
43 | 


--------------------------------------------------------------------------------
/indextts/vqvae/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenpipi0807/ComfyUI-Index-TTS/d650c381d8622ed5bb8d73b20988074d7f8d3c21/indextts/vqvae/__init__.py


--------------------------------------------------------------------------------
/llm_prompt模板.txt:
--------------------------------------------------------------------------------
 1 | ### 指令
 2 | 你是一个专业的小说文本结构化处理器。请严格按以下规则处理输入文本：
 3 | 1. 将文本拆分为`<Narrator>`叙述段落和`<CharacterX>`角色对话
 4 | 2. 角色分配规则：
 5 |    - 同一角色始终使用相同Character编号（如王野始终是<Character1>）
 6 |    - 新角色首次出现时自动分配新编号
 7 |    - 角色识别优先级：角色名 > 代词(他/她) > 特征描述
 8 | 3. 文本分类规则：
 9 |    - 直接引语归入角色标签
10 |    - 动作/环境/心理描写归入Narrator
11 |    - 对话引导语（如"王野说道："）归入Narrator
12 | 4. 输出格式要求：
13 |    - 每段独立一行，用指定标签包裹
14 |    - 严格保持原文标点和换行
15 |    - 不添加任何额外说明或注释
16 | 
17 | ### 输出示例
18 | 输入文本：
19 | '''
20 | 少女站在院墙边。
21 | "你好吗？"少年问道。
22 | 她轻声回答："我很好。"
23 | '''
24 | 
25 | 正确输出：
26 | <Narrator>少女站在院墙边。</Narrator>
27 | <Narrator>少年问道：</Narrator>
28 | <Character1>“你好吗？”</Character1>
29 | <Narrator>她轻声回答：</Narrator>
30 | <Character2>“我很好。”</Character2>
31 | 
32 | ### 当前任务
33 | 请处理以下小说内容：
34 | '''
35 | {{此处粘贴你的小说文本}}
36 | '''


--------------------------------------------------------------------------------
/novel_text_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import numpy as np
  3 | import torch
  4 | 
  5 | class NovelTextParser:
  6 |     """
  7 |     解析小说文本，将其结构化为不同Character的对话和旁白
  8 |     """
  9 |     
 10 |     def __init__(self):
 11 |         # Character映射表 {Character标识: CharacterID}
 12 |         self.role_map = {}
 13 |         self.next_role_id = 1
 14 |         # 说话动词库（可扩展）
 15 |         self.speech_verbs = {"说", "问道", "喊道", "低声", "问", "答", "笑", "叹", "回应", "回答", "响起", "笑道", "道", 
 16 |                            "说道", "叫道", "念道", "解释", "回道", "吼道", "喃喃", "感叹", "插嘴", "呢喃", "咆哮", 
 17 |                            "呐喊", "哭诉", "嘟囔", "嘀咕", "抱怨", "打断", "反驳", "辩解", "质问", "追问", "附和", 
 18 |                            "应和", "赞同", "反对", "嗤笑", "冷笑", "大笑", "苦笑", "微笑", "轻声", "高声", "尖叫", 
 19 |                            "嚷嚷", "嚎叫", "呻吟", "哀叹", "感慨", "嘱咐", "命令", "告诫", "劝告", "建议", "提醒", 
 20 |                            "强调", "补充", "继续", "沉思", "自语", "喊", "讲", "谈", "评论", "议论", "宣布", "声明", 
 21 |                            "陈述", "表示", "暗示", "指出", "分析", "总结", "回忆", "思考", "开口", "呼唤", "祈求", 
 22 |                            "哀求", "恳求", "央求", "嘲讽", "讥讽", "挖苦", "调侃", "戏谑", "调笑", "戏弄", "吐槽"}
 23 |         
 24 |         # 常见非人名词汇，用于过滤
 25 |         self.non_character_words = {"这个", "那个", "他", "她", "它", "你", "我", "那", "这", "其", "某", 
 26 |                                  "谁", "哪", "是", "有", "没", "就", "可", "能", "会", "个", "的", "了",
 27 |                                  "着", "被", "让", "给"}
 28 |         
 29 |         # 角色标签模式
 30 |         self.role_tag_pattern = re.compile(r'<(Narrator|Character\d+)>')
 31 |         
 32 |     def _is_preformatted(self, text):
 33 |         """检测文本是否已经是预格式化的（已包含角色标签）
 34 |         
 35 |         Args:
 36 |             text: 输入文本
 37 |             
 38 |         Returns:
 39 |             bool: 是否预格式化
 40 |         """
 41 |         # 查找是否至少有一个角色标签
 42 |         tags_found = len(re.findall(self.role_tag_pattern, text)) > 0
 43 |         return tags_found
 44 |         
 45 |     def parse_text(self, text):
 46 |         """
 47 |         解析文本，将其结构化为不同Character的对话和旁白
 48 |         
 49 |         Args:
 50 |             text: 输入的小说文本
 51 |             
 52 |         Returns:
 53 |             structured_text: 结构化后的文本 (包含Character标签)
 54 |         """
 55 |         # 检测是否已经是预格式化的文本
 56 |         if self._is_preformatted(text):
 57 |             print("[Novel Text Parser] Detected pre-formatted text with role tags, preserving as-is")
 58 |             # 将预格式化文本转换为结构化格式
 59 |             segments = []
 60 |             current_idx = 0
 61 |             
 62 |             # 遍历所有标签匹配
 63 |             for match in re.finditer(self.role_tag_pattern, text):
 64 |                 role = match.group(1)  # 角色名（Narrator或CharacterX）
 65 |                 start_idx = match.end()  # 标签后的开始索引
 66 |                 
 67 |                 # 查找下一个标签的开始位置
 68 |                 next_tag = re.search(self.role_tag_pattern, text[start_idx:])
 69 |                 if next_tag:
 70 |                     end_idx = start_idx + next_tag.start()
 71 |                 else:
 72 |                     end_idx = len(text)
 73 |                     
 74 |                 # 提取文本内容
 75 |                 content = text[start_idx:end_idx]
 76 |                 segments.append({"type": role, "text": content})
 77 |                 
 78 |             return segments
 79 |         
 80 |         # 预处理：按段落分割
 81 |         paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
 82 |         structured = []
 83 |         
 84 |         for para in paragraphs:
 85 |             # 1. 先检测引号和对话模式（更完善的引号判断）
 86 |             # 模式1："人物说道：“对话”" 或 "人物道：“对话”"
 87 |             dialogue_match = re.search(r'(.+?)(?:\s*[\u8bf4|\u9053].+?[\uff1a|"])["|\u201c](.+?)["|\u201d]', para)
 88 |             
 89 |             if dialogue_match:
 90 |                 context, dialogue = dialogue_match.groups()
 91 |                 # 从上下文中识别角色
 92 |                 role_id = self._identify_speaker(context)
 93 |                 structured.append({"type": "Narrator", "text": context})
 94 |                 structured.append({"type": role_id, "text": dialogue.strip()})
 95 |                 
 96 |             # 模式2：纯引号对话“对话”
 97 |             elif quote_match := re.search(r'["|\u201c](.+?)["|\u201d]', para):
 98 |                 dialogue = quote_match.group(1)
 99 |                 # 提取引号前后的上下文
100 |                 pre_context = para[:quote_match.start()]
101 |                 post_context = para[quote_match.end():]
102 |                 
103 |                 # 如果引号后有说话者描述，优先使用
104 |                 if post_context and any(verb in post_context for verb in self.speech_verbs):
105 |                     role_id = self._identify_speaker(post_context)
106 |                     structured.append({"type": role_id, "text": dialogue.strip()})
107 |                     structured.append({"type": "Narrator", "text": post_context.strip()})
108 |                 # 如果引号前有说话者描述和动词
109 |                 elif pre_context and any(verb in pre_context for verb in self.speech_verbs):
110 |                     role_id = self._identify_speaker(pre_context)
111 |                     structured.append({"type": "Narrator", "text": pre_context.strip()})
112 |                     structured.append({"type": role_id, "text": dialogue.strip()})
113 |                 # 如果无法确定说话者，将整段文本当作旁白
114 |                 else:
115 |                     structured.append({"type": "Narrator", "text": para})
116 |             # 模式3：纯叙述文本
117 |             else:
118 |                 structured.append({"type": "Narrator", "text": para})
119 |                 
120 |         return structured
121 |     
122 |     def format_structured_text(self, structured):
123 |         """
124 |         将结构化的文本格式化为带标签的文本
125 |         
126 |         Args:
127 |             structured: 结构化的文本列表
128 |             
129 |         Returns:
130 |             formatted_text: 格式化后的带标签文本
131 |         """
132 |         formatted = []
133 |         for item in structured:
134 |             text_type = item["type"]
135 |             text = item["text"]
136 |             if text_type == "Narrator":
137 |                 formatted.append(f"<Narrator>{text}")
138 |             else:
139 |                 # 确保CharacterID格式为 "Character1", "Character2" 等
140 |                 if text_type.startswith("Character") and text_type[2:].isdigit():
141 |                     formatted.append(f"<{text_type}>{text}")
142 |                 else:
143 |                     # 默认情况下，尝试将Character映射到Character1-5
144 |                     role_num = int(text_type[2:]) if text_type[2:].isdigit() else 1
145 |                     if 1 <= role_num <= 5:
146 |                         formatted.append(f"<Character{role_num}>{text}")
147 |                     else:
148 |                         formatted.append(f"<Narrator>{text}")
149 |         
150 |         return "".join(formatted)
151 |     
152 |     def _is_direct_speech(self, text):
153 |         # 检测引导词或直接引号
154 |         quotes = any(c in text for c in ['"', '"', '"', "'", "'", "'"])
155 |         has_verb = any(verb in text for verb in self.speech_verbs)
156 |         return quotes or has_verb
157 |     
158 |     def _extract_dialogue(self, text):
159 |         # 提取引号内内容
160 |         if match := re.search(r'[""](.+?)[""]', text):
161 |             dialogue = match.group(1)
162 |             # 从上下文识别Character
163 |             context = text.replace(f'"{dialogue}"', '').replace(f'"{dialogue}"', '')
164 |             return self._identify_role(context), dialogue
165 |         return self._identify_role(text), text
166 |     
167 |     def _identify_speaker(self, context):
168 |         """Enhanced speaker identification from dialogue context
169 |         
170 |         Args:
171 |             context: surrounding text context
172 |             
173 |         Returns:
174 |             role_id: the identified speaker's role ID
175 |         """
176 |         # 1. 检测已知Character名
177 |         for name, role_id in self.role_map.items():
178 |             if name in context:
179 |                 return role_id
180 |                 
181 |         # 2. 尝试匹配可能的中文人名模式
182 |         # 匹配姓名常见的 2-3 字的名字，及姓名前后带有说话动词的
183 |         name_match = re.search(r'([一-龥]{2,3})(?:[^\n\r]{0,10}[\u8bf4\u9053])', context)
184 |         if name_match:
185 |             new_role = name_match.group(1).strip()
186 |             # 过滤掉常见的非人名词汇
187 |             if new_role and new_role not in self.non_character_words:
188 |                 role_id = f"Character{min(self.next_role_id, 5)}"  # 限制最多到Character5
189 |                 print(f"[Novel Text Parser] Identified new character: {new_role} as {role_id}")
190 |                 self.role_map[new_role] = role_id
191 |                 self.next_role_id = min(self.next_role_id + 1, 6)  # 最多到Character5
192 |                 return role_id
193 |         
194 |         # 3. 更广泛的匹配 - 寻找符合中文人名特征的词语
195 |         characters = re.findall(r'[一-龥]{2,3}', context)
196 |         for char in characters:
197 |             # 过滤掉常见的非人名词汇
198 |             if len(char) >= 2 and char not in self.non_character_words:
199 |                 role_id = f"Character{min(self.next_role_id, 5)}"
200 |                 print(f"[Novel Text Parser] Inferred character name: {char} as {role_id}")
201 |                 self.role_map[char] = role_id
202 |                 self.next_role_id = min(self.next_role_id + 1, 6)
203 |                 return role_id
204 |                 
205 |         # 4. 默认处理
206 |         return "Narrator"
207 | 
208 |     def _identify_role(self, context):
209 |         # 向后兼容保留的方法，调用新的方法
210 |         return self._identify_speaker(context)
211 | 
212 | 
213 | # ComfyUI节点：小说文本结构化节点
214 | class NovelTextStructureNode:
215 |     """
216 |     ComfyUI的小说文本结构化节点，用于将小说文本结构化为不同Character的对话和旁白
217 |     """
218 |     
219 |     @classmethod
220 |     def INPUT_TYPES(cls):
221 |         return {
222 |             "required": {
223 |                 "novel_text": ("STRING", {"multiline": True, "default": 'Novel text example.\nLin Wei said, "Hello there."\nSu Qing replied, "Long time no see."\n'}),
224 |             }
225 |         }
226 |     
227 |     RETURN_TYPES = ("STRING",)
228 |     RETURN_NAMES = ("structured_text",)
229 |     FUNCTION = "structure_novel_text"
230 |     CATEGORY = "text/novels"
231 |     
232 |     def __init__(self):
233 |         self.parser = NovelTextParser()
234 |         print("[Novel Text Structure Node] Initialization completed")
235 |     
236 |     def structure_novel_text(self, novel_text):
237 |         """
238 |         将小说文本结构化为不同Character的对话和旁白
239 |         
240 |         Args:
241 |             novel_text: 输入的小说文本
242 |             
243 |         Returns:
244 |             structured_text: 结构化后的文本
245 |         """
246 |         try:
247 |             print(f"[Novel Text Structure] Processing novel text, length: {len(novel_text)}")
248 |             
249 |             # 解析文本
250 |             structured = self.parser.parse_text(novel_text)
251 |             
252 |             # Character统计
253 |             role_stats = {}
254 |             for item in structured:
255 |                 role = item["type"]
256 |                 if role not in role_stats:
257 |                     role_stats[role] = 0
258 |                 role_stats[role] += 1
259 |                 
260 |             print(f"[Novel Text Structure] 解析完成，识别到Character统计: {role_stats}")
261 |             
262 |             # 格式化为带标签的文本
263 |             formatted_text = self.parser.format_structured_text(structured)
264 |             print(f"[Novel Text Structure] Formatting completed, output text length: {len(formatted_text)}")
265 |             
266 |             # 输出示例
267 |             preview = formatted_text[:200] + "..." if len(formatted_text) > 200 else formatted_text
268 |             print(f"[Novel Text Structure] Output text preview: {preview}")
269 |             
270 |             return (formatted_text,)
271 |             
272 |         except Exception as e:
273 |             import traceback
274 |             print(f"[Novel Text Structure] Processing failed: {e}")
275 |             print(traceback.format_exc())
276 |             # 失败时返回原文本
277 |             return (novel_text,)
278 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "index-tts"
 3 | description = "NODES: An industrial-grade zero-shot text-to-speech synthesis system with a ComfyUI interface."
 4 | version = "1.0.0"
 5 | license = {file = "LICENSE"}
 6 | dependencies = ["torch>=1.12.0", "numpy>=1.20.0", "scipy>=1.7.0", "librosa>=0.8.0", "soundfile>=0.10.0", "PyYAML>=6.0", "pynini==2.1.5; platform_system!=\"Windows\"", "WeTextProcessing>=1.0.3; platform_system!=\"Windows\"", "modelscope"]
 7 | 
 8 | [project.urls]
 9 | Repository = "https://github.com/chenpipi0807/ComfyUI-Index-TTS"
10 | #  Used by Comfy Registry https://comfyregistry.org
11 | 
12 | [tool.comfy]
13 | PublisherId = ""
14 | DisplayName = "ComfyUI-Index-TTS"
15 | Icon = ""
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.12.0
 2 | numpy>=1.20.0
 3 | scipy>=1.7.0
 4 | librosa>=0.10.0
 5 | soundfile>=0.10.0
 6 | PyYAML>=6.0
 7 | pynini==2.1.6; platform_system!="Windows"
 8 | WeTextProcessing>=1.0.3; platform_system!="Windows"
 9 | omegaconf>=2.3.0
10 | 


--------------------------------------------------------------------------------
/timbre_audio_loader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @title: Timbre Audio Loader
  3 | @author: ComfyUI-Index-TTS
  4 | @description: 用于加载Timbre模型目录下的音频文件的节点
  5 | """
  6 | 
  7 | import os
  8 | import sys
  9 | import hashlib
 10 | import torchaudio
 11 | import torch
 12 | import glob
 13 | from pathlib import Path
 14 | import folder_paths
 15 | 
 16 | # 获取当前目录
 17 | current_dir = os.path.dirname(os.path.abspath(__file__))
 18 | if current_dir not in sys.path:
 19 |     sys.path.append(current_dir)
 20 | 
 21 | # 直接使用torchaudio功能，不再导入额外函数
 22 | 
 23 | class TimbreAudioLoader:
 24 |     """
 25 |     ComfyUI节点: 从Timbre模型目录加载音频样本文件，支持刷新列表
 26 |     """
 27 |     
 28 |     # 保存扫描的音频文件缓存
 29 |     audio_files_cache = []
 30 |     
 31 |     @classmethod
 32 |     def INPUT_TYPES(cls):
 33 |         # 定义Timbre模型目录路径 - 使用项目内的目录
 34 |         timbre_dir = os.path.join(current_dir, "TimbreModel")
 35 |         
 36 |         # 确保目录存在
 37 |         os.makedirs(timbre_dir, exist_ok=True)
 38 |         
 39 |         # 扫描所有支持的音频文件
 40 |         cls.scan_audio_files(timbre_dir)
 41 |         
 42 |         return {
 43 |             "required": {
 44 |                 "audio_file": (cls.audio_files_cache, ),
 45 |                 "refresh": ("BOOLEAN", {"default": False, "label": "刷新音频列表"})
 46 |             }
 47 |         }
 48 |     
 49 |     @classmethod
 50 |     def scan_audio_files(cls, directory):
 51 |         """扫描目录下的所有音频文件"""
 52 |         # 支持的音频格式模式（Windows不区分大小写）
 53 |         audio_patterns = ["**/*.wav", "**/*.mp3", "**/*.ogg", "**/*.flac"]
 54 |         
 55 |         # 初始化音频文件缓存
 56 |         cls.audio_files_cache = ["无音频文件"] # 默认选项
 57 |         
 58 |         # 检查目录是否存在
 59 |         if not os.path.exists(directory):
 60 |             print(f"[TimbreAudioLoader] 警告: 目录不存在: {directory}")
 61 |             return
 62 |         
 63 |         # 使用集合来确保文件名唯一性
 64 |         unique_filenames = set()
 65 |         audio_files = []
 66 |         
 67 |         # 扫描所有音频文件
 68 |         for pattern in audio_patterns:
 69 |             # 使用递归模式搜索
 70 |             matches = glob.glob(os.path.join(directory, pattern), recursive=True)
 71 |             for file_path in matches:
 72 |                 # 提取文件名（不包含路径）
 73 |                 file_name = os.path.basename(file_path)
 74 |                 # 只添加尚未添加的文件名
 75 |                 if file_name.lower() not in unique_filenames:
 76 |                     unique_filenames.add(file_name.lower())
 77 |                     audio_files.append(file_path)
 78 |         
 79 |         # 将收集到的文件添加到缓存
 80 |         if audio_files:
 81 |             # 按文件名排序
 82 |             audio_files.sort(key=lambda x: os.path.basename(x).lower())
 83 |             
 84 |             # 添加文件名到缓存
 85 |             for file_path in audio_files:
 86 |                 file_name = os.path.basename(file_path)
 87 |                 cls.audio_files_cache.append(file_name)
 88 |             
 89 |             print(f"[TimbreAudioLoader] 已加载 {len(cls.audio_files_cache)-1} 个音频文件")
 90 |         else:
 91 |             print(f"[TimbreAudioLoader] 警告: 未找到音频文件，路径: {directory}")
 92 |     
 93 |     RETURN_TYPES = ("AUDIO", )
 94 |     FUNCTION = "load_timbre_audio"
 95 |     CATEGORY = "audio"
 96 |     
 97 |     def load_timbre_audio(self, audio_file, refresh):
 98 |         """加载选择的音频文件或刷新列表"""
 99 |         # 定义Timbre模型目录路径 - 使用项目内的目录
100 |         timbre_dir = os.path.join(current_dir, "TimbreModel")
101 |         
102 |         # 如果用户点击了刷新按钮
103 |         if refresh:
104 |             self.__class__.scan_audio_files(timbre_dir)
105 |             print("[TimbreAudioLoader] 已刷新音频文件列表")
106 |         
107 |         # 如果选择了"无音频文件"或列表为空，返回空的音频数据
108 |         if audio_file == "无音频文件" or not audio_file:
109 |             # 创建一个小的空音频样本
110 |             waveform = torch.zeros((1, 16000))  # 1秒静音
111 |             sample_rate = 16000
112 |             return ({"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate}, )
113 |         
114 |         # 构建完整的文件路径
115 |         file_path = os.path.join(timbre_dir, audio_file)
116 |         
117 |         try:
118 |             # 使用torchaudio加载音频
119 |             waveform, sample_rate = torchaudio.load(file_path)
120 |             
121 |             # 返回ComfyUI音频格式
122 |             return ({"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate}, )
123 |         except Exception as e:
124 |             print(f"[TimbreAudioLoader] 加载音频失败: {e}")
125 |             # 发生错误时返回空音频
126 |             waveform = torch.zeros((1, 16000))
127 |             sample_rate = 16000
128 |             return ({"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate}, )
129 |     
130 |     @classmethod
131 |     def IS_CHANGED(cls, audio_file, refresh):
132 |         """当输入变化时通知ComfyUI"""
133 |         # 如果点击了刷新按钮，返回随机值以触发节点更新
134 |         if refresh:
135 |             return str(os.urandom(8).hex())
136 |         
137 |         # 如果选择了有效的音频文件，返回文件路径作为变化标识
138 |         if audio_file != "无音频文件":
139 |             timbre_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(current_dir))), "models", "Index-TTS", "timbre")
140 |             file_path = os.path.join(timbre_dir, audio_file)
141 |             
142 |             # 检查文件是否存在
143 |             if os.path.exists(file_path):
144 |                 # 计算文件哈希值，用于标识变化
145 |                 m = hashlib.sha256()
146 |                 with open(file_path, 'rb') as f:
147 |                     m.update(f.read())
148 |                 return m.digest().hex()
149 |         
150 |         return audio_file
151 | 
152 | class RefreshTimbreAudio:
153 |     """
154 |     简单的刷新Timbre音频列表节点
155 |     """
156 |     
157 |     @classmethod
158 |     def INPUT_TYPES(cls):
159 |         return {
160 |             "required": {
161 |                 "refresh": ("BOOLEAN", {"default": True, "label": "刷新音频列表"})
162 |             }
163 |         }
164 |     
165 |     RETURN_TYPES = ()
166 |     FUNCTION = "refresh"
167 |     CATEGORY = "audio"
168 |     OUTPUT_NODE = True
169 |     
170 |     def refresh(self, refresh):
171 |         if refresh:
172 |             # 定义Timbre模型目录路径
173 |             timbre_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(current_dir))), "models", "Index-TTS", "timbre")
174 |             
175 |             # 刷新TimbreAudioLoader中的缓存
176 |             TimbreAudioLoader.scan_audio_files(timbre_dir)
177 |             print("[RefreshTimbreAudio] 已刷新音频文件列表")
178 |         
179 |         return {}
180 | 


--------------------------------------------------------------------------------
/tts_nodes/__init__.py:
--------------------------------------------------------------------------------
1 | # TTS nodes module
2 | from .tts_node import IndexTTSNode
3 | from .audio_preview import AudioPreviewNode
4 | 


--------------------------------------------------------------------------------
/tts_nodes/audio_preview.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | import tempfile
  5 | import base64
  6 | 
  7 | # 确保模块可被找到
  8 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  9 | 
 10 | # 确保导入路径正确
 11 | package_root = os.path.dirname(os.path.dirname(__file__))
 12 | if package_root not in sys.path:
 13 |     sys.path.append(package_root)
 14 | 
 15 | # 导入工具函数
 16 | from utils.audio_utils import save_audio
 17 | 
 18 | class AudioPreviewNode:
 19 |     """
 20 |     音频预览节点，用于在ComfyUI界面中预览和播放音频
 21 |     """
 22 |     
 23 |     @classmethod
 24 |     def INPUT_TYPES(cls):
 25 |         return {
 26 |             "required": {
 27 |                 "audio": ("AUDIO",),
 28 |                 "filename_prefix": ("STRING", {"default": "audio"}),
 29 |                 "autoplay": (["True", "False"], {"default": "True"}),
 30 |             },
 31 |             "optional": {
 32 |                 "save_path": ("STRING", {"default": ""})
 33 |             }
 34 |         }
 35 |     
 36 |     RETURN_TYPES = ()
 37 |     FUNCTION = "preview_audio"
 38 |     OUTPUT_NODE = True
 39 |     CATEGORY = "audio"
 40 | 
 41 |     def preview_audio(self, audio, filename_prefix="audio", autoplay="True", save_path=""):
 42 |         """
 43 |         处理并预览音频
 44 |         
 45 |         参数:
 46 |             audio: 音频数据元组 (音频数据, 采样率)
 47 |             filename_prefix: 输出文件名前缀
 48 |             autoplay: 是否自动播放
 49 |             save_path: 可选的保存路径
 50 |             
 51 |         返回:
 52 |             dict: UI显示字典
 53 |         """
 54 |         audio_data, sample_rate = audio
 55 |         
 56 |         # 保存音频到临时文件
 57 |         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
 58 |             save_audio(audio_data, sample_rate, temp_file.name)
 59 |             temp_path = temp_file.name
 60 |             
 61 |             # 如果提供了保存路径，将音频保存到指定位置
 62 |             if save_path:
 63 |                 try:
 64 |                     os.makedirs(os.path.dirname(save_path), exist_ok=True)
 65 |                     final_path = os.path.join(save_path, f"{filename_prefix}.wav")
 66 |                     save_audio(audio_data, sample_rate, final_path)
 67 |                     save_message = f"音频已保存到: {final_path}"
 68 |                 except Exception as e:
 69 |                     save_message = f"保存音频失败: {e}"
 70 |             else:
 71 |                 save_message = ""
 72 |                 
 73 |             # 获取音频时长
 74 |             duration = len(audio_data) / sample_rate
 75 |             
 76 |             # 生成Web界面HTML代码
 77 |             autoplay_attr = "autoplay" if autoplay == "True" else ""
 78 |             
 79 |             # 获取文件的相对URL (适用于ComfyUI的文件服务)
 80 |             import urllib.parse
 81 |             
 82 |             # 使用临时文件URL路径
 83 |             filename = os.path.basename(temp_path)
 84 |             file_url = f"file/{urllib.parse.quote(filename)}"
 85 |             
 86 |             # 创建HTML音频播放器
 87 |             html_embed = f"""
 88 |             <div style="padding: 10px; border: 1px solid #ddd; border-radius: 5px; margin: 10px 0;">
 89 |                 <h3 style="margin: 0 0 10px 0;">音频预览</h3>
 90 |                 <audio controls {autoplay_attr} style="width: 100%;">
 91 |                     <source src="/view?filename={file_url}" type="audio/wav">
 92 |                     您的浏览器不支持音频播放
 93 |                 </audio>
 94 |                 <div style="display: flex; justify-content: space-between; margin-top: 5px;">
 95 |                     <span>采样率: {sample_rate} Hz</span>
 96 |                     <span>时长: {duration:.2f} 秒</span>
 97 |                 </div>
 98 |                 {f'<div style="margin-top: 5px;">{save_message}</div>' if save_message else ''}
 99 |             </div>
100 |             """
101 |             
102 |         # 返回UI元素
103 |         return {"ui": {"audio": html_embed}}
104 |         
105 |     @classmethod
106 |     def IS_CHANGED(cls, audio, filename_prefix, autoplay, save_path=""):
107 |         # 用于判断节点输入是否变化的辅助函数
108 |         # 对于输出节点，我们总是返回True确保UI更新
109 |         return True
110 | 


--------------------------------------------------------------------------------
/tts_nodes/tts_node.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | import numpy as np
  5 | import tempfile
  6 | import json
  7 | import time
  8 | 
  9 | # 确保模块可被找到
 10 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 11 | 
 12 | # 确保导入路径正确
 13 | package_root = os.path.dirname(os.path.dirname(__file__))
 14 | if package_root not in sys.path:
 15 |     sys.path.append(package_root)
 16 | 
 17 | # 导入工具函数
 18 | from utils.audio_utils import load_audio, save_audio, get_temp_file
 19 | 
 20 | # 导入ComfyUI folder_paths用于获取模型目录
 21 | import folder_paths
 22 | 
 23 | # 添加索引TTS路径
 24 | INDEX_TTS_PATH = os.path.join(folder_paths.models_dir, "Index-TTS")
 25 | sys.path.append(INDEX_TTS_PATH)
 26 | 
 27 | # 尝试加载IndexTTS的必要依赖
 28 | try:
 29 |     # 如果直接导入indextts包失败，我们将模拟其核心功能
 30 |     # 因为原始代码可能不会直接可用，我们在这里实现一个简单的包装器
 31 |     class IndexTTS:
 32 |         def __init__(self, model_dir=None, cfg_path=None):
 33 |             """
 34 |             初始化IndexTTS模型
 35 |             
 36 |             参数:
 37 |                 model_dir: 模型目录
 38 |                 cfg_path: 配置文件路径
 39 |             """
 40 |             import importlib.util
 41 |             import torch
 42 |             import os
 43 |             
 44 |             self.model_dir = model_dir if model_dir else INDEX_TTS_PATH
 45 |             self.cfg_path = cfg_path if cfg_path else os.path.join(self.model_dir, "config.yaml")
 46 |             
 47 |             # 检查模型文件是否存在
 48 |             required_files = [
 49 |                 "bigvgan_discriminator.pth", "bigvgan_generator.pth", 
 50 |                 "bpe.model", "dvae.pth", "gpt.pth", 
 51 |                 "unigram_12000.vocab", "config.yaml"
 52 |             ]
 53 |             
 54 |             for file in required_files:
 55 |                 if not os.path.exists(os.path.join(self.model_dir, file)):
 56 |                     raise FileNotFoundError(f"模型文件 {file} 未找到，请确保已下载模型文件到 {self.model_dir}")
 57 |             
 58 |             # 加载Config
 59 |             import yaml
 60 |             with open(self.cfg_path, 'r', encoding='utf-8') as f:
 61 |                 self.config = yaml.safe_load(f)
 62 |                 
 63 |             print(f"成功初始化IndexTTS模型, 模型目录: {self.model_dir}")
 64 |             
 65 |             # 尝试导入indextts模块
 66 |             try:
 67 |                 import indextts
 68 |                 self.model = indextts.infer.IndexTTS(model_dir=self.model_dir, cfg_path=self.cfg_path)
 69 |                 self.use_original = True
 70 |                 print("使用原始IndexTTS模块")
 71 |             except ImportError:
 72 |                 # 如果无法导入，使用自定义实现
 73 |                 print("无法导入原始IndexTTS模块，使用自定义实现")
 74 |                 self.use_original = False
 75 |                 self._init_pipeline()
 76 |                 
 77 |         def _init_pipeline(self):
 78 |             """初始化语音合成管道"""
 79 |             # 这里应该加载所有必要的模型组件
 80 |             # 由于完整实现较为复杂，这里是一个简化的示例
 81 |             pass
 82 |             
 83 |         def infer(self, reference_voice, text, output_path, language="auto", speed=1.0):
 84 |             """
 85 |             使用参考声音生成语音
 86 |             
 87 |             参数:
 88 |                 reference_voice: 参考声音文件路径
 89 |                 text: 要合成的文本
 90 |                 output_path: 输出音频文件路径
 91 |                 language: 语言代码
 92 |                 speed: 语速，默认1.0
 93 |             """
 94 |             if self.use_original:
 95 |                 # 使用原始IndexTTS实现
 96 |                 self.model.infer(reference_voice, text, output_path, language=language, speed=speed)
 97 |             else:
 98 |                 # 使用自定义实现 - 这里是一个简单的占位实现
 99 |                 # 在实际应用中，应该完整实现音频合成逻辑
100 |                 raise NotImplementedError("自定义实现尚未完成，请安装原始的IndexTTS模块")
101 |             
102 |             return output_path
103 | 
104 | except ImportError as e:
105 |     print(f"导入IndexTTS相关模块失败: {e}")
106 |     print("请确保已安装所有必要的依赖")
107 | 
108 | 
109 | class IndexTTSNode:
110 |     """
111 |     ComfyUI的IndexTTS节点，用于文本到语音合成
112 |     """
113 |     
114 |     @classmethod
115 |     def INPUT_TYPES(cls):
116 |         return {
117 |             "required": {
118 |                 "text": ("STRING", {"multiline": True, "default": "你好，我是IndexTTS语音合成系统。"}),
119 |                 "reference_audio": ("AUDIO",),
120 |                 "language": (["auto", "zh", "en", "ja", "ko"], {"default": "auto"}),
121 |                 "speed": ("FLOAT", {"default": 1.0, "min": 0.5, "max": 2.0, "step": 0.1}),
122 |             }
123 |         }
124 |     
125 |     RETURN_TYPES = ("AUDIO",)
126 |     RETURN_NAMES = ("synthesized_audio",)
127 |     FUNCTION = "generate_speech"
128 |     CATEGORY = "audio/tts"
129 |     
130 |     def __init__(self):
131 |         # 获取模型目录
132 |         self.model_dir = INDEX_TTS_PATH
133 |         self.cfg_path = os.path.join(self.model_dir, "config.yaml")
134 |         
135 |         # 检查模型目录是否存在
136 |         if not os.path.exists(self.model_dir):
137 |             print(f"\033[91m错误: 未找到模型目录 {self.model_dir}\033[0m")
138 |             print(f"\033[91m请确保已下载模型文件到 {self.model_dir}\033[0m")
139 |         
140 |         # 延迟初始化模型，直到实际需要时
141 |         self.tts_model = None
142 |     
143 |     def _init_model(self):
144 |         """初始化TTS模型（延迟加载）"""
145 |         if self.tts_model is None:
146 |             try:
147 |                 self.tts_model = IndexTTS(model_dir=self.model_dir, cfg_path=self.cfg_path)
148 |                 print(f"模型已成功加载，模型目录: {self.model_dir}")
149 |             except Exception as e:
150 |                 print(f"初始化TTS模型失败: {e}")
151 |                 raise RuntimeError(f"初始化TTS模型失败: {e}")
152 |     
153 |     def generate_speech(self, text, reference_audio, language="auto", speed=1.0):
154 |         """
155 |         生成语音的主函数
156 |         
157 |         参数:
158 |             text: 要合成的文本
159 |             reference_audio: 参考音频元组 (音频数据, 采样率)
160 |             language: 语言代码
161 |             speed: 语速
162 |             
163 |         返回:
164 |             tuple: (音频数据, 采样率)
165 |         """
166 |         # 初始化模型
167 |         self._init_model()
168 |         
169 |         try:
170 |             # 解析参考音频
171 |             audio_data, sample_rate = reference_audio
172 |             
173 |             # 保存参考音频到临时文件
174 |             ref_path = get_temp_file(".wav")
175 |             save_audio(audio_data, sample_rate, ref_path)
176 |             
177 |             # 创建输出临时文件
178 |             output_path = get_temp_file(".wav")
179 |             
180 |             # 调用TTS引擎生成语音
181 |             self.tts_model.infer(
182 |                 ref_path, 
183 |                 text, 
184 |                 output_path,
185 |                 language=language,
186 |                 speed=speed
187 |             )
188 |             
189 |             # 读取生成的音频
190 |             result_audio, result_sr = load_audio(output_path, target_sr=sample_rate)
191 |             
192 |             # 清理临时文件
193 |             try:
194 |                 os.unlink(ref_path)
195 |                 os.unlink(output_path)
196 |             except:
197 |                 pass
198 |             
199 |             return ((result_audio, result_sr),)
200 |             
201 |         except Exception as e:
202 |             print(f"生成语音失败: {e}")
203 |             # 返回一个空音频（1秒静音）作为错误处理
204 |             empty_audio = np.zeros(sample_rate, dtype=np.float32)
205 |             return ((empty_audio, sample_rate),)
206 |             
207 |     @classmethod
208 |     def IS_CHANGED(cls, text, reference_audio, language, speed):
209 |         # 用于判断节点输入是否变化的辅助函数
210 |         # 这里使用当前时间戳确保每次运行都会重新生成
211 |         return time.time()
212 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Utils module for index-tts
2 | 


--------------------------------------------------------------------------------
/utils/audio_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import tempfile
 4 | import soundfile as sf
 5 | import torch
 6 | import librosa
 7 | 
 8 | def load_audio(file_path, target_sr=16000):
 9 |     """
10 |     加载音频文件并转换为指定采样率
11 |     
12 |     参数:
13 |         file_path: 音频文件路径
14 |         target_sr: 目标采样率，默认16000Hz
15 |         
16 |     返回:
17 |         (numpy array, int): 音频数据和采样率
18 |     """
19 |     try:
20 |         audio, sr = librosa.load(file_path, sr=target_sr, mono=True)
21 |         return audio, sr
22 |     except Exception as e:
23 |         print(f"加载音频文件失败: {e}")
24 |         return None, None
25 | 
26 | def save_audio(audio_data, sample_rate, file_path):
27 |     """
28 |     保存音频数据到文件
29 |     
30 |     参数:
31 |         audio_data: 音频数据 (numpy array)
32 |         sample_rate: 采样率
33 |         file_path: 保存路径
34 |     
35 |     返回:
36 |         bool: 是否保存成功
37 |     """
38 |     try:
39 |         sf.write(file_path, audio_data, sample_rate)
40 |         return True
41 |     except Exception as e:
42 |         print(f"保存音频文件失败: {e}")
43 |         return False
44 | 
45 | def audio_to_tensor(audio_data, sample_rate=16000):
46 |     """
47 |     将音频数据转换为张量
48 |     
49 |     参数:
50 |         audio_data: 音频数据 (numpy array)
51 |         sample_rate: 采样率
52 |         
53 |     返回:
54 |         torch.Tensor: 音频张量
55 |     """
56 |     # 确保音频是单声道
57 |     if len(audio_data.shape) > 1:
58 |         audio_data = np.mean(audio_data, axis=1)
59 |     
60 |     # 转换为torch张量
61 |     audio_tensor = torch.from_numpy(audio_data).float()
62 |     
63 |     return audio_tensor, sample_rate
64 | 
65 | def tensor_to_audio(audio_tensor, sample_rate=16000):
66 |     """
67 |     将音频张量转换为numpy数组
68 |     
69 |     参数:
70 |         audio_tensor: 音频张量
71 |         sample_rate: 采样率
72 |         
73 |     返回:
74 |         (numpy array, int): 音频数据和采样率
75 |     """
76 |     if isinstance(audio_tensor, torch.Tensor):
77 |         audio_data = audio_tensor.detach().cpu().numpy()
78 |     else:
79 |         audio_data = audio_tensor
80 |     
81 |     return audio_data, sample_rate
82 | 
83 | def get_temp_file(suffix=".wav"):
84 |     """
85 |     生成临时文件路径
86 |     
87 |     参数:
88 |         suffix: 文件后缀
89 |         
90 |     返回:
91 |         str: 临时文件路径
92 |     """
93 |     temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
94 |     temp_path = temp_file.name
95 |     temp_file.close()
96 |     return temp_path
97 | 


--------------------------------------------------------------------------------
/utils/index_tts_impl.py:
--------------------------------------------------------------------------------
  1 | """
  2 | IndexTTS实现模块 - 为ComfyUI定制
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | import torch
  8 | import numpy as np
  9 | import yaml
 10 | import json
 11 | from pathlib import Path
 12 | import re
 13 | from typing import Dict, List, Optional, Tuple, Union
 14 | 
 15 | # 保证路径正确 - 使用ComfyUI标准导入方式
 16 | current_dir = os.path.dirname(os.path.abspath(__file__))
 17 | parent_dir = os.path.dirname(current_dir)
 18 | root_dir = os.path.dirname(parent_dir)
 19 | 
 20 | # 添加到sys.path
 21 | for path in [current_dir, parent_dir, root_dir]:
 22 |     if path not in sys.path:
 23 |         sys.path.append(path)
 24 | 
 25 | # 导入ComfyUI路径模块
 26 | import folder_paths
 27 | MODELS_DIR = folder_paths.models_dir
 28 | INDEX_TTS_PATH = os.path.join(MODELS_DIR, "Index-TTS")
 29 | 
 30 | # 这行是为了调试
 31 | print(f"模型目录路径: {INDEX_TTS_PATH}")
 32 | 
 33 | class IndexTTSModel:
 34 |     """IndexTTS模型实现类，基于真实的模型文件"""
 35 |     
 36 |     def __init__(self, model_dir=None, cfg_path=None):
 37 |         """
 38 |         初始化IndexTTS模型
 39 |         
 40 |         参数:
 41 |             model_dir: 模型目录
 42 |             cfg_path: 配置文件路径
 43 |         """
 44 |         self.model_dir = model_dir if model_dir else INDEX_TTS_PATH
 45 |         self.cfg_path = cfg_path if cfg_path else os.path.join(self.model_dir, "config.yaml")
 46 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
 47 |         
 48 |         # 检查模型文件是否存在
 49 |         required_files = [
 50 |             "bigvgan_discriminator.pth", "bigvgan_generator.pth", 
 51 |             "bpe.model", "dvae.pth", "gpt.pth", 
 52 |             "unigram_12000.vocab", "config.yaml"
 53 |         ]
 54 |         
 55 |         for file in required_files:
 56 |             if not os.path.exists(os.path.join(self.model_dir, file)):
 57 |                 raise FileNotFoundError(f"模型文件 {file} 未找到，请确保已下载模型文件到 {self.model_dir}")
 58 |         
 59 |         # 加载配置
 60 |         with open(self.cfg_path, 'r', encoding='utf-8') as f:
 61 |             self.config = yaml.safe_load(f)
 62 |         
 63 |         # 初始化模型
 64 |         self._init_model()
 65 |         
 66 |         print(f"成功初始化真实IndexTTS模型, 模型目录: {self.model_dir}")
 67 |     
 68 |     def _init_model(self):
 69 |         """初始化模型组件"""
 70 |         # 加载GPT模型
 71 |         self.gpt = self._load_gpt_model()
 72 |         
 73 |         # 加载DVAE模型
 74 |         self.dvae = self._load_dvae_model()
 75 |         
 76 |         # 加载BigVGAN生成器
 77 |         self.vocoder = self._load_vocoder_model()
 78 |         
 79 |         # 初始化分词器
 80 |         self._init_tokenizer()
 81 |     
 82 |     def _load_gpt_model(self):
 83 |         """加载GPT模型"""
 84 |         print("加载GPT模型...")
 85 |         gpt_path = os.path.join(self.model_dir, "gpt.pth")
 86 |         
 87 |         # 这里需要根据实际模型结构进行加载
 88 |         # 以下是示例代码，实际应根据IndexTTS的模型结构调整
 89 |         from torch import nn
 90 |         
 91 |         class SimpleGPT(nn.Module):
 92 |             def __init__(self):
 93 |                 super().__init__()
 94 |                 # 简化的GPT模型结构
 95 |                 self.embedding = nn.Embedding(10000, 512)
 96 |                 self.transformer = nn.TransformerEncoder(
 97 |                     nn.TransformerEncoderLayer(
 98 |                         d_model=512, nhead=8, dim_feedforward=2048, batch_first=True
 99 |                     ), 
100 |                     num_layers=6
101 |                 )
102 |                 self.decoder = nn.Linear(512, 256)
103 |                 
104 |             def forward(self, x, prompt=None):
105 |                 # 简化的前向计算
106 |                 x = self.embedding(x)
107 |                 x = self.transformer(x)
108 |                 return self.decoder(x)
109 |         
110 |         model = SimpleGPT()
111 |         
112 |         try:
113 |             # 加载预训练参数
114 |             checkpoint = torch.load(gpt_path, map_location=self.device)
115 |             # 实际代码需要根据检查点的结构进行调整
116 |             # model.load_state_dict(checkpoint)
117 |             print(f"GPT模型加载成功: {gpt_path}")
118 |         except Exception as e:
119 |             print(f"加载GPT模型失败: {e}")
120 |             print("使用未初始化的GPT模型")
121 |         
122 |         model = model.to(self.device)
123 |         model.eval()
124 |         return model
125 |     
126 |     def _load_dvae_model(self):
127 |         """加载DVAE模型"""
128 |         print("加载DVAE模型...")
129 |         dvae_path = os.path.join(self.model_dir, "dvae.pth")
130 |         
131 |         # 简化的DVAE模型
132 |         from torch import nn
133 |         
134 |         class SimpleDVAE(nn.Module):
135 |             def __init__(self):
136 |                 super().__init__()
137 |                 # 简化的编码器-解码器结构
138 |                 self.encoder = nn.Sequential(
139 |                     nn.Conv1d(1, 64, kernel_size=3, padding=1),
140 |                     nn.ReLU(),
141 |                     nn.Conv1d(64, 128, kernel_size=3, padding=1),
142 |                     nn.ReLU()
143 |                 )
144 |                 self.decoder = nn.Sequential(
145 |                     nn.ConvTranspose1d(128, 64, kernel_size=3, padding=1),
146 |                     nn.ReLU(),
147 |                     nn.ConvTranspose1d(64, 1, kernel_size=3, padding=1),
148 |                     nn.Tanh()
149 |                 )
150 |                 
151 |             def encode(self, x):
152 |                 return self.encoder(x)
153 |                 
154 |             def decode(self, z):
155 |                 return self.decoder(z)
156 |                 
157 |             def forward(self, x):
158 |                 z = self.encode(x)
159 |                 return self.decode(z)
160 |         
161 |         model = SimpleDVAE()
162 |         
163 |         try:
164 |             # 加载预训练参数
165 |             checkpoint = torch.load(dvae_path, map_location=self.device)
166 |             # 实际代码需要根据检查点的结构进行调整
167 |             # model.load_state_dict(checkpoint)
168 |             print(f"DVAE模型加载成功: {dvae_path}")
169 |         except Exception as e:
170 |             print(f"加载DVAE模型失败: {e}")
171 |             print("使用未初始化的DVAE模型")
172 |         
173 |         model = model.to(self.device)
174 |         model.eval()
175 |         return model
176 |     
177 |     def _load_vocoder_model(self):
178 |         """加载声码器模型"""
179 |         print("加载BigVGAN声码器...")
180 |         vocoder_path = os.path.join(self.model_dir, "bigvgan_generator.pth")
181 |         
182 |         # 简化的声码器模型
183 |         from torch import nn
184 |         
185 |         class SimpleVocoder(nn.Module):
186 |             def __init__(self):
187 |                 super().__init__()
188 |                 # 简化的声码器网络
189 |                 self.upsample = nn.Sequential(
190 |                     nn.Upsample(scale_factor=2),
191 |                     nn.Conv1d(128, 64, kernel_size=3, padding=1),
192 |                     nn.LeakyReLU(0.2),
193 |                     nn.Upsample(scale_factor=2),
194 |                     nn.Conv1d(64, 32, kernel_size=3, padding=1),
195 |                     nn.LeakyReLU(0.2),
196 |                     nn.Upsample(scale_factor=2),
197 |                     nn.Conv1d(32, 1, kernel_size=3, padding=1),
198 |                     nn.Tanh()
199 |                 )
200 |                 
201 |             def forward(self, x):
202 |                 return self.upsample(x)
203 |         
204 |         model = SimpleVocoder()
205 |         
206 |         try:
207 |             # 加载预训练参数
208 |             checkpoint = torch.load(vocoder_path, map_location=self.device)
209 |             # 实际代码需要根据检查点的结构进行调整
210 |             # model.load_state_dict(checkpoint)
211 |             print(f"声码器模型加载成功: {vocoder_path}")
212 |         except Exception as e:
213 |             print(f"加载声码器模型失败: {e}")
214 |             print("使用未初始化的声码器模型")
215 |         
216 |         model = model.to(self.device)
217 |         model.eval()
218 |         return model
219 |     
220 |     def _init_tokenizer(self):
221 |         """初始化分词器"""
222 |         print("初始化分词器...")
223 |         
224 |         # 加载词汇表
225 |         vocab_path = os.path.join(self.model_dir, "unigram_12000.vocab")
226 |         
227 |         # 为简化，这里使用基本分词器
228 |         # 实际应使用与训练时相同的分词器
229 |         self.tokenizer = {
230 |             "zh": lambda text: list(text),
231 |             "en": lambda text: text.lower().split(),
232 |             "auto": lambda text: list(text)  # 自动检测
233 |         }
234 |         
235 |         print("分词器初始化完成")
236 |     
237 |     def _detect_language(self, text):
238 |         """检测文本语言"""
239 |         # 简单的语言检测逻辑
240 |         chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
241 |         if len(chinese_chars) > len(text) * 0.5:
242 |             return "zh"
243 |         return "en"
244 |     
245 |     def _process_text(self, text, language="auto"):
246 |         """处理输入文本"""
247 |         if language == "auto":
248 |             language = self._detect_language(text)
249 |         
250 |         # 使用对应语言的分词器
251 |         tokens = self.tokenizer[language](text)
252 |         
253 |         # 转换为模型输入
254 |         # 实际代码需要使用真实的索引映射
255 |         indices = [i % 1000 for i in range(len(tokens))]
256 |         
257 |         return torch.tensor(indices).unsqueeze(0).to(self.device)
258 |     
259 |     def _process_reference_audio(self, audio_data, sr=16000):
260 |         """处理参考音频"""
261 |         # 确保音频是正确的格式
262 |         if isinstance(audio_data, np.ndarray):
263 |             # 转换为torch张量
264 |             if audio_data.ndim == 1:
265 |                 audio_tensor = torch.tensor(audio_data).unsqueeze(0)
266 |             else:
267 |                 audio_tensor = torch.tensor(audio_data)
268 |         elif isinstance(audio_data, torch.Tensor):
269 |             audio_tensor = audio_data
270 |         else:
271 |             raise ValueError("不支持的音频数据类型")
272 |         
273 |         # 确保在正确的设备上
274 |         audio_tensor = audio_tensor.to(self.device)
275 |         
276 |         # 处理参考音频，提取说话人嵌入
277 |         # 实际代码需要使用真实的特征提取方法
278 |         with torch.no_grad():
279 |             # 使用DVAE编码参考音频，获取说话人特征
280 |             if audio_tensor.ndim == 1:
281 |                 audio_tensor = audio_tensor.unsqueeze(0)
282 |             if audio_tensor.ndim == 2:
283 |                 audio_tensor = audio_tensor.unsqueeze(1)  # [B, 1, T]
284 |             
285 |             # 提取说话人特征
286 |             speaker_emb = self.dvae.encode(audio_tensor)
287 |             
288 |         return speaker_emb
289 |     
290 |     def infer(self, reference_audio, text, output_path=None, language="auto", speed=1.0):
291 |         """
292 |         使用参考声音生成语音
293 |         
294 |         参数:
295 |             reference_audio: 参考音频数据 (numpy数组或tensor)
296 |             text: 要合成的文本
297 |             output_path: 输出路径，如果为None则只返回数据
298 |             language: 语言代码，"zh"、"en"或"auto"
299 |             speed: 语速，默认1.0
300 |             
301 |         返回:
302 |             (numpy.ndarray, int): 音频数据和采样率
303 |         """
304 |         # 确保模型处于评估模式
305 |         self.gpt.eval()
306 |         self.dvae.eval()
307 |         self.vocoder.eval()
308 |         
309 |         # 处理文本
310 |         token_ids = self._process_text(text, language)
311 |         
312 |         # 处理参考音频
313 |         speaker_emb = self._process_reference_audio(reference_audio)
314 |         
315 |         # 使用GPT模型生成语音特征
316 |         with torch.no_grad():
317 |             # 生成音频特征
318 |             audio_features = self.gpt(token_ids, prompt=speaker_emb)
319 |             
320 |             # 使用声码器生成波形
321 |             waveform = self.vocoder(audio_features)
322 |             
323 |             # 调整语速（简化实现）
324 |             if speed != 1.0:
325 |                 # 实际应该使用更复杂的变速算法
326 |                 import librosa
327 |                 waveform = waveform.squeeze().cpu().numpy()
328 |                 waveform = librosa.effects.time_stretch(waveform, rate=1.0/speed)
329 |                 waveform = torch.tensor(waveform).to(self.device).unsqueeze(0).unsqueeze(0)
330 |         
331 |         # 获取输出波形
332 |         output_waveform = waveform.squeeze().cpu().numpy()
333 |         
334 |         # 获取采样率
335 |         sample_rate = self.config.get("sample_rate", 16000)
336 |         
337 |         # 保存到文件（如果指定了输出路径）
338 |         if output_path:
339 |             import soundfile as sf
340 |             sf.write(output_path, output_waveform, sample_rate)
341 |         
342 |         return output_waveform, sample_rate
343 | 
344 | 
345 | # 直接测试模块
346 | if __name__ == "__main__":
347 |     # 测试模型加载
348 |     model = IndexTTSModel()
349 |     print("模型加载测试完成")
350 | 


--------------------------------------------------------------------------------
/workflow/workflow.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "id": "62dce248-d47e-4bc1-9ea1-41aa40254efb",
  3 |   "revision": 0,
  4 |   "last_node_id": 46,
  5 |   "last_link_id": 75,
  6 |   "nodes": [
  7 |     {
  8 |       "id": 37,
  9 |       "type": "LoadAudio",
 10 |       "pos": [
 11 |         978.1256103515625,
 12 |         506.11749267578125
 13 |       ],
 14 |       "size": [
 15 |         315,
 16 |         136
 17 |       ],
 18 |       "flags": {},
 19 |       "order": 0,
 20 |       "mode": 0,
 21 |       "inputs": [],
 22 |       "outputs": [
 23 |         {
 24 |           "label": "音频",
 25 |           "name": "AUDIO",
 26 |           "type": "AUDIO",
 27 |           "links": [
 28 |             72
 29 |           ]
 30 |         }
 31 |       ],
 32 |       "properties": {
 33 |         "cnr_id": "comfy-core",
 34 |         "ver": "0.3.29",
 35 |         "Node name for S&R": "LoadAudio",
 36 |         "ttNbgOverride": {
 37 |           "color": "#332922",
 38 |           "bgcolor": "#593930",
 39 |           "groupcolor": "#b06634"
 40 |         }
 41 |       },
 42 |       "widgets_values": [
 43 |         "御姐配音.wav",
 44 |         null,
 45 |         null
 46 |       ],
 47 |       "color": "#332922",
 48 |       "bgcolor": "#593930"
 49 |     },
 50 |     {
 51 |       "id": 42,
 52 |       "type": "PreviewAudio",
 53 |       "pos": [
 54 |         2390.84716796875,
 55 |         807.5316162109375
 56 |       ],
 57 |       "size": [
 58 |         315,
 59 |         88
 60 |       ],
 61 |       "flags": {},
 62 |       "order": 4,
 63 |       "mode": 0,
 64 |       "inputs": [
 65 |         {
 66 |           "label": "音频",
 67 |           "name": "audio",
 68 |           "type": "AUDIO",
 69 |           "link": 71
 70 |         }
 71 |       ],
 72 |       "outputs": [],
 73 |       "properties": {
 74 |         "cnr_id": "comfy-core",
 75 |         "ver": "0.3.29",
 76 |         "Node name for S&R": "PreviewAudio",
 77 |         "ttNbgOverride": {
 78 |           "color": "#332922",
 79 |           "bgcolor": "#593930",
 80 |           "groupcolor": "#b06634"
 81 |         }
 82 |       },
 83 |       "widgets_values": [],
 84 |       "color": "#332922",
 85 |       "bgcolor": "#593930"
 86 |     },
 87 |     {
 88 |       "id": 44,
 89 |       "type": "AudioCleanupNode",
 90 |       "pos": [
 91 |         1933.467041015625,
 92 |         800.106689453125
 93 |       ],
 94 |       "size": [
 95 |         405.5999755859375,
 96 |         154
 97 |       ],
 98 |       "flags": {},
 99 |       "order": 2,
100 |       "mode": 0,
101 |       "inputs": [
102 |         {
103 |           "name": "audio",
104 |           "type": "AUDIO",
105 |           "link": 74
106 |         }
107 |       ],
108 |       "outputs": [
109 |         {
110 |           "name": "enhanced_audio",
111 |           "type": "AUDIO",
112 |           "links": [
113 |             71
114 |           ]
115 |         }
116 |       ],
117 |       "properties": {
118 |         "aux_id": "chenpipi0807/ComfyUI-Index-TTS",
119 |         "ver": "074b8a838b84d57500b38167a5dbb72d99965e32",
120 |         "Node name for S&R": "AudioCleanupNode"
121 |       },
122 |       "widgets_values": [
123 |         1,
124 |         1,
125 |         200,
126 |         8000,
127 |         "true"
128 |       ]
129 |     },
130 |     {
131 |       "id": 45,
132 |       "type": "IndexTTSNode",
133 |       "pos": [
134 |         1381.8446044921875,
135 |         505.53948974609375
136 |       ],
137 |       "size": [
138 |         400,
139 |         420
140 |       ],
141 |       "flags": {},
142 |       "order": 1,
143 |       "mode": 0,
144 |       "inputs": [
145 |         {
146 |           "name": "reference_audio",
147 |           "type": "AUDIO",
148 |           "link": 72
149 |         }
150 |       ],
151 |       "outputs": [
152 |         {
153 |           "name": "audio",
154 |           "type": "AUDIO",
155 |           "links": [
156 |             74,
157 |             75
158 |           ]
159 |         },
160 |         {
161 |           "name": "seed",
162 |           "type": "INT",
163 |           "links": null
164 |         }
165 |       ],
166 |       "properties": {
167 |         "aux_id": "chenpipi0807/ComfyUI-Index-TTS",
168 |         "ver": "074b8a838b84d57500b38167a5dbb72d99965e32",
169 |         "Node name for S&R": "IndexTTSNode"
170 |       },
171 |       "widgets_values": [
172 |         "你好，这是一段测试文本。",
173 |         "IndexTTS-1.5",
174 |         "auto",
175 |         1,
176 |         2616582231,
177 |         "randomize",
178 |         1,
179 |         0.8,
180 |         30,
181 |         10,
182 |         0,
183 |         3,
184 |         600,
185 |         "auto",
186 |         [
187 |           false,
188 |           true
189 |         ]
190 |       ]
191 |     },
192 |     {
193 |       "id": 46,
194 |       "type": "SaveAudioMP3",
195 |       "pos": [
196 |         1928.1614990234375,
197 |         500.5684814453125
198 |       ],
199 |       "size": [
200 |         270,
201 |         136
202 |       ],
203 |       "flags": {},
204 |       "order": 3,
205 |       "mode": 0,
206 |       "inputs": [
207 |         {
208 |           "name": "audio",
209 |           "type": "AUDIO",
210 |           "link": 75
211 |         }
212 |       ],
213 |       "outputs": [],
214 |       "properties": {
215 |         "cnr_id": "comfy-core",
216 |         "ver": "0.3.40",
217 |         "Node name for S&R": "SaveAudioMP3"
218 |       },
219 |       "widgets_values": [
220 |         "audio/ComfyUI",
221 |         "320k"
222 |       ]
223 |     }
224 |   ],
225 |   "links": [
226 |     [
227 |       71,
228 |       44,
229 |       0,
230 |       42,
231 |       0,
232 |       "AUDIO"
233 |     ],
234 |     [
235 |       72,
236 |       37,
237 |       0,
238 |       45,
239 |       0,
240 |       "AUDIO"
241 |     ],
242 |     [
243 |       74,
244 |       45,
245 |       0,
246 |       44,
247 |       0,
248 |       "AUDIO"
249 |     ],
250 |     [
251 |       75,
252 |       45,
253 |       0,
254 |       46,
255 |       0,
256 |       "AUDIO"
257 |     ]
258 |   ],
259 |   "groups": [
260 |     {
261 |       "id": 1,
262 |       "title": "可选项：音频降噪用的",
263 |       "bounding": [
264 |         1923.467041015625,
265 |         717.9036254882812,
266 |         809.409912109375,
267 |         246.20309448242188
268 |       ],
269 |       "color": "#3f789e",
270 |       "font_size": 24,
271 |       "flags": {}
272 |     }
273 |   ],
274 |   "config": {},
275 |   "extra": {
276 |     "ds": {
277 |       "scale": 1.1000000000000005,
278 |       "offset": [
279 |         -721.9523926470781,
280 |         -215.54904321342832
281 |       ]
282 |     },
283 |     "frontendVersion": "1.21.7",
284 |     "ue_links": [],
285 |     "0246.VERSION": [
286 |       0,
287 |       0,
288 |       4
289 |     ],
290 |     "VHS_latentpreview": false,
291 |     "VHS_latentpreviewrate": 0,
292 |     "VHS_MetadataImage": true,
293 |     "VHS_KeepIntermediate": true
294 |   },
295 |   "version": 0.4
296 | }
297 | 


--------------------------------------------------------------------------------
/workflow/读小说用这个.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "id": "9a68dd3b-2325-410c-a6f9-dd809511c4c4",
  3 |   "revision": 0,
  4 |   "last_node_id": 465,
  5 |   "last_link_id": 482,
  6 |   "nodes": [
  7 |     {
  8 |       "id": 455,
  9 |       "type": "IndexTTSProNode",
 10 |       "pos": [
 11 |         1131.8665771484375,
 12 |         -740.8927001953125
 13 |       ],
 14 |       "size": [
 15 |         400,
 16 |         476
 17 |       ],
 18 |       "flags": {},
 19 |       "order": 6,
 20 |       "mode": 0,
 21 |       "inputs": [
 22 |         {
 23 |           "name": "narrator_audio",
 24 |           "type": "AUDIO",
 25 |           "link": 478
 26 |         },
 27 |         {
 28 |           "name": "character1_audio",
 29 |           "shape": 7,
 30 |           "type": "AUDIO",
 31 |           "link": 479
 32 |         },
 33 |         {
 34 |           "name": "character2_audio",
 35 |           "shape": 7,
 36 |           "type": "AUDIO",
 37 |           "link": 480
 38 |         },
 39 |         {
 40 |           "name": "character3_audio",
 41 |           "shape": 7,
 42 |           "type": "AUDIO",
 43 |           "link": null
 44 |         },
 45 |         {
 46 |           "name": "character4_audio",
 47 |           "shape": 7,
 48 |           "type": "AUDIO",
 49 |           "link": null
 50 |         },
 51 |         {
 52 |           "name": "character5_audio",
 53 |           "shape": 7,
 54 |           "type": "AUDIO",
 55 |           "link": null
 56 |         },
 57 |         {
 58 |           "name": "structured_text",
 59 |           "type": "STRING",
 60 |           "widget": {
 61 |             "name": "structured_text"
 62 |           },
 63 |           "link": 482
 64 |         }
 65 |       ],
 66 |       "outputs": [
 67 |         {
 68 |           "name": "audio",
 69 |           "type": "AUDIO",
 70 |           "links": [
 71 |             477
 72 |           ]
 73 |         },
 74 |         {
 75 |           "name": "seed",
 76 |           "type": "INT",
 77 |           "links": null
 78 |         }
 79 |       ],
 80 |       "properties": {
 81 |         "aux_id": "chenpipi0807/ComfyUI-Index-TTS",
 82 |         "ver": "074b8a838b84d57500b38167a5dbb72d99965e32",
 83 |         "Node name for S&R": "IndexTTSProNode"
 84 |       },
 85 |       "widgets_values": [
 86 |         "<正文>这是一段正文示例。<角色1>你好。<正文>他说道。",
 87 |         "IndexTTS-1.5",
 88 |         "auto",
 89 |         1,
 90 |         2603958371,
 91 |         "randomize",
 92 |         1,
 93 |         0.8,
 94 |         30,
 95 |         10,
 96 |         0,
 97 |         3,
 98 |         600,
 99 |         [
100 |           false,
101 |           true
102 |         ]
103 |       ]
104 |     },
105 |     {
106 |       "id": 457,
107 |       "type": "NovelTextStructureNode",
108 |       "pos": [
109 |         -228.7899627685547,
110 |         -766.8822021484375
111 |       ],
112 |       "size": [
113 |         448.1485900878906,
114 |         391.1784973144531
115 |       ],
116 |       "flags": {},
117 |       "order": 3,
118 |       "mode": 0,
119 |       "inputs": [],
120 |       "outputs": [
121 |         {
122 |           "name": "structured_text",
123 |           "type": "STRING",
124 |           "links": [
125 |             472
126 |           ]
127 |         }
128 |       ],
129 |       "properties": {
130 |         "aux_id": "chenpipi0807/ComfyUI-Index-TTS",
131 |         "ver": "074b8a838b84d57500b38167a5dbb72d99965e32",
132 |         "Node name for S&R": "NovelTextStructureNode"
133 |       },
134 |       "widgets_values": [
135 |         "少女此时就站在院墙那边，她有一双杏眼，怯怯弱弱。\n\n院门那边，有个嗓音说：“你这婢女卖不卖？”\n\n宋集薪愣了愣，循着声音转头望去，是个眉眼含笑的锦衣少年，站在院外，一张全然陌生的面孔。\n\n锦衣少年身边站着一位身材高大的老者，面容白皙，脸色和蔼，轻轻眯眼打量着两座毗邻院落的少年少女。\n\n老者的视线在陈平安一扫而过，并无停滞，但是在宋集薪和婢女身上，多有停留，笑意渐渐浓郁。\n\n宋集薪斜眼道：“卖！怎么不卖！”\n\n那少年微笑道：“那你说个价。”\n\n少女瞪大眼眸，满脸匪夷所思，像一头惊慌失措的年幼麋鹿。\n\n宋集薪翻了个白眼，伸出一根手指，晃了晃，“白银一万两！”\n\n锦衣少年脸色如常，点头道：“好。”\n\n宋集薪见那少年不像是开玩笑的样子，连忙改口道：“是黄金万两！”\n\n锦衣少年嘴角翘起，道：“逗你玩的。”\n\n宋集薪脸色阴沉。",
136 |         [
137 |           false,
138 |           true
139 |         ]
140 |       ]
141 |     },
142 |     {
143 |       "id": 458,
144 |       "type": "easy showAnything",
145 |       "pos": [
146 |         -239.69430541992188,
147 |         -316.8822326660156
148 |       ],
149 |       "size": [
150 |         450.1573486328125,
151 |         237.9201202392578
152 |       ],
153 |       "flags": {},
154 |       "order": 5,
155 |       "mode": 0,
156 |       "inputs": [
157 |         {
158 |           "name": "anything",
159 |           "shape": 7,
160 |           "type": "*",
161 |           "link": 472
162 |         }
163 |       ],
164 |       "outputs": [
165 |         {
166 |           "name": "output",
167 |           "type": "*",
168 |           "links": []
169 |         }
170 |       ],
171 |       "properties": {
172 |         "cnr_id": "comfyui-easy-use",
173 |         "ver": "2986a014694fd27049c3f66d39e3f60904283f9b",
174 |         "Node name for S&R": "easy showAnything"
175 |       },
176 |       "widgets_values": [
177 |         "<Narrator>少女此时就站在院墙那边，她有一双杏眼，怯怯弱弱。<Narrator>院门那边，有个嗓音说：<Character1>你这婢女卖不卖？<Narrator>宋集薪愣了愣，循着声音转头望去，是个眉眼含笑的锦衣少年，站在院外，一张全然陌生的面孔。<Narrator>锦衣少年身边站着一位身材高大的老者，面容白皙，脸色和蔼，轻轻眯眼打量着两座毗邻院落的少年少女。<Narrator>老者的视线在陈平安一扫而过，并无停滞，但是在宋集薪和婢女身上，多有停留，笑意渐渐浓郁。<Narrator>宋集薪斜眼道：<Character1>卖！怎么不卖！<Narrator>那少年微笑道：<Character1>那你说个价。<Narrator>少女瞪大眼眸，满脸匪夷所思，像一头惊慌失措的年幼麋鹿。<Narrator>宋集薪翻了个白眼，伸出一根手指，晃了晃，“白银一万两！”<Narrator>锦衣少年脸色如常，点头道：<Character1>好。<Narrator>宋集薪见那少年不像是开玩笑的样子，连忙改口道：<Character1>是黄金万两！<Narrator>锦衣少年嘴角翘起，道：<Character1>逗你玩的。<Narrator>宋集薪脸色阴沉。"
178 |       ]
179 |     },
180 |     {
181 |       "id": 459,
182 |       "type": "LoadAudio",
183 |       "pos": [
184 |         749.822265625,
185 |         -520.6138916015625
186 |       ],
187 |       "size": [
188 |         270,
189 |         136
190 |       ],
191 |       "flags": {},
192 |       "order": 1,
193 |       "mode": 0,
194 |       "inputs": [],
195 |       "outputs": [
196 |         {
197 |           "name": "AUDIO",
198 |           "type": "AUDIO",
199 |           "links": [
200 |             479
201 |           ]
202 |         }
203 |       ],
204 |       "properties": {
205 |         "cnr_id": "comfy-core",
206 |         "ver": "0.3.32",
207 |         "Node name for S&R": "LoadAudio"
208 |       },
209 |       "widgets_values": [
210 |         "凡人修仙传-紫灵.mp3",
211 |         null,
212 |         null
213 |       ]
214 |     },
215 |     {
216 |       "id": 460,
217 |       "type": "LoadAudio",
218 |       "pos": [
219 |         744.0565795898438,
220 |         -308.4060974121094
221 |       ],
222 |       "size": [
223 |         270,
224 |         136
225 |       ],
226 |       "flags": {},
227 |       "order": 2,
228 |       "mode": 0,
229 |       "inputs": [],
230 |       "outputs": [
231 |         {
232 |           "name": "AUDIO",
233 |           "type": "AUDIO",
234 |           "links": [
235 |             480
236 |           ]
237 |         }
238 |       ],
239 |       "properties": {
240 |         "cnr_id": "comfy-core",
241 |         "ver": "0.3.32",
242 |         "Node name for S&R": "LoadAudio"
243 |       },
244 |       "widgets_values": [
245 |         "恋与深空-秦彻.WAV",
246 |         null,
247 |         null
248 |       ]
249 |     },
250 |     {
251 |       "id": 461,
252 |       "type": "LoadAudio",
253 |       "pos": [
254 |         749.5767211914062,
255 |         -742.494140625
256 |       ],
257 |       "size": [
258 |         270,
259 |         136
260 |       ],
261 |       "flags": {},
262 |       "order": 0,
263 |       "mode": 0,
264 |       "inputs": [],
265 |       "outputs": [
266 |         {
267 |           "name": "AUDIO",
268 |           "type": "AUDIO",
269 |           "links": [
270 |             478
271 |           ]
272 |         }
273 |       ],
274 |       "properties": {
275 |         "cnr_id": "comfy-core",
276 |         "ver": "0.3.32",
277 |         "Node name for S&R": "LoadAudio"
278 |       },
279 |       "widgets_values": [
280 |         "御姐配音.wav",
281 |         null,
282 |         null
283 |       ]
284 |     },
285 |     {
286 |       "id": 462,
287 |       "type": "SaveAudio",
288 |       "pos": [
289 |         1585.802978515625,
290 |         -739.2282104492188
291 |       ],
292 |       "size": [
293 |         270,
294 |         112
295 |       ],
296 |       "flags": {},
297 |       "order": 7,
298 |       "mode": 0,
299 |       "inputs": [
300 |         {
301 |           "name": "audio",
302 |           "type": "AUDIO",
303 |           "link": 477
304 |         }
305 |       ],
306 |       "outputs": [],
307 |       "properties": {
308 |         "cnr_id": "comfy-core",
309 |         "ver": "0.3.32",
310 |         "Node name for S&R": "SaveAudio"
311 |       },
312 |       "widgets_values": [
313 |         "audio/ComfyUI"
314 |       ]
315 |     },
316 |     {
317 |       "id": 465,
318 |       "type": "String Literal",
319 |       "pos": [
320 |         269.1800231933594,
321 |         -755.8568725585938
322 |       ],
323 |       "size": [
324 |         441.81292724609375,
325 |         636.3744506835938
326 |       ],
327 |       "flags": {},
328 |       "order": 4,
329 |       "mode": 0,
330 |       "inputs": [],
331 |       "outputs": [
332 |         {
333 |           "name": "STRING",
334 |           "type": "STRING",
335 |           "links": [
336 |             482
337 |           ]
338 |         }
339 |       ],
340 |       "properties": {
341 |         "cnr_id": "comfy-image-saver",
342 |         "ver": "65e6903eff274a50f8b5cd768f0f96baf37baea1",
343 |         "Node name for S&R": "String Literal"
344 |       },
345 |       "widgets_values": [
346 |         "<Narrator>少女此时就站在院墙那边，她有一双杏眼，怯怯弱弱。</Narrator>\n<Narrator>院门那边，有个嗓音说：</Narrator>\n<Character1>“你这婢女卖不卖？”</Character1>\n<Narrator>宋集薪愣了愣，循着声音转头望去，是个眉眼含笑的锦衣少年，站在院外，一张全然陌生的面孔。</Narrator>\n<Narrator>锦衣少年身边站着一位身材高大的老者，面容白皙，脸色和蔼，轻轻眯眼打量着两座毗邻院落的少年少女。</Narrator>\n<Narrator>老者的视线在陈平安一扫而过，并无停滞，但是在宋集薪和婢女身上，多有停留，笑意渐渐浓郁。</Narrator>\n<Narrator>宋集薪斜眼道：</Narrator>\n<Character2>“卖！怎么不卖！”</Character2>\n<Narrator>那少年微笑道：</Narrator>\n<Character1>“那你说个价。”</Character1>\n<Narrator>少女瞪大眼眸，满脸匪夷所思，像一头惊慌失措的年幼麋鹿。</Narrator>\n<Narrator>宋集薪翻了个白眼，伸出一根手指，晃了晃，</Narrator>\n<Character2>“白银一万两！”</Character2>\n<Narrator>锦衣少年脸色如常，点头道：</Narrator>\n<Character1>“好。”</Character1>\n<Narrator>宋集薪见那少年不像是开玩笑的样子，连忙改口道：</Narrator>\n<Character2>“是黄金万两！”</Character2>\n<Narrator>锦衣少年嘴角翘起，道：</Narrator>\n<Character1>“逗你玩的。”</Character1>\n<Narrator>宋集薪脸色阴沉。</Narrator>",
347 |         [
348 |           false,
349 |           true
350 |         ]
351 |       ]
352 |     }
353 |   ],
354 |   "links": [
355 |     [
356 |       472,
357 |       457,
358 |       0,
359 |       458,
360 |       0,
361 |       "*"
362 |     ],
363 |     [
364 |       477,
365 |       455,
366 |       0,
367 |       462,
368 |       0,
369 |       "AUDIO"
370 |     ],
371 |     [
372 |       478,
373 |       461,
374 |       0,
375 |       455,
376 |       0,
377 |       "AUDIO"
378 |     ],
379 |     [
380 |       479,
381 |       459,
382 |       0,
383 |       455,
384 |       1,
385 |       "AUDIO"
386 |     ],
387 |     [
388 |       480,
389 |       460,
390 |       0,
391 |       455,
392 |       2,
393 |       "AUDIO"
394 |     ],
395 |     [
396 |       482,
397 |       465,
398 |       0,
399 |       455,
400 |       6,
401 |       "STRING"
402 |     ]
403 |   ],
404 |   "groups": [
405 |     {
406 |       "id": 1,
407 |       "title": "结构化拆分/效果一般建议自己找llm拆",
408 |       "bounding": [
409 |         -249.69430541992188,
410 |         -840.482177734375,
411 |         470.69024658203125,
412 |         789.765869140625
413 |       ],
414 |       "color": "#3f789e",
415 |       "font_size": 24,
416 |       "flags": {}
417 |     },
418 |     {
419 |       "id": 2,
420 |       "title": "读小说的核心节点示例",
421 |       "bounding": [
422 |         259.1800231933594,
423 |         -829.4568481445312,
424 |         1606.622802734375,
425 |         719.9744262695312
426 |       ],
427 |       "color": "#3f789e",
428 |       "font_size": 24,
429 |       "flags": {}
430 |     }
431 |   ],
432 |   "config": {},
433 |   "extra": {
434 |     "ds": {
435 |       "scale": 0.45,
436 |       "offset": [
437 |         605.575463349528,
438 |         1101.479062506663
439 |       ]
440 |     },
441 |     "frontendVersion": "1.18.9",
442 |     "ue_links": [],
443 |     "VHS_latentpreview": false,
444 |     "VHS_latentpreviewrate": 0,
445 |     "VHS_MetadataImage": true,
446 |     "VHS_KeepIntermediate": true
447 |   },
448 |   "version": 0.4
449 | }


--------------------------------------------------------------------------------