├── LICENSE
├── README.md
├── app.py
├── asserts
    ├── HeartLink.gif
    ├── HeartLink.png
    ├── HeartLink_digitalhuman.gif
    ├── chart.png
    └── logo.jpg
├── datasets
    └── README.md
├── demo
    ├── TTS
    │   ├── GPT_SoVITS
    │   │   ├── AR
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │   │   ├── __init__.cpython-310.pyc
    │   │   │   │   │   ├── bucket_sampler.cpython-310.pyc
    │   │   │   │   │   ├── data_module.cpython-310.pyc
    │   │   │   │   │   └── dataset.cpython-310.pyc
    │   │   │   │   ├── bucket_sampler.py
    │   │   │   │   ├── data_module.py
    │   │   │   │   └── dataset.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │   │   ├── __init__.cpython-310.pyc
    │   │   │   │   │   ├── t2s_lightning_module.cpython-310.pyc
    │   │   │   │   │   ├── t2s_model.cpython-310.pyc
    │   │   │   │   │   └── utils.cpython-310.pyc
    │   │   │   │   ├── t2s_lightning_module.py
    │   │   │   │   ├── t2s_lightning_module_onnx.py
    │   │   │   │   ├── t2s_model.py
    │   │   │   │   ├── t2s_model_onnx.py
    │   │   │   │   └── utils.py
    │   │   │   ├── modules
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │   │   ├── __init__.cpython-310.pyc
    │   │   │   │   │   ├── activation.cpython-310.pyc
    │   │   │   │   │   ├── embedding.cpython-310.pyc
    │   │   │   │   │   ├── lr_schedulers.cpython-310.pyc
    │   │   │   │   │   ├── optim.cpython-310.pyc
    │   │   │   │   │   ├── patched_mha_with_cache.cpython-310.pyc
    │   │   │   │   │   ├── scaling.cpython-310.pyc
    │   │   │   │   │   └── transformer.cpython-310.pyc
    │   │   │   │   ├── activation.py
    │   │   │   │   ├── activation_onnx.py
    │   │   │   │   ├── embedding.py
    │   │   │   │   ├── embedding_onnx.py
    │   │   │   │   ├── lr_schedulers.py
    │   │   │   │   ├── optim.py
    │   │   │   │   ├── patched_mha_with_cache.py
    │   │   │   │   ├── patched_mha_with_cache_onnx.py
    │   │   │   │   ├── scaling.py
    │   │   │   │   ├── transformer.py
    │   │   │   │   └── transformer_onnx.py
    │   │   │   ├── text_processing
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── phonemizer.py
    │   │   │   │   └── symbols.py
    │   │   │   └── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │       ├── __init__.cpython-310.pyc
    │   │   │   │       └── io.cpython-310.pyc
    │   │   │   │   ├── initialize.py
    │   │   │   │   └── io.py
    │   │   ├── __init__.py
    │   │   ├── cankao.wav
    │   │   ├── cankao2.wav
    │   │   ├── feature_extractor
    │   │   │   ├── __init__.py
    │   │   │   ├── cnhubert.py
    │   │   │   └── whisper_enc.py
    │   │   ├── module
    │   │   │   ├── __init__.py
    │   │   │   ├── attentions.py
    │   │   │   ├── attentions_onnx.py
    │   │   │   ├── commons.py
    │   │   │   ├── core_vq.py
    │   │   │   ├── data_utils.py
    │   │   │   ├── losses.py
    │   │   │   ├── mel_processing.py
    │   │   │   ├── models.py
    │   │   │   ├── models_onnx.py
    │   │   │   ├── modules.py
    │   │   │   ├── mrte_model.py
    │   │   │   ├── quantize.py
    │   │   │   └── transforms.py
    │   │   ├── my_utils.py
    │   │   ├── output.wav
    │   │   ├── text
    │   │   │   ├── __init__.py
    │   │   │   ├── chinese.py
    │   │   │   ├── cleaner.py
    │   │   │   ├── cmudict-fast.rep
    │   │   │   ├── cmudict.rep
    │   │   │   ├── engdict-hot.rep
    │   │   │   ├── engdict_cache.pickle
    │   │   │   ├── english.py
    │   │   │   ├── japanese.py
    │   │   │   ├── namedict_cache.pickle
    │   │   │   ├── opencpop-strict.txt
    │   │   │   ├── symbols.py
    │   │   │   ├── tone_sandhi.py
    │   │   │   └── zh_normalization
    │   │   │   │   ├── README.md
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │       ├── __init__.cpython-310.pyc
    │   │   │   │       ├── char_convert.cpython-310.pyc
    │   │   │   │       ├── chronology.cpython-310.pyc
    │   │   │   │       ├── constants.cpython-310.pyc
    │   │   │   │       ├── num.cpython-310.pyc
    │   │   │   │       ├── phonecode.cpython-310.pyc
    │   │   │   │       ├── quantifier.cpython-310.pyc
    │   │   │   │       └── text_normlization.cpython-310.pyc
    │   │   │   │   ├── char_convert.py
    │   │   │   │   ├── chronology.py
    │   │   │   │   ├── constants.py
    │   │   │   │   ├── num.py
    │   │   │   │   ├── phonecode.py
    │   │   │   │   ├── quantifier.py
    │   │   │   │   └── text_normlization.py
    │   │   ├── tts.py
    │   │   └── utils.py
    │   ├── TEMP
    │   │   ├── jieba.cache
    │   │   └── tmp_s2.json
    │   ├── __init__.py
    │   ├── config.py
    │   ├── data_process.py
    │   ├── i18n
    │   │   └── locale
    │   │   │   ├── en_US.json
    │   │   │   ├── es_ES.json
    │   │   │   ├── fr_FR.json
    │   │   │   ├── it_IT.json
    │   │   │   ├── ja_JP.json
    │   │   │   ├── ko_KR.json
    │   │   │   ├── pt_BR.json
    │   │   │   ├── ru_RU.json
    │   │   │   ├── tr_TR.json
    │   │   │   ├── zh_CN.json
    │   │   │   ├── zh_HK.json
    │   │   │   ├── zh_SG.json
    │   │   │   └── zh_TW.json
    │   └── webui.py
    ├── __init__.py
    ├── app.py
    ├── asserts
    │   └── logo.jpg
    └── config.py
├── finetune_config
    └── xtuner_config
    │   └── README.md
├── nltk_data
    ├── corpora
    │   ├── cmudict.zip
    │   └── cmudict
    │   │   ├── README
    │   │   └── cmudict
    └── taggers
    │   ├── averaged_perceptron_tagger.zip
    │   └── averaged_perceptron_tagger
    │       └── averaged_perceptron_tagger.pickle
└── requirements.txt


/README.md:
--------------------------------------------------------------------------------
  1 | # HeartLink - 心理共情大模型
  2 | <div align="center">
  3 |   <img src="./asserts/logo.jpg" width="600"/>
  4 |   <!-- <a href="https://github.com/Nobody-ML/SoulStar/tree/main/">
  5 |     <img src="assets/logo.png" alt="Logo" width="600">
  6 |   </a> -->
  7 | 
  8 |   <!-- [![Logo](assets/logo.png)](https://github.com/Nobody-ML/SoulStar/tree/main/) -->
  9 | 
 10 |   <h3 align="center">HeartLink</h3>
 11 |   <br /><br />
 12 | 
 13 |   [![license](https://img.shields.io/github/license/Nobody-ML/SoulStar.svg)](https://github.com/Nobody-ML/SoulStar/blob/main/LICENSE)
 14 | 
 15 |   <!-- 🔍 模型开源地址：
 16 | [![Static Badge](https://img.shields.io/badge/-gery?style=social&label=🤗%20Huggingface)]()
 17 | [![Static Badge](https://img.shields.io/badge/-gery?style=social&label=🤖%20ModelScope)]() -->
 18 | 
 19 | </div>
 20 | 
 21 | ## 📖 目录
 22 | - [HeartLink - 心理共情大模型](#heartlink---心理共情大模型)
 23 |   - [📖 目录](#-目录)
 24 |   - [🔄 架构图](#-架构图)
 25 |   - [🎉 更新](#-更新)
 26 |   - [📝 简介](#-简介)
 27 |   - [🛠️ 快速开始](#️-快速开始)
 28 |     - [1. 算力要求](#1-算力要求)
 29 |     - [2. 基于 transformers 使用模型](#2-基于-transformers-使用模型)
 30 |     - [3. 通过网页前端体验 demo](#3-通过网页前端体验-demo)
 31 |     - [4. 基于 LMDeploy 高性能部署](#4-基于-lmdeploy-高性能部署)
 32 |   - [🧾 数据构建](#-数据构建)
 33 |   - [🧑‍💻 微调指南](#-微调指南)
 34 |   - [📚 应用体验](#-应用体验)
 35 |   - [🎖️ 致谢](#️-致谢)
 36 |   - [开源许可证](#开源许可证)
 37 | 
 38 | ## 🔄 架构图
 39 | <div align="center">
 40 |   <img src="./asserts/HeartLink.png" width="800"/>
 41 | </div>
 42 | 
 43 | ## 🎉 更新
 44 | - 【2024.6.20】基于 internlm2-chat 微调出 V1 版模型
 45 | 
 46 | 
 47 | ## 📝 简介
 48 | 
 49 | HeartLink 是一个心理共情大模型，通过 `Large Language Model` 在构建的大型共情问答数据集指令微调而来，能在对话过程中感知用户的情绪与此时用户的经历，通过丰富的心理学知识，给予共情回复，达到理解安慰、共情支持用户的目的。在回复中附有 emoji 表情以拉近与用户的距离，让用户在咨询中得到心理上的支持和帮助。
 50 | 
 51 | 在此基座共情大模型上构建出了心理共情应用，支持语音合成，在每次回复后播放合成语音；同时支持数字人展示（未完善）；此外还有用户情绪图表分析。
 52 | 
 53 | <div align="center">
 54 |   <img src="./asserts/HeartLink_digitalhuman.gif" width="800"/>
 55 |   <h3 align="center">HeartLink with Digital Human</h3>
 56 | </div>
 57 | 
 58 | <div align="center">
 59 |   <img src="./asserts/HeartLink.gif" width="800"/>
 60 |   <h3 align="center">HeartLink without Digital Human</h3>
 61 | </div>
 62 | 
 63 | 目前支持模型及微调方式列表如下：
 64 | |         基座模型          |   微调方式   |
 65 | | :-------------------: | :------: |
 66 | |   InternLM2-Chat-7B   |  qlora   |
 67 | |   InternLM2-Chat-7B   |  full  |
 68 | |   InternLM2-Chat-20B   |  qlora   |
 69 | |          ……           |    ……    |
 70 | 
 71 | 项目持续开发中，欢迎  Star⭐、PR 和 Issue。
 72 | 
 73 | ## 🛠️ 快速开始
 74 | 
 75 | ### 1. 算力要求
 76 | - 对于 7B 的模型推理要求显存至少16G
 77 | - 对于 20B 的模型推理要求显存至少40G
 78 | 
 79 | 
 80 | ### 2. 基于 transformers 使用模型
 81 | ```python
 82 | import torch
 83 | from transformers import AutoTokenizer, AutoModelForCausalLM
 84 | tokenizer = AutoTokenizer.from_pretrained("HeartLink", trust_remote_code=True)
 85 | # 设置`torch_dtype=torch.float16`来将模型精度指定为torch.float16，否则可能会因为您的硬件原因造成显存不足的问题。
 86 | model = AutoModelForCausalLM.from_pretrained("HeartLink", device_map="auto",trust_remote_code=True, torch_dtype=torch.float16)
 87 | 
 88 | model = model.eval()
 89 | response, history = model.chat(tokenizer, "请问你是谁呀？", history=[])
 90 | print(response)
 91 | 
 92 | response, history = model.chat(tokenizer, "我最近真的好焦虑，课业上给我的作业总是错的，考试时好时坏，我压力真的好大，父母也老是因为学习上的事打骂我，我是不是该放弃学习了？我也没什么朋友，我也想和别人一起玩，一起学习，但是我感觉总是开不了口，一直都是一个人，我该怎么办才好啊，感觉我的人生真的很糟糕，看不到什么希望。", history=history)
 93 | print(response)
 94 | ```
 95 | ### 3. 通过网页前端体验 demo
 96 | ```bash
 97 | pip install streamlit
 98 | pip install transformers
 99 | python app.py
100 | ```
101 | 
102 | ### 4. 基于 LMDeploy 高性能部署
103 | ```shell
104 | # 使用命令行
105 | pip install lmdeploy
106 | lmdeploy chat /root/model/HeartLink  --model-name internlm2
107 | ```
108 | 
109 | ## 🧾 数据构建
110 | 本项目的训练数据来源于真实心理咨询场景，第一版使用约 180k 轮问答对数据，数据持续 scale 中～
111 | 
112 | 数据涵盖场景丰富，包括了“爱情、婚恋、职场、生活、社会、学习、性、过往、情绪、教育、咨询、危机”等众多丰富的场景。大致划分为：情感、生活、社交、疾病、学习、职场、过往、性。
113 | 
114 | 大致场景数据配比如下：
115 | <div align="center">
116 |   <img src="./asserts/chart.png" width="800"/>
117 | </div>
118 | 
119 | - 详情请见[数据构建](./datasets/README.md)
120 | 
121 | 
122 | ## 🧑‍💻 微调指南
123 | 模型使用 XTuner 框架进行微调，使用了 deepseed 进行训练加速。
124 | - 详情请见[微调指南](./finetune_config/xtuner_config/README.md)
125 | 
126 | ## 📚 应用体验
127 | - 应用部署在 [OpenXLab 应用中心](https://openxlab.org.cn/apps/detail/Nobody-ML/HeartLink_7B_qlora_analyse)，可前往体验
128 | 
129 | ## 🎖️ 致谢
130 | - [OpenXLab](https://openxlab.org.cn/home)
131 | - [InternLM](https://github.com/InternLM/InternLM/tree/main)
132 | 
133 | ## 开源许可证
134 | 
135 | 该项目采用 [Apache License 2.0 开源许可证](LICENSE)。同时，请遵守所使用的模型与数据集的许可证。
136 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | # conda install -c conda-forge 'ffmpeg<7'
4 | os.system("apt install ffmpeg")
5 | os.system("apt install libsox-dev")
6 | os.system('streamlit run demo/app.py --server.port 7860')
7 | 


--------------------------------------------------------------------------------
/asserts/HeartLink.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/asserts/HeartLink.gif


--------------------------------------------------------------------------------
/asserts/HeartLink.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/asserts/HeartLink.png


--------------------------------------------------------------------------------
/asserts/HeartLink_digitalhuman.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/asserts/HeartLink_digitalhuman.gif


--------------------------------------------------------------------------------
/asserts/chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/asserts/chart.png


--------------------------------------------------------------------------------
/asserts/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/asserts/logo.jpg


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
 1 | # 数据构建
 2 | > *本项目的训练数据来源于真实心理咨询场景，并使用截至到2024年6月18日最强的一系列大模型使用 CoT、ICL 进行构建，第一版约有180k对问答对数据，请勿用于非法用途。*
 3 | 
 4 | ## 1、数据准备
 5 | 首先需要在心理咨询网站上或是其他途径获取真实心理咨询问答数据，在此基础上处理为问答对的形式，可以以 json 进行存储。
 6 | 
 7 | ## 2、模型准备
 8 | 目前较为强大的模型为 GPT-4, Qwen2, DeepSeek, InternLM, GLM-4，可以自行对比各模型的性能，包括但不限于指令跟随能力，幻觉现象，生成文本质量等，以此选择合适的模型，并且准备好模型的 api。
 9 | 
10 | ## 3、数据生成
11 | 在这一步上编写好你需要让模型生成的回复的要求，比如心理咨询回复。然后从准备好的问答数据中取出问题，对应的答案作为模型输入参考，然后使用模型生成回复，或者不使用答案作为参考，直接使用问题作为模型输入，然后使用模型生成回复。
12 | 
13 | 此外，可以以收集的问题场景作为参考，如“学习”、“工作”、“生活”，让大模型以此来生成问题，然后使用大模型生成回复。
14 | 
15 | 将上述流程写为自动化脚本，转化为自动化流程，即可完成数据生成。
16 | 
17 | ## 4、数据格式
18 | 训练数据格式有单轮和多轮之分。
19 | 
20 | - 单轮对话数据格式为：
21 | ```python
22 | {
23 | "conversation":[
24 |         {
25 |             "input": "xxx",
26 |             "output": "xxx"
27 |         }
28 |     ]
29 | }
30 | ```
31 | 
32 | - 多轮对话数据格式为：
33 | ```python
34 | {
35 | "conversation":[
36 |         {
37 |             "input": "xxx",# 第一轮对话
38 |             "output": "xxx"
39 |         },
40 |         {
41 |             "input": "xxx",# 第二轮对话
42 |             "output": "xxx"
43 |         }
44 |     ]
45 | }
46 | ```


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/__init__.py


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/data/__init__.py


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/data/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/data/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/data/__pycache__/bucket_sampler.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/data/__pycache__/bucket_sampler.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/data/__pycache__/data_module.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/data/__pycache__/data_module.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/data/__pycache__/dataset.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/data/__pycache__/dataset.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/data/bucket_sampler.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/bucket_sampler.py
  2 | # reference: https://github.com/lifeiteng/vall-e
  3 | import itertools
  4 | import math
  5 | import random
  6 | from random import shuffle
  7 | from typing import Iterator
  8 | from typing import Optional
  9 | from typing import TypeVar
 10 | 
 11 | import torch
 12 | import torch.distributed as dist
 13 | from torch.utils.data import Dataset
 14 | from torch.utils.data import Sampler
 15 | 
 16 | __all__ = [
 17 |     "DistributedBucketSampler",
 18 | ]
 19 | 
 20 | T_co = TypeVar("T_co", covariant=True)
 21 | 
 22 | 
 23 | class DistributedBucketSampler(Sampler[T_co]):
 24 |     r"""
 25 |     sort the dataset wrt. input length
 26 |     divide samples into buckets
 27 |     sort within buckets
 28 |     divide buckets into batches
 29 |     sort batches
 30 |     """
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         dataset: Dataset,
 35 |         num_replicas: Optional[int] = None,
 36 |         rank: Optional[int] = None,
 37 |         shuffle: bool = True,
 38 |         seed: int = 0,
 39 |         drop_last: bool = False,
 40 |         batch_size: int = 32,
 41 |     ) -> None:
 42 |         if num_replicas is None:
 43 |             if not dist.is_available():
 44 |                 raise RuntimeError("Requires distributed package to be available")
 45 |             num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1
 46 |         if rank is None:
 47 |             if not dist.is_available():
 48 |                 raise RuntimeError("Requires distributed package to be available")
 49 |             rank = dist.get_rank() if torch.cuda.is_available() else 0
 50 |             if torch.cuda.is_available():
 51 |                 torch.cuda.set_device(rank)
 52 |         if rank >= num_replicas or rank < 0:
 53 |             raise ValueError(
 54 |                 "Invalid rank {}, rank should be in the interval"
 55 |                 " [0, {}]".format(rank, num_replicas - 1)
 56 |             )
 57 |         self.dataset = dataset
 58 |         self.num_replicas = num_replicas
 59 |         self.rank = rank
 60 |         self.epoch = 0
 61 |         self.drop_last = drop_last
 62 |         # If the dataset length is evenly divisible by # of replicas, then there
 63 |         # is no need to drop any data, since the dataset will be split equally.
 64 |         if (
 65 |             self.drop_last and len(self.dataset) % self.num_replicas != 0
 66 |         ):  # type: ignore[arg-type]
 67 |             # Split to nearest available length that is evenly divisible.
 68 |             # This is to ensure each rank receives the same amount of data when
 69 |             # using this Sampler.
 70 |             self.num_samples = math.ceil(
 71 |                 (len(self.dataset) - self.num_replicas)
 72 |                 / self.num_replicas  # type: ignore[arg-type]
 73 |             )
 74 |         else:
 75 |             self.num_samples = math.ceil(
 76 |                 len(self.dataset) / self.num_replicas
 77 |             )  # type: ignore[arg-type]
 78 |         self.total_size = self.num_samples * self.num_replicas
 79 |         self.shuffle = shuffle
 80 |         self.seed = seed
 81 |         self.batch_size = batch_size
 82 |         self.id_with_length = self._get_sample_lengths()
 83 |         self.id_buckets = self.make_buckets(bucket_width=2.0)
 84 | 
 85 |     def _get_sample_lengths(self):
 86 |         id_with_lengths = []
 87 |         for i in range(len(self.dataset)):
 88 |             id_with_lengths.append((i, self.dataset.get_sample_length(i)))
 89 |         id_with_lengths.sort(key=lambda x: x[1])
 90 |         return id_with_lengths
 91 | 
 92 |     def make_buckets(self, bucket_width: float = 2.0):
 93 |         buckets = []
 94 |         cur = []
 95 |         max_sec = bucket_width
 96 |         for id, sec in self.id_with_length:
 97 |             if sec < max_sec:
 98 |                 cur.append(id)
 99 |             else:
100 |                 buckets.append(cur)
101 |                 cur = [id]
102 |                 max_sec += bucket_width
103 |         if len(cur) > 0:
104 |             buckets.append(cur)
105 |         return buckets
106 | 
107 |     def __iter__(self) -> Iterator[T_co]:
108 |         if self.shuffle:
109 |             # deterministically shuffle based on epoch and seed
110 |             g = torch.Generator()
111 |             g.manual_seed(self.seed + self.epoch)
112 |             random.seed(self.epoch + self.seed)
113 |             shuffled_bucket = []
114 |             for buc in self.id_buckets:
115 |                 buc_copy = buc.copy()
116 |                 shuffle(buc_copy)
117 |                 shuffled_bucket.append(buc_copy)
118 |             grouped_batch_size = self.batch_size * self.num_replicas
119 |             shuffled_bucket = list(itertools.chain(*shuffled_bucket))
120 |             n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size))
121 |             batches = [
122 |                 shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size]
123 |                 for b in range(n_batch)
124 |             ]
125 |             shuffle(batches)
126 |             indices = list(itertools.chain(*batches))
127 |         else:
128 |             # type: ignore[arg-type]
129 |             indices = list(range(len(self.dataset)))
130 | 
131 |         if not self.drop_last:
132 |             # add extra samples to make it evenly divisible
133 |             padding_size = self.total_size - len(indices)
134 |             if padding_size <= len(indices):
135 |                 indices += indices[:padding_size]
136 |             else:
137 |                 indices += (indices * math.ceil(padding_size / len(indices)))[
138 |                     :padding_size
139 |                 ]
140 |         else:
141 |             # remove tail of data to make it evenly divisible.
142 |             indices = indices[: self.total_size]
143 |         assert len(indices) == self.total_size
144 | 
145 |         # subsample
146 |         indices = indices[self.rank : self.total_size : self.num_replicas]
147 |         assert len(indices) == self.num_samples
148 | 
149 |         return iter(indices)
150 | 
151 |     def __len__(self) -> int:
152 |         return self.num_samples
153 | 
154 |     def set_epoch(self, epoch: int) -> None:
155 |         r"""
156 |         Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
157 |         use a different random ordering for each epoch. Otherwise, the next iteration of this
158 |         sampler will yield the same ordering.
159 | 
160 |         Args:
161 |             epoch (int): Epoch number.
162 |         """
163 |         self.epoch = epoch
164 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/data/data_module.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | from pytorch_lightning import LightningDataModule
 4 | from AR.data.bucket_sampler import DistributedBucketSampler
 5 | from AR.data.dataset import Text2SemanticDataset
 6 | from torch.utils.data import DataLoader
 7 | 
 8 | 
 9 | class Text2SemanticDataModule(LightningDataModule):
10 |     def __init__(
11 |         self,
12 |         config,
13 |         train_semantic_path,
14 |         train_phoneme_path,
15 |         dev_semantic_path=None,
16 |         dev_phoneme_path=None,
17 |     ):
18 |         super().__init__()
19 |         self.config = config
20 |         self.train_semantic_path = train_semantic_path
21 |         self.train_phoneme_path = train_phoneme_path
22 |         self.dev_semantic_path = dev_semantic_path
23 |         self.dev_phoneme_path = dev_phoneme_path
24 |         self.num_workers = self.config["data"]["num_workers"]
25 | 
26 |     def prepare_data(self):
27 |         pass
28 | 
29 |     def setup(self, stage=None, output_logs=False):
30 |         self._train_dataset = Text2SemanticDataset(
31 |             phoneme_path=self.train_phoneme_path,
32 |             semantic_path=self.train_semantic_path,
33 |             max_sec=self.config["data"]["max_sec"],
34 |             pad_val=self.config["data"]["pad_val"],
35 |         )
36 |         self._dev_dataset = self._train_dataset
37 |         # self._dev_dataset = Text2SemanticDataset(
38 |         #     phoneme_path=self.dev_phoneme_path,
39 |         #     semantic_path=self.dev_semantic_path,
40 |         #     max_sample=self.config['data']['max_eval_sample'],
41 |         #     max_sec=self.config['data']['max_sec'],
42 |         #     pad_val=self.config['data']['pad_val'])
43 | 
44 |     def train_dataloader(self):
45 |         batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"]
46 |         batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存
47 |         sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
48 |         return DataLoader(
49 |             self._train_dataset,
50 |             batch_size=batch_size,
51 |             sampler=sampler,
52 |             collate_fn=self._train_dataset.collate,
53 |             num_workers=self.num_workers,
54 |             persistent_workers=True,
55 |             prefetch_factor=16,
56 |         )
57 | 
58 |     def val_dataloader(self):
59 |         return DataLoader(
60 |             self._dev_dataset,
61 |             batch_size=1,
62 |             shuffle=False,
63 |             collate_fn=self._train_dataset.collate,
64 |             num_workers=max(self.num_workers, 12),
65 |             persistent_workers=True,
66 |             prefetch_factor=16,
67 |         )
68 | 
69 |     # 这个会使用到嘛？
70 |     def test_dataloader(self):
71 |         return DataLoader(
72 |             self._dev_dataset,
73 |             batch_size=1,
74 |             shuffle=False,
75 |             collate_fn=self._train_dataset.collate,
76 |         )
77 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/models/__init__.py


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/models/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/models/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/models/__pycache__/t2s_lightning_module.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/models/__pycache__/t2s_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/models/__pycache__/t2s_model.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/models/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/models/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/models/t2s_lightning_module.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
  2 | # reference: https://github.com/lifeiteng/vall-e
  3 | import os, sys
  4 | 
  5 | now_dir = os.getcwd()
  6 | sys.path.append(now_dir)
  7 | from typing import Dict
  8 | 
  9 | import torch
 10 | from pytorch_lightning import LightningModule
 11 | from .t2s_model import Text2SemanticDecoder
 12 | from ..modules.lr_schedulers import WarmupCosineLRSchedule
 13 | from ..modules.optim import ScaledAdam
 14 | 
 15 | class Text2SemanticLightningModule(LightningModule):
 16 |     def __init__(self, config, output_dir, is_train=True):
 17 |         super().__init__()
 18 |         self.config = config
 19 |         self.top_k = 3
 20 |         self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
 21 |         pretrained_s1 = config.get("pretrained_s1")
 22 |         if pretrained_s1 and is_train:
 23 |             # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
 24 |             print(
 25 |                 self.load_state_dict(
 26 |                     torch.load(pretrained_s1, map_location="cpu")["weight"]
 27 |                 )
 28 |             )
 29 |         if is_train:
 30 |             self.automatic_optimization = False
 31 |             self.save_hyperparameters()
 32 |             self.eval_dir = output_dir / "eval"
 33 |             self.eval_dir.mkdir(parents=True, exist_ok=True)
 34 | 
 35 |     def training_step(self, batch: Dict, batch_idx: int):
 36 |         opt = self.optimizers()
 37 |         scheduler = self.lr_schedulers()
 38 |         forward=self.model.forward if self.config["train"].get("if_dpo",False)==True else self.model.forward_old
 39 |         loss, acc = forward(
 40 |             batch["phoneme_ids"],
 41 |             batch["phoneme_ids_len"],
 42 |             batch["semantic_ids"],
 43 |             batch["semantic_ids_len"],
 44 |             batch["bert_feature"],
 45 |         )
 46 |         self.manual_backward(loss)
 47 |         if batch_idx > 0 and batch_idx % 4 == 0:
 48 |             opt.step()
 49 |             opt.zero_grad()
 50 |             scheduler.step()
 51 | 
 52 |         self.log(
 53 |             "total_loss",
 54 |             loss,
 55 |             on_step=True,
 56 |             on_epoch=True,
 57 |             prog_bar=True,
 58 |             sync_dist=True,
 59 |         )
 60 |         self.log(
 61 |             "lr",
 62 |             scheduler.get_last_lr()[0],
 63 |             on_epoch=True,
 64 |             prog_bar=True,
 65 |             sync_dist=True,
 66 |         )
 67 |         self.log(
 68 |             f"top_{self.top_k}_acc",
 69 |             acc,
 70 |             on_step=True,
 71 |             on_epoch=True,
 72 |             prog_bar=True,
 73 |             sync_dist=True,
 74 |         )
 75 | 
 76 |     def validation_step(self, batch: Dict, batch_idx: int):
 77 |         return
 78 | 
 79 |     # # get loss
 80 |     # loss, acc = self.model.forward(
 81 |     #     batch['phoneme_ids'], batch['phoneme_ids_len'],
 82 |     #     batch['semantic_ids'], batch['semantic_ids_len'],
 83 |     #     batch['bert_feature']
 84 |     # )
 85 |     #
 86 |     # self.log(
 87 |     #     "val_total_loss",
 88 |     #     loss,
 89 |     #     on_step=True,
 90 |     #     on_epoch=True,
 91 |     #     prog_bar=True,
 92 |     #     sync_dist=True)
 93 |     # self.log(
 94 |     #     f"val_top_{self.top_k}_acc",
 95 |     #     acc,
 96 |     #     on_step=True,
 97 |     #     on_epoch=True,
 98 |     #     prog_bar=True,
 99 |     #     sync_dist=True)
100 |     #
101 |     # # get infer output
102 |     # semantic_len = batch['semantic_ids'].size(1)
103 |     # prompt_len = min(int(semantic_len * 0.5), 150)
104 |     # prompt = batch['semantic_ids'][:, :prompt_len]
105 |     # pred_semantic = self.model.infer(batch['phoneme_ids'],
106 |     #                                  batch['phoneme_ids_len'], prompt,
107 |     #                                  batch['bert_feature']
108 |     #                                  )
109 |     # save_name = f'semantic_toks_{batch_idx}.pt'
110 |     # save_path = os.path.join(self.eval_dir, save_name)
111 |     # torch.save(pred_semantic.detach().cpu(), save_path)
112 | 
113 |     def configure_optimizers(self):
114 |         model_parameters = self.model.parameters()
115 |         parameters_names = []
116 |         parameters_names.append(
117 |             [name_param_pair[0] for name_param_pair in self.model.named_parameters()]
118 |         )
119 |         lm_opt = ScaledAdam(
120 |             model_parameters,
121 |             lr=0.01,
122 |             betas=(0.9, 0.95),
123 |             clipping_scale=2.0,
124 |             parameters_names=parameters_names,
125 |             show_dominant_parameters=False,
126 |             clipping_update_period=1000,
127 |         )
128 | 
129 |         return {
130 |             "optimizer": lm_opt,
131 |             "lr_scheduler": {
132 |                 "scheduler": WarmupCosineLRSchedule(
133 |                     lm_opt,
134 |                     init_lr=self.config["optimizer"]["lr_init"],
135 |                     peak_lr=self.config["optimizer"]["lr"],
136 |                     end_lr=self.config["optimizer"]["lr_end"],
137 |                     warmup_steps=self.config["optimizer"]["warmup_steps"],
138 |                     total_steps=self.config["optimizer"]["decay_steps"],
139 |                 )
140 |             },
141 |         }
142 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
  2 | # reference: https://github.com/lifeiteng/vall-e
  3 | import os, sys
  4 | 
  5 | now_dir = os.getcwd()
  6 | sys.path.append(now_dir)
  7 | from typing import Dict
  8 | 
  9 | import torch
 10 | from pytorch_lightning import LightningModule
 11 | from AR.models.t2s_model_onnx import Text2SemanticDecoder
 12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule
 13 | from AR.modules.optim import ScaledAdam
 14 | 
 15 | 
 16 | class Text2SemanticLightningModule(LightningModule):
 17 |     def __init__(self, config, output_dir, is_train=True):
 18 |         super().__init__()
 19 |         self.config = config
 20 |         self.top_k = 3
 21 |         self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
 22 |         pretrained_s1 = config.get("pretrained_s1")
 23 |         if pretrained_s1 and is_train:
 24 |             # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
 25 |             print(
 26 |                 self.load_state_dict(
 27 |                     torch.load(pretrained_s1, map_location="cpu")["weight"]
 28 |                 )
 29 |             )
 30 |         if is_train:
 31 |             self.automatic_optimization = False
 32 |             self.save_hyperparameters()
 33 |             self.eval_dir = output_dir / "eval"
 34 |             self.eval_dir.mkdir(parents=True, exist_ok=True)
 35 | 
 36 |     def training_step(self, batch: Dict, batch_idx: int):
 37 |         opt = self.optimizers()
 38 |         scheduler = self.lr_schedulers()
 39 |         loss, acc = self.model.forward(
 40 |             batch["phoneme_ids"],
 41 |             batch["phoneme_ids_len"],
 42 |             batch["semantic_ids"],
 43 |             batch["semantic_ids_len"],
 44 |             batch["bert_feature"],
 45 |         )
 46 |         self.manual_backward(loss)
 47 |         if batch_idx > 0 and batch_idx % 4 == 0:
 48 |             opt.step()
 49 |             opt.zero_grad()
 50 |             scheduler.step()
 51 | 
 52 |         self.log(
 53 |             "total_loss",
 54 |             loss,
 55 |             on_step=True,
 56 |             on_epoch=True,
 57 |             prog_bar=True,
 58 |             sync_dist=True,
 59 |         )
 60 |         self.log(
 61 |             "lr",
 62 |             scheduler.get_last_lr()[0],
 63 |             on_epoch=True,
 64 |             prog_bar=True,
 65 |             sync_dist=True,
 66 |         )
 67 |         self.log(
 68 |             f"top_{self.top_k}_acc",
 69 |             acc,
 70 |             on_step=True,
 71 |             on_epoch=True,
 72 |             prog_bar=True,
 73 |             sync_dist=True,
 74 |         )
 75 | 
 76 |     def validation_step(self, batch: Dict, batch_idx: int):
 77 |         return
 78 | 
 79 |     def configure_optimizers(self):
 80 |         model_parameters = self.model.parameters()
 81 |         parameters_names = []
 82 |         parameters_names.append(
 83 |             [name_param_pair[0] for name_param_pair in self.model.named_parameters()]
 84 |         )
 85 |         lm_opt = ScaledAdam(
 86 |             model_parameters,
 87 |             lr=0.01,
 88 |             betas=(0.9, 0.95),
 89 |             clipping_scale=2.0,
 90 |             parameters_names=parameters_names,
 91 |             show_dominant_parameters=False,
 92 |             clipping_update_period=1000,
 93 |         )
 94 | 
 95 |         return {
 96 |             "optimizer": lm_opt,
 97 |             "lr_scheduler": {
 98 |                 "scheduler": WarmupCosineLRSchedule(
 99 |                     lm_opt,
100 |                     init_lr=self.config["optimizer"]["lr_init"],
101 |                     peak_lr=self.config["optimizer"]["lr"],
102 |                     end_lr=self.config["optimizer"]["lr_end"],
103 |                     warmup_steps=self.config["optimizer"]["warmup_steps"],
104 |                     total_steps=self.config["optimizer"]["decay_steps"],
105 |                 )
106 |             },
107 |         }
108 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/models/utils.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/utils.py
  2 | # reference: https://github.com/lifeiteng/vall-e
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from typing import Tuple
  6 | 
  7 | def sequence_mask(length, max_length=None):
  8 |     if max_length is None:
  9 |         max_length = length.max()
 10 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
 11 |     return x.unsqueeze(0) < length.unsqueeze(1)
 12 | 
 13 | 
 14 | def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
 15 |     """
 16 |     Args:
 17 |       lengths:
 18 |         A 1-D tensor containing sentence lengths.
 19 |       max_len:
 20 |         The length of masks.
 21 |     Returns:
 22 |       Return a 2-D bool tensor, where masked positions
 23 |       are filled with `True` and non-masked positions are
 24 |       filled with `False`.
 25 | 
 26 |     #>>> lengths = torch.tensor([1, 3, 2, 5])
 27 |     #>>> make_pad_mask(lengths)
 28 |     tensor([[False,  True,  True,  True,  True],
 29 |             [False, False, False,  True,  True],
 30 |             [False, False,  True,  True,  True],
 31 |             [False, False, False, False, False]])
 32 |     """
 33 |     assert lengths.ndim == 1, lengths.ndim
 34 |     max_len = max(max_len, lengths.max())
 35 |     n = lengths.size(0)
 36 |     seq_range = torch.arange(0, max_len, device=lengths.device)
 37 |     expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
 38 | 
 39 |     return expaned_lengths >= lengths.unsqueeze(-1)
 40 | 
 41 | 
 42 | # https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
 43 | def top_k_top_p_filtering(
 44 |     logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1
 45 | ):
 46 |     """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
 47 |     Args:
 48 |         logits: logits distribution shape (batch size, vocabulary size)
 49 |         if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
 50 |         if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
 51 |             Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
 52 |         Make sure we keep at least min_tokens_to_keep per batch example in the output
 53 |     From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
 54 |     """
 55 |     if top_k > 0:
 56 |         top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
 57 |         # Remove all tokens with a probability less than the last token of the top-k
 58 |         indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
 59 |         logits[indices_to_remove] = filter_value
 60 | 
 61 |     if top_p < 1.0:
 62 |         sorted_logits, sorted_indices = torch.sort(logits, descending=True)
 63 |         cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
 64 | 
 65 |         # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
 66 |         sorted_indices_to_remove = cumulative_probs > top_p
 67 |         if min_tokens_to_keep > 1:
 68 |             # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
 69 |             sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
 70 |         # Shift the indices to the right to keep also the first token above the threshold
 71 |         sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
 72 |         sorted_indices_to_remove[..., 0] = 0
 73 | 
 74 |         # scatter sorted tensors to original indexing
 75 |         indices_to_remove = sorted_indices_to_remove.scatter(
 76 |             1, sorted_indices, sorted_indices_to_remove
 77 |         )
 78 |         logits[indices_to_remove] = filter_value
 79 |     return logits
 80 | 
 81 | 
 82 | def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0):
 83 |     # temperature: (`optional`) float
 84 |     #     The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
 85 |     # top_k: (`optional`) int
 86 |     #     The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
 87 |     # top_p: (`optional`) float
 88 |     #     The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
 89 | 
 90 |     # Temperature (higher temperature => more likely to sample low probability tokens)
 91 |     if temperature != 1.0:
 92 |         logits = logits / temperature
 93 |     # Top-p/top-k filtering
 94 |     logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
 95 |     # Sample
 96 |     token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
 97 |     return token
 98 | 
 99 | 
100 | from typing import Optional, Tuple
101 | 
102 | 
103 | def multinomial_sample_one_no_sync(
104 |     probs_sort,
105 | ):  # Does multinomial sampling without a cuda synchronization
106 |     q = torch.empty_like(probs_sort).exponential_(1)
107 |     return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
108 | 
109 | 
110 | def logits_to_probs(
111 |     logits,
112 |     previous_tokens: Optional[torch.Tensor] = None,
113 |     temperature: float = 1.0,
114 |     top_k: Optional[int] = None,
115 |     top_p: Optional[int] = None,
116 |     repetition_penalty: float = 1.0,
117 | ):
118 |     if previous_tokens is not None:
119 |         previous_tokens = previous_tokens.squeeze()
120 |     # print(logits.shape,previous_tokens.shape)
121 |     # pdb.set_trace()
122 |     if previous_tokens is not None and repetition_penalty != 1.0:
123 |         previous_tokens = previous_tokens.long()
124 |         score = torch.gather(logits, dim=0, index=previous_tokens)
125 |         score = torch.where(
126 |             score < 0, score * repetition_penalty, score / repetition_penalty
127 |         )
128 |         logits.scatter_(dim=0, index=previous_tokens, src=score)
129 | 
130 |     if top_p is not None and top_p < 1.0:
131 |         sorted_logits, sorted_indices = torch.sort(logits, descending=True)
132 |         cum_probs = torch.cumsum(
133 |             torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1
134 |         )
135 |         sorted_indices_to_remove = cum_probs > top_p
136 |         sorted_indices_to_remove[0] = False  # keep at least one option
137 |         indices_to_remove = sorted_indices_to_remove.scatter(
138 |             dim=0, index=sorted_indices, src=sorted_indices_to_remove
139 |         )
140 |         logits = logits.masked_fill(indices_to_remove, -float("Inf"))
141 | 
142 |     logits = logits / max(temperature, 1e-5)
143 | 
144 |     if top_k is not None:
145 |         v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
146 |         pivot = v.select(-1, -1).unsqueeze(-1)
147 |         logits = torch.where(logits < pivot, -float("Inf"), logits)
148 | 
149 |     probs = torch.nn.functional.softmax(logits, dim=-1)
150 |     return probs
151 | 
152 | 
153 | def sample(
154 |     logits,
155 |     previous_tokens: Optional[torch.Tensor] = None,
156 |     **sampling_kwargs,
157 | ) -> Tuple[torch.Tensor, torch.Tensor]:
158 |     probs = logits_to_probs(
159 |         logits=logits, previous_tokens=previous_tokens, **sampling_kwargs
160 |     )
161 |     idx_next = multinomial_sample_one_no_sync(probs)
162 |     return idx_next, probs
163 | 
164 | def dpo_loss(policy_chosen_logps: torch.FloatTensor,
165 |              policy_rejected_logps: torch.FloatTensor,
166 |              reference_chosen_logps: torch.FloatTensor,
167 |              reference_rejected_logps: torch.FloatTensor,
168 |              beta: float,
169 |              reference_free: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
170 |     pi_logratios = policy_chosen_logps - policy_rejected_logps
171 |     ref_logratios = reference_chosen_logps - reference_rejected_logps
172 | 
173 |     if reference_free:
174 |         ref_logratios = 0
175 | 
176 |     logits = pi_logratios - ref_logratios
177 | 
178 |     losses = -F.logsigmoid(beta * logits)
179 |     chosen_rewards = beta * (policy_chosen_logps - reference_chosen_logps).detach()
180 |     rejected_rewards = beta * (policy_rejected_logps - reference_rejected_logps).detach()
181 | 
182 |     return losses.mean(), chosen_rewards, rejected_rewards
183 | 
184 | def get_batch_logps(logits_target: torch.FloatTensor, logits_reject: torch.FloatTensor, labels_target: torch.LongTensor, labels_reject: torch.LongTensor, average_log_prob: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
185 | 
186 |     # dummy token; we'll ignore the losses on these tokens later
187 | 
188 |     per_token_logps_target = torch.gather(logits_target.log_softmax(-1), dim=2, index=labels_target.unsqueeze(2)).squeeze(2)
189 |     per_token_logps_reject = torch.gather(logits_reject.log_softmax(-1), dim=2, index=labels_reject.unsqueeze(2)).squeeze(2)
190 | 
191 |     return per_token_logps_target.sum(-1), per_token_logps_reject.sum(-1)
192 | 
193 | def make_reject_y(y_o, y_lens):
194 |     def repeat_P(y):
195 |         range_idx, _ = torch.randint(0, len(y), size=(2,)).sort()
196 |         pre = y[:range_idx[0]]
197 |         shf = y[range_idx[1]:]
198 |         range_text = y[range_idx[0]:range_idx[1]]
199 |         new_y = torch.cat([pre, range_text, range_text, shf])
200 |         return new_y
201 |     def lost_P(y):
202 |         range_idx, _ = torch.randint(0, len(y), size=(2,)).sort()
203 |         pre = y[:range_idx[0]]
204 |         shf = y[range_idx[1]:]
205 |         range_text = y[range_idx[0]:range_idx[1]]
206 |         new_y = torch.cat([pre, shf])
207 |         return new_y
208 |     bs = len(y_lens)
209 |     reject_y = []
210 |     reject_y_lens = []
211 |     for b in range(bs):
212 |         process_item_idx = torch.randint(0, 1, size=(1, ))[0]
213 |         if process_item_idx == 0:
214 |             new_y = repeat_P(y_o[b])
215 |             reject_y.append(new_y)
216 |             reject_y_lens.append(len(new_y))
217 |         elif process_item_idx==1:
218 |             new_y = lost_P(y_o[b])
219 |             reject_y.append(new_y)
220 |             reject_y_lens.append(len(new_y))
221 |     max_length = max(reject_y_lens)
222 |     for b in range(bs):
223 |         pad_length = max_length - reject_y_lens[b]
224 |         reject_y[b] = torch.cat([reject_y[b], torch.zeros(pad_length, dtype=y_o.dtype, device=y_o.device)], dim=0)
225 | 
226 |     reject_y = torch.stack(reject_y, dim = 0)
227 |     reject_y_lens = torch.tensor(reject_y_lens, device=y_lens.device)
228 | 
229 |     return reject_y, reject_y_lens
230 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/modules/__init__.py


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/activation.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/activation.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/embedding.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/embedding.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/lr_schedulers.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/lr_schedulers.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/optim.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/optim.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/patched_mha_with_cache.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/patched_mha_with_cache.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/scaling.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/scaling.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/transformer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/modules/__pycache__/transformer.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/activation_onnx.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py
  2 | from typing import Optional
  3 | from typing import Tuple
  4 | import torch
  5 | from torch import Tensor
  6 | from torch.nn import Linear
  7 | from torch.nn import Module
  8 | from torch.nn.init import constant_
  9 | from torch.nn.init import xavier_normal_
 10 | from torch.nn.init import xavier_uniform_
 11 | from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
 12 | from torch.nn.parameter import Parameter
 13 | 
 14 | from torch.nn import functional as F
 15 | from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched
 16 | 
 17 | 
 18 | class MultiheadAttention(Module):
 19 |     __constants__ = ["batch_first"]
 20 |     bias_k: Optional[torch.Tensor]
 21 |     bias_v: Optional[torch.Tensor]
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         embed_dim,
 26 |         num_heads,
 27 |         dropout=0.0,
 28 |         bias=True,
 29 |         add_bias_kv=False,
 30 |         add_zero_attn=False,
 31 |         kdim=None,
 32 |         vdim=None,
 33 |         batch_first=False,
 34 |         linear1_cls=Linear,
 35 |         linear2_cls=Linear,
 36 |         device=None,
 37 |         dtype=None,
 38 |     ) -> None:
 39 |         factory_kwargs = {"device": device, "dtype": dtype}
 40 |         super(MultiheadAttention, self).__init__()
 41 |         self.embed_dim = embed_dim
 42 |         self.kdim = kdim if kdim is not None else embed_dim
 43 |         self.vdim = vdim if vdim is not None else embed_dim
 44 |         self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
 45 | 
 46 |         self.num_heads = num_heads
 47 |         self.dropout = dropout
 48 |         self.batch_first = batch_first
 49 |         self.head_dim = embed_dim // num_heads
 50 |         assert (
 51 |             self.head_dim * num_heads == self.embed_dim
 52 |         ), "embed_dim must be divisible by num_heads"
 53 | 
 54 |         if add_bias_kv:
 55 |             self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
 56 |             self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
 57 |         else:
 58 |             self.bias_k = self.bias_v = None
 59 | 
 60 |         if linear1_cls == Linear:
 61 |             if not self._qkv_same_embed_dim:
 62 |                 self.q_proj_weight = Parameter(
 63 |                     torch.empty((embed_dim, embed_dim), **factory_kwargs)
 64 |                 )
 65 |                 self.k_proj_weight = Parameter(
 66 |                     torch.empty((embed_dim, self.kdim), **factory_kwargs)
 67 |                 )
 68 |                 self.v_proj_weight = Parameter(
 69 |                     torch.empty((embed_dim, self.vdim), **factory_kwargs)
 70 |                 )
 71 |                 self.register_parameter("in_proj_weight", None)
 72 |             else:
 73 |                 self.in_proj_weight = Parameter(
 74 |                     torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
 75 |                 )
 76 |                 self.register_parameter("q_proj_weight", None)
 77 |                 self.register_parameter("k_proj_weight", None)
 78 |                 self.register_parameter("v_proj_weight", None)
 79 | 
 80 |             if bias:
 81 |                 self.in_proj_bias = Parameter(
 82 |                     torch.empty(3 * embed_dim, **factory_kwargs)
 83 |                 )
 84 |             else:
 85 |                 self.register_parameter("in_proj_bias", None)
 86 |             self.out_proj = NonDynamicallyQuantizableLinear(
 87 |                 embed_dim, embed_dim, bias=bias, **factory_kwargs
 88 |             )
 89 | 
 90 |             self._reset_parameters()
 91 |         else:
 92 |             if not self._qkv_same_embed_dim:
 93 |                 raise NotImplementedError
 94 |             else:
 95 |                 self.in_proj_linear = linear1_cls(
 96 |                     embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs
 97 |                 )
 98 |                 self.in_proj_weight = self.in_proj_linear.weight
 99 | 
100 |                 self.register_parameter("q_proj_weight", None)
101 |                 self.register_parameter("k_proj_weight", None)
102 |                 self.register_parameter("v_proj_weight", None)
103 | 
104 |                 if bias:
105 |                     self.in_proj_bias = self.in_proj_linear.bias
106 |                 else:
107 |                     self.register_parameter("in_proj_bias", None)
108 | 
109 |             self.out_proj = linear2_cls(
110 |                 embed_dim, embed_dim, bias=bias, **factory_kwargs
111 |             )
112 | 
113 |             if self.bias_k is not None:
114 |                 xavier_normal_(self.bias_k)
115 |             if self.bias_v is not None:
116 |                 xavier_normal_(self.bias_v)
117 | 
118 |         self.add_zero_attn = add_zero_attn
119 | 
120 |     def _reset_parameters(self):
121 |         if self._qkv_same_embed_dim:
122 |             xavier_uniform_(self.in_proj_weight)
123 |         else:
124 |             xavier_uniform_(self.q_proj_weight)
125 |             xavier_uniform_(self.k_proj_weight)
126 |             xavier_uniform_(self.v_proj_weight)
127 | 
128 |         if self.in_proj_bias is not None:
129 |             constant_(self.in_proj_bias, 0.0)
130 |             constant_(self.out_proj.bias, 0.0)
131 | 
132 |         if self.bias_k is not None:
133 |             xavier_normal_(self.bias_k)
134 |         if self.bias_v is not None:
135 |             xavier_normal_(self.bias_v)
136 | 
137 |     def __setstate__(self, state):
138 |         # Support loading old MultiheadAttention checkpoints generated by v1.1.0
139 |         if "_qkv_same_embed_dim" not in state:
140 |             state["_qkv_same_embed_dim"] = True
141 | 
142 |         super(MultiheadAttention, self).__setstate__(state)
143 | 
144 |     def forward(
145 |         self,
146 |         query: Tensor,
147 |         key: Tensor,
148 |         value: Tensor,
149 |         key_padding_mask: Optional[Tensor] = None,
150 |         need_weights: bool = True,
151 |         attn_mask: Optional[Tensor] = None,
152 |         average_attn_weights: bool = True,
153 |         cache=None,
154 |     ) -> Tuple[Tensor, Optional[Tensor]]:
155 |         any_nested = query.is_nested or key.is_nested or value.is_nested
156 |         query = key = value = query.transpose(1, 0)
157 |         attn_output = multi_head_attention_forward_patched(
158 |             query,
159 |             key,
160 |             value,
161 |             self.embed_dim,
162 |             self.num_heads,
163 |             self.in_proj_weight,
164 |             self.in_proj_bias,
165 |             self.bias_k,
166 |             self.bias_v,
167 |             self.add_zero_attn,
168 |             self.dropout,
169 |             self.out_proj.weight,
170 |             self.out_proj.bias,
171 |             training=self.training,
172 |             key_padding_mask=key_padding_mask,
173 |             need_weights=need_weights,
174 |             attn_mask=attn_mask,
175 |             average_attn_weights=average_attn_weights,
176 |             cache=cache,
177 |         )
178 |         return attn_output.transpose(1, 0)
179 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 | 
50 |         self.reverse = False
51 |         self.pe = None
52 |         self.extend_pe(torch.tensor(0.0).expand(1, 4000))
53 | 
54 |     def extend_pe(self, x):
55 |         """Reset the positional encodings."""
56 |         if self.pe is not None:
57 |             if self.pe.size(1) >= x.size(1):
58 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
59 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
60 |                 return
61 |         pe = torch.zeros(x.size(1), self.embedding_dim)
62 |         if self.reverse:
63 |             position = torch.arange(
64 |                 x.size(1) - 1, -1, -1.0, dtype=torch.float32
65 |             ).unsqueeze(1)
66 |         else:
67 |             position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
68 |         div_term = torch.exp(
69 |             torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
70 |             * -(math.log(10000.0) / self.embedding_dim)
71 |         )
72 |         pe[:, 0::2] = torch.sin(position * div_term)
73 |         pe[:, 1::2] = torch.cos(position * div_term)
74 |         pe = pe.unsqueeze(0)
75 |         self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
76 | 
77 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
78 |         self.extend_pe(x)
79 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
80 |         output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
81 |         return self.dropout(output)
82 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/embedding_onnx.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 |         self.reverse = False
50 |         self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
51 | 
52 |     def extend_pe(self, x):
53 |         position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1)
54 |         scpe = (position * self.div_term).unsqueeze(0)
55 |         pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
56 |         pe = pe.contiguous().view(1, -1, self.embedding_dim)
57 |         return pe
58 | 
59 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
60 |         pe = self.extend_pe(x)
61 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
62 |         output = output * self.x_scale + self.alpha * pe
63 |         return self.dropout(output)
64 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/lr_schedulers.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import math
 4 | 
 5 | import torch
 6 | from matplotlib import pyplot as plt
 7 | from torch import nn
 8 | from torch.optim import Adam
 9 | 
10 | 
11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
12 |     """
13 |     Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         optimizer,
19 |         init_lr,
20 |         peak_lr,
21 |         end_lr,
22 |         warmup_steps=10000,
23 |         total_steps=400000,
24 |         current_step=0,
25 |     ):
26 |         self.init_lr = init_lr
27 |         self.peak_lr = peak_lr
28 |         self.end_lr = end_lr
29 |         self.optimizer = optimizer
30 |         self._warmup_rate = (peak_lr - init_lr) / warmup_steps
31 |         self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
32 |         self._current_step = current_step
33 |         self.lr = init_lr
34 |         self.warmup_steps = warmup_steps
35 |         self.total_steps = total_steps
36 |         self._last_lr = [self.lr]
37 | 
38 |     def set_lr(self, lr):
39 |         self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
40 |         for g in self.optimizer.param_groups:
41 |             # g['lr'] = lr
42 |             g["lr"] = self.end_lr  ###锁定用线性
43 | 
44 |     def step(self):
45 |         if self._current_step < self.warmup_steps:
46 |             lr = self.init_lr + self._warmup_rate * self._current_step
47 | 
48 |         elif self._current_step > self.total_steps:
49 |             lr = self.end_lr
50 | 
51 |         else:
52 |             decay_ratio = (self._current_step - self.warmup_steps) / (
53 |                 self.total_steps - self.warmup_steps
54 |             )
55 |             if decay_ratio < 0.0 or decay_ratio > 1.0:
56 |                 raise RuntimeError(
57 |                     "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings."
58 |                 )
59 |             coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
60 |             lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
61 | 
62 |         self.lr = lr = self.end_lr = 0.002  ###锁定用线性###不听话，直接锁定！
63 |         self.set_lr(lr)
64 |         self.lr = lr
65 |         self._current_step += 1
66 |         return self.lr
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     m = nn.Linear(10, 10)
71 |     opt = Adam(m.parameters(), lr=1e-4)
72 |     s = WarmupCosineLRSchedule(
73 |         opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0
74 |     )
75 |     lrs = []
76 |     for i in range(25000):
77 |         s.step()
78 |         lrs.append(s.lr)
79 |         print(s.lr)
80 | 
81 |     plt.plot(lrs)
82 |     plt.plot(range(0, 25000), lrs)
83 |     plt.show()
84 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.functional import *
 2 | from torch.nn.functional import (
 3 |     _mha_shape_check,
 4 |     _canonical_mask,
 5 |     _none_or_dtype,
 6 |     _in_projection_packed,
 7 | )
 8 | 
 9 | def multi_head_attention_forward_patched(
10 |     query,
11 |     key,
12 |     value,
13 |     embed_dim_to_check: int,
14 |     num_heads: int,
15 |     in_proj_weight,
16 |     in_proj_bias: Optional[Tensor],
17 |     bias_k: Optional[Tensor],
18 |     bias_v: Optional[Tensor],
19 |     add_zero_attn: bool,
20 |     dropout_p: float,
21 |     out_proj_weight: Tensor,
22 |     out_proj_bias: Optional[Tensor],
23 |     training: bool = True,
24 |     key_padding_mask: Optional[Tensor] = None,
25 |     need_weights: bool = True,
26 |     attn_mask: Optional[Tensor] = None,
27 |     use_separate_proj_weight: bool = False,
28 |     q_proj_weight: Optional[Tensor] = None,
29 |     k_proj_weight: Optional[Tensor] = None,
30 |     v_proj_weight: Optional[Tensor] = None,
31 |     static_k: Optional[Tensor] = None,
32 |     static_v: Optional[Tensor] = None,
33 |     average_attn_weights: bool = True,
34 |     is_causal: bool = False,
35 |     cache=None,
36 | ) -> Tuple[Tensor, Optional[Tensor]]:
37 | 
38 |     # set up shape vars
39 |     _, _, embed_dim = query.shape
40 |     attn_mask = _canonical_mask(
41 |         mask=attn_mask,
42 |         mask_name="attn_mask",
43 |         other_type=None,
44 |         other_name="",
45 |         target_type=query.dtype,
46 |         check_other=False,
47 |     )
48 |     head_dim = embed_dim // num_heads
49 | 
50 |     proj_qkv = linear(query, in_proj_weight, in_proj_bias)
51 |     proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
52 |     q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
53 | 
54 |     if cache["first_infer"] == 1:
55 |         cache["k"][cache["stage"]] = k
56 |         cache["v"][cache["stage"]] = v
57 |     else:
58 |         cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
59 |         cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
60 |         k = cache["k"][cache["stage"]]
61 |         v = cache["v"][cache["stage"]]
62 |     cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
63 | 
64 |     attn_mask = _canonical_mask(
65 |         mask=attn_mask,
66 |         mask_name="attn_mask",
67 |         other_type=None,
68 |         other_name="",
69 |         target_type=q.dtype,
70 |         check_other=False,
71 |     )
72 |     attn_mask = attn_mask.unsqueeze(0)
73 | 
74 |     q = q.view(-1, num_heads, head_dim).transpose(0, 1)
75 |     k = k.view(-1, num_heads, head_dim).transpose(0, 1)
76 |     v = v.view(-1, num_heads, head_dim).transpose(0, 1)
77 | 
78 |     dropout_p = 0.0
79 |     attn_mask = attn_mask.unsqueeze(0)
80 |     q = q.view(num_heads, -1, head_dim).unsqueeze(0)
81 |     k = k.view(num_heads, -1, head_dim).unsqueeze(0)
82 |     v = v.view(num_heads, -1, head_dim).unsqueeze(0)
83 |     attn_output = scaled_dot_product_attention(
84 |         q, k, v, attn_mask, dropout_p, is_causal
85 |     )
86 |     attn_output = (
87 |         attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
88 |     )
89 |     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
90 |     attn_output = attn_output.view(-1, 1, attn_output.size(1))
91 | 
92 |     return attn_output
93 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/text_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/text_processing/__init__.py


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import itertools
 4 | import re
 5 | from typing import Dict
 6 | from typing import List
 7 | 
 8 | import regex
 9 | from gruut import sentences
10 | from gruut.const import Sentence
11 | from gruut.const import Word
12 | from AR.text_processing.symbols import SYMBOL_TO_ID
13 | 
14 | 
15 | class GruutPhonemizer:
16 |     def __init__(self, language: str):
17 |         self._phonemizer = sentences
18 |         self.lang = language
19 |         self.symbol_to_id = SYMBOL_TO_ID
20 |         self._special_cases_dict: Dict[str] = {
21 |             r"\.\.\.": "... ",
22 |             ";": "; ",
23 |             ":": ": ",
24 |             ",": ", ",
25 |             r"\.": ". ",
26 |             "!": "! ",
27 |             r"\?": "? ",
28 |             "—": "—",
29 |             "…": "… ",
30 |             "«": "«",
31 |             "»": "»",
32 |         }
33 |         self._punctuation_regexp: str = (
34 |             rf"([{''.join(self._special_cases_dict.keys())}])"
35 |         )
36 | 
37 |     def _normalize_punctuation(self, text: str) -> str:
38 |         text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
39 |         text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
40 |         text = regex.sub(r"\pZ+", r" ", text)
41 |         return text.strip()
42 | 
43 |     def _convert_punctuation(self, word: Word) -> str:
44 |         if not word.phonemes:
45 |             return ""
46 |         if word.phonemes[0] in ["‖", "|"]:
47 |             return word.text.strip()
48 | 
49 |         phonemes = "".join(word.phonemes)
50 |         # remove modifier characters ˈˌː with regex
51 |         phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
52 |         return phonemes.strip()
53 | 
54 |     def phonemize(self, text: str, espeak: bool = False) -> str:
55 |         text_to_phonemize: str = self._normalize_punctuation(text)
56 |         sents: List[Sentence] = [
57 |             sent
58 |             for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)
59 |         ]
60 |         words: List[str] = [
61 |             self._convert_punctuation(word) for word in itertools.chain(*sents)
62 |         ]
63 |         return " ".join(words)
64 | 
65 |     def transform(self, phonemes):
66 |         # convert phonemes to ids
67 |         # dictionary is in symbols.py
68 |         return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     phonemizer = GruutPhonemizer("en-us")
73 |     # text -> IPA
74 |     phonemes = phonemizer.phonemize("Hello, wor-ld ?")
75 |     print("phonemes:", phonemes)
76 |     print("len(phonemes):", len(phonemes))
77 |     phoneme_ids = phonemizer.transform(phonemes)
78 |     print("phoneme_ids:", phoneme_ids)
79 |     print("len(phoneme_ids):", len(phoneme_ids))
80 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/text_processing/symbols.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | PAD = "_"
 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” '
 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 6 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
 7 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
 8 | SPACE_ID = SYMBOLS.index(" ")
 9 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
10 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
11 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def str2bool(str):
 5 |     return True if str.lower() == 'true' else False
 6 | 
 7 | 
 8 | def get_newest_ckpt(string_list):
 9 |     # 定义一个正则表达式模式，用于匹配字符串中的数字
10 |     pattern = r'epoch=(\d+)-step=(\d+)\.ckpt'
11 | 
12 |     # 使用正则表达式提取每个字符串中的数字信息，并创建一个包含元组的列表
13 |     extracted_info = []
14 |     for string in string_list:
15 |         match = re.match(pattern, string)
16 |         if match:
17 |             epoch = int(match.group(1))
18 |             step = int(match.group(2))
19 |             extracted_info.append((epoch, step, string))
20 |     # 按照 epoch 后面的数字和 step 后面的数字进行排序
21 |     sorted_info = sorted(
22 |         extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
23 |     # 获取最新的 ckpt 文件名
24 |     newest_ckpt = sorted_info[0][2]
25 |     return newest_ckpt
26 | 
27 | 
28 | # 文本存在且不为空时 return True
29 | def check_txt_file(file_path):
30 |     try:
31 |         with open(file_path, 'r') as file:
32 |             text = file.readline().strip()
33 |         assert text.strip() != ''
34 |         return text
35 |     except Exception:
36 |         return False
37 |     return False
38 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/utils/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/utils/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/utils/__pycache__/io.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/AR/utils/__pycache__/io.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/utils/initialize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Initialize modules for espnet2 neural networks."""
 3 | import torch
 4 | from typeguard import check_argument_types
 5 | 
 6 | 
 7 | def initialize(model: torch.nn.Module, init: str):
 8 |     """Initialize weights of a neural network module.
 9 | 
10 |     Parameters are initialized using the given method or distribution.
11 | 
12 |     Custom initialization routines can be implemented into submodules
13 |     as function `espnet_initialization_fn` within the custom module.
14 | 
15 |     Args:
16 |         model: Target.
17 |         init: Method of initialization.
18 |     """
19 |     assert check_argument_types()
20 |     print("init with", init)
21 | 
22 |     # weight init
23 |     for p in model.parameters():
24 |         if p.dim() > 1:
25 |             if init == "xavier_uniform":
26 |                 torch.nn.init.xavier_uniform_(p.data)
27 |             elif init == "xavier_normal":
28 |                 torch.nn.init.xavier_normal_(p.data)
29 |             elif init == "kaiming_uniform":
30 |                 torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
31 |             elif init == "kaiming_normal":
32 |                 torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
33 |             else:
34 |                 raise ValueError("Unknown initialization: " + init)
35 |     # bias init
36 |     for name, p in model.named_parameters():
37 |         if ".bias" in name and p.dim() == 1:
38 |             p.data.zero_()
39 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/AR/utils/io.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | import yaml
 5 | 
 6 | 
 7 | def load_yaml_config(path):
 8 |     with open(path) as f:
 9 |         config = yaml.full_load(f)
10 |     return config
11 | 
12 | 
13 | def save_config_to_yaml(config, path):
14 |     assert path.endswith(".yaml")
15 |     with open(path, "w") as f:
16 |         f.write(yaml.dump(config))
17 |         f.close()
18 | 
19 | 
20 | def write_args(args, path):
21 |     args_dict = dict(
22 |         (name, getattr(args, name)) for name in dir(args) if not name.startswith("_")
23 |     )
24 |     with open(path, "a") as args_file:
25 |         args_file.write("==> torch version: {}\n".format(torch.__version__))
26 |         args_file.write(
27 |             "==> cudnn version: {}\n".format(torch.backends.cudnn.version())
28 |         )
29 |         args_file.write("==> Cmd:\n")
30 |         args_file.write(str(sys.argv))
31 |         args_file.write("\n==> args:\n")
32 |         for k, v in sorted(args_dict.items()):
33 |             args_file.write("  %s: %s\n" % (str(k), str(v)))
34 |         args_file.close()
35 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/__init__.py


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/cankao.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/cankao.wav


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/cankao2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/cankao2.wav


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cnhubert, whisper_enc
2 | 
3 | content_module_map = {
4 |     'cnhubert': cnhubert,
5 |     'whisper': whisper_enc
6 | }


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/feature_extractor/cnhubert.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import librosa
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import soundfile as sf
  7 | import logging
  8 | 
  9 | # logging.getLogger("numba").setLevel(logging.WARNING)
 10 | 
 11 | from transformers import (
 12 |     Wav2Vec2FeatureExtractor,
 13 |     HubertModel,
 14 | )
 15 | 
 16 | # from ..utils import load_wav_to_torch_and_resample
 17 | import torch.nn as nn
 18 | 
 19 | cnhubert_base_path = None
 20 | 
 21 | 
 22 | class CNHubert(nn.Module):
 23 |     def __init__(self):
 24 |         super().__init__()
 25 |         self.model = HubertModel.from_pretrained(cnhubert_base_path)
 26 |         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
 27 |             cnhubert_base_path
 28 |         )
 29 | 
 30 |     def forward(self, x):
 31 |         input_values = self.feature_extractor(
 32 |             x, return_tensors="pt", sampling_rate=16000
 33 |         ).input_values.to(x.device)
 34 |         feats = self.model(input_values)["last_hidden_state"]
 35 |         return feats
 36 | 
 37 | 
 38 | # class CNHubertLarge(nn.Module):
 39 | #     def __init__(self):
 40 | #         super().__init__()
 41 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 42 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 43 | #     def forward(self, x):
 44 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 45 | #         feats = self.model(input_values)["last_hidden_state"]
 46 | #         return feats
 47 | #
 48 | # class CVec(nn.Module):
 49 | #     def __init__(self):
 50 | #         super().__init__()
 51 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 52 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 53 | #     def forward(self, x):
 54 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 55 | #         feats = self.model(input_values)["last_hidden_state"]
 56 | #         return feats
 57 | #
 58 | # class cnw2v2base(nn.Module):
 59 | #     def __init__(self):
 60 | #         super().__init__()
 61 | #         self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 62 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 63 | #     def forward(self, x):
 64 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 65 | #         feats = self.model(input_values)["last_hidden_state"]
 66 | #         return feats
 67 | 
 68 | 
 69 | def get_model():
 70 |     model = CNHubert()
 71 |     model.eval()
 72 |     return model
 73 | 
 74 | 
 75 | # def get_large_model():
 76 | #     model = CNHubertLarge()
 77 | #     model.eval()
 78 | #     return model
 79 | #
 80 | # def get_model_cvec():
 81 | #     model = CVec()
 82 | #     model.eval()
 83 | #     return model
 84 | #
 85 | # def get_model_cnw2v2base():
 86 | #     model = cnw2v2base()
 87 | #     model.eval()
 88 | #     return model
 89 | 
 90 | 
 91 | def get_content(hmodel, wav_16k_tensor):
 92 |     with torch.no_grad():
 93 |         feats = hmodel(wav_16k_tensor)
 94 |     return feats.transpose(1, 2)
 95 | 
 96 | 
 97 | # if __name__ == "__main__":
 98 | #     model = get_model()
 99 | #     src_path = "/Users/Shared/原音频2.wav"
100 | #     wav_16k_tensor = load_wav_to_torch_and_resample(src_path, 16000)
101 | #     model = model
102 | #     wav_16k_tensor = wav_16k_tensor
103 | #     feats = get_content(model, wav_16k_tensor)
104 | #     print(feats.shape)
105 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/feature_extractor/whisper_enc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_model():
 5 |     import whisper
 6 | 
 7 |     model = whisper.load_model("small", device="cpu")
 8 | 
 9 |     return model.encoder
10 | 
11 | 
12 | def get_content(model=None, wav_16k_tensor=None):
13 |     from whisper import log_mel_spectrogram, pad_or_trim
14 | 
15 |     dev = next(model.parameters()).device
16 |     mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
17 |     # if torch.cuda.is_available():
18 |     #     mel = mel.to(torch.float16)
19 |     feature_len = mel.shape[-1] // 2
20 |     assert mel.shape[-1] < 3000, "输入音频过长，只允许输入30以内音频"
21 |     with torch.no_grad():
22 |         feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[
23 |             :1, :feature_len, :
24 |         ].transpose(1, 2)
25 |     return feature
26 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/module/__init__.py


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/module/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | def init_weights(m, mean=0.0, std=0.01):
  7 |     classname = m.__class__.__name__
  8 |     if classname.find("Conv") != -1:
  9 |         m.weight.data.normal_(mean, std)
 10 | 
 11 | 
 12 | def get_padding(kernel_size, dilation=1):
 13 |     return int((kernel_size * dilation - dilation) / 2)
 14 | 
 15 | 
 16 | def convert_pad_shape(pad_shape):
 17 |     l = pad_shape[::-1]
 18 |     pad_shape = [item for sublist in l for item in sublist]
 19 |     return pad_shape
 20 | 
 21 | 
 22 | def intersperse(lst, item):
 23 |     result = [item] * (len(lst) * 2 + 1)
 24 |     result[1::2] = lst
 25 |     return result
 26 | 
 27 | 
 28 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 29 |     """KL(P||Q)"""
 30 |     kl = (logs_q - logs_p) - 0.5
 31 |     kl += (
 32 |         0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
 33 |     )
 34 |     return kl
 35 | 
 36 | 
 37 | def rand_gumbel(shape):
 38 |     """Sample from the Gumbel distribution, protect from overflows."""
 39 |     uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 40 |     return -torch.log(-torch.log(uniform_samples))
 41 | 
 42 | 
 43 | def rand_gumbel_like(x):
 44 |     g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 45 |     return g
 46 | 
 47 | 
 48 | def slice_segments(x, ids_str, segment_size=4):
 49 |     ret = torch.zeros_like(x[:, :, :segment_size])
 50 |     for i in range(x.size(0)):
 51 |         idx_str = ids_str[i]
 52 |         idx_end = idx_str + segment_size
 53 |         ret[i] = x[i, :, idx_str:idx_end]
 54 |     return ret
 55 | 
 56 | 
 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 58 |     b, d, t = x.size()
 59 |     if x_lengths is None:
 60 |         x_lengths = t
 61 |     ids_str_max = x_lengths - segment_size + 1
 62 |     ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 63 |     ret = slice_segments(x, ids_str, segment_size)
 64 |     return ret, ids_str
 65 | 
 66 | 
 67 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
 68 |     position = torch.arange(length, dtype=torch.float)
 69 |     num_timescales = channels // 2
 70 |     log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
 71 |         num_timescales - 1
 72 |     )
 73 |     inv_timescales = min_timescale * torch.exp(
 74 |         torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
 75 |     )
 76 |     scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 77 |     signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 78 |     signal = F.pad(signal, [0, 0, 0, channels % 2])
 79 |     signal = signal.view(1, channels, length)
 80 |     return signal
 81 | 
 82 | 
 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 84 |     b, channels, length = x.size()
 85 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 86 |     return x + signal.to(dtype=x.dtype, device=x.device)
 87 | 
 88 | 
 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 90 |     b, channels, length = x.size()
 91 |     signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 92 |     return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 93 | 
 94 | 
 95 | def subsequent_mask(length):
 96 |     mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
 97 |     return mask
 98 | 
 99 | 
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 |     n_channels_int = n_channels[0]
103 |     in_act = input_a + input_b
104 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 |     acts = t_act * s_act
107 |     return acts
108 | 
109 | 
110 | def convert_pad_shape(pad_shape):
111 |     l = pad_shape[::-1]
112 |     pad_shape = [item for sublist in l for item in sublist]
113 |     return pad_shape
114 | 
115 | 
116 | def shift_1d(x):
117 |     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118 |     return x
119 | 
120 | 
121 | def sequence_mask(length, max_length=None):
122 |     if max_length is None:
123 |         max_length = length.max()
124 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125 |     return x.unsqueeze(0) < length.unsqueeze(1)
126 | 
127 | 
128 | def generate_path(duration, mask):
129 |     """
130 |     duration: [b, 1, t_x]
131 |     mask: [b, 1, t_y, t_x]
132 |     """
133 |     device = duration.device
134 | 
135 |     b, _, t_y, t_x = mask.shape
136 |     cum_duration = torch.cumsum(duration, -1)
137 | 
138 |     cum_duration_flat = cum_duration.view(b * t_x)
139 |     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
140 |     path = path.view(b, t_x, t_y)
141 |     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
142 |     path = path.unsqueeze(1).transpose(2, 3) * mask
143 |     return path
144 | 
145 | 
146 | def clip_grad_value_(parameters, clip_value, norm_type=2):
147 |     if isinstance(parameters, torch.Tensor):
148 |         parameters = [parameters]
149 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
150 |     norm_type = float(norm_type)
151 |     if clip_value is not None:
152 |         clip_value = float(clip_value)
153 | 
154 |     total_norm = 0
155 |     for p in parameters:
156 |         param_norm = p.grad.data.norm(norm_type)
157 |         total_norm += param_norm.item() ** norm_type
158 |         if clip_value is not None:
159 |             p.grad.data.clamp_(min=-clip_value, max=clip_value)
160 |     total_norm = total_norm ** (1.0 / norm_type)
161 |     return total_norm
162 | 
163 | 
164 | def squeeze(x, x_mask=None, n_sqz=2):
165 |     b, c, t = x.size()
166 | 
167 |     t = (t // n_sqz) * n_sqz
168 |     x = x[:, :, :t]
169 |     x_sqz = x.view(b, c, t // n_sqz, n_sqz)
170 |     x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz)
171 | 
172 |     if x_mask is not None:
173 |         x_mask = x_mask[:, :, n_sqz - 1 :: n_sqz]
174 |     else:
175 |         x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype)
176 |     return x_sqz * x_mask, x_mask
177 | 
178 | 
179 | def unsqueeze(x, x_mask=None, n_sqz=2):
180 |     b, c, t = x.size()
181 | 
182 |     x_unsqz = x.view(b, n_sqz, c // n_sqz, t)
183 |     x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz)
184 | 
185 |     if x_mask is not None:
186 |         x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz)
187 |     else:
188 |         x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype)
189 |     return x_unsqz * x_mask, x_mask
190 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/module/losses.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch.nn import functional as F
 5 | 
 6 | 
 7 | def feature_loss(fmap_r, fmap_g):
 8 |     loss = 0
 9 |     for dr, dg in zip(fmap_r, fmap_g):
10 |         for rl, gl in zip(dr, dg):
11 |             rl = rl.float().detach()
12 |             gl = gl.float()
13 |             loss += torch.mean(torch.abs(rl - gl))
14 | 
15 |     return loss * 2
16 | 
17 | 
18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19 |     loss = 0
20 |     r_losses = []
21 |     g_losses = []
22 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23 |         dr = dr.float()
24 |         dg = dg.float()
25 |         r_loss = torch.mean((1 - dr) ** 2)
26 |         g_loss = torch.mean(dg**2)
27 |         loss += r_loss + g_loss
28 |         r_losses.append(r_loss.item())
29 |         g_losses.append(g_loss.item())
30 | 
31 |     return loss, r_losses, g_losses
32 | 
33 | 
34 | def generator_loss(disc_outputs):
35 |     loss = 0
36 |     gen_losses = []
37 |     for dg in disc_outputs:
38 |         dg = dg.float()
39 |         l = torch.mean((1 - dg) ** 2)
40 |         gen_losses.append(l)
41 |         loss += l
42 | 
43 |     return loss, gen_losses
44 | 
45 | 
46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47 |     """
48 |     z_p, logs_q: [b, h, t_t]
49 |     m_p, logs_p: [b, h, t_t]
50 |     """
51 |     z_p = z_p.float()
52 |     logs_q = logs_q.float()
53 |     m_p = m_p.float()
54 |     logs_p = logs_p.float()
55 |     z_mask = z_mask.float()
56 | 
57 |     kl = logs_p - logs_q - 0.5
58 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
59 |     kl = torch.sum(kl * z_mask)
60 |     l = kl / torch.sum(z_mask)
61 |     return l
62 | 
63 | 
64 | def mle_loss(z, m, logs, logdet, mask):
65 |     l = torch.sum(logs) + 0.5 * torch.sum(
66 |         torch.exp(-2 * logs) * ((z - m) ** 2)
67 |     )  # neg normal likelihood w/o the constant term
68 |     l = l - torch.sum(logdet)  # log jacobian determinant
69 |     l = l / torch.sum(
70 |         torch.ones_like(z) * mask
71 |     )  # averaging across batch, channel and time axes
72 |     l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
73 |     return l
74 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/module/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import torch
  5 | from torch import nn
  6 | import torch.nn.functional as F
  7 | import torch.utils.data
  8 | import numpy as np
  9 | import librosa
 10 | import librosa.util as librosa_util
 11 | from librosa.util import normalize, pad_center, tiny
 12 | from scipy.signal import get_window
 13 | from scipy.io.wavfile import read
 14 | from librosa.filters import mel as librosa_mel_fn
 15 | 
 16 | MAX_WAV_VALUE = 32768.0
 17 | 
 18 | 
 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 20 |     """
 21 |     PARAMS
 22 |     ------
 23 |     C: compression factor
 24 |     """
 25 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 26 | 
 27 | 
 28 | def dynamic_range_decompression_torch(x, C=1):
 29 |     """
 30 |     PARAMS
 31 |     ------
 32 |     C: compression factor used to compress
 33 |     """
 34 |     return torch.exp(x) / C
 35 | 
 36 | 
 37 | def spectral_normalize_torch(magnitudes):
 38 |     output = dynamic_range_compression_torch(magnitudes)
 39 |     return output
 40 | 
 41 | 
 42 | def spectral_de_normalize_torch(magnitudes):
 43 |     output = dynamic_range_decompression_torch(magnitudes)
 44 |     return output
 45 | 
 46 | 
 47 | mel_basis = {}
 48 | hann_window = {}
 49 | 
 50 | 
 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 52 |     if torch.min(y) < -1.0:
 53 |         print("min value is ", torch.min(y))
 54 |     if torch.max(y) > 1.0:
 55 |         print("max value is ", torch.max(y))
 56 | 
 57 |     global hann_window
 58 |     dtype_device = str(y.dtype) + "_" + str(y.device)
 59 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
 60 |     if wnsize_dtype_device not in hann_window:
 61 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
 62 |             dtype=y.dtype, device=y.device
 63 |         )
 64 | 
 65 |     y = torch.nn.functional.pad(
 66 |         y.unsqueeze(1),
 67 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
 68 |         mode="reflect",
 69 |     )
 70 |     y = y.squeeze(1)
 71 |     spec = torch.stft(
 72 |         y,
 73 |         n_fft,
 74 |         hop_length=hop_size,
 75 |         win_length=win_size,
 76 |         window=hann_window[wnsize_dtype_device],
 77 |         center=center,
 78 |         pad_mode="reflect",
 79 |         normalized=False,
 80 |         onesided=True,
 81 |         return_complex=False,
 82 |     )
 83 | 
 84 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 85 |     return spec
 86 | 
 87 | 
 88 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 89 |     global mel_basis
 90 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
 91 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
 92 |     if fmax_dtype_device not in mel_basis:
 93 |         mel = librosa_mel_fn(
 94 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
 95 |         )
 96 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
 97 |             dtype=spec.dtype, device=spec.device
 98 |         )
 99 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
100 |     spec = spectral_normalize_torch(spec)
101 |     return spec
102 | 
103 | 
104 | def mel_spectrogram_torch(
105 |     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
106 | ):
107 |     if torch.min(y) < -1.0:
108 |         print("min value is ", torch.min(y))
109 |     if torch.max(y) > 1.0:
110 |         print("max value is ", torch.max(y))
111 | 
112 |     global mel_basis, hann_window
113 |     dtype_device = str(y.dtype) + "_" + str(y.device)
114 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
115 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
116 |     if fmax_dtype_device not in mel_basis:
117 |         mel = librosa_mel_fn(
118 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
119 |         )
120 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
121 |             dtype=y.dtype, device=y.device
122 |         )
123 |     if wnsize_dtype_device not in hann_window:
124 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
125 |             dtype=y.dtype, device=y.device
126 |         )
127 | 
128 |     y = torch.nn.functional.pad(
129 |         y.unsqueeze(1),
130 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
131 |         mode="reflect",
132 |     )
133 |     y = y.squeeze(1)
134 | 
135 |     spec = torch.stft(
136 |         y,
137 |         n_fft,
138 |         hop_length=hop_size,
139 |         win_length=win_size,
140 |         window=hann_window[wnsize_dtype_device],
141 |         center=center,
142 |         pad_mode="reflect",
143 |         normalized=False,
144 |         onesided=True,
145 |         return_complex=False,
146 |     )
147 | 
148 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
149 | 
150 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
151 |     spec = spectral_normalize_torch(spec)
152 | 
153 |     return spec
154 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/module/mrte_model.py:
--------------------------------------------------------------------------------
  1 | # This is Multi-reference timbre encoder
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn.utils import remove_weight_norm, weight_norm
  6 | from .attentions import MultiHeadAttention
  7 | 
  8 | 
  9 | class MRTE(nn.Module):
 10 |     def __init__(
 11 |         self,
 12 |         content_enc_channels=192,
 13 |         hidden_size=512,
 14 |         out_channels=192,
 15 |         kernel_size=5,
 16 |         n_heads=4,
 17 |         ge_layer=2,
 18 |     ):
 19 |         super(MRTE, self).__init__()
 20 |         self.cross_attention = MultiHeadAttention(hidden_size, hidden_size, n_heads)
 21 |         self.c_pre = nn.Conv1d(content_enc_channels, hidden_size, 1)
 22 |         self.text_pre = nn.Conv1d(content_enc_channels, hidden_size, 1)
 23 |         self.c_post = nn.Conv1d(hidden_size, out_channels, 1)
 24 | 
 25 |     def forward(self, ssl_enc, ssl_mask, text, text_mask, ge, test=None):
 26 |         if ge == None:
 27 |             ge = 0
 28 |         attn_mask = text_mask.unsqueeze(2) * ssl_mask.unsqueeze(-1)
 29 | 
 30 |         ssl_enc = self.c_pre(ssl_enc * ssl_mask)
 31 |         text_enc = self.text_pre(text * text_mask)
 32 |         if test != None:
 33 |             if test == 0:
 34 |                 x = (
 35 |                     self.cross_attention(
 36 |                         ssl_enc * ssl_mask, text_enc * text_mask, attn_mask
 37 |                     )
 38 |                     + ssl_enc
 39 |                     + ge
 40 |                 )
 41 |             elif test == 1:
 42 |                 x = ssl_enc + ge
 43 |             elif test == 2:
 44 |                 x = (
 45 |                     self.cross_attention(
 46 |                         ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask
 47 |                     )
 48 |                     + ge
 49 |                 )
 50 |             else:
 51 |                 raise ValueError("test should be 0,1,2")
 52 |         else:
 53 |             x = (
 54 |                 self.cross_attention(
 55 |                     ssl_enc * ssl_mask, text_enc * text_mask, attn_mask
 56 |                 )
 57 |                 + ssl_enc
 58 |                 + ge
 59 |             )
 60 |         x = self.c_post(x * ssl_mask)
 61 |         return x
 62 | 
 63 | 
 64 | class SpeakerEncoder(torch.nn.Module):
 65 |     def __init__(
 66 |         self,
 67 |         mel_n_channels=80,
 68 |         model_num_layers=2,
 69 |         model_hidden_size=256,
 70 |         model_embedding_size=256,
 71 |     ):
 72 |         super(SpeakerEncoder, self).__init__()
 73 |         self.lstm = nn.LSTM(
 74 |             mel_n_channels, model_hidden_size, model_num_layers, batch_first=True
 75 |         )
 76 |         self.linear = nn.Linear(model_hidden_size, model_embedding_size)
 77 |         self.relu = nn.ReLU()
 78 | 
 79 |     def forward(self, mels):
 80 |         self.lstm.flatten_parameters()
 81 |         _, (hidden, _) = self.lstm(mels.transpose(-1, -2))
 82 |         embeds_raw = self.relu(self.linear(hidden[-1]))
 83 |         return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
 84 | 
 85 | 
 86 | class MELEncoder(nn.Module):
 87 |     def __init__(
 88 |         self,
 89 |         in_channels,
 90 |         out_channels,
 91 |         hidden_channels,
 92 |         kernel_size,
 93 |         dilation_rate,
 94 |         n_layers,
 95 |     ):
 96 |         super().__init__()
 97 |         self.in_channels = in_channels
 98 |         self.out_channels = out_channels
 99 |         self.hidden_channels = hidden_channels
100 |         self.kernel_size = kernel_size
101 |         self.dilation_rate = dilation_rate
102 |         self.n_layers = n_layers
103 | 
104 |         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
105 |         self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers)
106 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
107 | 
108 |     def forward(self, x):
109 |         # print(x.shape,x_lengths.shape)
110 |         x = self.pre(x)
111 |         x = self.enc(x)
112 |         x = self.proj(x)
113 |         return x
114 | 
115 | 
116 | class WN(torch.nn.Module):
117 |     def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers):
118 |         super(WN, self).__init__()
119 |         assert kernel_size % 2 == 1
120 |         self.hidden_channels = hidden_channels
121 |         self.kernel_size = kernel_size
122 |         self.dilation_rate = dilation_rate
123 |         self.n_layers = n_layers
124 | 
125 |         self.in_layers = torch.nn.ModuleList()
126 |         self.res_skip_layers = torch.nn.ModuleList()
127 | 
128 |         for i in range(n_layers):
129 |             dilation = dilation_rate**i
130 |             padding = int((kernel_size * dilation - dilation) / 2)
131 |             in_layer = nn.Conv1d(
132 |                 hidden_channels,
133 |                 2 * hidden_channels,
134 |                 kernel_size,
135 |                 dilation=dilation,
136 |                 padding=padding,
137 |             )
138 |             in_layer = weight_norm(in_layer)
139 |             self.in_layers.append(in_layer)
140 | 
141 |             # last one is not necessary
142 |             if i < n_layers - 1:
143 |                 res_skip_channels = 2 * hidden_channels
144 |             else:
145 |                 res_skip_channels = hidden_channels
146 | 
147 |             res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
148 |             res_skip_layer = weight_norm(res_skip_layer, name="weight")
149 |             self.res_skip_layers.append(res_skip_layer)
150 | 
151 |     def forward(self, x):
152 |         output = torch.zeros_like(x)
153 |         n_channels_tensor = torch.IntTensor([self.hidden_channels])
154 | 
155 |         for i in range(self.n_layers):
156 |             x_in = self.in_layers[i](x)
157 | 
158 |             acts = fused_add_tanh_sigmoid_multiply(x_in, n_channels_tensor)
159 | 
160 |             res_skip_acts = self.res_skip_layers[i](acts)
161 |             if i < self.n_layers - 1:
162 |                 res_acts = res_skip_acts[:, : self.hidden_channels, :]
163 |                 x = x + res_acts
164 |                 output = output + res_skip_acts[:, self.hidden_channels :, :]
165 |             else:
166 |                 output = output + res_skip_acts
167 |         return output
168 | 
169 |     def remove_weight_norm(self):
170 |         for l in self.in_layers:
171 |             remove_weight_norm(l)
172 |         for l in self.res_skip_layers:
173 |             remove_weight_norm(l)
174 | 
175 | 
176 | @torch.jit.script
177 | def fused_add_tanh_sigmoid_multiply(input, n_channels):
178 |     n_channels_int = n_channels[0]
179 |     t_act = torch.tanh(input[:, :n_channels_int, :])
180 |     s_act = torch.sigmoid(input[:, n_channels_int:, :])
181 |     acts = t_act * s_act
182 |     return acts
183 | 
184 | 
185 | if __name__ == "__main__":
186 |     content_enc = torch.randn(3, 192, 100)
187 |     content_mask = torch.ones(3, 1, 100)
188 |     ref_mel = torch.randn(3, 128, 30)
189 |     ref_mask = torch.ones(3, 1, 30)
190 |     model = MRTE()
191 |     out = model(content_enc, content_mask, ref_mel, ref_mask)
192 |     print(out.shape)
193 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/module/quantize.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | """Residual vector quantizer implementation."""
  8 | 
  9 | from dataclasses import dataclass, field
 10 | import math
 11 | import typing as tp
 12 | 
 13 | import torch
 14 | from torch import nn
 15 | 
 16 | from .core_vq import ResidualVectorQuantization
 17 | 
 18 | 
 19 | @dataclass
 20 | class QuantizedResult:
 21 |     quantized: torch.Tensor
 22 |     codes: torch.Tensor
 23 |     bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
 24 |     penalty: tp.Optional[torch.Tensor] = None
 25 |     metrics: dict = field(default_factory=dict)
 26 | 
 27 | 
 28 | class ResidualVectorQuantizer(nn.Module):
 29 |     """Residual Vector Quantizer.
 30 |     Args:
 31 |         dimension (int): Dimension of the codebooks.
 32 |         n_q (int): Number of residual vector quantizers used.
 33 |         bins (int): Codebook size.
 34 |         decay (float): Decay for exponential moving average over the codebooks.
 35 |         kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
 36 |         kmeans_iters (int): Number of iterations used for kmeans initialization.
 37 |         threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
 38 |             that have an exponential moving average cluster size less than the specified threshold with
 39 |             randomly selected vector from the current batch.
 40 |     """
 41 | 
 42 |     def __init__(
 43 |         self,
 44 |         dimension: int = 256,
 45 |         n_q: int = 8,
 46 |         bins: int = 1024,
 47 |         decay: float = 0.99,
 48 |         kmeans_init: bool = True,
 49 |         kmeans_iters: int = 50,
 50 |         threshold_ema_dead_code: int = 2,
 51 |     ):
 52 |         super().__init__()
 53 |         self.n_q = n_q
 54 |         self.dimension = dimension
 55 |         self.bins = bins
 56 |         self.decay = decay
 57 |         self.kmeans_init = kmeans_init
 58 |         self.kmeans_iters = kmeans_iters
 59 |         self.threshold_ema_dead_code = threshold_ema_dead_code
 60 |         self.vq = ResidualVectorQuantization(
 61 |             dim=self.dimension,
 62 |             codebook_size=self.bins,
 63 |             num_quantizers=self.n_q,
 64 |             decay=self.decay,
 65 |             kmeans_init=self.kmeans_init,
 66 |             kmeans_iters=self.kmeans_iters,
 67 |             threshold_ema_dead_code=self.threshold_ema_dead_code,
 68 |         )
 69 | 
 70 |     def forward(
 71 |         self,
 72 |         x: torch.Tensor,
 73 |         n_q: tp.Optional[int] = None,
 74 |         layers: tp.Optional[list] = None,
 75 |     ) -> QuantizedResult:
 76 |         """Residual vector quantization on the given input tensor.
 77 |         Args:
 78 |             x (torch.Tensor): Input tensor.
 79 |             n_q (int): Number of quantizer used to quantize. Default: All quantizers.
 80 |             layers (list): Layer that need to return quantized. Defalt: None.
 81 |         Returns:
 82 |             QuantizedResult:
 83 |                 The quantized (or approximately quantized) representation with
 84 |                 the associated numbert quantizers and layer quantized required to return.
 85 |         """
 86 |         n_q = n_q if n_q else self.n_q
 87 |         if layers and max(layers) >= n_q:
 88 |             raise ValueError(
 89 |                 f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B."
 90 |             )
 91 |         quantized, codes, commit_loss, quantized_list = self.vq(
 92 |             x, n_q=n_q, layers=layers
 93 |         )
 94 |         return quantized, codes, torch.mean(commit_loss), quantized_list
 95 | 
 96 |     def encode(
 97 |         self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None
 98 |     ) -> torch.Tensor:
 99 |         """Encode a given input tensor with the specified sample rate at the given bandwidth.
100 |         The RVQ encode method sets the appropriate number of quantizer to use
101 |         and returns indices for each quantizer.
102 |         Args:
103 |             x (torch.Tensor): Input tensor.
104 |             n_q (int): Number of quantizer used to quantize. Default: All quantizers.
105 |             st (int): Start to encode input from which layers. Default: 0.
106 |         """
107 |         n_q = n_q if n_q else self.n_q
108 |         st = st or 0
109 |         codes = self.vq.encode(x, n_q=n_q, st=st)
110 |         return codes
111 | 
112 |     def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor:
113 |         """Decode the given codes to the quantized representation.
114 |         Args:
115 |             codes (torch.Tensor): Input indices for each quantizer.
116 |             st (int): Start to decode input codes from which layers. Default: 0.
117 |         """
118 |         quantized = self.vq.decode(codes, st=st)
119 |         return quantized
120 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/module/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(
 13 |     inputs,
 14 |     unnormalized_widths,
 15 |     unnormalized_heights,
 16 |     unnormalized_derivatives,
 17 |     inverse=False,
 18 |     tails=None,
 19 |     tail_bound=1.0,
 20 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 21 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 22 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 23 | ):
 24 |     if tails is None:
 25 |         spline_fn = rational_quadratic_spline
 26 |         spline_kwargs = {}
 27 |     else:
 28 |         spline_fn = unconstrained_rational_quadratic_spline
 29 |         spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
 30 | 
 31 |     outputs, logabsdet = spline_fn(
 32 |         inputs=inputs,
 33 |         unnormalized_widths=unnormalized_widths,
 34 |         unnormalized_heights=unnormalized_heights,
 35 |         unnormalized_derivatives=unnormalized_derivatives,
 36 |         inverse=inverse,
 37 |         min_bin_width=min_bin_width,
 38 |         min_bin_height=min_bin_height,
 39 |         min_derivative=min_derivative,
 40 |         **spline_kwargs
 41 |     )
 42 |     return outputs, logabsdet
 43 | 
 44 | 
 45 | def searchsorted(bin_locations, inputs, eps=1e-6):
 46 |     bin_locations[..., -1] += eps
 47 |     return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
 48 | 
 49 | 
 50 | def unconstrained_rational_quadratic_spline(
 51 |     inputs,
 52 |     unnormalized_widths,
 53 |     unnormalized_heights,
 54 |     unnormalized_derivatives,
 55 |     inverse=False,
 56 |     tails="linear",
 57 |     tail_bound=1.0,
 58 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 59 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 60 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
 61 | ):
 62 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 63 |     outside_interval_mask = ~inside_interval_mask
 64 | 
 65 |     outputs = torch.zeros_like(inputs)
 66 |     logabsdet = torch.zeros_like(inputs)
 67 | 
 68 |     if tails == "linear":
 69 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 70 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 71 |         unnormalized_derivatives[..., 0] = constant
 72 |         unnormalized_derivatives[..., -1] = constant
 73 | 
 74 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 75 |         logabsdet[outside_interval_mask] = 0
 76 |     else:
 77 |         raise RuntimeError("{} tails are not implemented.".format(tails))
 78 | 
 79 |     (
 80 |         outputs[inside_interval_mask],
 81 |         logabsdet[inside_interval_mask],
 82 |     ) = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound,
 89 |         right=tail_bound,
 90 |         bottom=-tail_bound,
 91 |         top=tail_bound,
 92 |         min_bin_width=min_bin_width,
 93 |         min_bin_height=min_bin_height,
 94 |         min_derivative=min_derivative,
 95 |     )
 96 | 
 97 |     return outputs, logabsdet
 98 | 
 99 | 
100 | def rational_quadratic_spline(
101 |     inputs,
102 |     unnormalized_widths,
103 |     unnormalized_heights,
104 |     unnormalized_derivatives,
105 |     inverse=False,
106 |     left=0.0,
107 |     right=1.0,
108 |     bottom=0.0,
109 |     top=1.0,
110 |     min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111 |     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112 |     min_derivative=DEFAULT_MIN_DERIVATIVE,
113 | ):
114 |     if torch.min(inputs) < left or torch.max(inputs) > right:
115 |         raise ValueError("Input to a transform is not within its domain")
116 | 
117 |     num_bins = unnormalized_widths.shape[-1]
118 | 
119 |     if min_bin_width * num_bins > 1.0:
120 |         raise ValueError("Minimal bin width too large for the number of bins")
121 |     if min_bin_height * num_bins > 1.0:
122 |         raise ValueError("Minimal bin height too large for the number of bins")
123 | 
124 |     widths = F.softmax(unnormalized_widths, dim=-1)
125 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126 |     cumwidths = torch.cumsum(widths, dim=-1)
127 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128 |     cumwidths = (right - left) * cumwidths + left
129 |     cumwidths[..., 0] = left
130 |     cumwidths[..., -1] = right
131 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132 | 
133 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134 | 
135 |     heights = F.softmax(unnormalized_heights, dim=-1)
136 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137 |     cumheights = torch.cumsum(heights, dim=-1)
138 |     cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139 |     cumheights = (top - bottom) * cumheights + bottom
140 |     cumheights[..., 0] = bottom
141 |     cumheights[..., -1] = top
142 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
143 | 
144 |     if inverse:
145 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
146 |     else:
147 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
148 | 
149 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151 | 
152 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153 |     delta = heights / widths
154 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
155 | 
156 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158 | 
159 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
160 | 
161 |     if inverse:
162 |         a = (inputs - input_cumheights) * (
163 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
164 |         ) + input_heights * (input_delta - input_derivatives)
165 |         b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166 |             input_derivatives + input_derivatives_plus_one - 2 * input_delta
167 |         )
168 |         c = -input_delta * (inputs - input_cumheights)
169 | 
170 |         discriminant = b.pow(2) - 4 * a * c
171 |         assert (discriminant >= 0).all()
172 | 
173 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
174 |         outputs = root * input_bin_widths + input_cumwidths
175 | 
176 |         theta_one_minus_theta = root * (1 - root)
177 |         denominator = input_delta + (
178 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179 |             * theta_one_minus_theta
180 |         )
181 |         derivative_numerator = input_delta.pow(2) * (
182 |             input_derivatives_plus_one * root.pow(2)
183 |             + 2 * input_delta * theta_one_minus_theta
184 |             + input_derivatives * (1 - root).pow(2)
185 |         )
186 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187 | 
188 |         return outputs, -logabsdet
189 |     else:
190 |         theta = (inputs - input_cumwidths) / input_bin_widths
191 |         theta_one_minus_theta = theta * (1 - theta)
192 | 
193 |         numerator = input_heights * (
194 |             input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195 |         )
196 |         denominator = input_delta + (
197 |             (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198 |             * theta_one_minus_theta
199 |         )
200 |         outputs = input_cumheights + numerator / denominator
201 | 
202 |         derivative_numerator = input_delta.pow(2) * (
203 |             input_derivatives_plus_one * theta.pow(2)
204 |             + 2 * input_delta * theta_one_minus_theta
205 |             + input_derivatives * (1 - theta).pow(2)
206 |         )
207 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208 | 
209 |         return outputs, logabsdet
210 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/my_utils.py:
--------------------------------------------------------------------------------
 1 | import ffmpeg
 2 | import numpy as np
 3 | 
 4 | 
 5 | def load_audio(file, sr):
 6 |     try:
 7 |         # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
 8 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
 9 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
10 |         file = (
11 |             file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
12 |         )  # 防止小白拷路径头尾带了空格和"和回车
13 |         out, _ = (
14 |             ffmpeg.input(file, threads=0)
15 |             .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
16 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17 |         )
18 |     except Exception as e:
19 |         raise RuntimeError(f"Failed to load audio: {e}")
20 | 
21 |     return np.frombuffer(out, np.float32).flatten()
22 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/output.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/output.wav


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from .symbols import *
 2 | 
 3 | 
 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text):
 7 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |   '''
13 |   phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |   return phones
15 | 
16 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/chinese.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pdb
  3 | import re
  4 | 
  5 | import cn2an
  6 | from pypinyin import lazy_pinyin, Style
  7 | 
  8 | from .symbols import punctuation
  9 | from .tone_sandhi import ToneSandhi
 10 | from .zh_normalization.text_normlization import TextNormalizer
 11 | 
 12 | normalizer = lambda x: cn2an.transform(x, "an2cn")
 13 | 
 14 | current_file_path = os.path.dirname(__file__)
 15 | pinyin_to_symbol_map = {
 16 |     line.split("\t")[0]: line.strip().split("\t")[1]
 17 |     for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
 18 | }
 19 | 
 20 | import jieba_fast.posseg as psg
 21 | 
 22 | 
 23 | rep_map = {
 24 |     "：": ",",
 25 |     "；": ",",
 26 |     "，": ",",
 27 |     "。": ".",
 28 |     "！": "!",
 29 |     "？": "?",
 30 |     "\n": ".",
 31 |     "·": ",",
 32 |     "、": ",",
 33 |     "...": "…",
 34 |     "$": ".",
 35 |     "/": ",",
 36 |     "—": "-",
 37 |     "~": "…",
 38 |     "～":"…",
 39 | }
 40 | 
 41 | tone_modifier = ToneSandhi()
 42 | 
 43 | 
 44 | def replace_punctuation(text):
 45 |     text = text.replace("嗯", "恩").replace("呣", "母")
 46 |     pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
 47 | 
 48 |     replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
 49 | 
 50 |     replaced_text = re.sub(
 51 |         r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
 52 |     )
 53 | 
 54 |     return replaced_text
 55 | 
 56 | 
 57 | def g2p(text):
 58 |     pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
 59 |     sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
 60 |     phones, word2ph = _g2p(sentences)
 61 |     return phones, word2ph
 62 | 
 63 | 
 64 | def _get_initials_finals(word):
 65 |     initials = []
 66 |     finals = []
 67 |     orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
 68 |     orig_finals = lazy_pinyin(
 69 |         word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
 70 |     )
 71 |     for c, v in zip(orig_initials, orig_finals):
 72 |         initials.append(c)
 73 |         finals.append(v)
 74 |     return initials, finals
 75 | 
 76 | 
 77 | def _g2p(segments):
 78 |     phones_list = []
 79 |     word2ph = []
 80 |     for seg in segments:
 81 |         pinyins = []
 82 |         # Replace all English words in the sentence
 83 |         seg = re.sub("[a-zA-Z]+", "", seg)
 84 |         seg_cut = psg.lcut(seg)
 85 |         initials = []
 86 |         finals = []
 87 |         seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
 88 |         for word, pos in seg_cut:
 89 |             if pos == "eng":
 90 |                 continue
 91 |             sub_initials, sub_finals = _get_initials_finals(word)
 92 |             sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
 93 |             initials.append(sub_initials)
 94 |             finals.append(sub_finals)
 95 | 
 96 |             # assert len(sub_initials) == len(sub_finals) == len(word)
 97 |         initials = sum(initials, [])
 98 |         finals = sum(finals, [])
 99 |         #
100 |         for c, v in zip(initials, finals):
101 |             raw_pinyin = c + v
102 |             # NOTE: post process for pypinyin outputs
103 |             # we discriminate i, ii and iii
104 |             if c == v:
105 |                 assert c in punctuation
106 |                 phone = [c]
107 |                 word2ph.append(1)
108 |             else:
109 |                 v_without_tone = v[:-1]
110 |                 tone = v[-1]
111 | 
112 |                 pinyin = c + v_without_tone
113 |                 assert tone in "12345"
114 | 
115 |                 if c:
116 |                     # 多音节
117 |                     v_rep_map = {
118 |                         "uei": "ui",
119 |                         "iou": "iu",
120 |                         "uen": "un",
121 |                     }
122 |                     if v_without_tone in v_rep_map.keys():
123 |                         pinyin = c + v_rep_map[v_without_tone]
124 |                 else:
125 |                     # 单音节
126 |                     pinyin_rep_map = {
127 |                         "ing": "ying",
128 |                         "i": "yi",
129 |                         "in": "yin",
130 |                         "u": "wu",
131 |                     }
132 |                     if pinyin in pinyin_rep_map.keys():
133 |                         pinyin = pinyin_rep_map[pinyin]
134 |                     else:
135 |                         single_rep_map = {
136 |                             "v": "yu",
137 |                             "e": "e",
138 |                             "i": "y",
139 |                             "u": "w",
140 |                         }
141 |                         if pinyin[0] in single_rep_map.keys():
142 |                             pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
143 | 
144 |                 assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
145 |                 new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
146 |                 new_v = new_v + tone
147 |                 phone = [new_c, new_v]
148 |                 word2ph.append(len(phone))
149 | 
150 |             phones_list += phone
151 |     return phones_list, word2ph
152 | 
153 | 
154 | def text_normalize(text):
155 |     # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
156 |     tx = TextNormalizer()
157 |     sentences = tx.normalize(text)
158 |     dest_text = ""
159 |     for sentence in sentences:
160 |         dest_text += replace_punctuation(sentence)
161 |     return dest_text
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     text = "啊——但是《原神》是由,米哈\游自主，研发的一款全.新开放世界.冒险游戏"
166 |     text = "呣呣呣～就是…大人的鼹鼠党吧？"
167 |     text = "你好"
168 |     text = text_normalize(text)
169 |     print(g2p(text))
170 | 
171 | 
172 | # # 示例用法
173 | # text = "这是一个示例文本：,你好！这是一个测试..."
174 | # print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试
175 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from . import chinese, japanese, cleaned_text_to_sequence, symbols, english
 2 | 
 3 | language_module_map = {"zh": chinese, "ja": japanese, "en": english}
 4 | special = [
 5 |     # ("%", "zh", "SP"),
 6 |     ("￥", "zh", "SP2"),
 7 |     ("^", "zh", "SP3"),
 8 |     # ('@', 'zh', "SP4")#不搞鬼畜了，和第二版保持一致吧
 9 | ]
10 | 
11 | 
12 | def clean_text(text, language):
13 |     if(language not in language_module_map):
14 |         language="en"
15 |         text=" "
16 |     for special_s, special_l, target_symbol in special:
17 |         if special_s in text and language == special_l:
18 |             return clean_special(text, language, special_s, target_symbol)
19 |     language_module = language_module_map[language]
20 |     norm_text = language_module.text_normalize(text)
21 |     if language == "zh":
22 |         phones, word2ph = language_module.g2p(norm_text)
23 |         assert len(phones) == sum(word2ph)
24 |         assert len(norm_text) == len(word2ph)
25 |     else:
26 |         phones = language_module.g2p(norm_text)
27 |         word2ph = None
28 | 
29 |     for ph in phones:
30 |         assert ph in symbols
31 |     return phones, word2ph, norm_text
32 | 
33 | 
34 | def clean_special(text, language, special_s, target_symbol):
35 |     """
36 |     特殊静音段sp符号处理
37 |     """
38 |     text = text.replace(special_s, ",")
39 |     language_module = language_module_map[language]
40 |     norm_text = language_module.text_normalize(text)
41 |     phones = language_module.g2p(norm_text)
42 |     new_ph = []
43 |     for ph in phones[0]:
44 |         assert ph in symbols
45 |         if ph == ",":
46 |             new_ph.append(target_symbol)
47 |         else:
48 |             new_ph.append(ph)
49 |     return new_ph, phones[1], norm_text
50 | 
51 | 
52 | def text_to_sequence(text, language):
53 |     phones = clean_text(text)
54 |     return cleaned_text_to_sequence(phones)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
59 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/engdict-hot.rep:
--------------------------------------------------------------------------------
1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2 | JSON JH EY1 S AH0 N


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/engdict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/text/engdict_cache.pickle


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/japanese.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
  2 | import re
  3 | import sys
  4 | 
  5 | import pyopenjtalk
  6 | 
  7 | 
  8 | from . import symbols
  9 | # Regular expression matching Japanese without punctuation marks:
 10 | _japanese_characters = re.compile(
 11 |     r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
 12 | )
 13 | 
 14 | # Regular expression matching non-Japanese characters or punctuation marks:
 15 | _japanese_marks = re.compile(
 16 |     r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
 17 | )
 18 | 
 19 | # List of (symbol, Japanese) pairs for marks:
 20 | _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("％", "パーセント")]]
 21 | 
 22 | 
 23 | # List of (consonant, sokuon) pairs:
 24 | _real_sokuon = [
 25 |     (re.compile("%s" % x[0]), x[1])
 26 |     for x in [
 27 |         (r"Q([↑↓]*[kg])", r"k#\1"),
 28 |         (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
 29 |         (r"Q([↑↓]*[sʃ])", r"s\1"),
 30 |         (r"Q([↑↓]*[pb])", r"p#\1"),
 31 |     ]
 32 | ]
 33 | 
 34 | # List of (consonant, hatsuon) pairs:
 35 | _real_hatsuon = [
 36 |     (re.compile("%s" % x[0]), x[1])
 37 |     for x in [
 38 |         (r"N([↑↓]*[pbm])", r"m\1"),
 39 |         (r"N([↑↓]*[ʧʥj])", r"n^\1"),
 40 |         (r"N([↑↓]*[tdn])", r"n\1"),
 41 |         (r"N([↑↓]*[kg])", r"ŋ\1"),
 42 |     ]
 43 | ]
 44 | 
 45 | 
 46 | def post_replace_ph(ph):
 47 |     rep_map = {
 48 |         "：": ",",
 49 |         "；": ",",
 50 |         "，": ",",
 51 |         "。": ".",
 52 |         "！": "!",
 53 |         "？": "?",
 54 |         "\n": ".",
 55 |         "·": ",",
 56 |         "、": ",",
 57 |         "...": "…",
 58 |     }
 59 |     if ph in rep_map.keys():
 60 |         ph = rep_map[ph]
 61 |     if ph in symbols:
 62 |         return ph
 63 |     if ph not in symbols:
 64 |         ph = "UNK"
 65 |     return ph
 66 | 
 67 | 
 68 | def symbols_to_japanese(text):
 69 |     for regex, replacement in _symbols_to_japanese:
 70 |         text = re.sub(regex, replacement, text)
 71 |     return text
 72 | 
 73 | 
 74 | def preprocess_jap(text, with_prosody=False):
 75 |     """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
 76 |     text = symbols_to_japanese(text)
 77 |     sentences = re.split(_japanese_marks, text)
 78 |     marks = re.findall(_japanese_marks, text)
 79 |     text = []
 80 |     for i, sentence in enumerate(sentences):
 81 |         if re.match(_japanese_characters, sentence):
 82 |             if with_prosody:
 83 |                 text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
 84 |             else:
 85 |                 p = pyopenjtalk.g2p(sentence)
 86 |                 text += p.split(" ")
 87 | 
 88 |         if i < len(marks):
 89 |             if marks[i] == " ":# 防止意外的UNK
 90 |                 continue
 91 |             text += [marks[i].replace(" ", "")]
 92 |     return text
 93 | 
 94 | 
 95 | def text_normalize(text):
 96 |     # todo: jap text normalize
 97 |     return text
 98 | 
 99 | # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
100 | def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
101 |     """Extract phoneme + prosoody symbol sequence from input full-context labels.
102 | 
103 |     The algorithm is based on `Prosodic features control by symbols as input of
104 |     sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
105 | 
106 |     Args:
107 |         text (str): Input text.
108 |         drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
109 | 
110 |     Returns:
111 |         List[str]: List of phoneme + prosody symbols.
112 | 
113 |     Examples:
114 |         >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
115 |         >>> pyopenjtalk_g2p_prosody("こんにちは。")
116 |         ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
117 | 
118 |     .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
119 |         modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
120 | 
121 |     """
122 |     labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
123 |     N = len(labels)
124 | 
125 |     phones = []
126 |     for n in range(N):
127 |         lab_curr = labels[n]
128 | 
129 |         # current phoneme
130 |         p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
131 |         # deal unvoiced vowels as normal vowels
132 |         if drop_unvoiced_vowels and p3 in "AEIOU":
133 |             p3 = p3.lower()
134 | 
135 |         # deal with sil at the beginning and the end of text
136 |         if p3 == "sil":
137 |             assert n == 0 or n == N - 1
138 |             if n == 0:
139 |                 phones.append("^")
140 |             elif n == N - 1:
141 |                 # check question form or not
142 |                 e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
143 |                 if e3 == 0:
144 |                     phones.append("$")
145 |                 elif e3 == 1:
146 |                     phones.append("?")
147 |             continue
148 |         elif p3 == "pau":
149 |             phones.append("_")
150 |             continue
151 |         else:
152 |             phones.append(p3)
153 | 
154 |         # accent type and position info (forward or backward)
155 |         a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
156 |         a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
157 |         a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
158 | 
159 |         # number of mora in accent phrase
160 |         f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
161 | 
162 |         a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
163 |         # accent phrase border
164 |         if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
165 |             phones.append("#")
166 |         # pitch falling
167 |         elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
168 |             phones.append("]")
169 |         # pitch rising
170 |         elif a2 == 1 and a2_next == 2:
171 |             phones.append("[")
172 | 
173 |     return phones
174 | 
175 | # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
176 | def _numeric_feature_by_regex(regex, s):
177 |     match = re.search(regex, s)
178 |     if match is None:
179 |         return -50
180 |     return int(match.group(1))
181 | 
182 | def g2p(norm_text, with_prosody=False):
183 |     phones = preprocess_jap(norm_text, with_prosody)
184 |     phones = [post_replace_ph(i) for i in phones]
185 |     # todo: implement tones and word2ph
186 |     return phones
187 | 
188 | 
189 | if __name__ == "__main__":
190 |     phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね！")
191 |     print(phones)


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/namedict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/text/namedict_cache.pickle


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/opencpop-strict.txt:
--------------------------------------------------------------------------------
  1 | a	AA a
  2 | ai	AA ai
  3 | an	AA an
  4 | ang	AA ang
  5 | ao	AA ao
  6 | ba	b a
  7 | bai	b ai
  8 | ban	b an
  9 | bang	b ang
 10 | bao	b ao
 11 | bei	b ei
 12 | ben	b en
 13 | beng	b eng
 14 | bi	b i
 15 | bian	b ian
 16 | biao	b iao
 17 | bie	b ie
 18 | bin	b in
 19 | bing	b ing
 20 | bo	b o
 21 | bu	b u
 22 | ca	c a
 23 | cai	c ai
 24 | can	c an
 25 | cang	c ang
 26 | cao	c ao
 27 | ce	c e
 28 | cei	c ei
 29 | cen	c en
 30 | ceng	c eng
 31 | cha	ch a
 32 | chai	ch ai
 33 | chan	ch an
 34 | chang	ch ang
 35 | chao	ch ao
 36 | che	ch e
 37 | chen	ch en
 38 | cheng	ch eng
 39 | chi	ch ir
 40 | chong	ch ong
 41 | chou	ch ou
 42 | chu	ch u
 43 | chua	ch ua
 44 | chuai	ch uai
 45 | chuan	ch uan
 46 | chuang	ch uang
 47 | chui	ch ui
 48 | chun	ch un
 49 | chuo	ch uo
 50 | ci	c i0
 51 | cong	c ong
 52 | cou	c ou
 53 | cu	c u
 54 | cuan	c uan
 55 | cui	c ui
 56 | cun	c un
 57 | cuo	c uo
 58 | da	d a
 59 | dai	d ai
 60 | dan	d an
 61 | dang	d ang
 62 | dao	d ao
 63 | de	d e
 64 | dei	d ei
 65 | den	d en
 66 | deng	d eng
 67 | di	d i
 68 | dia	d ia
 69 | dian	d ian
 70 | diao	d iao
 71 | die	d ie
 72 | ding	d ing
 73 | diu	d iu
 74 | dong	d ong
 75 | dou	d ou
 76 | du	d u
 77 | duan	d uan
 78 | dui	d ui
 79 | dun	d un
 80 | duo	d uo
 81 | e	EE e
 82 | ei	EE ei
 83 | en	EE en
 84 | eng	EE eng
 85 | er	EE er
 86 | fa	f a
 87 | fan	f an
 88 | fang	f ang
 89 | fei	f ei
 90 | fen	f en
 91 | feng	f eng
 92 | fo	f o
 93 | fou	f ou
 94 | fu	f u
 95 | ga	g a
 96 | gai	g ai
 97 | gan	g an
 98 | gang	g ang
 99 | gao	g ao
100 | ge	g e
101 | gei	g ei
102 | gen	g en
103 | geng	g eng
104 | gong	g ong
105 | gou	g ou
106 | gu	g u
107 | gua	g ua
108 | guai	g uai
109 | guan	g uan
110 | guang	g uang
111 | gui	g ui
112 | gun	g un
113 | guo	g uo
114 | ha	h a
115 | hai	h ai
116 | han	h an
117 | hang	h ang
118 | hao	h ao
119 | he	h e
120 | hei	h ei
121 | hen	h en
122 | heng	h eng
123 | hong	h ong
124 | hou	h ou
125 | hu	h u
126 | hua	h ua
127 | huai	h uai
128 | huan	h uan
129 | huang	h uang
130 | hui	h ui
131 | hun	h un
132 | huo	h uo
133 | ji	j i
134 | jia	j ia
135 | jian	j ian
136 | jiang	j iang
137 | jiao	j iao
138 | jie	j ie
139 | jin	j in
140 | jing	j ing
141 | jiong	j iong
142 | jiu	j iu
143 | ju	j v
144 | jv	j v
145 | juan	j van
146 | jvan	j van
147 | jue	j ve
148 | jve	j ve
149 | jun	j vn
150 | jvn	j vn
151 | ka	k a
152 | kai	k ai
153 | kan	k an
154 | kang	k ang
155 | kao	k ao
156 | ke	k e
157 | kei	k ei
158 | ken	k en
159 | keng	k eng
160 | kong	k ong
161 | kou	k ou
162 | ku	k u
163 | kua	k ua
164 | kuai	k uai
165 | kuan	k uan
166 | kuang	k uang
167 | kui	k ui
168 | kun	k un
169 | kuo	k uo
170 | la	l a
171 | lai	l ai
172 | lan	l an
173 | lang	l ang
174 | lao	l ao
175 | le	l e
176 | lei	l ei
177 | leng	l eng
178 | li	l i
179 | lia	l ia
180 | lian	l ian
181 | liang	l iang
182 | liao	l iao
183 | lie	l ie
184 | lin	l in
185 | ling	l ing
186 | liu	l iu
187 | lo	l o
188 | long	l ong
189 | lou	l ou
190 | lu	l u
191 | luan	l uan
192 | lun	l un
193 | luo	l uo
194 | lv	l v
195 | lve	l ve
196 | ma	m a
197 | mai	m ai
198 | man	m an
199 | mang	m ang
200 | mao	m ao
201 | me	m e
202 | mei	m ei
203 | men	m en
204 | meng	m eng
205 | mi	m i
206 | mian	m ian
207 | miao	m iao
208 | mie	m ie
209 | min	m in
210 | ming	m ing
211 | miu	m iu
212 | mo	m o
213 | mou	m ou
214 | mu	m u
215 | na	n a
216 | nai	n ai
217 | nan	n an
218 | nang	n ang
219 | nao	n ao
220 | ne	n e
221 | nei	n ei
222 | nen	n en
223 | neng	n eng
224 | ni	n i
225 | nian	n ian
226 | niang	n iang
227 | niao	n iao
228 | nie	n ie
229 | nin	n in
230 | ning	n ing
231 | niu	n iu
232 | nong	n ong
233 | nou	n ou
234 | nu	n u
235 | nuan	n uan
236 | nun	n un
237 | nuo	n uo
238 | nv	n v
239 | nve	n ve
240 | o	OO o
241 | ou	OO ou
242 | pa	p a
243 | pai	p ai
244 | pan	p an
245 | pang	p ang
246 | pao	p ao
247 | pei	p ei
248 | pen	p en
249 | peng	p eng
250 | pi	p i
251 | pian	p ian
252 | piao	p iao
253 | pie	p ie
254 | pin	p in
255 | ping	p ing
256 | po	p o
257 | pou	p ou
258 | pu	p u
259 | qi	q i
260 | qia	q ia
261 | qian	q ian
262 | qiang	q iang
263 | qiao	q iao
264 | qie	q ie
265 | qin	q in
266 | qing	q ing
267 | qiong	q iong
268 | qiu	q iu
269 | qu	q v
270 | qv	q v
271 | quan	q van
272 | qvan	q van
273 | que	q ve
274 | qve	q ve
275 | qun	q vn
276 | qvn	q vn
277 | ran	r an
278 | rang	r ang
279 | rao	r ao
280 | re	r e
281 | ren	r en
282 | reng	r eng
283 | ri	r ir
284 | rong	r ong
285 | rou	r ou
286 | ru	r u
287 | rua	r ua
288 | ruan	r uan
289 | rui	r ui
290 | run	r un
291 | ruo	r uo
292 | sa	s a
293 | sai	s ai
294 | san	s an
295 | sang	s ang
296 | sao	s ao
297 | se	s e
298 | sen	s en
299 | seng	s eng
300 | sha	sh a
301 | shai	sh ai
302 | shan	sh an
303 | shang	sh ang
304 | shao	sh ao
305 | she	sh e
306 | shei	sh ei
307 | shen	sh en
308 | sheng	sh eng
309 | shi	sh ir
310 | shou	sh ou
311 | shu	sh u
312 | shua	sh ua
313 | shuai	sh uai
314 | shuan	sh uan
315 | shuang	sh uang
316 | shui	sh ui
317 | shun	sh un
318 | shuo	sh uo
319 | si	s i0
320 | song	s ong
321 | sou	s ou
322 | su	s u
323 | suan	s uan
324 | sui	s ui
325 | sun	s un
326 | suo	s uo
327 | ta	t a
328 | tai	t ai
329 | tan	t an
330 | tang	t ang
331 | tao	t ao
332 | te	t e
333 | tei	t ei
334 | teng	t eng
335 | ti	t i
336 | tian	t ian
337 | tiao	t iao
338 | tie	t ie
339 | ting	t ing
340 | tong	t ong
341 | tou	t ou
342 | tu	t u
343 | tuan	t uan
344 | tui	t ui
345 | tun	t un
346 | tuo	t uo
347 | wa	w a
348 | wai	w ai
349 | wan	w an
350 | wang	w ang
351 | wei	w ei
352 | wen	w en
353 | weng	w eng
354 | wo	w o
355 | wu	w u
356 | xi	x i
357 | xia	x ia
358 | xian	x ian
359 | xiang	x iang
360 | xiao	x iao
361 | xie	x ie
362 | xin	x in
363 | xing	x ing
364 | xiong	x iong
365 | xiu	x iu
366 | xu	x v
367 | xv	x v
368 | xuan	x van
369 | xvan	x van
370 | xue	x ve
371 | xve	x ve
372 | xun	x vn
373 | xvn	x vn
374 | ya	y a
375 | yan	y En
376 | yang	y ang
377 | yao	y ao
378 | ye	y E
379 | yi	y i
380 | yin	y in
381 | ying	y ing
382 | yo	y o
383 | yong	y ong
384 | you	y ou
385 | yu	y v
386 | yv	y v
387 | yuan	y van
388 | yvan	y van
389 | yue	y ve
390 | yve	y ve
391 | yun	y vn
392 | yvn	y vn
393 | za	z a
394 | zai	z ai
395 | zan	z an
396 | zang	z ang
397 | zao	z ao
398 | ze	z e
399 | zei	z ei
400 | zen	z en
401 | zeng	z eng
402 | zha	zh a
403 | zhai	zh ai
404 | zhan	zh an
405 | zhang	zh ang
406 | zhao	zh ao
407 | zhe	zh e
408 | zhei	zh ei
409 | zhen	zh en
410 | zheng	zh eng
411 | zhi	zh ir
412 | zhong	zh ong
413 | zhou	zh ou
414 | zhu	zh u
415 | zhua	zh ua
416 | zhuai	zh uai
417 | zhuan	zh uan
418 | zhuang	zh uang
419 | zhui	zh ui
420 | zhun	zh un
421 | zhuo	zh uo
422 | zi	z i0
423 | zong	z ong
424 | zou	z ou
425 | zu	z u
426 | zuan	z uan
427 | zui	z ui
428 | zun	z un
429 | zuo	z uo
430 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/symbols.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
  4 | punctuation = ["!", "?", "…", ",", "."]  # @是SP停顿
  5 | punctuation.append("-")
  6 | pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
  7 | # pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"]
  8 | pad = "_"
  9 | 
 10 | c = [
 11 |     "AA",
 12 |     "EE",
 13 |     "OO",
 14 |     "b",
 15 |     "c",
 16 |     "ch",
 17 |     "d",
 18 |     "f",
 19 |     "g",
 20 |     "h",
 21 |     "j",
 22 |     "k",
 23 |     "l",
 24 |     "m",
 25 |     "n",
 26 |     "p",
 27 |     "q",
 28 |     "r",
 29 |     "s",
 30 |     "sh",
 31 |     "t",
 32 |     "w",
 33 |     "x",
 34 |     "y",
 35 |     "z",
 36 |     "zh",
 37 | ]
 38 | v = [
 39 |     "E1",
 40 |     "En1",
 41 |     "a1",
 42 |     "ai1",
 43 |     "an1",
 44 |     "ang1",
 45 |     "ao1",
 46 |     "e1",
 47 |     "ei1",
 48 |     "en1",
 49 |     "eng1",
 50 |     "er1",
 51 |     "i1",
 52 |     "i01",
 53 |     "ia1",
 54 |     "ian1",
 55 |     "iang1",
 56 |     "iao1",
 57 |     "ie1",
 58 |     "in1",
 59 |     "ing1",
 60 |     "iong1",
 61 |     "ir1",
 62 |     "iu1",
 63 |     "o1",
 64 |     "ong1",
 65 |     "ou1",
 66 |     "u1",
 67 |     "ua1",
 68 |     "uai1",
 69 |     "uan1",
 70 |     "uang1",
 71 |     "ui1",
 72 |     "un1",
 73 |     "uo1",
 74 |     "v1",
 75 |     "van1",
 76 |     "ve1",
 77 |     "vn1",
 78 |     "E2",
 79 |     "En2",
 80 |     "a2",
 81 |     "ai2",
 82 |     "an2",
 83 |     "ang2",
 84 |     "ao2",
 85 |     "e2",
 86 |     "ei2",
 87 |     "en2",
 88 |     "eng2",
 89 |     "er2",
 90 |     "i2",
 91 |     "i02",
 92 |     "ia2",
 93 |     "ian2",
 94 |     "iang2",
 95 |     "iao2",
 96 |     "ie2",
 97 |     "in2",
 98 |     "ing2",
 99 |     "iong2",
100 |     "ir2",
101 |     "iu2",
102 |     "o2",
103 |     "ong2",
104 |     "ou2",
105 |     "u2",
106 |     "ua2",
107 |     "uai2",
108 |     "uan2",
109 |     "uang2",
110 |     "ui2",
111 |     "un2",
112 |     "uo2",
113 |     "v2",
114 |     "van2",
115 |     "ve2",
116 |     "vn2",
117 |     "E3",
118 |     "En3",
119 |     "a3",
120 |     "ai3",
121 |     "an3",
122 |     "ang3",
123 |     "ao3",
124 |     "e3",
125 |     "ei3",
126 |     "en3",
127 |     "eng3",
128 |     "er3",
129 |     "i3",
130 |     "i03",
131 |     "ia3",
132 |     "ian3",
133 |     "iang3",
134 |     "iao3",
135 |     "ie3",
136 |     "in3",
137 |     "ing3",
138 |     "iong3",
139 |     "ir3",
140 |     "iu3",
141 |     "o3",
142 |     "ong3",
143 |     "ou3",
144 |     "u3",
145 |     "ua3",
146 |     "uai3",
147 |     "uan3",
148 |     "uang3",
149 |     "ui3",
150 |     "un3",
151 |     "uo3",
152 |     "v3",
153 |     "van3",
154 |     "ve3",
155 |     "vn3",
156 |     "E4",
157 |     "En4",
158 |     "a4",
159 |     "ai4",
160 |     "an4",
161 |     "ang4",
162 |     "ao4",
163 |     "e4",
164 |     "ei4",
165 |     "en4",
166 |     "eng4",
167 |     "er4",
168 |     "i4",
169 |     "i04",
170 |     "ia4",
171 |     "ian4",
172 |     "iang4",
173 |     "iao4",
174 |     "ie4",
175 |     "in4",
176 |     "ing4",
177 |     "iong4",
178 |     "ir4",
179 |     "iu4",
180 |     "o4",
181 |     "ong4",
182 |     "ou4",
183 |     "u4",
184 |     "ua4",
185 |     "uai4",
186 |     "uan4",
187 |     "uang4",
188 |     "ui4",
189 |     "un4",
190 |     "uo4",
191 |     "v4",
192 |     "van4",
193 |     "ve4",
194 |     "vn4",
195 |     "E5",
196 |     "En5",
197 |     "a5",
198 |     "ai5",
199 |     "an5",
200 |     "ang5",
201 |     "ao5",
202 |     "e5",
203 |     "ei5",
204 |     "en5",
205 |     "eng5",
206 |     "er5",
207 |     "i5",
208 |     "i05",
209 |     "ia5",
210 |     "ian5",
211 |     "iang5",
212 |     "iao5",
213 |     "ie5",
214 |     "in5",
215 |     "ing5",
216 |     "iong5",
217 |     "ir5",
218 |     "iu5",
219 |     "o5",
220 |     "ong5",
221 |     "ou5",
222 |     "u5",
223 |     "ua5",
224 |     "uai5",
225 |     "uan5",
226 |     "uang5",
227 |     "ui5",
228 |     "un5",
229 |     "uo5",
230 |     "v5",
231 |     "van5",
232 |     "ve5",
233 |     "vn5",
234 | ]
235 | 
236 | v_without_tone = [
237 |     "E",
238 |     "En",
239 |     "a",
240 |     "ai",
241 |     "an",
242 |     "ang",
243 |     "ao",
244 |     "e",
245 |     "ei",
246 |     "en",
247 |     "eng",
248 |     "er",
249 |     "i",
250 |     "i0",
251 |     "ia",
252 |     "ian",
253 |     "iang",
254 |     "iao",
255 |     "ie",
256 |     "in",
257 |     "ing",
258 |     "iong",
259 |     "ir",
260 |     "iu",
261 |     "o",
262 |     "ong",
263 |     "ou",
264 |     "u",
265 |     "ua",
266 |     "uai",
267 |     "uan",
268 |     "uang",
269 |     "ui",
270 |     "un",
271 |     "uo",
272 |     "v",
273 |     "van",
274 |     "ve",
275 |     "vn",
276 | ]
277 | 
278 | # japanese
279 | ja_symbols = [
280 |     "I",
281 |     "N",
282 |     "U",
283 |     "a",
284 |     "b",
285 |     "by",
286 |     "ch",
287 |     "cl",
288 |     "d",
289 |     "dy",
290 |     "e",
291 |     "f",
292 |     "g",
293 |     "gy",
294 |     "h",
295 |     "hy",
296 |     "i",
297 |     "j",
298 |     "k",
299 |     "ky",
300 |     "m",
301 |     "my",
302 |     "n",
303 |     "ny",
304 |     "o",
305 |     "p",
306 |     "py",
307 |     "r",
308 |     "ry",
309 |     "s",
310 |     "sh",
311 |     "t",
312 |     "ts",
313 |     "u",
314 |     "v",
315 |     "w",
316 |     "y",
317 |     "z",
318 |     # "[", #上升调型
319 |     # "]", #下降调型
320 |     # "$", #结束符
321 |     # "^", #开始符
322 | ]
323 | 
324 | arpa = {
325 |     "AH0",
326 |     "S",
327 |     "AH1",
328 |     "EY2",
329 |     "AE2",
330 |     "EH0",
331 |     "OW2",
332 |     "UH0",
333 |     "NG",
334 |     "B",
335 |     "G",
336 |     "AY0",
337 |     "M",
338 |     "AA0",
339 |     "F",
340 |     "AO0",
341 |     "ER2",
342 |     "UH1",
343 |     "IY1",
344 |     "AH2",
345 |     "DH",
346 |     "IY0",
347 |     "EY1",
348 |     "IH0",
349 |     "K",
350 |     "N",
351 |     "W",
352 |     "IY2",
353 |     "T",
354 |     "AA1",
355 |     "ER1",
356 |     "EH2",
357 |     "OY0",
358 |     "UH2",
359 |     "UW1",
360 |     "Z",
361 |     "AW2",
362 |     "AW1",
363 |     "V",
364 |     "UW2",
365 |     "AA2",
366 |     "ER",
367 |     "AW0",
368 |     "UW0",
369 |     "R",
370 |     "OW1",
371 |     "EH1",
372 |     "ZH",
373 |     "AE0",
374 |     "IH2",
375 |     "IH",
376 |     "Y",
377 |     "JH",
378 |     "P",
379 |     "AY1",
380 |     "EY0",
381 |     "OY2",
382 |     "TH",
383 |     "HH",
384 |     "D",
385 |     "ER0",
386 |     "CH",
387 |     "AO1",
388 |     "AE1",
389 |     "AO2",
390 |     "OY1",
391 |     "AY2",
392 |     "IH1",
393 |     "OW0",
394 |     "L",
395 |     "SH",
396 | }
397 | 
398 | symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
399 | symbols = sorted(set(symbols))
400 | if __name__ == "__main__":
401 |     print(len(symbols))
402 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/README.md:
--------------------------------------------------------------------------------
 1 | ## Supported NSW (Non-Standard-Word) Normalization
 2 | 
 3 | |NSW type|raw|normalized|
 4 | |:--|:-|:-|
 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
 6 | |cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
 7 | |numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
 8 | |date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
14 | |telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .text_normlization import *
15 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/char_convert.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/char_convert.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/chronology.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/chronology.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/constants.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/constants.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/num.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/num.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/phonecode.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/phonecode.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/quantifier.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/quantifier.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/text_normlization.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/GPT_SoVITS/text/zh_normalization/__pycache__/text_normlization.cpython-310.pyc


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import re
 15 | 
 16 | from .num import DIGITS
 17 | from .num import num2str
 18 | from .num import verbalize_cardinal
 19 | from .num import verbalize_digit
 20 | 
 21 | 
 22 | def _time_num2str(num_string: str) -> str:
 23 |     """A special case for verbalizing number in time."""
 24 |     result = num2str(num_string.lstrip('0'))
 25 |     if num_string.startswith('0'):
 26 |         result = DIGITS['0'] + result
 27 |     return result
 28 | 
 29 | 
 30 | # 时刻表达式
 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
 32 |                      r':([0-5][0-9])'
 33 |                      r'(:([0-5][0-9]))?')
 34 | 
 35 | # 时间范围，如8:30-12:30
 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
 37 |                            r':([0-5][0-9])'
 38 |                            r'(:([0-5][0-9]))?'
 39 |                            r'(~|-)'
 40 |                            r'([0-1]?[0-9]|2[0-3])'
 41 |                            r':([0-5][0-9])'
 42 |                            r'(:([0-5][0-9]))?')
 43 | 
 44 | 
 45 | def replace_time(match) -> str:
 46 |     """
 47 |     Args:
 48 |         match (re.Match)
 49 |     Returns:
 50 |         str
 51 |     """
 52 | 
 53 |     is_range = len(match.groups()) > 5
 54 | 
 55 |     hour = match.group(1)
 56 |     minute = match.group(2)
 57 |     second = match.group(4)
 58 | 
 59 |     if is_range:
 60 |         hour_2 = match.group(6)
 61 |         minute_2 = match.group(7)
 62 |         second_2 = match.group(9)
 63 | 
 64 |     result = f"{num2str(hour)}点"
 65 |     if minute.lstrip('0'):
 66 |         if int(minute) == 30:
 67 |             result += "半"
 68 |         else:
 69 |             result += f"{_time_num2str(minute)}分"
 70 |     if second and second.lstrip('0'):
 71 |         result += f"{_time_num2str(second)}秒"
 72 | 
 73 |     if is_range:
 74 |         result += "至"
 75 |         result += f"{num2str(hour_2)}点"
 76 |         if minute_2.lstrip('0'):
 77 |             if int(minute) == 30:
 78 |                 result += "半"
 79 |             else:
 80 |                 result += f"{_time_num2str(minute_2)}分"
 81 |         if second_2 and second_2.lstrip('0'):
 82 |             result += f"{_time_num2str(second_2)}秒"
 83 | 
 84 |     return result
 85 | 
 86 | 
 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年'
 88 |                      r'((0?[1-9]|1[0-2])月)?'
 89 |                      r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
 90 | 
 91 | 
 92 | def replace_date(match) -> str:
 93 |     """
 94 |     Args:
 95 |         match (re.Match)
 96 |     Returns:
 97 |         str
 98 |     """
 99 |     year = match.group(1)
100 |     month = match.group(3)
101 |     day = match.group(5)
102 |     result = ""
103 |     if year:
104 |         result += f"{verbalize_digit(year)}年"
105 |     if month:
106 |         result += f"{verbalize_cardinal(month)}月"
107 |     if day:
108 |         result += f"{verbalize_cardinal(day)}{match.group(9)}"
109 |     return result
110 | 
111 | 
112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
113 | RE_DATE2 = re.compile(
114 |     r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
115 | 
116 | 
117 | def replace_date2(match) -> str:
118 |     """
119 |     Args:
120 |         match (re.Match)
121 |     Returns:
122 |         str
123 |     """
124 |     year = match.group(1)
125 |     month = match.group(3)
126 |     day = match.group(4)
127 |     result = ""
128 |     if year:
129 |         result += f"{verbalize_digit(year)}年"
130 |     if month:
131 |         result += f"{verbalize_cardinal(month)}月"
132 |     if day:
133 |         result += f"{verbalize_cardinal(day)}日"
134 |     return result
135 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 | 
17 | from pypinyin.constants import SUPPORT_UCS4
18 | 
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {
22 |     ord(char) + 65248: ord(char)
23 |     for char in string.ascii_letters
24 | }
25 | 
26 | # 英文字符半角 -> 全角映射表
27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
28 | 
29 | # 数字字符全角 -> 半角映射表 (num: 10)
30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
31 | # 数字字符半角 -> 全角映射表
32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
33 | 
34 | # 标点符号全角 -> 半角映射表 (num: 32)
35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
36 | # 标点符号半角 -> 全角映射表
37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
38 | 
39 | # 空格 (num: 1)
40 | F2H_SPACE = {'\u3000': ' '}
41 | H2F_SPACE = {' ': '\u3000'}
42 | 
43 | # 非"有拼音的汉字"的字符串，可用于NSW提取
44 | if SUPPORT_UCS4:
45 |     RE_NSW = re.compile(r'(?:[^'
46 |                         r'\u3007'  # 〇
47 |                         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
48 |                         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
49 |                         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
50 |                         r'\U00020000-\U0002A6DF'  # CJK扩展B:[20000-2A6DF]
51 |                         r'\U0002A703-\U0002B73F'  # CJK扩展C:[2A700-2B73F]
52 |                         r'\U0002B740-\U0002B81D'  # CJK扩展D:[2B740-2B81D]
53 |                         r'\U0002F80A-\U0002FA1F'  # CJK兼容扩展:[2F800-2FA1F]
54 |                         r'])+')
55 | else:
56 |     RE_NSW = re.compile(  # pragma: no cover
57 |         r'(?:[^'
58 |         r'\u3007'  # 〇
59 |         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
60 |         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
61 |         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
62 |         r'])+')
63 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/num.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """
 15 | Rules to verbalize numbers into Chinese characters.
 16 | https://zh.wikipedia.org/wiki/中文数字#現代中文
 17 | """
 18 | import re
 19 | from collections import OrderedDict
 20 | from typing import List
 21 | 
 22 | DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
 23 | UNITS = OrderedDict({
 24 |     1: '十',
 25 |     2: '百',
 26 |     3: '千',
 27 |     4: '万',
 28 |     8: '亿',
 29 | })
 30 | 
 31 | COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
 32 | 
 33 | # 分数表达式
 34 | RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
 35 | 
 36 | 
 37 | def replace_frac(match) -> str:
 38 |     """
 39 |     Args:
 40 |         match (re.Match)
 41 |     Returns:
 42 |         str
 43 |     """
 44 |     sign = match.group(1)
 45 |     nominator = match.group(2)
 46 |     denominator = match.group(3)
 47 |     sign: str = "负" if sign else ""
 48 |     nominator: str = num2str(nominator)
 49 |     denominator: str = num2str(denominator)
 50 |     result = f"{sign}{denominator}分之{nominator}"
 51 |     return result
 52 | 
 53 | 
 54 | # 百分数表达式
 55 | RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
 56 | 
 57 | 
 58 | def replace_percentage(match) -> str:
 59 |     """
 60 |     Args:
 61 |         match (re.Match)
 62 |     Returns:
 63 |         str
 64 |     """
 65 |     sign = match.group(1)
 66 |     percent = match.group(2)
 67 |     sign: str = "负" if sign else ""
 68 |     percent: str = num2str(percent)
 69 |     result = f"{sign}百分之{percent}"
 70 |     return result
 71 | 
 72 | 
 73 | # 整数表达式
 74 | # 带负号的整数 -10
 75 | RE_INTEGER = re.compile(r'(-)' r'(\d+)')
 76 | 
 77 | 
 78 | def replace_negative_num(match) -> str:
 79 |     """
 80 |     Args:
 81 |         match (re.Match)
 82 |     Returns:
 83 |         str
 84 |     """
 85 |     sign = match.group(1)
 86 |     number = match.group(2)
 87 |     sign: str = "负" if sign else ""
 88 |     number: str = num2str(number)
 89 |     result = f"{sign}{number}"
 90 |     return result
 91 | 
 92 | 
 93 | # 编号-无符号整形
 94 | # 00078
 95 | RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
 96 | 
 97 | 
 98 | def replace_default_num(match):
 99 |     """
100 |     Args:
101 |         match (re.Match)
102 |     Returns:
103 |         str
104 |     """
105 |     number = match.group(0)
106 |     return verbalize_digit(number, alt_one=True)
107 | 
108 | 
109 | # 加减乘除
110 | RE_ASMD = re.compile(
111 |     r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
112 | asmd_map = {
113 |     '+': '加',
114 |     '-': '减',
115 |     '×': '乘',
116 |     '÷': '除',
117 |     '=': '等于'
118 | }
119 | 
120 | 
121 | def replace_asmd(match) -> str:
122 |     """
123 |     Args:
124 |         match (re.Match)
125 |     Returns:
126 |         str
127 |     """
128 |     result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
129 |     return result
130 | 
131 | 
132 | # 数字表达式
133 | # 纯小数
134 | RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
135 | # 正整数 + 量词
136 | RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
137 | RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
138 | 
139 | 
140 | def replace_positive_quantifier(match) -> str:
141 |     """
142 |     Args:
143 |         match (re.Match)
144 |     Returns:
145 |         str
146 |     """
147 |     number = match.group(1)
148 |     match_2 = match.group(2)
149 |     if match_2 == "+":
150 |         match_2 = "多"
151 |     match_2: str = match_2 if match_2 else ""
152 |     quantifiers: str = match.group(3)
153 |     number: str = num2str(number)
154 |     result = f"{number}{match_2}{quantifiers}"
155 |     return result
156 | 
157 | 
158 | def replace_number(match) -> str:
159 |     """
160 |     Args:
161 |         match (re.Match)
162 |     Returns:
163 |         str
164 |     """
165 |     sign = match.group(1)
166 |     number = match.group(2)
167 |     pure_decimal = match.group(5)
168 |     if pure_decimal:
169 |         result = num2str(pure_decimal)
170 |     else:
171 |         sign: str = "负" if sign else ""
172 |         number: str = num2str(number)
173 |         result = f"{sign}{number}"
174 |     return result
175 | 
176 | 
177 | # 范围表达式
178 | # match.group(1) and match.group(8) are copy from RE_NUMBER
179 | 
180 | RE_RANGE = re.compile(
181 |     r"""
182 |     (?<![\d\+\-\×÷=])      # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
183 |     ((-?)((\d+)(\.\d+)?))  # 匹配范围起始的负数或正数（整数或小数）
184 |     [-~]                   # 匹配范围分隔符
185 |     ((-?)((\d+)(\.\d+)?))  # 匹配范围结束的负数或正数（整数或小数）
186 |     (?![\d\+\-\×÷=])       # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
187 |     """, re.VERBOSE)
188 | 
189 | 
190 | def replace_range(match) -> str:
191 |     """
192 |     Args:
193 |         match (re.Match)
194 |     Returns:
195 |         str
196 |     """
197 |     first, second = match.group(1), match.group(6)
198 |     first = RE_NUMBER.sub(replace_number, first)
199 |     second = RE_NUMBER.sub(replace_number, second)
200 |     result = f"{first}到{second}"
201 |     return result
202 | 
203 | 
204 | # ~至表达式
205 | RE_TO_RANGE = re.compile(
206 |     r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)')
207 | 
208 | def replace_to_range(match) -> str:
209 |     """
210 |     Args:
211 |         match (re.Match)
212 |     Returns:
213 |         str
214 |     """
215 |     result = match.group(0).replace('~', '至')
216 |     return result
217 | 
218 | 
219 | def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
220 |     stripped = value_string.lstrip('0')
221 |     if len(stripped) == 0:
222 |         return []
223 |     elif len(stripped) == 1:
224 |         if use_zero and len(stripped) < len(value_string):
225 |             return [DIGITS['0'], DIGITS[stripped]]
226 |         else:
227 |             return [DIGITS[stripped]]
228 |     else:
229 |         largest_unit = next(
230 |             power for power in reversed(UNITS.keys()) if power < len(stripped))
231 |         first_part = value_string[:-largest_unit]
232 |         second_part = value_string[-largest_unit:]
233 |         return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
234 |             second_part)
235 | 
236 | 
237 | def verbalize_cardinal(value_string: str) -> str:
238 |     if not value_string:
239 |         return ''
240 | 
241 |     # 000 -> '零' , 0 -> '零'
242 |     value_string = value_string.lstrip('0')
243 |     if len(value_string) == 0:
244 |         return DIGITS['0']
245 | 
246 |     result_symbols = _get_value(value_string)
247 |     # verbalized number starting with '一十*' is abbreviated as `十*`
248 |     if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
249 |             '1'] and result_symbols[1] == UNITS[1]:
250 |         result_symbols = result_symbols[1:]
251 |     return ''.join(result_symbols)
252 | 
253 | 
254 | def verbalize_digit(value_string: str, alt_one=False) -> str:
255 |     result_symbols = [DIGITS[digit] for digit in value_string]
256 |     result = ''.join(result_symbols)
257 |     if alt_one:
258 |         result = result.replace("一", "幺")
259 |     return result
260 | 
261 | 
262 | def num2str(value_string: str) -> str:
263 |     integer_decimal = value_string.split('.')
264 |     if len(integer_decimal) == 1:
265 |         integer = integer_decimal[0]
266 |         decimal = ''
267 |     elif len(integer_decimal) == 2:
268 |         integer, decimal = integer_decimal
269 |     else:
270 |         raise ValueError(
271 |             f"The value string: '${value_string}' has more than one point in it."
272 |         )
273 | 
274 |     result = verbalize_cardinal(integer)
275 | 
276 |     decimal = decimal.rstrip('0')
277 |     if decimal:
278 |         # '.22' is verbalized as '零点二二'
279 |         # '3.20' is verbalized as '三点二
280 |         result = result if result else "零"
281 |         result += '点' + verbalize_digit(decimal)
282 |     return result
283 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import verbalize_digit
17 | 
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通：130、131、132、156、155、186、185、176
23 | # 电信：133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(
25 |     r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
26 | RE_TELEPHONE = re.compile(
27 |     r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
28 | 
29 | # 全国统一的号码400开头
30 | RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
31 | 
32 | 
33 | def phone2str(phone_string: str, mobile=True) -> str:
34 |     if mobile:
35 |         sp_parts = phone_string.strip('+').split()
36 |         result = '，'.join(
37 |             [verbalize_digit(part, alt_one=True) for part in sp_parts])
38 |         return result
39 |     else:
40 |         sil_parts = phone_string.split('-')
41 |         result = '，'.join(
42 |             [verbalize_digit(part, alt_one=True) for part in sil_parts])
43 |         return result
44 | 
45 | 
46 | def replace_phone(match) -> str:
47 |     """
48 |     Args:
49 |         match (re.Match)
50 |     Returns:
51 |         str
52 |     """
53 |     return phone2str(match.group(0), mobile=False)
54 | 
55 | 
56 | def replace_mobile(match) -> str:
57 |     """
58 |     Args:
59 |         match (re.Match)
60 |     Returns:
61 |         str
62 |     """
63 |     return phone2str(match.group(0))
64 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import num2str
17 | 
18 | # 温度表达式，温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
21 | measure_dict = {
22 |     "cm2": "平方厘米",
23 |     "cm²": "平方厘米",
24 |     "cm3": "立方厘米",
25 |     "cm³": "立方厘米",
26 |     "cm": "厘米",
27 |     "db": "分贝",
28 |     "ds": "毫秒",
29 |     "kg": "千克",
30 |     "km": "千米",
31 |     "m2": "平方米",
32 |     "m²": "平方米",
33 |     "m³": "立方米",
34 |     "m3": "立方米",
35 |     "ml": "毫升",
36 |     "m": "米",
37 |     "mm": "毫米",
38 |     "s": "秒"
39 | }
40 | 
41 | 
42 | def replace_temperature(match) -> str:
43 |     """
44 |     Args:
45 |         match (re.Match)
46 |     Returns:
47 |         str
48 |     """
49 |     sign = match.group(1)
50 |     temperature = match.group(2)
51 |     unit = match.group(3)
52 |     sign: str = "零下" if sign else ""
53 |     temperature: str = num2str(temperature)
54 |     unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 |     result = f"{sign}{temperature}{unit}"
56 |     return result
57 | 
58 | 
59 | def replace_measure(sentence) -> str:
60 |     for q_notation in measure_dict:
61 |         if q_notation in sentence:
62 |             sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 |     return sentence
64 | 


--------------------------------------------------------------------------------
/demo/TTS/GPT_SoVITS/text/zh_normalization/text_normlization.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import re
 15 | from typing import List
 16 | 
 17 | from .char_convert import tranditional_to_simplified
 18 | from .chronology import RE_DATE
 19 | from .chronology import RE_DATE2
 20 | from .chronology import RE_TIME
 21 | from .chronology import RE_TIME_RANGE
 22 | from .chronology import replace_date
 23 | from .chronology import replace_date2
 24 | from .chronology import replace_time
 25 | from .constants import F2H_ASCII_LETTERS
 26 | from .constants import F2H_DIGITS
 27 | from .constants import F2H_SPACE
 28 | from .num import RE_DECIMAL_NUM
 29 | from .num import RE_DEFAULT_NUM
 30 | from .num import RE_FRAC
 31 | from .num import RE_INTEGER
 32 | from .num import RE_NUMBER
 33 | from .num import RE_PERCENTAGE
 34 | from .num import RE_POSITIVE_QUANTIFIERS
 35 | from .num import RE_RANGE
 36 | from .num import RE_TO_RANGE
 37 | from .num import RE_ASMD
 38 | from .num import replace_default_num
 39 | from .num import replace_frac
 40 | from .num import replace_negative_num
 41 | from .num import replace_number
 42 | from .num import replace_percentage
 43 | from .num import replace_positive_quantifier
 44 | from .num import replace_range
 45 | from .num import replace_to_range
 46 | from .num import replace_asmd
 47 | from .phonecode import RE_MOBILE_PHONE
 48 | from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
 49 | from .phonecode import RE_TELEPHONE
 50 | from .phonecode import replace_mobile
 51 | from .phonecode import replace_phone
 52 | from .quantifier import RE_TEMPERATURE
 53 | from .quantifier import replace_measure
 54 | from .quantifier import replace_temperature
 55 | 
 56 | 
 57 | class TextNormalizer():
 58 |     def __init__(self):
 59 |         self.SENTENCE_SPLITOR = re.compile(r'([：、，；。？！,;?!][”’]?)')
 60 | 
 61 |     def _split(self, text: str, lang="zh") -> List[str]:
 62 |         """Split long text into sentences with sentence-splitting punctuations.
 63 |         Args:
 64 |             text (str): The input text.
 65 |         Returns:
 66 |             List[str]: Sentences.
 67 |         """
 68 |         # Only for pure Chinese here
 69 |         if lang == "zh":
 70 |             text = text.replace(" ", "")
 71 |             # 过滤掉特殊字符
 72 |             text = re.sub(r'[——《》【】<>{}()（）#&@“”^_|\\]', '', text)
 73 |         text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
 74 |         text = text.strip()
 75 |         sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
 76 |         return sentences
 77 | 
 78 |     def _post_replace(self, sentence: str) -> str:
 79 |         sentence = sentence.replace('/', '每')
 80 |         # sentence = sentence.replace('~', '至')
 81 |         # sentence = sentence.replace('～', '至')
 82 |         sentence = sentence.replace('①', '一')
 83 |         sentence = sentence.replace('②', '二')
 84 |         sentence = sentence.replace('③', '三')
 85 |         sentence = sentence.replace('④', '四')
 86 |         sentence = sentence.replace('⑤', '五')
 87 |         sentence = sentence.replace('⑥', '六')
 88 |         sentence = sentence.replace('⑦', '七')
 89 |         sentence = sentence.replace('⑧', '八')
 90 |         sentence = sentence.replace('⑨', '九')
 91 |         sentence = sentence.replace('⑩', '十')
 92 |         sentence = sentence.replace('α', '阿尔法')
 93 |         sentence = sentence.replace('β', '贝塔')
 94 |         sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
 95 |         sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
 96 |         sentence = sentence.replace('ε', '艾普西龙')
 97 |         sentence = sentence.replace('ζ', '捷塔')
 98 |         sentence = sentence.replace('η', '依塔')
 99 |         sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
100 |         sentence = sentence.replace('ι', '艾欧塔')
101 |         sentence = sentence.replace('κ', '喀帕')
102 |         sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
103 |         sentence = sentence.replace('μ', '缪')
104 |         sentence = sentence.replace('ν', '拗')
105 |         sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
106 |         sentence = sentence.replace('ο', '欧米克伦')
107 |         sentence = sentence.replace('π', '派').replace('Π', '派')
108 |         sentence = sentence.replace('ρ', '肉')
109 |         sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
110 |             'σ', '西格玛')
111 |         sentence = sentence.replace('τ', '套')
112 |         sentence = sentence.replace('υ', '宇普西龙')
113 |         sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
114 |         sentence = sentence.replace('χ', '器')
115 |         sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
116 |         sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
117 |         # re filter special characters, have one more character "-" than line 68
118 |         sentence = re.sub(r'[-——《》【】<=>{}()（）#&@“”^_|\\]', '', sentence)
119 |         return sentence
120 | 
121 |     def normalize_sentence(self, sentence: str) -> str:
122 |         # basic character conversions
123 |         sentence = tranditional_to_simplified(sentence)
124 |         sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
125 |             F2H_DIGITS).translate(F2H_SPACE)
126 | 
127 |         # number related NSW verbalization
128 |         sentence = RE_DATE.sub(replace_date, sentence)
129 |         sentence = RE_DATE2.sub(replace_date2, sentence)
130 | 
131 |         # range first
132 |         sentence = RE_TIME_RANGE.sub(replace_time, sentence)
133 |         sentence = RE_TIME.sub(replace_time, sentence)
134 | 
135 |         # 处理~波浪号作为至的替换
136 |         sentence = RE_TO_RANGE.sub(replace_to_range, sentence)
137 |         sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
138 |         sentence = replace_measure(sentence)
139 |         sentence = RE_FRAC.sub(replace_frac, sentence)
140 |         sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
141 |         sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
142 | 
143 |         sentence = RE_TELEPHONE.sub(replace_phone, sentence)
144 |         sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
145 | 
146 |         sentence = RE_RANGE.sub(replace_range, sentence)
147 | 
148 |         # 处理加减乘除
149 |         while RE_ASMD.search(sentence):
150 |             sentence = RE_ASMD.sub(replace_asmd, sentence)
151 | 
152 |         sentence = RE_INTEGER.sub(replace_negative_num, sentence)
153 |         sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
154 |         sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
155 |                                                sentence)
156 |         sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
157 |         sentence = RE_NUMBER.sub(replace_number, sentence)
158 |         sentence = self._post_replace(sentence)
159 | 
160 |         return sentence
161 | 
162 |     def normalize(self, text: str) -> List[str]:
163 |         sentences = self._split(text)
164 |         sentences = [self.normalize_sentence(sent) for sent in sentences]
165 |         return sentences
166 | 


--------------------------------------------------------------------------------
/demo/TTS/TEMP/jieba.cache:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/TEMP/jieba.cache


--------------------------------------------------------------------------------
/demo/TTS/TEMP/tmp_s2.json:
--------------------------------------------------------------------------------
1 | {"train": {"log_interval": 100, "eval_interval": 500, "seed": 1234, "epochs": 25, "learning_rate": 0.0001, "betas": [0.8, 0.99], "eps": 1e-09, "batch_size": 2, "fp16_run": true, "lr_decay": 0.999875, "segment_size": 20480, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0, "text_low_lr_rate": 0.4, "pretrained_s2G": "/root/GPT-SoVITS/pretrained_models/s2G488k.pth", "pretrained_s2D": "/root/GPT-SoVITS/pretrained_models/s2D488k.pth", "if_save_latest": true, "if_save_every_weights": true, "save_every_epoch": 5, "gpu_numbers": "0"}, "data": {"max_wav_value": 32768.0, "sampling_rate": 32000, "filter_length": 2048, "hop_length": 640, "win_length": 2048, "n_mel_channels": 128, "mel_fmin": 0.0, "mel_fmax": null, "add_blank": true, "n_speakers": 300, "cleaned_text": true, "exp_dir": "logs/hutao"}, "model": {"inter_channels": 192, "hidden_channels": 192, "filter_channels": 768, "n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0.1, "resblock": "1", "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], "upsample_rates": [10, 8, 2, 2, 2], "upsample_initial_channel": 512, "upsample_kernel_sizes": [16, 16, 8, 2, 2], "n_layers_q": 3, "use_spectral_norm": false, "gin_channels": 512, "semantic_frame_rate": "25hz", "freeze_quantizer": true}, "s2_ckpt_dir": "logs/hutao", "content_module": "cnhubert", "save_weight_dir": "SoVITS_weights", "name": "hutao"}


--------------------------------------------------------------------------------
/demo/TTS/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/TTS/__init__.py


--------------------------------------------------------------------------------
/demo/TTS/config.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | 
 3 | import torch
 4 | 
 5 | # 推理用的指定模型
 6 | sovits_path = ""
 7 | gpt_path = ""
 8 | is_half_str = os.environ.get("is_half", "True")
 9 | is_half = True if is_half_str.lower() == 'true' else False
10 | is_share_str = os.environ.get("is_share","False")
11 | is_share= True if is_share_str.lower() == 'true' else False
12 | 
13 | cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
14 | bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
15 | pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
16 | pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
17 | 
18 | exp_root = "logs"
19 | python_exec = sys.executable or "python"
20 | if torch.cuda.is_available():
21 |     infer_device = "cuda"
22 | else:
23 |     infer_device = "cpu"
24 | 
25 | webui_port_main = 9874
26 | webui_port_uvr5 = 9873
27 | webui_port_infer_tts = 9872
28 | webui_port_subfix = 9871
29 | 
30 | api_port = 9880
31 | 
32 | if infer_device == "cuda":
33 |     gpu_name = torch.cuda.get_device_name(0)
34 |     if (
35 |             ("16" in gpu_name and "V100" not in gpu_name.upper())
36 |             or "P40" in gpu_name.upper()
37 |             or "P10" in gpu_name.upper()
38 |             or "1060" in gpu_name
39 |             or "1070" in gpu_name
40 |             or "1080" in gpu_name
41 |     ):
42 |         is_half=False
43 | 
44 | if(infer_device=="cpu"):is_half=False
45 | 
46 | class Config:
47 |     def __init__(self):
48 |         self.sovits_path = sovits_path
49 |         self.gpt_path = gpt_path
50 |         self.is_half = is_half
51 | 
52 |         self.cnhubert_path = cnhubert_path
53 |         self.bert_path = bert_path
54 |         self.pretrained_sovits_path = pretrained_sovits_path
55 |         self.pretrained_gpt_path = pretrained_gpt_path
56 | 
57 |         self.exp_root = exp_root
58 |         self.python_exec = python_exec
59 |         self.infer_device = infer_device
60 | 
61 |         self.webui_port_main = webui_port_main
62 |         self.webui_port_uvr5 = webui_port_uvr5
63 |         self.webui_port_infer_tts = webui_port_infer_tts
64 |         self.webui_port_subfix = webui_port_subfix
65 | 
66 |         self.api_port = api_port
67 | 


--------------------------------------------------------------------------------
/demo/TTS/data_process.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | path = '/root/GPT-SoVITS/Hutao/'
 4 | files = os.listdir(path)
 5 | 
 6 | 
 7 | for id,value in enumerate(files):
 8 |     format = ' |hutao|zh| '.split('|')
 9 |     words = value.split('.')[0]
10 |     format[3] = words
11 |     format[0] = f'{path+str(id)}.wav'
12 |     os.system(f'mv {path+value} {path+str(id)}.wav')
13 |     list_content = '|'.join(format) + '\n'
14 | 
15 |     with open('/root/GPT-SoVITS/hutao.list','a') as f:
16 |         f.write(list_content)


--------------------------------------------------------------------------------
/demo/TTS/i18n/locale/pt_BR.json:
--------------------------------------------------------------------------------
 1 | {	
 2 |     "很遗憾您这没有能用的显卡来支持您训练": "Infelizmente, você não possui uma placa de vídeo funcional para suportar seu treinamento",
 3 |     "UVR5已开启": "UVR5 está ativado",
 4 |     "UVR5已关闭": "UVR5 está desativado",
 5 |     "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "Este software é de código aberto sob a licença MIT. O autor não tem controle sobre o software. Aqueles que usam o software e difundem os sons exportados pelo software são totalmente responsáveis. <br>Se você não concorda com esta cláusula, não pode usar ou citar nenhum código e arquivo dentro do pacote de software. Consulte o diretório raiz <b>LICENSE</b> para mais detalhes.<br><br> Traduzido por Rafael Godoy Ebert",
 6 |     "0-前置数据集获取工具": "0- Ferramenta de aquisição de conjunto de dados pré-frontal",
 7 |     "0a-UVR5人声伴奏分离&去混响去延迟工具": "0A-UVR5 separação de voz e acompanhamento instrumental & ferramenta para remover reverberação e atraso",
 8 |     "是否开启UVR5-WebUI": "Se deseja ativar a UVR5-WEBUI",
 9 |     "UVR5进程输出信息": "Informações de saída do processo UVR5",
10 |     "0b-语音切分工具": "0b- Ferramenta de corte de voz",
11 |     "音频自动切分输入路径，可文件可文件夹": "Caminho de entrada automático de corte de áudio, pode ser um arquivo ou uma pasta",
12 |     "切分后的子音频的输出根目录": "Diretório raiz de saída do sub-áudio após o corte",
13 |     "threshold:音量小于这个值视作静音的备选切割点": "Limiar: O volume menor que este valor é considerado como um ponto de corte mudo alternativo",
14 |     "min_length:每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值": "min_length: O comprimento mínimo de cada parágrafo, se o primeiro for muito curto, conecte-o continuamente aos próximos até ultrapassar este valor",
15 |     "min_interval:最短切割间隔": "min_interval: O intervalo de corte mínimo",
16 |     "hop_size:怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）": "HOP_SIZE: Como calcular a curva de volume, quanto menor a precisão, maior a quantidade de cálculos (não significa que quanto maior a precisão, melhor o efeito)",
17 |     "max_sil_kept:切完后静音最多留多长": "max_sil_kept: Depois de cortar, por quanto tempo no máximo o silêncio é mantido",
18 |     "开启语音切割": "Ativar corte de voz",
19 |     "终止语音切割": "Encerrar corte de voz",
20 |     "max:归一化后最大值多少": "MAX: Qual é o valor máximo após a normalização?",
21 |     "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Em que proporção o áudio normalizado é misturado de volta",
22 |     "切割使用的进程数": "Número de processos para corte",
23 |     "语音切割进程输出信息": "Informações de saída do processo de corte de voz",
24 |     "0c-中文批量离线ASR工具": "0c- Ferramenta chinesa de ASR offline em lote",
25 |     "开启离线批量ASR": "Ativar ASR offline em lote",
26 |     "终止ASR进程": "Encerrar processo ASR",
27 |     "批量ASR(中文only)输入文件夹路径": "Caminho da pasta de entrada para ASR em lote (apenas chinês)",
28 |     "ASR进程输出信息": "Informações de saída do processo ASR",
29 |     "0d-语音文本校对标注工具": "0d- Ferramenta de correção e marcação de texto de voz",
30 |     "是否开启打标WebUI": "Se deseja abrir o webui de marcação",
31 |     "打标数据标注文件路径": "Caminho do arquivo de marcação de dados de marcação",
32 |     "打标工具进程输出信息": "Informações de saída do processo da ferramenta de marcação",
33 |     "1-GPT-SoVITS-TTS": "1-GPT-SOVITS-TTS",
34 |     "*实验/模型名": "*Nome do experimento/modelo",
35 |     "显卡信息": "Informações da placa de vídeo",
36 |     "预训练的SoVITS-G模型路径": "Caminho do modelo SoVITS-G pre-train",
37 |     "预训练的SoVITS-D模型路径": "Caminho do modelo SoVITS-D pre-train",
38 |     "预训练的GPT模型路径": "Caminho do modelo GPT pre-train",
39 |     "1A-训练集格式化工具": "1A-Ferramenta de formatação de conjunto de dados de treinamento",
40 |     "输出logs/实验名目录下应有23456开头的文件和文件夹": "Logs de saída/deve haver arquivos e pastas começando com 23456 no diretório do nome do experimento",
41 |     "*文本标注文件": "*Arquivo de marcação de texto",
42 |     "*训练集音频文件目录": "*Diretório de arquivos de áudio do conjunto de treinamento",
43 |     "训练集音频文件目录 拼接 list文件里波形对应的文件名。": "Diretório de arquivos de áudio do conjunto de treinamento. Concatene o nome do arquivo correspondente à forma de onda no arquivo de lista",
44 |     "1Aa-文本内容": "1AA-Conteúdo do texto",
45 |     "GPU卡号以-分割，每个卡号一个进程": "Número da placa de vídeo dividido por-, cada número de placa é um processo",
46 |     "预训练的中文BERT模型路径": "Caminho do modelo BERT chinês pre-train",
47 |     "开启文本获取": "Ativar obtenção de texto",
48 |     "终止文本获取进程": "Encerrar processo de obtenção de texto",
49 |     "文本进程输出信息": "Informações de saída do processo de texto",
50 |     "1Ab-SSL自监督特征提取": "1AB-Extração de características auto-supervisionadas SSL",
51 |     "预训练的SSL模型路径": "Caminho do modelo SSL pre-train",
52 |     "开启SSL提取": "Ativar extração SSL",
53 |     "终止SSL提取进程": "Encerrar processo de extração SSL",
54 |     "SSL进程输出信息": "Informações de saída do processo SSL",
55 |     "1Ac-语义token提取": "1AC-Extração de token semântico",
56 |     "开启语义token提取": "Ativar extração de token semântico",
57 |     "终止语义token提取进程": "Encerrar processo de extração de token semântico",
58 |     "语义token提取进程输出信息": "Informações de saída do processo de extração de token semântico",
59 |     "1Aabc-训练集格式化一键三连": "1AABC-Formatação de conjunto de treinamento em um clique",
60 |     "开启一键三连": "Ativar um clique",
61 |     "终止一键三连": "Encerrar um clique",
62 |     "一键三连进程输出信息": "Informações de saída do processo de um clique",
63 |     "1B-微调训练": "1B-Treinamento de ajuste fino",
64 |     "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1ba-Treinamento SoVITS. O arquivo de modelo para compartilhamento é gerado em SOVITS_WEIGHTS",
65 |     "每张显卡的batch_size": "Tamanho do lote de cada placa de vídeo",
66 |     "总训练轮数total_epoch，不建议太高": "Total de epoch de treinamento, não é recomendável um valor muito alto",
67 |     "文本模块学习率权重": "Weight da taxa de aprendizado do módulo de texto",
68 |     "保存频率save_every_epoch": "Frequência de salvamento save_every_epoch",
69 |     "是否仅保存最新的ckpt文件以节省硬盘空间": "Se deve salvar apenas o último arquivo CKPT para economizar espaço em disco",
70 |     "是否在每次保存时间点将最终小模型保存至weights文件夹": "Se deve salvar o modelo pequeno final na pasta Weights em cada ponto de salvamento de tempo",
71 |     "开启SoVITS训练": "Ativar treinamento SoVITS",
72 |     "终止SoVITS训练": "Encerrar treinamento SoVITS",
73 |     "SoVITS训练进程输出信息": "Informações de saída do processo de treinamento SoVITS",
74 |     "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1BB-Treinamento GPT. O arquivo de modelo para compartilhamento é gerado em GPT_WEIGHTS",
75 |     "总训练轮数total_epoch": "Total de epoch de treinamento",
76 |     "开启GPT训练": "Ativar treinamento GPT",
77 |     "终止GPT训练": "Encerrar treinamento GPT",
78 |     "GPT训练进程输出信息": "Informações de saída do processo de treinamento GPT",
79 |     "1C-推理": "1C-raciocínio",
80 |     "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模，体验5秒Zero Shot TTS用。": "Selecione os modelos armazenados em Sovits_weights e GPT_WEIGHTS. O padrão é o modelo inferior, experiência para 5 segundos de Zero Shot TTS",
81 |     "*GPT模型列表": "*Lista de modelos GPT",
82 |     "*SoVITS模型列表": "*Lista de modelos Sovits",
83 |     "GPU卡号,只能填1个整数": "Número da placa de vídeo, só é possível preencher com um número inteiro",
84 |     "刷新模型路径": "Atualizar caminho do modelo",
85 |     "是否开启TTS推理WebUI": "Se deseja ativar o webui de raciocínio TTS",
86 |     "TTS推理WebUI进程输出信息": "Informações de saída do processo webui de raciocínio TTS",
87 |     "2-GPT-SoVITS-变声": "2-gpt-sovits-mudança de voz",
88 |     "施工中，请静候佳音": "Em construção, por favor, aguarde por um bom som",
89 |     "TTS推理进程已开启": "O processo de inferência TTS foi iniciado",
90 |     "TTS推理进程已关闭": "O processo de inferência TTS foi desativado",
91 |     "打标工具WebUI已开启": "A ferramenta de marcação WebUI está ativada",
92 |     "打标工具WebUI已关闭": "A ferramenta de marcação WebUI foi desativado"
93 | }
94 | 


--------------------------------------------------------------------------------
/demo/TTS/i18n/locale/zh_HK.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波，數值為濾波半徑，使用可以削弱啞音",
  3 |     "A模型权重": "A模型權重",
  4 |     "A模型路径": "A模型路徑",
  5 |     "B模型路径": "B模型路徑",
  6 |     "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src",
  7 |     "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案，可選，一行一個音高，代替預設的F0及升降調",
  8 |     "Index Rate": "Index Rate",
  9 |     "Onnx导出": "Onnx导出",
 10 |     "Onnx输出路径": "Onnx输出路径",
 11 |     "RVC模型路径": "RVC模型路径",
 12 |     "ckpt处理": "ckpt處理",
 13 |     "harvest进程数": "harvest進程數",
 14 |     "index文件路径不可包含中文": "index文件路径不可包含中文",
 15 |     "pth文件路径不可包含中文": "pth文件路径不可包含中文",
 16 |     "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置：以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程",
 17 |     "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1：填寫實驗配置。實驗數據放在logs下，每個實驗一個資料夾，需手動輸入實驗名路徑，內含實驗配置、日誌、訓練得到的模型檔案。",
 18 |     "step1:正在处理数据": "step1:正在处理数据",
 19 |     "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
 20 |     "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a：自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化，在實驗目錄下生成2個wav資料夾；暫時只支援單人訓練。",
 21 |     "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
 22 |     "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引",
 23 |     "step3a:正在训练模型": "step3a:正在训练模型",
 24 |     "一键训练": "一鍵訓練",
 25 |     "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
 26 |     "人声伴奏分离批量处理， 使用UVR5模型。 <br>合格的文件夹路径格式举例： E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类： <br>1、保留人声：不带和声的音频选这个，对主人声保留比HP5更好。内置HP2和HP3两个模型，HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点； <br>2、仅保留主人声：带和声的音频选这个，对主人声可能有削弱。内置HP5一个模型； <br> 3、去混响、去延迟模型（by FoxJoy）：<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择，不能去除单通道混响；<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底，DeReverb额外去除混响，可去除单声道混响，但是对高频重的板式混响去不干净。<br>去混响/去延迟，附：<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍；<br>2、MDX-Net-Dereverb模型挺慢的；<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。<br>有效資料夾路徑格式的例子：D:\\path\\to\\input\\folder（從檔案管理員地址欄複製）。<br>模型分為三類：<br>1. 保留人聲：選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型：HP2和HP3。HP3可能輕微漏出伴奏，但比HP2更好地保留了人聲；<br>2. 僅保留主人聲：選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型：HP5。<br>3. 消除混響和延遲模型（由FoxJoy提供）：<br>  (1) MDX-Net：對於立體聲混響的移除是最好的選擇，但不能移除單聲道混響；<br>&emsp;(234) DeEcho：移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響，可以移除單聲道混響，但對於高頻重的板式混響移除不乾淨。<br>消除混響/延遲注意事項：<br>1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍；<br>2. MDX-Net-Dereverb模型相當慢；<br>3. 個人推薦的最乾淨配置是先使用MDX-Net，然後使用DeEcho-Aggressive。",
 27 |     "以-分隔输入使用的卡号, 例如   0-1-2   使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2",
 28 |     "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
 29 |     "使用模型采样率": "使用模型采样率",
 30 |     "使用设备采样率": "使用设备采样率",
 31 |     "保存名": "儲存名",
 32 |     "保存的文件名, 默认空为和源文件同名": "儲存的檔案名，預設空為與來源檔案同名",
 33 |     "保存的模型名不带后缀": "儲存的模型名不帶副檔名",
 34 |     "保存频率save_every_epoch": "保存頻率save_every_epoch",
 35 |     "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲，防止電音撕裂等artifact，拉滿0.5不開啟，調低加大保護力度但可能降低索引效果",
 36 |     "修改": "修改",
 37 |     "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 38 |     "停止音频转换": "停止音訊轉換",
 39 |     "全流程结束！": "全流程结束！",
 40 |     "刷新音色列表和索引路径": "刷新音色列表和索引路徑",
 41 |     "加载模型": "載入模型",
 42 |     "加载预训练底模D路径": "加載預訓練底模D路徑",
 43 |     "加载预训练底模G路径": "加載預訓練底模G路徑",
 44 |     "单次推理": "单次推理",
 45 |     "卸载音色省显存": "卸載音色節省 VRAM",
 46 |     "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
 47 |     "后处理重采样至最终采样率，0为不进行重采样": "後處理重採樣至最終採樣率，0為不進行重採樣",
 48 |     "否": "否",
 49 |     "启用相位声码器": "启用相位声码器",
 50 |     "响应阈值": "響應閾值",
 51 |     "响度因子": "響度因子",
 52 |     "处理数据": "處理資料",
 53 |     "导出Onnx模型": "导出Onnx模型",
 54 |     "导出文件格式": "導出檔格式",
 55 |     "常见问题解答": "常見問題解答",
 56 |     "常规设置": "一般設定",
 57 |     "开始音频转换": "開始音訊轉換",
 58 |     "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
 59 |     "性能设置": "效能設定",
 60 |     "总训练轮数total_epoch": "總訓練輪數total_epoch",
 61 |     "批量推理": "批量推理",
 62 |     "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換，輸入待轉換音頻資料夾，或上傳多個音頻檔案，在指定資料夾(默認opt)下輸出轉換的音頻。",
 63 |     "指定输出主人声文件夹": "指定输出主人声文件夹",
 64 |     "指定输出文件夹": "指定輸出資料夾",
 65 |     "指定输出非主人声文件夹": "指定输出非主人声文件夹",
 66 |     "推理时间(ms):": "推理時間(ms):",
 67 |     "推理音色": "推理音色",
 68 |     "提取": "提取",
 69 |     "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
 70 |     "是": "是",
 71 |     "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間",
 72 |     "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾",
 73 |     "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練，大數據緩存會爆 VRAM 也加不了多少速度",
 74 |     "显卡信息": "顯示卡資訊",
 75 |     "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本軟體以MIT協議開源，作者不對軟體具備任何控制力，使用軟體者、傳播軟體導出的聲音者自負全責。<br>如不認可該條款，則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄<b>使用需遵守的協議-LICENSE.txt</b>。",
 76 |     "查看": "查看",
 77 |     "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 78 |     "检索特征占比": "檢索特徵佔比",
 79 |     "模型": "模型",
 80 |     "模型推理": "模型推理",
 81 |     "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑)，適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型，或者想測試中間模型的情況",
 82 |     "模型是否带音高指导": "模型是否帶音高指導",
 83 |     "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導（唱歌一定要，語音可以不要）",
 84 |     "模型是否带音高指导,1是0否": "模型是否帶音高指導，1是0否",
 85 |     "模型版本型号": "模型版本型號",
 86 |     "模型融合, 可用于测试音色融合": "模型融合，可用於測試音色融合",
 87 |     "模型路径": "模型路徑",
 88 |     "每张显卡的batch_size": "每张显卡的batch_size",
 89 |     "淡入淡出长度": "淡入淡出長度",
 90 |     "版本": "版本",
 91 |     "特征提取": "特徵提取",
 92 |     "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果",
 93 |     "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key，女性轉男性推薦-12key，如果音域爆炸導致音色失真也可以自己調整到合適音域。",
 94 |     "目标采样率": "目標取樣率",
 95 |     "算法延迟(ms):": "算法延迟(ms):",
 96 |     "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)",
 97 |     "融合": "融合",
 98 |     "要改的模型信息": "要改的模型資訊",
 99 |     "要置入的模型信息": "要置入的模型資訊",
100 |     "训练": "訓練",
101 |     "训练模型": "訓練模型",
102 |     "训练特征索引": "訓練特徵索引",
103 |     "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
104 |     "请指定说话人id": "請指定說話人id",
105 |     "请选择index文件": "请选择index文件",
106 |     "请选择pth文件": "请选择pth文件",
107 |     "请选择说话人id": "請選擇說話人ID",
108 |     "转换": "轉換",
109 |     "输入实验名": "輸入實驗名稱",
110 |     "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
111 |     "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
112 |     "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑（預設是正確格式示例）",
113 |     "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例，越靠近1越使用輸出包絡",
114 |     "输入监听": "输入监听",
115 |     "输入训练文件夹路径": "輸入訓練檔案夾路徑",
116 |     "输入设备": "輸入設備",
117 |     "输入降噪": "輸入降噪",
118 |     "输出信息": "輸出訊息",
119 |     "输出变声": "输出变声",
120 |     "输出设备": "輸出設備",
121 |     "输出降噪": "輸出降噪",
122 |     "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點，點了可以下載)",
123 |     "选择.index文件": "選擇 .index 檔案",
124 |     "选择.pth文件": "選擇 .pth 檔案",
125 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
126 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU",
127 |     "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU",
128 |     "采样率:": "采样率:",
129 |     "采样长度": "取樣長度",
130 |     "重载设备列表": "重載設備列表",
131 |     "音调设置": "音調設定",
132 |     "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)",
133 |     "音高算法": "音高演算法",
134 |     "额外推理时长": "額外推理時長"
135 | }
136 | 


--------------------------------------------------------------------------------
/demo/TTS/i18n/locale/zh_SG.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波，數值為濾波半徑，使用可以削弱啞音",
  3 |     "A模型权重": "A模型權重",
  4 |     "A模型路径": "A模型路徑",
  5 |     "B模型路径": "B模型路徑",
  6 |     "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src",
  7 |     "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案，可選，一行一個音高，代替預設的F0及升降調",
  8 |     "Index Rate": "Index Rate",
  9 |     "Onnx导出": "Onnx导出",
 10 |     "Onnx输出路径": "Onnx输出路径",
 11 |     "RVC模型路径": "RVC模型路径",
 12 |     "ckpt处理": "ckpt處理",
 13 |     "harvest进程数": "harvest進程數",
 14 |     "index文件路径不可包含中文": "index文件路径不可包含中文",
 15 |     "pth文件路径不可包含中文": "pth文件路径不可包含中文",
 16 |     "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置：以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程",
 17 |     "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1：填寫實驗配置。實驗數據放在logs下，每個實驗一個資料夾，需手動輸入實驗名路徑，內含實驗配置、日誌、訓練得到的模型檔案。",
 18 |     "step1:正在处理数据": "step1:正在处理数据",
 19 |     "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
 20 |     "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a：自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化，在實驗目錄下生成2個wav資料夾；暫時只支援單人訓練。",
 21 |     "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
 22 |     "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引",
 23 |     "step3a:正在训练模型": "step3a:正在训练模型",
 24 |     "一键训练": "一鍵訓練",
 25 |     "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
 26 |     "人声伴奏分离批量处理， 使用UVR5模型。 <br>合格的文件夹路径格式举例： E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类： <br>1、保留人声：不带和声的音频选这个，对主人声保留比HP5更好。内置HP2和HP3两个模型，HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点； <br>2、仅保留主人声：带和声的音频选这个，对主人声可能有削弱。内置HP5一个模型； <br> 3、去混响、去延迟模型（by FoxJoy）：<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择，不能去除单通道混响；<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底，DeReverb额外去除混响，可去除单声道混响，但是对高频重的板式混响去不干净。<br>去混响/去延迟，附：<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍；<br>2、MDX-Net-Dereverb模型挺慢的；<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。<br>有效資料夾路徑格式的例子：D:\\path\\to\\input\\folder（從檔案管理員地址欄複製）。<br>模型分為三類：<br>1. 保留人聲：選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型：HP2和HP3。HP3可能輕微漏出伴奏，但比HP2更好地保留了人聲；<br>2. 僅保留主人聲：選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型：HP5。<br>3. 消除混響和延遲模型（由FoxJoy提供）：<br>  (1) MDX-Net：對於立體聲混響的移除是最好的選擇，但不能移除單聲道混響；<br>&emsp;(234) DeEcho：移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響，可以移除單聲道混響，但對於高頻重的板式混響移除不乾淨。<br>消除混響/延遲注意事項：<br>1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍；<br>2. MDX-Net-Dereverb模型相當慢；<br>3. 個人推薦的最乾淨配置是先使用MDX-Net，然後使用DeEcho-Aggressive。",
 27 |     "以-分隔输入使用的卡号, 例如   0-1-2   使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2",
 28 |     "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
 29 |     "使用模型采样率": "使用模型采样率",
 30 |     "使用设备采样率": "使用设备采样率",
 31 |     "保存名": "儲存名",
 32 |     "保存的文件名, 默认空为和源文件同名": "儲存的檔案名，預設空為與來源檔案同名",
 33 |     "保存的模型名不带后缀": "儲存的模型名不帶副檔名",
 34 |     "保存频率save_every_epoch": "保存頻率save_every_epoch",
 35 |     "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲，防止電音撕裂等artifact，拉滿0.5不開啟，調低加大保護力度但可能降低索引效果",
 36 |     "修改": "修改",
 37 |     "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 38 |     "停止音频转换": "停止音訊轉換",
 39 |     "全流程结束！": "全流程结束！",
 40 |     "刷新音色列表和索引路径": "刷新音色列表和索引路徑",
 41 |     "加载模型": "載入模型",
 42 |     "加载预训练底模D路径": "加載預訓練底模D路徑",
 43 |     "加载预训练底模G路径": "加載預訓練底模G路徑",
 44 |     "单次推理": "单次推理",
 45 |     "卸载音色省显存": "卸載音色節省 VRAM",
 46 |     "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
 47 |     "后处理重采样至最终采样率，0为不进行重采样": "後處理重採樣至最終採樣率，0為不進行重採樣",
 48 |     "否": "否",
 49 |     "启用相位声码器": "启用相位声码器",
 50 |     "响应阈值": "響應閾值",
 51 |     "响度因子": "響度因子",
 52 |     "处理数据": "處理資料",
 53 |     "导出Onnx模型": "导出Onnx模型",
 54 |     "导出文件格式": "導出檔格式",
 55 |     "常见问题解答": "常見問題解答",
 56 |     "常规设置": "一般設定",
 57 |     "开始音频转换": "開始音訊轉換",
 58 |     "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
 59 |     "性能设置": "效能設定",
 60 |     "总训练轮数total_epoch": "總訓練輪數total_epoch",
 61 |     "批量推理": "批量推理",
 62 |     "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換，輸入待轉換音頻資料夾，或上傳多個音頻檔案，在指定資料夾(默認opt)下輸出轉換的音頻。",
 63 |     "指定输出主人声文件夹": "指定输出主人声文件夹",
 64 |     "指定输出文件夹": "指定輸出資料夾",
 65 |     "指定输出非主人声文件夹": "指定输出非主人声文件夹",
 66 |     "推理时间(ms):": "推理時間(ms):",
 67 |     "推理音色": "推理音色",
 68 |     "提取": "提取",
 69 |     "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
 70 |     "是": "是",
 71 |     "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間",
 72 |     "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾",
 73 |     "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練，大數據緩存會爆 VRAM 也加不了多少速度",
 74 |     "显卡信息": "顯示卡資訊",
 75 |     "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本軟體以MIT協議開源，作者不對軟體具備任何控制力，使用軟體者、傳播軟體導出的聲音者自負全責。<br>如不認可該條款，則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄<b>使用需遵守的協議-LICENSE.txt</b>。",
 76 |     "查看": "查看",
 77 |     "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 78 |     "检索特征占比": "檢索特徵佔比",
 79 |     "模型": "模型",
 80 |     "模型推理": "模型推理",
 81 |     "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑)，適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型，或者想測試中間模型的情況",
 82 |     "模型是否带音高指导": "模型是否帶音高指導",
 83 |     "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導（唱歌一定要，語音可以不要）",
 84 |     "模型是否带音高指导,1是0否": "模型是否帶音高指導，1是0否",
 85 |     "模型版本型号": "模型版本型號",
 86 |     "模型融合, 可用于测试音色融合": "模型融合，可用於測試音色融合",
 87 |     "模型路径": "模型路徑",
 88 |     "每张显卡的batch_size": "每张显卡的batch_size",
 89 |     "淡入淡出长度": "淡入淡出長度",
 90 |     "版本": "版本",
 91 |     "特征提取": "特徵提取",
 92 |     "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果",
 93 |     "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key，女性轉男性推薦-12key，如果音域爆炸導致音色失真也可以自己調整到合適音域。",
 94 |     "目标采样率": "目標取樣率",
 95 |     "算法延迟(ms):": "算法延迟(ms):",
 96 |     "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)",
 97 |     "融合": "融合",
 98 |     "要改的模型信息": "要改的模型資訊",
 99 |     "要置入的模型信息": "要置入的模型資訊",
100 |     "训练": "訓練",
101 |     "训练模型": "訓練模型",
102 |     "训练特征索引": "訓練特徵索引",
103 |     "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
104 |     "请指定说话人id": "請指定說話人id",
105 |     "请选择index文件": "请选择index文件",
106 |     "请选择pth文件": "请选择pth文件",
107 |     "请选择说话人id": "請選擇說話人ID",
108 |     "转换": "轉換",
109 |     "输入实验名": "輸入實驗名稱",
110 |     "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
111 |     "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
112 |     "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑（預設是正確格式示例）",
113 |     "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例，越靠近1越使用輸出包絡",
114 |     "输入监听": "输入监听",
115 |     "输入训练文件夹路径": "輸入訓練檔案夾路徑",
116 |     "输入设备": "輸入設備",
117 |     "输入降噪": "輸入降噪",
118 |     "输出信息": "輸出訊息",
119 |     "输出变声": "输出变声",
120 |     "输出设备": "輸出設備",
121 |     "输出降噪": "輸出降噪",
122 |     "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點，點了可以下載)",
123 |     "选择.index文件": "選擇 .index 檔案",
124 |     "选择.pth文件": "選擇 .pth 檔案",
125 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
126 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU",
127 |     "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU",
128 |     "采样率:": "采样率:",
129 |     "采样长度": "取樣長度",
130 |     "重载设备列表": "重載設備列表",
131 |     "音调设置": "音調設定",
132 |     "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)",
133 |     "音高算法": "音高演算法",
134 |     "额外推理时长": "額外推理時長"
135 | }
136 | 


--------------------------------------------------------------------------------
/demo/TTS/i18n/locale/zh_TW.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波，數值為濾波半徑，使用可以削弱啞音",
  3 |     "A模型权重": "A模型權重",
  4 |     "A模型路径": "A模型路徑",
  5 |     "B模型路径": "B模型路徑",
  6 |     "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src",
  7 |     "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案，可選，一行一個音高，代替預設的F0及升降調",
  8 |     "Index Rate": "Index Rate",
  9 |     "Onnx导出": "Onnx导出",
 10 |     "Onnx输出路径": "Onnx输出路径",
 11 |     "RVC模型路径": "RVC模型路径",
 12 |     "ckpt处理": "ckpt處理",
 13 |     "harvest进程数": "harvest進程數",
 14 |     "index文件路径不可包含中文": "index文件路径不可包含中文",
 15 |     "pth文件路径不可包含中文": "pth文件路径不可包含中文",
 16 |     "rmvpe卡号配置：以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置：以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程",
 17 |     "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1：填寫實驗配置。實驗數據放在logs下，每個實驗一個資料夾，需手動輸入實驗名路徑，內含實驗配置、日誌、訓練得到的模型檔案。",
 18 |     "step1:正在处理数据": "step1:正在处理数据",
 19 |     "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征",
 20 |     "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a：自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化，在實驗目錄下生成2個wav資料夾；暫時只支援單人訓練。",
 21 |     "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)",
 22 |     "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引",
 23 |     "step3a:正在训练模型": "step3a:正在训练模型",
 24 |     "一键训练": "一鍵訓練",
 25 |     "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
 26 |     "人声伴奏分离批量处理， 使用UVR5模型。 <br>合格的文件夹路径格式举例： E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>模型分为三类： <br>1、保留人声：不带和声的音频选这个，对主人声保留比HP5更好。内置HP2和HP3两个模型，HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点； <br>2、仅保留主人声：带和声的音频选这个，对主人声可能有削弱。内置HP5一个模型； <br> 3、去混响、去延迟模型（by FoxJoy）：<br>  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择，不能去除单通道混响；<br>&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底，DeReverb额外去除混响，可去除单声道混响，但是对高频重的板式混响去不干净。<br>去混响/去延迟，附：<br>1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍；<br>2、MDX-Net-Dereverb模型挺慢的；<br>3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。<br>有效資料夾路徑格式的例子：D:\\path\\to\\input\\folder（從檔案管理員地址欄複製）。<br>模型分為三類：<br>1. 保留人聲：選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型：HP2和HP3。HP3可能輕微漏出伴奏，但比HP2更好地保留了人聲；<br>2. 僅保留主人聲：選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型：HP5。<br>3. 消除混響和延遲模型（由FoxJoy提供）：<br>  (1) MDX-Net：對於立體聲混響的移除是最好的選擇，但不能移除單聲道混響；<br>&emsp;(234) DeEcho：移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響，可以移除單聲道混響，但對於高頻重的板式混響移除不乾淨。<br>消除混響/延遲注意事項：<br>1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍；<br>2. MDX-Net-Dereverb模型相當慢；<br>3. 個人推薦的最乾淨配置是先使用MDX-Net，然後使用DeEcho-Aggressive。",
 27 |     "以-分隔输入使用的卡号, 例如   0-1-2   使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2",
 28 |     "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
 29 |     "使用模型采样率": "使用模型采样率",
 30 |     "使用设备采样率": "使用设备采样率",
 31 |     "保存名": "儲存名",
 32 |     "保存的文件名, 默认空为和源文件同名": "儲存的檔案名，預設空為與來源檔案同名",
 33 |     "保存的模型名不带后缀": "儲存的模型名不帶副檔名",
 34 |     "保存频率save_every_epoch": "保存頻率save_every_epoch",
 35 |     "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲，防止電音撕裂等artifact，拉滿0.5不開啟，調低加大保護力度但可能降低索引效果",
 36 |     "修改": "修改",
 37 |     "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 38 |     "停止音频转换": "停止音訊轉換",
 39 |     "全流程结束！": "全流程结束！",
 40 |     "刷新音色列表和索引路径": "刷新音色列表和索引路徑",
 41 |     "加载模型": "載入模型",
 42 |     "加载预训练底模D路径": "加載預訓練底模D路徑",
 43 |     "加载预训练底模G路径": "加載預訓練底模G路徑",
 44 |     "单次推理": "单次推理",
 45 |     "卸载音色省显存": "卸載音色節省 VRAM",
 46 |     "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
 47 |     "后处理重采样至最终采样率，0为不进行重采样": "後處理重採樣至最終採樣率，0為不進行重採樣",
 48 |     "否": "否",
 49 |     "启用相位声码器": "启用相位声码器",
 50 |     "响应阈值": "響應閾值",
 51 |     "响度因子": "響度因子",
 52 |     "处理数据": "處理資料",
 53 |     "导出Onnx模型": "导出Onnx模型",
 54 |     "导出文件格式": "導出檔格式",
 55 |     "常见问题解答": "常見問題解答",
 56 |     "常规设置": "一般設定",
 57 |     "开始音频转换": "開始音訊轉換",
 58 |     "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
 59 |     "性能设置": "效能設定",
 60 |     "总训练轮数total_epoch": "總訓練輪數total_epoch",
 61 |     "批量推理": "批量推理",
 62 |     "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換，輸入待轉換音頻資料夾，或上傳多個音頻檔案，在指定資料夾(默認opt)下輸出轉換的音頻。",
 63 |     "指定输出主人声文件夹": "指定输出主人声文件夹",
 64 |     "指定输出文件夹": "指定輸出資料夾",
 65 |     "指定输出非主人声文件夹": "指定输出非主人声文件夹",
 66 |     "推理时间(ms):": "推理時間(ms):",
 67 |     "推理音色": "推理音色",
 68 |     "提取": "提取",
 69 |     "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
 70 |     "是": "是",
 71 |     "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間",
 72 |     "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾",
 73 |     "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練，大數據緩存會爆 VRAM 也加不了多少速度",
 74 |     "显卡信息": "顯示卡資訊",
 75 |     "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.": "本軟體以MIT協議開源，作者不對軟體具備任何控制力，使用軟體者、傳播軟體導出的聲音者自負全責。<br>如不認可該條款，則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄<b>使用需遵守的協議-LICENSE.txt</b>。",
 76 |     "查看": "查看",
 77 |     "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)",
 78 |     "检索特征占比": "檢索特徵佔比",
 79 |     "模型": "模型",
 80 |     "模型推理": "模型推理",
 81 |     "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑)，適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型，或者想測試中間模型的情況",
 82 |     "模型是否带音高指导": "模型是否帶音高指導",
 83 |     "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導（唱歌一定要，語音可以不要）",
 84 |     "模型是否带音高指导,1是0否": "模型是否帶音高指導，1是0否",
 85 |     "模型版本型号": "模型版本型號",
 86 |     "模型融合, 可用于测试音色融合": "模型融合，可用於測試音色融合",
 87 |     "模型路径": "模型路徑",
 88 |     "每张显卡的batch_size": "每张显卡的batch_size",
 89 |     "淡入淡出长度": "淡入淡出長度",
 90 |     "版本": "版本",
 91 |     "特征提取": "特徵提取",
 92 |     "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果",
 93 |     "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key，女性轉男性推薦-12key，如果音域爆炸導致音色失真也可以自己調整到合適音域。",
 94 |     "目标采样率": "目標取樣率",
 95 |     "算法延迟(ms):": "算法延迟(ms):",
 96 |     "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)",
 97 |     "融合": "融合",
 98 |     "要改的模型信息": "要改的模型資訊",
 99 |     "要置入的模型信息": "要置入的模型資訊",
100 |     "训练": "訓練",
101 |     "训练模型": "訓練模型",
102 |     "训练特征索引": "訓練特徵索引",
103 |     "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log",
104 |     "请指定说话人id": "請指定說話人id",
105 |     "请选择index文件": "请选择index文件",
106 |     "请选择pth文件": "请选择pth文件",
107 |     "请选择说话人id": "請選擇說話人ID",
108 |     "转换": "轉換",
109 |     "输入实验名": "輸入實驗名稱",
110 |     "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
111 |     "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
112 |     "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑（預設是正確格式示例）",
113 |     "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例，越靠近1越使用輸出包絡",
114 |     "输入监听": "输入监听",
115 |     "输入训练文件夹路径": "輸入訓練檔案夾路徑",
116 |     "输入设备": "輸入設備",
117 |     "输入降噪": "輸入降噪",
118 |     "输出信息": "輸出訊息",
119 |     "输出变声": "输出变声",
120 |     "输出设备": "輸出設備",
121 |     "输出降噪": "輸出降噪",
122 |     "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點，點了可以下載)",
123 |     "选择.index文件": "選擇 .index 檔案",
124 |     "选择.pth文件": "選擇 .pth 檔案",
125 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU",
126 |     "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU",
127 |     "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU",
128 |     "采样率:": "采样率:",
129 |     "采样长度": "取樣長度",
130 |     "重载设备列表": "重載設備列表",
131 |     "音调设置": "音調設定",
132 |     "音频设备(请使用同种类驱动)": "音訊設備 (請使用同種類驅動)",
133 |     "音高算法": "音高演算法",
134 |     "额外推理时长": "額外推理時長"
135 | }
136 | 


--------------------------------------------------------------------------------
/demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/__init__.py


--------------------------------------------------------------------------------
/demo/app.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import streamlit as st
  3 | from streamlit.components.v1 import html
  4 | import streamlit.components.v1 as components
  5 | import torch
  6 | import lmdeploy
  7 | import json
  8 | import os
  9 | import time
 10 | import soundfile as sf
 11 | import copy
 12 | import pandas as pd
 13 | import altair as alt
 14 | from datetime import datetime
 15 | from io import BytesIO
 16 | from transformers import AutoModelForMaskedLM, AutoTokenizer
 17 | from dataclasses import asdict, dataclass
 18 | from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig, ChatTemplateConfig
 19 | 
 20 | from TTS.GPT_SoVITS.utils import HParams
 21 | 
 22 | from config import backend_config,chat_template_config,IS_TURBOMIND
 23 | from config import prompt_text, prompt_language, text_language, ref_wav_path
 24 | from TTS.GPT_SoVITS.tts import get_tts_wav, load_tts_model
 25 | from TTS.GPT_SoVITS.feature_extractor import cnhubert
 26 | from modelscope.hub.api import HubApi
 27 | 
 28 | hubapi = os.getenv("HUBAPI")
 29 | api = HubApi()
 30 | api.login(hubapi)
 31 | 
 32 | from modelscope import snapshot_download
 33 | model_dir1 = snapshot_download('NobodyYing/HeartLink_7B_qlora_analyse', cache_dir='/home/xlab-app-center')
 34 | 
 35 | model_dir2 = snapshot_download('NobodyYing/GPT_SoVITS_pretrained_models', cache_dir='/home/xlab-app-center')
 36 | 
 37 | model_dir3 = snapshot_download('NobodyYing/GPT_weights_hutao', cache_dir='/home/xlab-app-center')
 38 | 
 39 | model_dir4 = snapshot_download('NobodyYing/SoVITS_weights_hutao', cache_dir='/home/xlab-app-center')
 40 | 
 41 | gradient_text_html = """
 42 | <style>
 43 | .container {
 44 |     position: relative;
 45 |     /* 可能需要调整的高度，以避免内容重叠 */
 46 |     padding-top: 50px; 
 47 | }
 48 | 
 49 | .gradient-text {
 50 |     font-weight: bold;
 51 |     background: -webkit-linear-gradient(left, red, orange);
 52 |     background: linear-gradient(to right, red, orange);
 53 |     -webkit-background-clip: text;
 54 |     -webkit-text-fill-color: transparent;
 55 |     display: inline;
 56 |     font-size: 3em;
 57 |     /* 使用相对定位并上移 */
 58 |     position: relative;
 59 |     top: -105px;
 60 | }
 61 | </style>
 62 | <div class="container">
 63 |     <div class="gradient-text">HeartLink</div>
 64 | </div>
 65 | """
 66 | st.markdown(gradient_text_html, unsafe_allow_html=True)
 67 | 
 68 | def on_btn_click():
 69 |     del st.session_state.messages
 70 | 
 71 | def turbomind_generation_config():
 72 |     with st.sidebar:
 73 |         st.title("HeartLink——共情大模型")
 74 |         st.subheader("目前支持功能")
 75 |         st.markdown("- 💖 共情对话")
 76 |         st.markdown("- 💬 语音生成(胡桃)")
 77 |         st.markdown("- 📊 情绪分析")
 78 |         with st.container(height=200, border=True):
 79 |             st.subheader("模型配置")
 80 |             max_length = st.slider('Max Length',
 81 |                                 min_value=8,
 82 |                                 max_value=4096,
 83 |                                 value=4096)
 84 |             top_p = st.slider('Top P', 0.0, 1.0, 0.8, step=0.01)
 85 |             temperature = st.slider('Temperature', 0.0, 1.0, 0.8, step=0.01)
 86 |         
 87 |         st.button('清空历史对话', on_click=on_btn_click)
 88 | 
 89 |     tb_generation_config = GenerationConfig(top_p=top_p,
 90 |                                          temperature=temperature,
 91 |                                          max_new_tokens=max_length,)
 92 | 
 93 |     return tb_generation_config
 94 | 
 95 | @st.cache_resource
 96 | def load_llm_model():
 97 |     if IS_TURBOMIND:
 98 |         pipe = pipeline('/home/xlab-app-center/NobodyYing/HeartLink_7B_qlora_analyse',
 99 |                     backend_config=backend_config,
100 |                     chat_template_config=chat_template_config,
101 |                     )
102 |     return pipe
103 | 
104 | 
105 | def llm_prompt():
106 |     prompts = []
107 |     mes = copy.deepcopy(st.session_state.messages)
108 |     for detail in mes:
109 |         if detail["role"] == "robot" or detail["role"] == "assistant":
110 |             detail["role"] = "assistant"
111 |             del detail['wav'] 
112 |             del detail['emotions']
113 |             del detail['avatar']
114 |         prompts.append(detail)
115 |     return prompts
116 | 
117 | def main():
118 |     print('load llm')
119 |     pipe = load_llm_model()
120 |     print('load llm done')
121 | 
122 |     print('load tts model')
123 |     tokenizer, bert_model, ssl_model, vq_model, hps, t2s_model, max_sec = load_tts_model()
124 |     print('load tts model done')
125 | 
126 |     tb_generation_config = turbomind_generation_config()
127 | 
128 |     if 'messages' not in st.session_state:
129 |         st.session_state.messages = []
130 | 
131 |     # with st.container():
132 |     #     col1, col2 = st.columns([3, 2])
133 | 
134 |     for message in st.session_state.messages:
135 |         with st.chat_message(message['role'], avatar=message.get('avatar')):
136 |             if message['role'] == 'robot' or message['role'] == 'assistant':
137 |                 try:
138 |                     content = json.loads(message['content'])["共情回复"]
139 |                 except (json.JSONDecodeError, KeyError):
140 |                     content = message['content']
141 |                 st.markdown(content)
142 |             else:
143 |                 st.markdown(message['content'])
144 |             
145 |             if message.get("wav") is not None:
146 |                 sub_audio_io = message['wav']
147 |                 sub_audio_io.seek(0)
148 |                 st.audio(sub_audio_io, format='audio/wav')
149 |     
150 |     if prompt := st.chat_input('请告诉我你的经历与感受～'):
151 |         with st.chat_message('user',):
152 |                 st.markdown(prompt)
153 | 
154 |         st.session_state.messages.append({
155 |             'role': 'user',
156 |             'content': prompt,
157 |         })
158 | 
159 |         prompts = llm_prompt()
160 |         with st.chat_message('robot',avatar='/home/xlab-app-center/demo/asserts/logo.jpg'):
161 |             message_placeholder = st.empty()
162 |             loading_placeholder = st.empty()
163 |             # border,width,height调圈大小，<div style="display: flex; align-items: center; margin-top: -15px;">加justify-content: center;居中
164 |             loading_placeholder.markdown("""
165 |                 <div style="display: flex; align-items: center; margin-top: -15px;">
166 |                     <div class="spinner"></div>
167 |                     <span style="margin-left: 10px;">正在生成文本，请稍等</span>
168 |                 </div>
169 |                 <style>
170 |                     .spinner {
171 |                         border: 2px solid rgba(0, 0, 0, 0.1);
172 |                         width: 20px;
173 |                         height: 20px;
174 |                         border-radius: 50%;
175 |                         border-left-color: #09f;
176 |                         animation: spin 1s ease infinite;
177 |                     }
178 |                     @keyframes spin {
179 |                         0% { transform: rotate(0deg); }
180 |                         100% { transform: rotate(360deg); }
181 |                     }
182 |                 </style>
183 |             """, unsafe_allow_html=True)
184 |             items = ''
185 |             print(st.session_state.messages)
186 |             while True:
187 |                 for item in pipe.stream_infer(prompts=prompts, gen_config=tb_generation_config):
188 |                     items += item.text
189 |                     print(item.text,end='')
190 |                 try:
191 |                     response = json.loads(items)["共情回复"]
192 |                     emotion = json.loads(items)["情绪"].replace("，" ,",")
193 |                     break
194 |                 except:
195 |                     continue
196 | 
197 |             loading_placeholder.empty()
198 |             message_placeholder.markdown(response)
199 |             
200 |             with st.spinner("正在生成语音，请稍等～"):
201 |                 sr, audio_io = get_tts_wav(ref_wav_path=ref_wav_path, prompt_text=prompt_text, prompt_language=prompt_language, text=response, text_language=text_language, 
202 |                                                 tokenizer=tokenizer, bert_model=bert_model, ssl_model=ssl_model, vq_model=vq_model, hps=hps, t2s_model=t2s_model, max_sec=max_sec,
203 |                                                 )
204 |                 try:
205 |                     st.audio(data=audio_io, format="audio/wav", autoplay=True)
206 |                 except:
207 |                     st.audio(data=audio_io, format="audio/wav")
208 | 
209 |             emotions = emotion.split(',')
210 | 
211 |             try:
212 |                 print(st.session_state.messages)
213 |                 tmp = copy.deepcopy(st.session_state.messages[-2]["emotions"])
214 |                 for e in emotions:
215 |                     try:
216 |                         tmp[e] += 1
217 |                     except:
218 |                         tmp[e] = 1
219 | 
220 |             except:
221 |                 print(st.session_state.messages)
222 |                 print('false')
223 |                 tmp = {e: 1 for e in emotions}
224 |             print(tmp)
225 | 
226 |             st.session_state.messages.append({
227 |                 'role': 'assistant',
228 |                 'content': items,
229 |                 'wav': audio_io,
230 |                 'emotions': tmp,
231 |                 'avatar': '/home/xlab-app-center/demo/asserts/logo.jpg',
232 |             })
233 | 
234 |             
235 |             with st.sidebar:
236 |                 st.subheader("情绪分析图表")
237 |                 df = pd.DataFrame(list(tmp.items()), columns=['Emotion', 'Count'])
238 |                 chart = alt.Chart(df).mark_bar(size=50).encode(
239 |                     x=alt.X('Count:Q', title='Count'),
240 |                     y=alt.Y('Emotion:N', title='Emotion', axis=alt.Axis(labelAngle=0)),
241 |                     
242 |                     color=alt.Color('Emotion:N', legend=None)
243 |                 ).properties(
244 |                     width=400,
245 |                     height=400
246 |                 ).interactive()
247 |                 with st.container(height=400, border=True):
248 |                     st.altair_chart(chart, use_container_width=True)
249 | 
250 |         torch.cuda.empty_cache()
251 | 
252 | if __name__ == '__main__':
253 |     main()
254 | 


--------------------------------------------------------------------------------
/demo/asserts/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/demo/asserts/logo.jpg


--------------------------------------------------------------------------------
/demo/config.py:
--------------------------------------------------------------------------------
 1 | from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig, ChatTemplateConfig
 2 | import os
 3 | 
 4 | #######################################################################
 5 | #                          PART 1  lmdeploy                           #
 6 | #######################################################################
 7 | SYSTEM = os.getenv("SYSTEM")
 8 | 
 9 | IS_TURBOMIND = True
10 | IS_PYTORCH = False
11 | 
12 | backend_config = TurbomindEngineConfig(cache_max_entry_count=0.3)
13 | chat_template_config = ChatTemplateConfig(model_name='internlm2',meta_instruction=SYSTEM)
14 | 
15 | #######################################################################
16 | #                          PART 2  TTS                                #
17 | #######################################################################
18 | prompt_text = "胡桃的胡是胡吃海喝的胡，胡桃的桃却不是淘气的淘！嘿嘿…不、不好笑吗？"
19 | prompt_language = "中文"
20 | text_language = "中文"
21 | ref_wav_path = "/home/xlab-app-center/demo/TTS/GPT_SoVITS/cankao2.wav"
22 | 


--------------------------------------------------------------------------------
/finetune_config/xtuner_config/README.md:
--------------------------------------------------------------------------------
  1 | # 基于 Xtuner 的微调指南
  2 | 
  3 | ## 1. 环境准备
  4 | 
  5 | - 微调硬件：对于 7B 的模型需要 A100(40G)
  6 | 
  7 | 
  8 | - 使用 conda 先构建一个 Python-3.10 的虚拟环境
  9 | 
 10 |   ```bash
 11 |   conda create --name finetune_xtuner python=3.10 -y
 12 |   conda activate finetune_xtuner
 13 |   ```
 14 | 
 15 | - 通过 pip 安装 XTuner：
 16 | 
 17 |   ```shell
 18 |   pip install -U xtuner
 19 |   ```
 20 | 
 21 |   亦可集成 DeepSpeed 安装：
 22 | 
 23 |   ```shell
 24 |   pip install -U 'xtuner[deepspeed]'
 25 |   ```
 26 | 
 27 | - 从源码安装 XTuner：
 28 | 
 29 |   ```shell
 30 |   git clone https://github.com/InternLM/xtuner.git
 31 |   cd xtuner
 32 |   pip install -e '.[all]'
 33 |   ```
 34 | 
 35 | 
 36 | ## 2. 微调配置
 37 | - 进入配置文件夹
 38 | 
 39 | ```shell
 40 | cd xtuner_config
 41 | ```
 42 | - 查看 XTuner 支持模型
 43 | ```shell
 44 | xtuner list-cfg
 45 | ```
 46 | - 拷贝目标模型的配置文件与修改配置文件参数
 47 | ```shell
 48 | xtuner copy-cfg internlm2_chat_7b_qlora_oasst1_e3 .
 49 | mv internlm2_chat_7b_qlora_oasst1_e3.py internlm2_chat_7b_qlora.py
 50 | vim internlm2_chat_7b_qlora.py
 51 | ```
 52 | 
 53 | ## 3. 微调训练
 54 | - 运行微调训练
 55 | ```shell
 56 | # 单卡
 57 | xtuner train /root/SoulStar/finetune_config/xtuner_config/internlm2_chat_7b_qlora.py --deepspeed deepspeed_zero2
 58 | ```
 59 | - `--deepspeed` 表示使用 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 🚀 来优化训练过程。XTuner 内置了多种策略，包括 ZeRO-1、ZeRO-2、ZeRO-3 等。如果用户期望关闭此功能，请直接移除此参数。
 60 | 
 61 | - 更多示例，请查阅[文档](https://github.com/InternLM/xtuner/blob/main/docs/zh_cn/user_guides/finetune.md)。
 62 | 
 63 | ## 4. 模型参数转换合并
 64 | - pth 格式参数转换为 huggingface 格式
 65 | ```shell
 66 | # 创建用于存放Hugging Face格式参数的hf文件夹
 67 | mkdir /root/SoulStar/finetune_config/xtuner_config/work_dirs/hf
 68 | 
 69 | export MKL_SERVICE_FORCE_INTEL=1
 70 | 
 71 | # 配置文件存放的位置
 72 | export CONFIG_NAME_OR_PATH=/root/SoulStar/finetune_config/xtuner_config/internlm2_chat_7b_qlora.py
 73 | 
 74 | # 模型训练后得到的pth格式参数存放的位置
 75 | export PTH=/root/SoulStar/finetune_config/xtuner_config/work_dirs/internlm2_chat_7b_qlora/iter_2500.pth
 76 | 
 77 | # pth文件转换为Hugging Face格式后参数存放的位置
 78 | export SAVE_PATH=/root/SoulStar/finetune_config/xtuner_config/work_dirs/hf
 79 | 
 80 | # 执行参数转换
 81 | xtuner convert pth_to_hf $CONFIG_NAME_OR_PATH $PTH $SAVE_PATH
 82 | ```
 83 | 
 84 | - huggingface 格式参数合并
 85 | ```shell
 86 | export MKL_SERVICE_FORCE_INTEL=1
 87 | export MKL_THREADING_LAYER='GNU'
 88 | 
 89 | # 原始模型参数存放的位置
 90 | export NAME_OR_PATH_TO_LLM=/root/model/Shanghai_AI_Laboratory/internlm2-chat-7b
 91 | 
 92 | # Hugging Face格式参数存放的位置
 93 | export NAME_OR_PATH_TO_ADAPTER=/root/SoulStar/finetune_config/xtuner_config/work_dirs/hf
 94 | 
 95 | # 最终Merge后的参数存放的位置
 96 | mkdir /root/model/internlm2-chat-7b-soulstar
 97 | export SAVE_PATH=/root/model/internlm2-chat-7b-soulstar
 98 | 
 99 | # 执行参数Merge
100 | xtuner convert merge \
101 |     $NAME_OR_PATH_TO_LLM \
102 |     $NAME_OR_PATH_TO_ADAPTER \
103 |     $SAVE_PATH \
104 |     --max-shard-size 2GB
105 | ```
106 | 


--------------------------------------------------------------------------------
/nltk_data/corpora/cmudict.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/nltk_data/corpora/cmudict.zip


--------------------------------------------------------------------------------
/nltk_data/corpora/cmudict/README:
--------------------------------------------------------------------------------
 1 | The Carnegie Mellon Pronouncing Dictionary [cmudict.0.7a]
 2 | 
 3 | ftp://ftp.cs.cmu.edu/project/speech/dict/
 4 | https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/cmudict.0.7a
 5 | 
 6 | Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
 7 | 
 8 | File Format: Each line consists of an uppercased word,
 9 | a counter (for alternative pronunciations), and a transcription.
10 | Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
11 | E.g.: NATURAL 1 N AE1 CH ER0 AH0 L
12 | 
13 | The dictionary contains 127069 entries.  Of these, 119400 words are assigned
14 | a unique pronunciation, 6830 words have two pronunciations, and 839 words have
15 | three or more pronunciations.  Many of these are fast-speech variants.
16 | 
17 | Phonemes: There are 39 phonemes, as shown below:
18 |     
19 |     Phoneme Example Translation    Phoneme Example Translation
20 |     ------- ------- -----------    ------- ------- -----------
21 |     AA      odd     AA D           AE      at      AE T
22 |     AH      hut     HH AH T        AO      ought   AO T
23 |     AW      cow     K AW           AY      hide    HH AY D
24 |     B       be      B IY           CH      cheese  CH IY Z
25 |     D       dee     D IY           DH      thee    DH IY
26 |     EH      Ed      EH D           ER      hurt    HH ER T
27 |     EY      ate     EY T           F       fee     F IY
28 |     G       green   G R IY N       HH      he      HH IY
29 |     IH      it      IH T           IY      eat     IY T
30 |     JH      gee     JH IY          K       key     K IY
31 |     L       lee     L IY           M       me      M IY
32 |     N       knee    N IY           NG      ping    P IH NG
33 |     OW      oat     OW T           OY      toy     T OY
34 |     P       pee     P IY           R       read    R IY D
35 |     S       sea     S IY           SH      she     SH IY
36 |     T       tea     T IY           TH      theta   TH EY T AH
37 |     UH      hood    HH UH D        UW      two     T UW
38 |     V       vee     V IY           W       we      W IY
39 |     Y       yield   Y IY L D       Z       zee     Z IY
40 |     ZH      seizure S IY ZH ER
41 | 
42 | (For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
43 | are contiguous, and not separated by FIRE'S 1.)
44 | 
45 | Redistribution and use in source and binary forms, with or without
46 | modification, are permitted provided that the following conditions
47 | are met:
48 | 
49 | 1. Redistributions of source code must retain the above copyright
50 |    notice, this list of conditions and the following disclaimer.
51 |    The contents of this file are deemed to be source code.
52 | 
53 | 2. Redistributions in binary form must reproduce the above copyright
54 |    notice, this list of conditions and the following disclaimer in
55 |    the documentation and/or other materials provided with the
56 |    distribution.
57 | 
58 | This work was supported in part by funding from the Defense Advanced
59 | Research Projects Agency, the Office of Naval Research and the National
60 | Science Foundation of the United States of America, and by member
61 | companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
62 | the contributions of many volunteers to the expansion and improvement of
63 | this dictionary.
64 | 
65 | THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
66 | ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
67 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
68 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
69 | NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
70 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
71 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
72 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
73 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
74 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
75 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
76 | 
77 | 


--------------------------------------------------------------------------------
/nltk_data/taggers/averaged_perceptron_tagger.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/nltk_data/taggers/averaged_perceptron_tagger.zip


--------------------------------------------------------------------------------
/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nobody-ML/HeartLink/317c04bd94f834ce78d3e16bc6a62ee6e3f6f383/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | scipy
 3 | tensorboard
 4 | librosa==0.9.2
 5 | numba==0.56.4
 6 | pytorch-lightning
 7 | # gradio==3.38.0
 8 | # gradio_client==0.8.1
 9 | ffmpeg-python
10 | onnxruntime
11 | tqdm
12 | funasr==1.0.0
13 | cn2an
14 | pypinyin
15 | pyopenjtalk
16 | g2p_en
17 | torchaudio
18 | modelscope==1.10.0
19 | sentencepiece
20 | transformers
21 | chardet
22 | PyYAML
23 | psutil
24 | jieba_fast
25 | jieba
26 | LangSegment>=0.2.0
27 | Faster_Whisper
28 | wordsegment
29 | lmdeploy==0.4.2
30 | streamlit==1.35.0
31 | 


--------------------------------------------------------------------------------