├── .dockerignore
├── .gitignore
├── 0 一键启动脚本
    ├── 0 一键更新项目.bat
    ├── 1 一键更新本项目所需要的依赖.bat
    ├── 10 启动模型管理界面（可选）.bat
    ├── 11 启动原项目的训练界面（小白别开，请根据页面上的文档链接自行研究，推理群不包解答）.bat
    ├── 3 启动GSVI.bat
    ├── 5 启动纯粹的后端（不推荐）.bat
    ├── 999 强制更新：会覆盖你的设置，慎用，和0功能类似.bat
    ├── Cfg
    │   ├── About.txt
    │   └── Cfg.ini
    ├── GPT-soVITS Start.exe
    └── 说明.txt
├── Docker
    ├── damo.sha256
    ├── download.py
    ├── download.sh
    ├── links.sha256
    └── links.txt
├── Dockerfile
├── GPT_SoVITS
    ├── AR
    │   ├── __init__.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── bucket_sampler.py
    │   │   ├── data_module.py
    │   │   └── dataset.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── t2s_lightning_module.py
    │   │   ├── t2s_lightning_module_onnx.py
    │   │   ├── t2s_model.py
    │   │   ├── t2s_model_batch_only.py
    │   │   ├── t2s_model_onnx.py
    │   │   └── utils.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── activation.py
    │   │   ├── activation_onnx.py
    │   │   ├── embedding.py
    │   │   ├── embedding_onnx.py
    │   │   ├── lr_schedulers.py
    │   │   ├── optim.py
    │   │   ├── patched_mha_with_cache.py
    │   │   ├── patched_mha_with_cache_onnx.py
    │   │   ├── scaling.py
    │   │   ├── transformer.py
    │   │   └── transformer_onnx.py
    │   ├── text_processing
    │   │   ├── __init__.py
    │   │   ├── phonemizer.py
    │   │   └── symbols.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── initialize.py
    │   │   └── io.py
    ├── TTS_infer_pack
    │   ├── TTS.py
    │   ├── TextPreprocessor.py
    │   ├── __init__.py
    │   └── text_segmentation_method.py
    ├── configs
    │   ├── s1.yaml
    │   ├── s1big.yaml
    │   ├── s1big2.yaml
    │   ├── s1longer.yaml
    │   ├── s1mq.yaml
    │   ├── s2.json
    │   ├── train.yaml
    │   └── tts_infer.yaml
    ├── feature_extractor
    │   ├── __init__.py
    │   ├── cnhubert.py
    │   └── whisper_enc.py
    ├── inference_gui.py
    ├── inference_webui.py
    ├── inference_webui_old.py
    ├── module
    │   ├── __init__.py
    │   ├── attentions.py
    │   ├── attentions_onnx.py
    │   ├── commons.py
    │   ├── core_vq.py
    │   ├── data_utils.py
    │   ├── losses.py
    │   ├── mel_processing.py
    │   ├── models.py
    │   ├── models_onnx.py
    │   ├── modules.py
    │   ├── mrte_model.py
    │   ├── quantize.py
    │   └── transforms.py
    ├── my_utils.py
    ├── onnx_export.py
    ├── prepare_datasets
    │   ├── 1-get-text.py
    │   ├── 2-get-hubert-wav32k.py
    │   └── 3-get-semantic.py
    ├── pretrained_models
    │   └── .gitignore
    ├── process_ckpt.py
    ├── s1_train.py
    ├── s2_train.py
    ├── text
    │   ├── __init__.py
    │   ├── chinese.py
    │   ├── cleaner.py
    │   ├── cmudict-fast.rep
    │   ├── cmudict.rep
    │   ├── engdict-hot.rep
    │   ├── engdict_cache.pickle
    │   ├── english.py
    │   ├── japanese.py
    │   ├── namedict_cache.pickle
    │   ├── opencpop-strict.txt
    │   ├── symbols.py
    │   ├── tone_sandhi.py
    │   └── zh_normalization
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── char_convert.py
    │   │   ├── chronology.py
    │   │   ├── constants.py
    │   │   ├── num.py
    │   │   ├── phonecode.py
    │   │   ├── quantifier.py
    │   │   └── text_normlization.py
    └── utils.py
├── LICENSE
├── README.md
├── Synthesizers
    ├── base
    │   ├── Base_TTS_Synthesizer.py
    │   ├── Base_TTS_Task.py
    │   ├── __init__.py
    │   └── config_utils.py
    ├── gsv_fast
    │   ├── GSV_Synthesizer.py
    │   ├── __init__.py
    │   ├── configs
    │   │   ├── i18n
    │   │   │   └── locale
    │   │   │   │   ├── en_US.json
    │   │   │   │   ├── zh_CN.json
    │   │   │   │   └── zh_TW.json
    │   │   ├── params_config.json
    │   │   └── ui_config.json
    │   ├── gsv_config.py
    │   ├── gsv_task.py
    │   └── ssml_dealer.py
    └── remote
    │   ├── Remote_Synthesizer.py
    │   ├── __init__.py
    │   ├── configs
    │       ├── config.json
    │       ├── i18n
    │       │   └── locale
    │       │   │   ├── en_US.json
    │       │   │   ├── zh_CN.json
    │       │   │   └── zh_TW.json
    │       ├── params_config.json
    │       └── ui_config.json
    │   └── remote_task.py
├── api_doc.md
├── app.py
├── colab_webui.ipynb
├── common_config.json
├── docker-compose.yaml
├── dockerbuild.sh
├── docs
    ├── cn
    │   ├── Changelog_CN.md
    │   └── README.md
    ├── ja
    │   ├── Changelog_JA.md
    │   └── README.md
    ├── ko
    │   ├── Changelog_KO.md
    │   └── README.md
    └── tr
    │   └── README.md
├── gpt-sovits_kaggle.ipynb
├── gsv_config.json
├── i18n
    └── locale
    │   ├── en_US.json
    │   ├── es_ES.json
    │   ├── fr_FR.json
    │   ├── it_IT.json
    │   ├── ja_JP.json
    │   ├── ko_KR.json
    │   ├── pt_BR.json
    │   ├── ru_RU.json
    │   ├── tr_TR.json
    │   ├── zh_CN.json
    │   ├── zh_HK.json
    │   ├── zh_SG.json
    │   └── zh_TW.json
├── install.sh
├── pure_api.py
├── requirements.txt
├── src
    ├── api_utils.py
    └── common_config_manager.py
├── tmp_audio
    └── .gitignore
├── tools
    ├── __init__.py
    ├── asr
    │   ├── config.py
    │   ├── fasterwhisper_asr.py
    │   ├── funasr_asr.py
    │   └── models
    │   │   └── .gitignore
    ├── cmd-denoise.py
    ├── denoise-model
    │   └── .gitignore
    ├── i18n
    │   ├── i18n.py
    │   ├── locale
    │   │   ├── en_US.json
    │   │   ├── es_ES.json
    │   │   ├── fr_FR.json
    │   │   ├── it_IT.json
    │   │   ├── ja_JP.json
    │   │   ├── ko_KR.json
    │   │   ├── ru_RU.json
    │   │   ├── tr_TR.json
    │   │   ├── zh_CN.json
    │   │   ├── zh_HK.json
    │   │   ├── zh_SG.json
    │   │   └── zh_TW.json
    │   ├── locale_diff.py
    │   └── scan_i18n.py
    ├── my_utils.py
    ├── normalize_loudness.py
    ├── slice_audio.py
    ├── slicer2.py
    ├── srt_slicer
    │   ├── i18n
    │   │   └── locale
    │   │   │   ├── en_US.json
    │   │   │   └── zh_CN.json
    │   ├── srt_utils.py
    │   └── webui.py
    ├── subfix_webui.py
    └── uvr5
    │   ├── lib
    │       ├── lib_v5
    │       │   ├── dataset.py
    │       │   ├── layers.py
    │       │   ├── layers_123812KB.py
    │       │   ├── layers_123821KB.py
    │       │   ├── layers_33966KB.py
    │       │   ├── layers_537227KB.py
    │       │   ├── layers_537238KB.py
    │       │   ├── layers_new.py
    │       │   ├── model_param_init.py
    │       │   ├── modelparams
    │       │   │   ├── 1band_sr16000_hl512.json
    │       │   │   ├── 1band_sr32000_hl512.json
    │       │   │   ├── 1band_sr33075_hl384.json
    │       │   │   ├── 1band_sr44100_hl1024.json
    │       │   │   ├── 1band_sr44100_hl256.json
    │       │   │   ├── 1band_sr44100_hl512.json
    │       │   │   ├── 1band_sr44100_hl512_cut.json
    │       │   │   ├── 2band_32000.json
    │       │   │   ├── 2band_44100_lofi.json
    │       │   │   ├── 2band_48000.json
    │       │   │   ├── 3band_44100.json
    │       │   │   ├── 3band_44100_mid.json
    │       │   │   ├── 3band_44100_msb2.json
    │       │   │   ├── 4band_44100.json
    │       │   │   ├── 4band_44100_mid.json
    │       │   │   ├── 4band_44100_msb.json
    │       │   │   ├── 4band_44100_msb2.json
    │       │   │   ├── 4band_44100_reverse.json
    │       │   │   ├── 4band_44100_sw.json
    │       │   │   ├── 4band_v2.json
    │       │   │   ├── 4band_v2_sn.json
    │       │   │   ├── 4band_v3.json
    │       │   │   └── ensemble.json
    │       │   ├── nets.py
    │       │   ├── nets_123812KB.py
    │       │   ├── nets_123821KB.py
    │       │   ├── nets_33966KB.py
    │       │   ├── nets_537227KB.py
    │       │   ├── nets_537238KB.py
    │       │   ├── nets_61968KB.py
    │       │   ├── nets_new.py
    │       │   └── spec_utils.py
    │       ├── name_params.json
    │       └── utils.py
    │   ├── mdxnet.py
    │   ├── uvr5_weights
    │       └── .gitignore
    │   ├── vr.py
    │   └── webui.py
└── webuis
    ├── builders
        └── gradio_builder.py
    └── character_manager
        ├── i18n
            └── locale
            │   ├── en_US.json
            │   ├── zh_CN.json
            │   └── zh_TW.json
        └── webui.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | docs
 2 | logs
 3 | output
 4 | reference
 5 | SoVITS_weights
 6 | GPT_weights
 7 | TEMP
 8 | GPT_SoVITS
 9 | trained
10 | .git
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | __pycache__
 3 | *.pyc
 4 | env
 5 | runtime
 6 | .idea
 7 | output
 8 | logs
 9 | reference
10 | GPT_weights
11 | SoVITS_weights
12 | TEMP
13 | PortableGit
14 | cache
15 | 
16 | ffmpeg.exe
17 | ffprobe.exe
18 | tmp_audio
19 | trained*
20 | history
21 | app.log
22 | 


--------------------------------------------------------------------------------
/0 一键启动脚本/0 一键更新项目.bat:
--------------------------------------------------------------------------------
 1 | CHCP 65001
 2 | @echo off
 3 | setlocal
 4 | 
 5 | 
 6 | echo 设置需要同步的本地仓库路径
 7 | set REPO_PATH=../
 8 | 
 9 | echo 切换到仓库目录
10 | cd /d %REPO_PATH%
11 | 
12 | echo 设置 PortableGit 的路径
13 | set GIT_PATH=PortableGit/bin
14 | 
15 | echo 更新所有子模块
16 | "%GIT_PATH%\git.exe" submodule update --init --recursive
17 | 
18 | echo 执行 git pull 更新本地仓库
19 | "%GIT_PATH%\git.exe" stash
20 | "%GIT_PATH%\git.exe" pull https://gitee.com/xxoy/GPT-SoVITS-Inference.git stable
21 | 
22 | echo.
23 | echo 更新完成！
24 | pause
25 | 


--------------------------------------------------------------------------------
/0 一键启动脚本/1 一键更新本项目所需要的依赖.bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off 
3 | cd ../
4 | echo 请确保您的主项目运行正常
5 | runtime\python.exe -m pip  config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
6 | runtime\python.exe -m pip  config set install.trusted-host pypi.tuna.tsinghua.edu.cn
7 | runtime\python.exe -m pip install -r ./requirements.txt
8 | 
9 | pause


--------------------------------------------------------------------------------
/0 一键启动脚本/10 启动模型管理界面（可选）.bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off 
3 | cd ../
4 | echo 尝试启动程序，请耐心等待gradio启动，等待十几秒，若未自动弹出浏览器，请手动打开浏览器输入http://127.0.0.1:9868
5 | runtime\python.exe ./webuis/character_manager/webui.py
6 | 
7 | pause


--------------------------------------------------------------------------------
/0 一键启动脚本/11 启动原项目的训练界面（小白别开，请根据页面上的文档链接自行研究，推理群不包解答）.bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off 
3 | cd ../
4 | echo 尝试启动原版的训练推理界面
5 | 
6 | runtime\python.exe ./webui.py
7 | 
8 | pause


--------------------------------------------------------------------------------
/0 一键启动脚本/3 启动GSVI.bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off 
3 | cd ../
4 | echo 尝试启动程序，请耐心等待gradio启动，等待十几秒，若未自动弹出浏览器，请手动打开浏览器输入你配置的网址，例如：http://127.0.0.1:5000
5 | runtime\python.exe app.py
6 | 
7 | pause


--------------------------------------------------------------------------------
/0 一键启动脚本/5 启动纯粹的后端（不推荐）.bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off 
3 | cd ../
4 | echo 尝试启动程序
5 | runtime\python.exe pure_api.py
6 | 
7 | pause


--------------------------------------------------------------------------------
/0 一键启动脚本/999 强制更新：会覆盖你的设置，慎用，和0功能类似.bat:
--------------------------------------------------------------------------------
 1 | CHCP 65001
 2 | @echo off
 3 | setlocal
 4 | 
 5 | 
 6 | echo 设置需要同步的本地仓库路径
 7 | set REPO_PATH=../
 8 | 
 9 | echo 切换到仓库目录
10 | cd /d %REPO_PATH%
11 | 
12 | echo 设置 PortableGit 的路径
13 | set GIT_PATH=PortableGit/bin
14 | 
15 | echo 强制覆盖本地仓库
16 | "%GIT_PATH%\git.exe" fetch https://gitee.com/xxoy/GPT-SoVITS-Inference.git stable
17 | "%GIT_PATH%\git.exe" reset --hard FETCH_HEAD
18 | 
19 | echo.
20 | echo 更新完成！
21 | pause


--------------------------------------------------------------------------------
/0 一键启动脚本/Cfg/About.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/0 一键启动脚本/Cfg/About.txt


--------------------------------------------------------------------------------
/0 一键启动脚本/Cfg/Cfg.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/0 一键启动脚本/Cfg/Cfg.ini


--------------------------------------------------------------------------------
/0 一键启动脚本/GPT-soVITS Start.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/0 一键启动脚本/GPT-soVITS Start.exe


--------------------------------------------------------------------------------
/0 一键启动脚本/说明.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 一、启动GSVI
 3 | 
 4 | 现在的GSVI已经集成了后端的功能，使用方法同样。
 5 | 根据您的设置，比如您的网址是127.0.0.1:5000，
 6 | 那您直接打开这个网址会是一个简单的gradio合成界面
 7 | 访问例如 http://127.0.0.1:5000/tts?text=你好 这样的链接的话就会生成一个音频文件并返回给您
 8 | 
 9 | 您可以用该目录下的启动器启动，或手动运行 3 启动GSVI.bat
10 | 
11 | 更多有关示例请参见 https://www.yuque.com/xter/zibxlp
12 | 
13 | 二、纯粹的后端（不推荐的）
14 | 
15 | 您如果想要纯粹的后端，可以运行 5 启动纯粹的后端.bat
16 | 用法与上面完全一致，更多请参见API的文档，也在上面链接里
17 | 但完全没必要，附加一个简易的gradio界面并不会造成性能损耗
18 | 
19 | 三、模型管理界面
20 | 
21 | 有一个相当简陋的gradio界面，亟待升级，但是现在先将就着用吧~
22 | 您可以用该目录下的启动器启动，或双击运行 10 启动模型管理界面（可选）.bat
23 | 
24 | 四、启动原版GSV的训练
25 | 
26 | 您可以启动 GSV WebUI
27 | 但是实际上我们不推荐您这么做，这个推理项目不被建议用来训练，版本太高了，据传会出现小问题
28 | 另外您需要补充降噪模型，在新版本为了节省体积默认不放置这些模型了
29 | 路径：tools\uvr5\uvr5_weights\ 与 tools\asr\models\models\  与 tools\denoise-model\
30 | 您可以去原始项目重新复制一份tools\进来
31 | 
32 | 有关于配置文件：您可以配置根目录下的gsv_config.json与common_config.json，详情见语雀文档
33 | 
34 | 另外，如果您运行出了问题，请第一时间想到去更新（999-0-1：执行这3个bat）
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/Docker/damo.sha256:
--------------------------------------------------------------------------------
1 | 5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb
2 | b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb
3 | a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb


--------------------------------------------------------------------------------
/Docker/download.py:
--------------------------------------------------------------------------------
1 | # Download moda ASR related models
2 | from modelscope import snapshot_download
3 | model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',revision="v2.0.4")
4 | model_dir = snapshot_download('damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',revision="v2.0.4")
5 | model_dir = snapshot_download('damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',revision="v2.0.4")
6 | 


--------------------------------------------------------------------------------
/Docker/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -Eeuo pipefail
 4 | 
 5 | echo "Downloading models..."
 6 | 
 7 | aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue
 8 | 
 9 | echo "Checking SHA256..."
10 | 
11 | parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c"
12 | 


--------------------------------------------------------------------------------
/Docker/links.sha256:
--------------------------------------------------------------------------------
 1 | b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
 2 | fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth
 3 | 020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth
 4 | 24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
 5 | e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
 6 | 39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth
 7 | 45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth
 8 | 5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
 9 | 8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
10 | 01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
11 | 56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
12 | 233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx


--------------------------------------------------------------------------------
/Docker/links.txt:
--------------------------------------------------------------------------------
 1 | # GPT-SoVITS models
 2 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt
 3 |   out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
 4 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth
 5 |   out=GPT_SoVITS/pretrained_models/s2D488k.pth
 6 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth
 7 |   out=GPT_SoVITS/pretrained_models/s2G488k.pth
 8 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json
 9 |   out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json
10 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json
11 |   out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json
12 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin
13 |   out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
14 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json
15 |   out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json
16 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin
17 |   out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
18 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json
19 |   out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json
20 | # UVR5
21 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
22 |   out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth
23 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth
24 |   out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth
25 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth
26 |   out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
27 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth
28 |   out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
29 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth
30 |   out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
31 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth
32 |   out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
33 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
34 |   out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base CUDA image
 2 | FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04
 3 | 
 4 | LABEL maintainer="breakstring@hotmail.com"
 5 | LABEL version="dev-20240209"
 6 | LABEL description="Docker image for GPT-SoVITS-Inference"
 7 | 
 8 | 
 9 | # Install 3rd party apps
10 | ENV DEBIAN_FRONTEND=noninteractive
11 | ENV TZ=Etc/UTC
12 | RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.aliyun.com/ubuntu/|g' /etc/apt/sources.list && \
13 |     apt-get update && \
14 |     apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \
15 |     git lfs install && \
16 |     rm -rf /var/lib/apt/lists/*
17 | 
18 | # Copy only requirements.txt initially to leverage Docker cache
19 | WORKDIR /workspace
20 | COPY requirements.txt /workspace/
21 | RUN pip install --no-cache-dir -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
22 | 
23 | # Define a build-time argument for image type
24 | ARG IMAGE_TYPE=full
25 | 
26 | # Conditional logic based on the IMAGE_TYPE argument
27 | # Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite"
28 | COPY ./Docker /workspace/Docker 
29 | # elite 类型的镜像里面不包含额外的模型
30 | 
31 | #如果能直接从官方（翻墙）下载，则打开如下的注释，否则参考ReadMe.md中的说明自行将模型文件放到对应的文件夹中
32 | #RUN if [ "$IMAGE_TYPE" != "elite" ]; then \
33 | #        chmod +x /workspace/Docker/download.sh && \
34 | #        /workspace/Docker/download.sh && \
35 | #        python /workspace/Docker/download.py && \
36 | #        pip install -i https://pypi.tuna.tsinghua.edu.cn/simple nltk && \
37 | #        python -m nltk.downloader averaged_perceptron_tagger cmudict; \
38 | #    fi
39 | 
40 | 
41 | 
42 | # Copy the rest of the application
43 | COPY . /workspace
44 | 
45 | #EXPOSE 9871 9872 9873 9874 9880
46 | EXPOSE 5000
47 | 
48 | 
49 | CMD ["python", "app.py"]
50 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/AR/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/AR/data/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/data_module.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | from pytorch_lightning import LightningDataModule
 4 | from AR.data.bucket_sampler import DistributedBucketSampler
 5 | from AR.data.dataset import Text2SemanticDataset
 6 | from torch.utils.data import DataLoader
 7 | 
 8 | 
 9 | class Text2SemanticDataModule(LightningDataModule):
10 |     def __init__(
11 |         self,
12 |         config,
13 |         train_semantic_path,
14 |         train_phoneme_path,
15 |         dev_semantic_path=None,
16 |         dev_phoneme_path=None,
17 |     ):
18 |         super().__init__()
19 |         self.config = config
20 |         self.train_semantic_path = train_semantic_path
21 |         self.train_phoneme_path = train_phoneme_path
22 |         self.dev_semantic_path = dev_semantic_path
23 |         self.dev_phoneme_path = dev_phoneme_path
24 |         self.num_workers = self.config["data"]["num_workers"]
25 | 
26 |     def prepare_data(self):
27 |         pass
28 | 
29 |     def setup(self, stage=None, output_logs=False):
30 |         self._train_dataset = Text2SemanticDataset(
31 |             phoneme_path=self.train_phoneme_path,
32 |             semantic_path=self.train_semantic_path,
33 |             max_sec=self.config["data"]["max_sec"],
34 |             pad_val=self.config["data"]["pad_val"],
35 |         )
36 |         self._dev_dataset = self._train_dataset
37 |         # self._dev_dataset = Text2SemanticDataset(
38 |         #     phoneme_path=self.dev_phoneme_path,
39 |         #     semantic_path=self.dev_semantic_path,
40 |         #     max_sample=self.config['data']['max_eval_sample'],
41 |         #     max_sec=self.config['data']['max_sec'],
42 |         #     pad_val=self.config['data']['pad_val'])
43 | 
44 |     def train_dataloader(self):
45 |         batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"]
46 |         batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存
47 |         sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
48 |         return DataLoader(
49 |             self._train_dataset,
50 |             batch_size=batch_size,
51 |             sampler=sampler,
52 |             collate_fn=self._train_dataset.collate,
53 |             num_workers=self.num_workers,
54 |             persistent_workers=True,
55 |             prefetch_factor=16,
56 |         )
57 | 
58 |     def val_dataloader(self):
59 |         return DataLoader(
60 |             self._dev_dataset,
61 |             batch_size=1,
62 |             shuffle=False,
63 |             collate_fn=self._train_dataset.collate,
64 |             num_workers=max(self.num_workers, 12),
65 |             persistent_workers=True,
66 |             prefetch_factor=16,
67 |         )
68 | 
69 |     # 这个会使用到嘛？
70 |     def test_dataloader(self):
71 |         return DataLoader(
72 |             self._dev_dataset,
73 |             batch_size=1,
74 |             shuffle=False,
75 |             collate_fn=self._train_dataset.collate,
76 |         )
77 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/AR/models/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
  2 | # reference: https://github.com/lifeiteng/vall-e
  3 | import os, sys
  4 | 
  5 | now_dir = os.getcwd()
  6 | sys.path.append(now_dir)
  7 | from typing import Dict
  8 | 
  9 | import torch
 10 | from pytorch_lightning import LightningModule
 11 | from AR.models.t2s_model_onnx import Text2SemanticDecoder
 12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule
 13 | from AR.modules.optim import ScaledAdam
 14 | 
 15 | 
 16 | class Text2SemanticLightningModule(LightningModule):
 17 |     def __init__(self, config, output_dir, is_train=True):
 18 |         super().__init__()
 19 |         self.config = config
 20 |         self.top_k = 3
 21 |         self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
 22 |         pretrained_s1 = config.get("pretrained_s1")
 23 |         if pretrained_s1 and is_train:
 24 |             # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
 25 |             print(
 26 |                 self.load_state_dict(
 27 |                     torch.load(pretrained_s1, map_location="cpu")["weight"]
 28 |                 )
 29 |             )
 30 |         if is_train:
 31 |             self.automatic_optimization = False
 32 |             self.save_hyperparameters()
 33 |             self.eval_dir = output_dir / "eval"
 34 |             self.eval_dir.mkdir(parents=True, exist_ok=True)
 35 | 
 36 |     def training_step(self, batch: Dict, batch_idx: int):
 37 |         opt = self.optimizers()
 38 |         scheduler = self.lr_schedulers()
 39 |         loss, acc = self.model.forward(
 40 |             batch["phoneme_ids"],
 41 |             batch["phoneme_ids_len"],
 42 |             batch["semantic_ids"],
 43 |             batch["semantic_ids_len"],
 44 |             batch["bert_feature"],
 45 |         )
 46 |         self.manual_backward(loss)
 47 |         if batch_idx > 0 and batch_idx % 4 == 0:
 48 |             opt.step()
 49 |             opt.zero_grad()
 50 |             scheduler.step()
 51 | 
 52 |         self.log(
 53 |             "total_loss",
 54 |             loss,
 55 |             on_step=True,
 56 |             on_epoch=True,
 57 |             prog_bar=True,
 58 |             sync_dist=True,
 59 |         )
 60 |         self.log(
 61 |             "lr",
 62 |             scheduler.get_last_lr()[0],
 63 |             on_epoch=True,
 64 |             prog_bar=True,
 65 |             sync_dist=True,
 66 |         )
 67 |         self.log(
 68 |             f"top_{self.top_k}_acc",
 69 |             acc,
 70 |             on_step=True,
 71 |             on_epoch=True,
 72 |             prog_bar=True,
 73 |             sync_dist=True,
 74 |         )
 75 | 
 76 |     def validation_step(self, batch: Dict, batch_idx: int):
 77 |         return
 78 | 
 79 |     def configure_optimizers(self):
 80 |         model_parameters = self.model.parameters()
 81 |         parameters_names = []
 82 |         parameters_names.append(
 83 |             [name_param_pair[0] for name_param_pair in self.model.named_parameters()]
 84 |         )
 85 |         lm_opt = ScaledAdam(
 86 |             model_parameters,
 87 |             lr=0.01,
 88 |             betas=(0.9, 0.95),
 89 |             clipping_scale=2.0,
 90 |             parameters_names=parameters_names,
 91 |             show_dominant_parameters=False,
 92 |             clipping_update_period=1000,
 93 |         )
 94 | 
 95 |         return {
 96 |             "optimizer": lm_opt,
 97 |             "lr_scheduler": {
 98 |                 "scheduler": WarmupCosineLRSchedule(
 99 |                     lm_opt,
100 |                     init_lr=self.config["optimizer"]["lr_init"],
101 |                     peak_lr=self.config["optimizer"]["lr"],
102 |                     end_lr=self.config["optimizer"]["lr_end"],
103 |                     warmup_steps=self.config["optimizer"]["warmup_steps"],
104 |                     total_steps=self.config["optimizer"]["decay_steps"],
105 |                 )
106 |             },
107 |         }
108 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/AR/modules/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 | 
50 |         self.reverse = False
51 |         self.pe = None
52 |         self.extend_pe(torch.tensor(0.0).expand(1, 4000))
53 | 
54 |     def extend_pe(self, x):
55 |         """Reset the positional encodings."""
56 |         if self.pe is not None:
57 |             if self.pe.size(1) >= x.size(1):
58 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
59 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
60 |                 return
61 |         pe = torch.zeros(x.size(1), self.embedding_dim)
62 |         if self.reverse:
63 |             position = torch.arange(
64 |                 x.size(1) - 1, -1, -1.0, dtype=torch.float32
65 |             ).unsqueeze(1)
66 |         else:
67 |             position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
68 |         div_term = torch.exp(
69 |             torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
70 |             * -(math.log(10000.0) / self.embedding_dim)
71 |         )
72 |         pe[:, 0::2] = torch.sin(position * div_term)
73 |         pe[:, 1::2] = torch.cos(position * div_term)
74 |         pe = pe.unsqueeze(0)
75 |         self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
76 | 
77 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
78 |         self.extend_pe(x)
79 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
80 |         output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
81 |         return self.dropout(output)
82 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding_onnx.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 |         self.reverse = False
50 |         self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
51 | 
52 |     def extend_pe(self, x):
53 |         position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1)
54 |         scpe = (position * self.div_term).unsqueeze(0)
55 |         pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
56 |         pe = pe.contiguous().view(1, -1, self.embedding_dim)
57 |         return pe
58 | 
59 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
60 |         pe = self.extend_pe(x)
61 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
62 |         output = output * self.x_scale + self.alpha * pe
63 |         return self.dropout(output)
64 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/lr_schedulers.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import math
 4 | 
 5 | import torch
 6 | from matplotlib import pyplot as plt
 7 | from torch import nn
 8 | from torch.optim import Adam
 9 | 
10 | 
11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
12 |     """
13 |     Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         optimizer,
19 |         init_lr,
20 |         peak_lr,
21 |         end_lr,
22 |         warmup_steps=10000,
23 |         total_steps=400000,
24 |         current_step=0,
25 |     ):
26 |         self.init_lr = init_lr
27 |         self.peak_lr = peak_lr
28 |         self.end_lr = end_lr
29 |         self.optimizer = optimizer
30 |         self._warmup_rate = (peak_lr - init_lr) / warmup_steps
31 |         self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
32 |         self._current_step = current_step
33 |         self.lr = init_lr
34 |         self.warmup_steps = warmup_steps
35 |         self.total_steps = total_steps
36 |         self._last_lr = [self.lr]
37 | 
38 |     def set_lr(self, lr):
39 |         self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
40 |         for g in self.optimizer.param_groups:
41 |             # g['lr'] = lr
42 |             g["lr"] = self.end_lr  ###锁定用线性
43 | 
44 |     def step(self):
45 |         if self._current_step < self.warmup_steps:
46 |             lr = self.init_lr + self._warmup_rate * self._current_step
47 | 
48 |         elif self._current_step > self.total_steps:
49 |             lr = self.end_lr
50 | 
51 |         else:
52 |             decay_ratio = (self._current_step - self.warmup_steps) / (
53 |                 self.total_steps - self.warmup_steps
54 |             )
55 |             if decay_ratio < 0.0 or decay_ratio > 1.0:
56 |                 raise RuntimeError(
57 |                     "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings."
58 |                 )
59 |             coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
60 |             lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
61 | 
62 |         self.lr = lr = self.end_lr = 0.002  ###锁定用线性###不听话，直接锁定！
63 |         self.set_lr(lr)
64 |         self.lr = lr
65 |         self._current_step += 1
66 |         return self.lr
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     m = nn.Linear(10, 10)
71 |     opt = Adam(m.parameters(), lr=1e-4)
72 |     s = WarmupCosineLRSchedule(
73 |         opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0
74 |     )
75 |     lrs = []
76 |     for i in range(25000):
77 |         s.step()
78 |         lrs.append(s.lr)
79 |         print(s.lr)
80 | 
81 |     plt.plot(lrs)
82 |     plt.plot(range(0, 25000), lrs)
83 |     plt.show()
84 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.functional import *
 2 | from torch.nn.functional import (
 3 |     _mha_shape_check,
 4 |     _canonical_mask,
 5 |     _none_or_dtype,
 6 |     _in_projection_packed,
 7 | )
 8 | 
 9 | def multi_head_attention_forward_patched(
10 |     query,
11 |     key,
12 |     value,
13 |     embed_dim_to_check: int,
14 |     num_heads: int,
15 |     in_proj_weight,
16 |     in_proj_bias: Optional[Tensor],
17 |     bias_k: Optional[Tensor],
18 |     bias_v: Optional[Tensor],
19 |     add_zero_attn: bool,
20 |     dropout_p: float,
21 |     out_proj_weight: Tensor,
22 |     out_proj_bias: Optional[Tensor],
23 |     training: bool = True,
24 |     key_padding_mask: Optional[Tensor] = None,
25 |     need_weights: bool = True,
26 |     attn_mask: Optional[Tensor] = None,
27 |     use_separate_proj_weight: bool = False,
28 |     q_proj_weight: Optional[Tensor] = None,
29 |     k_proj_weight: Optional[Tensor] = None,
30 |     v_proj_weight: Optional[Tensor] = None,
31 |     static_k: Optional[Tensor] = None,
32 |     static_v: Optional[Tensor] = None,
33 |     average_attn_weights: bool = True,
34 |     is_causal: bool = False,
35 |     cache=None,
36 | ) -> Tuple[Tensor, Optional[Tensor]]:
37 | 
38 |     # set up shape vars
39 |     _, _, embed_dim = query.shape
40 |     attn_mask = _canonical_mask(
41 |         mask=attn_mask,
42 |         mask_name="attn_mask",
43 |         other_type=None,
44 |         other_name="",
45 |         target_type=query.dtype,
46 |         check_other=False,
47 |     )
48 |     head_dim = embed_dim // num_heads
49 | 
50 |     proj_qkv = linear(query, in_proj_weight, in_proj_bias)
51 |     proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
52 |     q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
53 | 
54 |     if cache["first_infer"] == 1:
55 |         cache["k"][cache["stage"]] = k
56 |         cache["v"][cache["stage"]] = v
57 |     else:
58 |         cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
59 |         cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
60 |         k = cache["k"][cache["stage"]]
61 |         v = cache["v"][cache["stage"]]
62 |     cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
63 | 
64 |     attn_mask = _canonical_mask(
65 |         mask=attn_mask,
66 |         mask_name="attn_mask",
67 |         other_type=None,
68 |         other_name="",
69 |         target_type=q.dtype,
70 |         check_other=False,
71 |     )
72 |     attn_mask = attn_mask.unsqueeze(0)
73 | 
74 |     q = q.view(-1, num_heads, head_dim).transpose(0, 1)
75 |     k = k.view(-1, num_heads, head_dim).transpose(0, 1)
76 |     v = v.view(-1, num_heads, head_dim).transpose(0, 1)
77 | 
78 |     dropout_p = 0.0
79 |     attn_mask = attn_mask.unsqueeze(0)
80 |     q = q.view(num_heads, -1, head_dim).unsqueeze(0)
81 |     k = k.view(num_heads, -1, head_dim).unsqueeze(0)
82 |     v = v.view(num_heads, -1, head_dim).unsqueeze(0)
83 |     attn_output = scaled_dot_product_attention(
84 |         q, k, v, attn_mask, dropout_p, is_causal
85 |     )
86 |     attn_output = (
87 |         attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
88 |     )
89 |     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
90 |     attn_output = attn_output.view(-1, 1, attn_output.size(1))
91 | 
92 |     return attn_output
93 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/AR/text_processing/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import itertools
 4 | import re
 5 | from typing import Dict
 6 | from typing import List
 7 | 
 8 | import regex
 9 | from gruut import sentences
10 | from gruut.const import Sentence
11 | from gruut.const import Word
12 | from AR.text_processing.symbols import SYMBOL_TO_ID
13 | 
14 | 
15 | class GruutPhonemizer:
16 |     def __init__(self, language: str):
17 |         self._phonemizer = sentences
18 |         self.lang = language
19 |         self.symbol_to_id = SYMBOL_TO_ID
20 |         self._special_cases_dict: Dict[str] = {
21 |             r"\.\.\.": "... ",
22 |             ";": "; ",
23 |             ":": ": ",
24 |             ",": ", ",
25 |             r"\.": ". ",
26 |             "!": "! ",
27 |             r"\?": "? ",
28 |             "—": "—",
29 |             "…": "… ",
30 |             "«": "«",
31 |             "»": "»",
32 |         }
33 |         self._punctuation_regexp: str = (
34 |             rf"([{''.join(self._special_cases_dict.keys())}])"
35 |         )
36 | 
37 |     def _normalize_punctuation(self, text: str) -> str:
38 |         text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
39 |         text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
40 |         text = regex.sub(r"\pZ+", r" ", text)
41 |         return text.strip()
42 | 
43 |     def _convert_punctuation(self, word: Word) -> str:
44 |         if not word.phonemes:
45 |             return ""
46 |         if word.phonemes[0] in ["‖", "|"]:
47 |             return word.text.strip()
48 | 
49 |         phonemes = "".join(word.phonemes)
50 |         # remove modifier characters ˈˌː with regex
51 |         phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
52 |         return phonemes.strip()
53 | 
54 |     def phonemize(self, text: str, espeak: bool = False) -> str:
55 |         text_to_phonemize: str = self._normalize_punctuation(text)
56 |         sents: List[Sentence] = [
57 |             sent
58 |             for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)
59 |         ]
60 |         words: List[str] = [
61 |             self._convert_punctuation(word) for word in itertools.chain(*sents)
62 |         ]
63 |         return " ".join(words)
64 | 
65 |     def transform(self, phonemes):
66 |         # convert phonemes to ids
67 |         # dictionary is in symbols.py
68 |         return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     phonemizer = GruutPhonemizer("en-us")
73 |     # text -> IPA
74 |     phonemes = phonemizer.phonemize("Hello, wor-ld ?")
75 |     print("phonemes:", phonemes)
76 |     print("len(phonemes):", len(phonemes))
77 |     phoneme_ids = phonemizer.transform(phonemes)
78 |     print("phoneme_ids:", phoneme_ids)
79 |     print("len(phoneme_ids):", len(phoneme_ids))
80 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/symbols.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | PAD = "_"
 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” '
 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 6 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
 7 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
 8 | SPACE_ID = SYMBOLS.index(" ")
 9 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
10 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
11 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def str2bool(str):
 5 |     return True if str.lower() == 'true' else False
 6 | 
 7 | 
 8 | def get_newest_ckpt(string_list):
 9 |     # 定义一个正则表达式模式，用于匹配字符串中的数字
10 |     pattern = r'epoch=(\d+)-step=(\d+)\.ckpt'
11 | 
12 |     # 使用正则表达式提取每个字符串中的数字信息，并创建一个包含元组的列表
13 |     extracted_info = []
14 |     for string in string_list:
15 |         match = re.match(pattern, string)
16 |         if match:
17 |             epoch = int(match.group(1))
18 |             step = int(match.group(2))
19 |             extracted_info.append((epoch, step, string))
20 |     # 按照 epoch 后面的数字和 step 后面的数字进行排序
21 |     sorted_info = sorted(
22 |         extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
23 |     # 获取最新的 ckpt 文件名
24 |     newest_ckpt = sorted_info[0][2]
25 |     return newest_ckpt
26 | 
27 | 
28 | # 文本存在且不为空时 return True
29 | def check_txt_file(file_path):
30 |     try:
31 |         with open(file_path, 'r') as file:
32 |             text = file.readline().strip()
33 |         assert text.strip() != ''
34 |         return text
35 |     except Exception:
36 |         return False
37 |     return False
38 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/initialize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Initialize modules for espnet2 neural networks."""
 3 | import torch
 4 | from typeguard import check_argument_types
 5 | 
 6 | 
 7 | def initialize(model: torch.nn.Module, init: str):
 8 |     """Initialize weights of a neural network module.
 9 | 
10 |     Parameters are initialized using the given method or distribution.
11 | 
12 |     Custom initialization routines can be implemented into submodules
13 |     as function `espnet_initialization_fn` within the custom module.
14 | 
15 |     Args:
16 |         model: Target.
17 |         init: Method of initialization.
18 |     """
19 |     assert check_argument_types()
20 |     print("init with", init)
21 | 
22 |     # weight init
23 |     for p in model.parameters():
24 |         if p.dim() > 1:
25 |             if init == "xavier_uniform":
26 |                 torch.nn.init.xavier_uniform_(p.data)
27 |             elif init == "xavier_normal":
28 |                 torch.nn.init.xavier_normal_(p.data)
29 |             elif init == "kaiming_uniform":
30 |                 torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
31 |             elif init == "kaiming_normal":
32 |                 torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
33 |             else:
34 |                 raise ValueError("Unknown initialization: " + init)
35 |     # bias init
36 |     for name, p in model.named_parameters():
37 |         if ".bias" in name and p.dim() == 1:
38 |             p.data.zero_()
39 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/io.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | import yaml
 5 | 
 6 | 
 7 | def load_yaml_config(path):
 8 |     with open(path) as f:
 9 |         config = yaml.full_load(f)
10 |     return config
11 | 
12 | 
13 | def save_config_to_yaml(config, path):
14 |     assert path.endswith(".yaml")
15 |     with open(path, "w") as f:
16 |         f.write(yaml.dump(config))
17 |         f.close()
18 | 
19 | 
20 | def write_args(args, path):
21 |     args_dict = dict(
22 |         (name, getattr(args, name)) for name in dir(args) if not name.startswith("_")
23 |     )
24 |     with open(path, "a") as args_file:
25 |         args_file.write("==> torch version: {}\n".format(torch.__version__))
26 |         args_file.write(
27 |             "==> cudnn version: {}\n".format(torch.backends.cudnn.version())
28 |         )
29 |         args_file.write("==> Cmd:\n")
30 |         args_file.write(str(sys.argv))
31 |         args_file.write("\n==> args:\n")
32 |         for k, v in sorted(args_dict.items()):
33 |             args_file.write("  %s: %s\n" % (str(k), str(v)))
34 |         args_file.close()
35 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/TTS_infer_pack/__init__.py:
--------------------------------------------------------------------------------
1 | from . import TTS, text_segmentation_method


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 8
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 512
24 |   hidden_dim: 512
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 12
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 8
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 1024
24 |   hidden_dim: 1024
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 16
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big2.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 12
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 1024
24 |   hidden_dim: 1024
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 6
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1longer.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 20
 4 |   batch_size: 8
 5 |   save_every_n_epoch: 1
 6 |   precision: 16-mixed
 7 |   gradient_clip: 1.0
 8 | optimizer:
 9 |   lr: 0.01
10 |   lr_init: 0.00001
11 |   lr_end: 0.0001
12 |   warmup_steps: 2000
13 |   decay_steps: 40000
14 | data:
15 |   max_eval_sample: 8
16 |   max_sec: 54
17 |   num_workers: 4
18 |   pad_val: 1024 # same with EOS in model
19 | model:
20 |   vocab_size: 1025
21 |   phoneme_vocab_size: 512
22 |   embedding_dim: 512
23 |   hidden_dim: 512
24 |   head: 16
25 |   linear_units: 2048
26 |   n_layer: 24
27 |   dropout: 0
28 |   EOS: 1024
29 |   random_bert: 0
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1mq.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 100
 4 |   batch_size: 6
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 32
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 40
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   saving_path: "ckpt/"
22 |   resume_checkpoint: null
23 |   vocoder_config_path: "quantizer/new_ckpt/config.json"
24 |   vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000"
25 |   datadir: "/home/liweiche/GigaSpeech/wavs"
26 |   metapath: "/home/liweiche/GigaSpeech/train2.json"
27 |   val_metapath: "/home/liweiche/GigaSpeech/dev2.json"
28 |   sampledir: "logs/"
29 |   pretrained_path: null
30 |   lr: 0.0001
31 |   batch_size: 200.0
32 |   train_bucket_size: 8192
33 |   training_step: 800000
34 |   optim_flat_percent: 0.0
35 |   warmup_step: 50
36 |   adam_beta1: 0.9
37 |   adam_beta2: 0.98
38 |   ffd_size: 3072
39 |   hidden_size: 768
40 |   enc_nlayers: 6
41 |   dec_nlayers: 6
42 |   nheads: 12
43 |   ar_layer: 4
44 |   ar_ffd_size: 1024
45 |   ar_hidden_size: 256
46 |   ar_nheads: 4
47 |   aligner_softmax_temp: 1.0
48 |   layer_norm_eps: 0.00001
49 |   speaker_embed_dropout: 0.05
50 |   label_smoothing: 0.0
51 |   val_check_interval: 5000
52 |   check_val_every_n_epoch: 1
53 |   precision: "fp16"
54 |   nworkers: 16
55 |   distributed: true
56 |   accelerator: "ddp"
57 |   version: null
58 |   accumulate_grad_batches: 1
59 |   use_repetition_token: true
60 |   use_repetition_gating: false
61 |   repetition_penalty: 1.0
62 |   sampling_temperature: 1.0
63 |   top_k: -1
64 |   min_top_k: 3
65 |   top_p: 0.8
66 |   sample_num: 4
67 |   length_penalty_max_length: 15000
68 |   length_penalty_max_prob: 0.95
69 |   max_input_length: 2048
70 |   max_output_length: 2000
71 |   sample_rate: 16000
72 |   n_codes: 1024
73 |   n_cluster_groups: 1
74 |   phone_context_window: 4
75 |   phoneset_size: 1000
76 | inference:
77 |   top_k: 5
78 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 100,
 4 |     "eval_interval": 500,
 5 |     "seed": 1234,
 6 |     "epochs": 100,
 7 |     "learning_rate": 0.0001,
 8 |     "betas": [
 9 |       0.8,
10 |       0.99
11 |     ],
12 |     "eps": 1e-09,
13 |     "batch_size": 32,
14 |     "fp16_run": true,
15 |     "lr_decay": 0.999875,
16 |     "segment_size": 20480,
17 |     "init_lr_ratio": 1,
18 |     "warmup_epochs": 0,
19 |     "c_mel": 45,
20 |     "c_kl": 1.0,
21 |     "text_low_lr_rate": 0.4
22 |   },
23 |   "data": {
24 |     "max_wav_value": 32768.0,
25 |     "sampling_rate": 32000,
26 |     "filter_length": 2048,
27 |     "hop_length": 640,
28 |     "win_length": 2048,
29 |     "n_mel_channels": 128,
30 |     "mel_fmin": 0.0,
31 |     "mel_fmax": null,
32 |     "add_blank": true,
33 |     "n_speakers": 300,
34 |     "cleaned_text": true
35 |   },
36 |   "model": {
37 |     "inter_channels": 192,
38 |     "hidden_channels": 192,
39 |     "filter_channels": 768,
40 |     "n_heads": 2,
41 |     "n_layers": 6,
42 |     "kernel_size": 3,
43 |     "p_dropout": 0.1,
44 |     "resblock": "1",
45 |     "resblock_kernel_sizes": [
46 |       3,
47 |       7,
48 |       11
49 |     ],
50 |     "resblock_dilation_sizes": [
51 |       [
52 |         1,
53 |         3,
54 |         5
55 |       ],
56 |       [
57 |         1,
58 |         3,
59 |         5
60 |       ],
61 |       [
62 |         1,
63 |         3,
64 |         5
65 |       ]
66 |     ],
67 |     "upsample_rates": [
68 |       10,
69 |       8,
70 |       2,
71 |       2,
72 |       2
73 |     ],
74 |     "upsample_initial_channel": 512,
75 |     "upsample_kernel_sizes": [
76 |       16,
77 |       16,
78 |       8,
79 |       2,
80 |       2
81 |     ],
82 |     "n_layers_q": 3,
83 |     "use_spectral_norm": false,
84 |     "gin_channels": 512,
85 |     "semantic_frame_rate": "25hz",
86 |     "freeze_quantizer": true
87 |   },
88 |   "s2_ckpt_dir": "logs/s2/big2k1",
89 |   "content_module": "cnhubert"
90 | }


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | gpu:
 2 |   n_card: 1
 3 |   n_process_per_card: 2
 4 | io:
 5 |   text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 512
24 |   hidden_dim: 512
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 24
28 |   dropout: 0
29 |   EOS: 1024
30 |   random_bert: 0
31 | inference:
32 |   top_k: 5
33 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/tts_infer.yaml:
--------------------------------------------------------------------------------
 1 | custom:
 2 |   bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
 3 |   cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
 4 |   device: cuda
 5 |   is_half: true
 6 |   t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
 7 |   vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
 8 | default:
 9 |   bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
10 |   cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
11 |   device: cpu
12 |   is_half: false
13 |   t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
14 |   vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
15 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cnhubert, whisper_enc
2 | 
3 | content_module_map = {
4 |     'cnhubert': cnhubert,
5 |     'whisper': whisper_enc
6 | }


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/cnhubert.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import librosa
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import soundfile as sf
  7 | import logging
  8 | 
  9 | logging.getLogger("numba").setLevel(logging.WARNING)
 10 | 
 11 | from transformers import (
 12 |     Wav2Vec2FeatureExtractor,
 13 |     HubertModel,
 14 | )
 15 | 
 16 | import utils
 17 | import torch.nn as nn
 18 | 
 19 | cnhubert_base_path = None
 20 | 
 21 | 
 22 | class CNHubert(nn.Module):
 23 |     def __init__(self, base_path:str=None):
 24 |         super().__init__()
 25 |         if base_path is None:
 26 |             base_path = cnhubert_base_path
 27 |         self.model = HubertModel.from_pretrained(base_path)
 28 |         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
 29 |             base_path
 30 |         )
 31 | 
 32 | 
 33 |     def forward(self, x):
 34 |         input_values = self.feature_extractor(
 35 |             x, return_tensors="pt", sampling_rate=16000
 36 |         ).input_values.to(x.device)
 37 |         feats = self.model(input_values)["last_hidden_state"]
 38 |         return feats
 39 | 
 40 | 
 41 | # class CNHubertLarge(nn.Module):
 42 | #     def __init__(self):
 43 | #         super().__init__()
 44 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 45 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 46 | #     def forward(self, x):
 47 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 48 | #         feats = self.model(input_values)["last_hidden_state"]
 49 | #         return feats
 50 | #
 51 | # class CVec(nn.Module):
 52 | #     def __init__(self):
 53 | #         super().__init__()
 54 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 55 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 56 | #     def forward(self, x):
 57 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 58 | #         feats = self.model(input_values)["last_hidden_state"]
 59 | #         return feats
 60 | #
 61 | # class cnw2v2base(nn.Module):
 62 | #     def __init__(self):
 63 | #         super().__init__()
 64 | #         self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 65 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 66 | #     def forward(self, x):
 67 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 68 | #         feats = self.model(input_values)["last_hidden_state"]
 69 | #         return feats
 70 | 
 71 | 
 72 | def get_model():
 73 |     model = CNHubert()
 74 |     model.eval()
 75 |     return model
 76 | 
 77 | 
 78 | # def get_large_model():
 79 | #     model = CNHubertLarge()
 80 | #     model.eval()
 81 | #     return model
 82 | #
 83 | # def get_model_cvec():
 84 | #     model = CVec()
 85 | #     model.eval()
 86 | #     return model
 87 | #
 88 | # def get_model_cnw2v2base():
 89 | #     model = cnw2v2base()
 90 | #     model.eval()
 91 | #     return model
 92 | 
 93 | 
 94 | def get_content(hmodel, wav_16k_tensor):
 95 |     with torch.no_grad():
 96 |         feats = hmodel(wav_16k_tensor)
 97 |     return feats.transpose(1, 2)
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     model = get_model()
102 |     src_path = "/Users/Shared/原音频2.wav"
103 |     wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
104 |     model = model
105 |     wav_16k_tensor = wav_16k_tensor
106 |     feats = get_content(model, wav_16k_tensor)
107 |     print(feats.shape)
108 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/whisper_enc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_model():
 5 |     import whisper
 6 | 
 7 |     model = whisper.load_model("small", device="cpu")
 8 | 
 9 |     return model.encoder
10 | 
11 | 
12 | def get_content(model=None, wav_16k_tensor=None):
13 |     from whisper import log_mel_spectrogram, pad_or_trim
14 | 
15 |     dev = next(model.parameters()).device
16 |     mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
17 |     # if torch.cuda.is_available():
18 |     #     mel = mel.to(torch.float16)
19 |     feature_len = mel.shape[-1] // 2
20 |     assert mel.shape[-1] < 3000, "输入音频过长，只允许输入30以内音频"
21 |     with torch.no_grad():
22 |         feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[
23 |             :1, :feature_len, :
24 |         ].transpose(1, 2)
25 |     return feature
26 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/module/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/module/losses.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch.nn import functional as F
 5 | 
 6 | 
 7 | def feature_loss(fmap_r, fmap_g):
 8 |     loss = 0
 9 |     for dr, dg in zip(fmap_r, fmap_g):
10 |         for rl, gl in zip(dr, dg):
11 |             rl = rl.float().detach()
12 |             gl = gl.float()
13 |             loss += torch.mean(torch.abs(rl - gl))
14 | 
15 |     return loss * 2
16 | 
17 | 
18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19 |     loss = 0
20 |     r_losses = []
21 |     g_losses = []
22 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23 |         dr = dr.float()
24 |         dg = dg.float()
25 |         r_loss = torch.mean((1 - dr) ** 2)
26 |         g_loss = torch.mean(dg**2)
27 |         loss += r_loss + g_loss
28 |         r_losses.append(r_loss.item())
29 |         g_losses.append(g_loss.item())
30 | 
31 |     return loss, r_losses, g_losses
32 | 
33 | 
34 | def generator_loss(disc_outputs):
35 |     loss = 0
36 |     gen_losses = []
37 |     for dg in disc_outputs:
38 |         dg = dg.float()
39 |         l = torch.mean((1 - dg) ** 2)
40 |         gen_losses.append(l)
41 |         loss += l
42 | 
43 |     return loss, gen_losses
44 | 
45 | 
46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47 |     """
48 |     z_p, logs_q: [b, h, t_t]
49 |     m_p, logs_p: [b, h, t_t]
50 |     """
51 |     z_p = z_p.float()
52 |     logs_q = logs_q.float()
53 |     m_p = m_p.float()
54 |     logs_p = logs_p.float()
55 |     z_mask = z_mask.float()
56 | 
57 |     kl = logs_p - logs_q - 0.5
58 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
59 |     kl = torch.sum(kl * z_mask)
60 |     l = kl / torch.sum(z_mask)
61 |     return l
62 | 
63 | 
64 | def mle_loss(z, m, logs, logdet, mask):
65 |     l = torch.sum(logs) + 0.5 * torch.sum(
66 |         torch.exp(-2 * logs) * ((z - m) ** 2)
67 |     )  # neg normal likelihood w/o the constant term
68 |     l = l - torch.sum(logdet)  # log jacobian determinant
69 |     l = l / torch.sum(
70 |         torch.ones_like(z) * mask
71 |     )  # averaging across batch, channel and time axes
72 |     l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
73 |     return l
74 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/my_utils.py:
--------------------------------------------------------------------------------
 1 | import ffmpeg
 2 | import numpy as np
 3 | 
 4 | 
 5 | def load_audio(file, sr):
 6 |     try:
 7 |         # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
 8 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
 9 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
10 |         file = (
11 |             file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
12 |         )  # 防止小白拷路径头尾带了空格和"和回车
13 |         out, _ = (
14 |             ffmpeg.input(file, threads=0)
15 |             .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
16 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17 |         )
18 |     except Exception as e:
19 |         raise RuntimeError(f"Failed to load audio: {e}")
20 | 
21 |     return np.frombuffer(out, np.float32).flatten()
22 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/prepare_datasets/3-get-semantic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | inp_text = os.environ.get("inp_text")
 4 | exp_name = os.environ.get("exp_name")
 5 | i_part = os.environ.get("i_part")
 6 | all_parts = os.environ.get("all_parts")
 7 | os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES")
 8 | opt_dir = os.environ.get("opt_dir")
 9 | pretrained_s2G = os.environ.get("pretrained_s2G")
10 | s2config_path = os.environ.get("s2config_path")
11 | is_half = eval(os.environ.get("is_half", "True"))
12 | import math, traceback
13 | import multiprocessing
14 | import sys, pdb
15 | 
16 | now_dir = os.getcwd()
17 | sys.path.append(now_dir)
18 | from random import shuffle
19 | import torch.multiprocessing as mp
20 | from glob import glob
21 | from tqdm import tqdm
22 | import logging, librosa, utils, torch
23 | from module.models import SynthesizerTrn
24 | 
25 | logging.getLogger("numba").setLevel(logging.WARNING)
26 | # from config import pretrained_s2G
27 | 
28 | # inp_text=sys.argv[1]
29 | # exp_name=sys.argv[2]
30 | # i_part=sys.argv[3]
31 | # all_parts=sys.argv[4]
32 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[5]
33 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
34 | 
35 | 
36 | hubert_dir = "%s/4-cnhubert" % (opt_dir)
37 | semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
38 | if os.path.exists(semantic_path) == False:
39 |     os.makedirs(opt_dir, exist_ok=True)
40 | 
41 |     if torch.cuda.is_available():
42 |         device = "cuda"
43 |     # elif torch.backends.mps.is_available():
44 |     #     device = "mps"
45 |     else:
46 |         device = "cpu"
47 |     hps = utils.get_hparams_from_file(s2config_path)
48 |     vq_model = SynthesizerTrn(
49 |         hps.data.filter_length // 2 + 1,
50 |         hps.train.segment_size // hps.data.hop_length,
51 |         n_speakers=hps.data.n_speakers,
52 |         **hps.model
53 |     )
54 |     if is_half == True:
55 |         vq_model = vq_model.half().to(device)
56 |     else:
57 |         vq_model = vq_model.to(device)
58 |     vq_model.eval()
59 |     # utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True)
60 |     # utils.load_checkpoint(pretrained_s2G, vq_model, None, True)
61 |     print(
62 |         vq_model.load_state_dict(
63 |             torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False
64 |         )
65 |     )
66 | 
67 |     def name2go(wav_name, lines):
68 |         hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
69 |         if os.path.exists(hubert_path) == False:
70 |             return
71 |         ssl_content = torch.load(hubert_path, map_location="cpu")
72 |         if is_half == True:
73 |             ssl_content = ssl_content.half().to(device)
74 |         else:
75 |             ssl_content = ssl_content.to(device)
76 |         codes = vq_model.extract_latent(ssl_content)
77 |         semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()])
78 |         lines.append("%s\t%s" % (wav_name, semantic))
79 | 
80 |     with open(inp_text, "r", encoding="utf8") as f:
81 |         lines = f.read().strip("\n").split("\n")
82 | 
83 |     lines1 = []
84 |     for line in lines[int(i_part) :: int(all_parts)]:
85 |         # print(line)
86 |         try:
87 |             # wav_name,text=line.split("\t")
88 |             wav_name, spk_name, language, text = line.split("|")
89 |             wav_name = os.path.basename(wav_name)
90 |             # name2go(name,lines1)
91 |             name2go(wav_name, lines1)
92 |         except:
93 |             print(line, traceback.format_exc())
94 |     with open(semantic_path, "w", encoding="utf8") as f:
95 |         f.write("\n".join(lines1))
96 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/pretrained_models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/GPT_SoVITS/process_ckpt.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | from collections import OrderedDict
 3 | from time import time as ttime
 4 | import shutil,os
 5 | import torch
 6 | from tools.i18n.i18n import I18nAuto
 7 | 
 8 | i18n = I18nAuto()
 9 | 
10 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path
11 |     dir=os.path.dirname(path)
12 |     name=os.path.basename(path)
13 |     tmp_path="%s.pth"%(ttime())
14 |     torch.save(fea,tmp_path)
15 |     shutil.move(tmp_path,"%s/%s"%(dir,name))
16 | 
17 | def savee(ckpt, name, epoch, steps, hps):
18 |     try:
19 |         opt = OrderedDict()
20 |         opt["weight"] = {}
21 |         for key in ckpt.keys():
22 |             if "enc_q" in key:
23 |                 continue
24 |             opt["weight"][key] = ckpt[key].half()
25 |         opt["config"] = hps
26 |         opt["info"] = "%sepoch_%siteration" % (epoch, steps)
27 |         # torch.save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
28 |         my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
29 |         return "Success."
30 |     except:
31 |         return traceback.format_exc()
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from text.symbols import *
 2 | 
 3 | 
 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text):
 7 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |   '''
13 |   phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |   return phones
15 | 
16 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from text import chinese, japanese, cleaned_text_to_sequence, symbols, english
 2 | 
 3 | language_module_map = {"zh": chinese, "ja": japanese, "en": english}
 4 | special = [
 5 |     # ("%", "zh", "SP"),
 6 |     ("￥", "zh", "SP2"),
 7 |     ("^", "zh", "SP3"),
 8 |     # ('@', 'zh', "SP4")#不搞鬼畜了，和第二版保持一致吧
 9 | ]
10 | 
11 | 
12 | def clean_text(text, language):
13 |     if(language not in language_module_map):
14 |         language="en"
15 |         text=" "
16 |     for special_s, special_l, target_symbol in special:
17 |         if special_s in text and language == special_l:
18 |             return clean_special(text, language, special_s, target_symbol)
19 |     language_module = language_module_map[language]
20 |     norm_text = language_module.text_normalize(text)
21 |     if language == "zh":
22 |         phones, word2ph = language_module.g2p(norm_text)
23 |         assert len(phones) == sum(word2ph)
24 |         assert len(norm_text) == len(word2ph)
25 |     else:
26 |         phones = language_module.g2p(norm_text)
27 |         word2ph = None
28 | 
29 |     for ph in phones:
30 |         assert ph in symbols
31 |     return phones, word2ph, norm_text
32 | 
33 | 
34 | def clean_special(text, language, special_s, target_symbol):
35 |     """
36 |     特殊静音段sp符号处理
37 |     """
38 |     text = text.replace(special_s, ",")
39 |     language_module = language_module_map[language]
40 |     norm_text = language_module.text_normalize(text)
41 |     phones = language_module.g2p(norm_text)
42 |     new_ph = []
43 |     for ph in phones[0]:
44 |         assert ph in symbols
45 |         if ph == ",":
46 |             new_ph.append(target_symbol)
47 |         else:
48 |             new_ph.append(ph)
49 |     return new_ph, phones[1], norm_text
50 | 
51 | 
52 | def text_to_sequence(text, language):
53 |     phones = clean_text(text)
54 |     return cleaned_text_to_sequence(phones)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
59 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict-hot.rep:
--------------------------------------------------------------------------------
1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2 | JSON JH EY1 S AH0 N


--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/text/engdict_cache.pickle


--------------------------------------------------------------------------------
/GPT_SoVITS/text/namedict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/text/namedict_cache.pickle


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/README.md:
--------------------------------------------------------------------------------
 1 | ## Supported NSW (Non-Standard-Word) Normalization
 2 | 
 3 | |NSW type|raw|normalized|
 4 | |:--|:-|:-|
 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
 6 | |cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
 7 | |numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
 8 | |date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
14 | |telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from text.zh_normalization.text_normlization import *
15 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import re
 15 | 
 16 | from .num import DIGITS
 17 | from .num import num2str
 18 | from .num import verbalize_cardinal
 19 | from .num import verbalize_digit
 20 | 
 21 | 
 22 | def _time_num2str(num_string: str) -> str:
 23 |     """A special case for verbalizing number in time."""
 24 |     result = num2str(num_string.lstrip('0'))
 25 |     if num_string.startswith('0'):
 26 |         result = DIGITS['0'] + result
 27 |     return result
 28 | 
 29 | 
 30 | # 时刻表达式
 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
 32 |                      r':([0-5][0-9])'
 33 |                      r'(:([0-5][0-9]))?')
 34 | 
 35 | # 时间范围，如8:30-12:30
 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
 37 |                            r':([0-5][0-9])'
 38 |                            r'(:([0-5][0-9]))?'
 39 |                            r'(~|-)'
 40 |                            r'([0-1]?[0-9]|2[0-3])'
 41 |                            r':([0-5][0-9])'
 42 |                            r'(:([0-5][0-9]))?')
 43 | 
 44 | 
 45 | def replace_time(match) -> str:
 46 |     """
 47 |     Args:
 48 |         match (re.Match)
 49 |     Returns:
 50 |         str
 51 |     """
 52 | 
 53 |     is_range = len(match.groups()) > 5
 54 | 
 55 |     hour = match.group(1)
 56 |     minute = match.group(2)
 57 |     second = match.group(4)
 58 | 
 59 |     if is_range:
 60 |         hour_2 = match.group(6)
 61 |         minute_2 = match.group(7)
 62 |         second_2 = match.group(9)
 63 | 
 64 |     result = f"{num2str(hour)}点"
 65 |     if minute.lstrip('0'):
 66 |         if int(minute) == 30:
 67 |             result += "半"
 68 |         else:
 69 |             result += f"{_time_num2str(minute)}分"
 70 |     if second and second.lstrip('0'):
 71 |         result += f"{_time_num2str(second)}秒"
 72 | 
 73 |     if is_range:
 74 |         result += "至"
 75 |         result += f"{num2str(hour_2)}点"
 76 |         if minute_2.lstrip('0'):
 77 |             if int(minute) == 30:
 78 |                 result += "半"
 79 |             else:
 80 |                 result += f"{_time_num2str(minute_2)}分"
 81 |         if second_2 and second_2.lstrip('0'):
 82 |             result += f"{_time_num2str(second_2)}秒"
 83 | 
 84 |     return result
 85 | 
 86 | 
 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年'
 88 |                      r'((0?[1-9]|1[0-2])月)?'
 89 |                      r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
 90 | 
 91 | 
 92 | def replace_date(match) -> str:
 93 |     """
 94 |     Args:
 95 |         match (re.Match)
 96 |     Returns:
 97 |         str
 98 |     """
 99 |     year = match.group(1)
100 |     month = match.group(3)
101 |     day = match.group(5)
102 |     result = ""
103 |     if year:
104 |         result += f"{verbalize_digit(year)}年"
105 |     if month:
106 |         result += f"{verbalize_cardinal(month)}月"
107 |     if day:
108 |         result += f"{verbalize_cardinal(day)}{match.group(9)}"
109 |     return result
110 | 
111 | 
112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
113 | RE_DATE2 = re.compile(
114 |     r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
115 | 
116 | 
117 | def replace_date2(match) -> str:
118 |     """
119 |     Args:
120 |         match (re.Match)
121 |     Returns:
122 |         str
123 |     """
124 |     year = match.group(1)
125 |     month = match.group(3)
126 |     day = match.group(4)
127 |     result = ""
128 |     if year:
129 |         result += f"{verbalize_digit(year)}年"
130 |     if month:
131 |         result += f"{verbalize_cardinal(month)}月"
132 |     if day:
133 |         result += f"{verbalize_cardinal(day)}日"
134 |     return result
135 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 | 
17 | from pypinyin.constants import SUPPORT_UCS4
18 | 
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {
22 |     ord(char) + 65248: ord(char)
23 |     for char in string.ascii_letters
24 | }
25 | 
26 | # 英文字符半角 -> 全角映射表
27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
28 | 
29 | # 数字字符全角 -> 半角映射表 (num: 10)
30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
31 | # 数字字符半角 -> 全角映射表
32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
33 | 
34 | # 标点符号全角 -> 半角映射表 (num: 32)
35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
36 | # 标点符号半角 -> 全角映射表
37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
38 | 
39 | # 空格 (num: 1)
40 | F2H_SPACE = {'\u3000': ' '}
41 | H2F_SPACE = {' ': '\u3000'}
42 | 
43 | # 非"有拼音的汉字"的字符串，可用于NSW提取
44 | if SUPPORT_UCS4:
45 |     RE_NSW = re.compile(r'(?:[^'
46 |                         r'\u3007'  # 〇
47 |                         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
48 |                         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
49 |                         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
50 |                         r'\U00020000-\U0002A6DF'  # CJK扩展B:[20000-2A6DF]
51 |                         r'\U0002A703-\U0002B73F'  # CJK扩展C:[2A700-2B73F]
52 |                         r'\U0002B740-\U0002B81D'  # CJK扩展D:[2B740-2B81D]
53 |                         r'\U0002F80A-\U0002FA1F'  # CJK兼容扩展:[2F800-2FA1F]
54 |                         r'])+')
55 | else:
56 |     RE_NSW = re.compile(  # pragma: no cover
57 |         r'(?:[^'
58 |         r'\u3007'  # 〇
59 |         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
60 |         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
61 |         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
62 |         r'])+')
63 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import verbalize_digit
17 | 
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通：130、131、132、156、155、186、185、176
23 | # 电信：133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(
25 |     r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
26 | RE_TELEPHONE = re.compile(
27 |     r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
28 | 
29 | # 全国统一的号码400开头
30 | RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
31 | 
32 | 
33 | def phone2str(phone_string: str, mobile=True) -> str:
34 |     if mobile:
35 |         sp_parts = phone_string.strip('+').split()
36 |         result = '，'.join(
37 |             [verbalize_digit(part, alt_one=True) for part in sp_parts])
38 |         return result
39 |     else:
40 |         sil_parts = phone_string.split('-')
41 |         result = '，'.join(
42 |             [verbalize_digit(part, alt_one=True) for part in sil_parts])
43 |         return result
44 | 
45 | 
46 | def replace_phone(match) -> str:
47 |     """
48 |     Args:
49 |         match (re.Match)
50 |     Returns:
51 |         str
52 |     """
53 |     return phone2str(match.group(0), mobile=False)
54 | 
55 | 
56 | def replace_mobile(match) -> str:
57 |     """
58 |     Args:
59 |         match (re.Match)
60 |     Returns:
61 |         str
62 |     """
63 |     return phone2str(match.group(0))
64 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import num2str
17 | 
18 | # 温度表达式，温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
21 | measure_dict = {
22 |     "cm2": "平方厘米",
23 |     "cm²": "平方厘米",
24 |     "cm3": "立方厘米",
25 |     "cm³": "立方厘米",
26 |     "cm": "厘米",
27 |     "db": "分贝",
28 |     "ds": "毫秒",
29 |     "kg": "千克",
30 |     "km": "千米",
31 |     "m2": "平方米",
32 |     "m²": "平方米",
33 |     "m³": "立方米",
34 |     "m3": "立方米",
35 |     "ml": "毫升",
36 |     "m": "米",
37 |     "mm": "毫米",
38 |     "s": "秒"
39 | }
40 | 
41 | 
42 | def replace_temperature(match) -> str:
43 |     """
44 |     Args:
45 |         match (re.Match)
46 |     Returns:
47 |         str
48 |     """
49 |     sign = match.group(1)
50 |     temperature = match.group(2)
51 |     unit = match.group(3)
52 |     sign: str = "零下" if sign else ""
53 |     temperature: str = num2str(temperature)
54 |     unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 |     result = f"{sign}{temperature}{unit}"
56 |     return result
57 | 
58 | 
59 | def replace_measure(sentence) -> str:
60 |     for q_notation in measure_dict:
61 |         if q_notation in sentence:
62 |             sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 |     return sentence
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 RVC-Boss
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Synthesizers/base/Base_TTS_Synthesizer.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | 
  3 | from .Base_TTS_Task import Base_TTS_Task as TTS_Task
  4 | import json
  5 | from typing import List, Dict, Literal, Optional, Any, Union, Generator, Tuple
  6 | from pydantic import BaseModel, Field, model_validator
  7 | import numpy as np
  8 | from abc import ABC, abstractmethod
  9 | from typing import Dict, List, Union, Generator, Tuple
 10 | from typing_extensions import Literal
 11 | import numpy as np
 12 | import wave,io
 13 | 
 14 | class Base_TTS_Synthesizer(ABC):
 15 |     """
 16 |     Abstract base class for a Text-To-Speech (TTS) synthesizer.
 17 | 
 18 |     Attributes:
 19 |         ui_config (Dict[str, List]): A dictionary containing UI configuration settings.
 20 |         debug_mode (bool): Flag to toggle debug mode for additional logging and debugging information.
 21 | 
 22 |     """
 23 | 
 24 |     ui_config: Dict[str, List] = {}
 25 |     debug_mode: bool = False
 26 | 
 27 |     def __init__(self, **kwargs):
 28 |         """
 29 |         Initializes the TTS synthesizer with optional UI configurations and debug mode setting.
 30 | 
 31 |         Args:
 32 |             ui_config (Dict[str, List], optional): Configuration for user interface settings.
 33 |             debug_mode (bool, optional): Enables or disables debug mode.
 34 | 
 35 |         """
 36 |         self.ui_config = kwargs.get("ui_config", {})
 37 |         self.debug_mode = kwargs.get("debug_mode", False)
 38 | 
 39 |     @abstractmethod
 40 |     def generate(
 41 |         self,
 42 |         task: TTS_Task,
 43 |         return_type: Literal["filepath", "numpy"] = "numpy",
 44 |         save_path: Optional[str] = None,
 45 |     ) -> Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]:
 46 |         """
 47 |         Generates speech from a given TTS task.
 48 | 
 49 |         Args:
 50 |             task (TTS_Task): The task containing data and parameters for speech synthesis.
 51 |             return_type (Literal["filepath", "numpy"], optional): The type of return value, either a file path or audio data.
 52 |             save_path (str, optional): The path to save the audio file.
 53 |         Returns:
 54 |             Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]: Depending on the return_type, returns a file path, a generator of audio data, or other types.
 55 | 
 56 |         """
 57 |         pass
 58 | 
 59 |     @abstractmethod
 60 |     def get_characters(self):
 61 |         """
 62 |         Retrieves the available characters and their emotions for the TTS.
 63 | 
 64 |         Returns:
 65 |             Dict[str, List[str]]: A dictionary mapping character names to lists of their emotions.
 66 |         """
 67 |         pass
 68 | 
 69 |     @abstractmethod
 70 |     def params_parser(self, data):
 71 |         """
 72 |         Parses input data into a TTS_Task.
 73 | 
 74 |         Args:
 75 |             data (Any): The raw input data to be parsed.
 76 | 
 77 |         Returns:
 78 |             TTS_Task: A TTS task object created from the input data.
 79 |         """
 80 |         pass
 81 | 
 82 |     @abstractmethod
 83 |     def ms_like_parser(self, data):
 84 |         """
 85 |         Parses input data in a Microsoft-like format into a TTS_Task.
 86 | 
 87 |         Args:
 88 |             data (Any): The raw input data to be parsed.
 89 | 
 90 |         Returns:
 91 |             TTS_Task: A TTS task object created from the Microsoft-like formatted input data.
 92 |         """
 93 |         pass
 94 | 
 95 | 
 96 | def get_wave_header_chunk(sample_rate: int, channels: int = 1, sample_width: int = 2):
 97 |     """
 98 |     Generate a wave header with no data.
 99 | 
100 |     Args:
101 |         sample_rate (int): The sample rate of the audio.
102 |         channels (int, optional): The number of audio channels. Defaults to 1.
103 |         sample_width (int, optional): The sample width in bytes. Defaults to 2.
104 | 
105 |     Returns:
106 |         bytes: The wave header as bytes.
107 |     """
108 |     wav_buf = io.BytesIO()
109 |     with wave.open(wav_buf, "wb") as vfout:
110 |         vfout.setnchannels(channels)
111 |         vfout.setsampwidth(sample_width)
112 |         vfout.setframerate(sample_rate)
113 | 
114 |     wav_buf.seek(0)
115 |     return wav_buf.read()
116 | 


--------------------------------------------------------------------------------
/Synthesizers/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .Base_TTS_Task import Base_TTS_Task, ParamItem, init_params_config
2 | from .Base_TTS_Synthesizer import Base_TTS_Synthesizer, get_wave_header_chunk
3 | from .config_utils import load_config


--------------------------------------------------------------------------------
/Synthesizers/base/config_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional, Dict, List, Literal
 2 | from pydantic import BaseModel
 3 | import os, json
 4 | 
 5 | class ConfigItem(BaseModel):
 6 |     value : Optional[Any] = None
 7 |     default : Optional[Any] = None
 8 |     type : Optional[str] = None
 9 |     description : Optional[str] = None
10 |     
11 |     def __init__(self, **data):
12 |         super().__init__(**data)
13 |         if (self.value is None) and self.default is not None:
14 |             self.value = self.default
15 | 
16 | def is_config_item(item:Dict[str, Any])->bool:
17 |     """判断是否为配置项"""
18 |     return isinstance(item, dict) and ("value" in item or "default" in item)
19 | 
20 | def parse_config_dict(input_config:Dict[str, Any], output_config)->Dict[str, Any]:
21 |     
22 |     for key, res in input_config.items():
23 |         if is_config_item(res):
24 |             value = ConfigItem(**res).value
25 |         else:
26 |             if isinstance(res, dict):
27 |                 value = parse_config_dict(res, {})
28 |             else:
29 |                 value = res
30 |         output_config[key] = value
31 |     return output_config
32 |       
33 | def load_config(config_path:str)->Dict[str, Any]:
34 |     """加载配置文件"""
35 |     assert os.path.exists(config_path), f"配置文件不存在: {config_path}"
36 |     config:Dict[str, Any] = {}
37 |     with open(config_path, 'r', encoding='utf-8') as f:
38 |         config = parse_config_dict(json.load(f), {})
39 |     return config
40 | 
41 | 


--------------------------------------------------------------------------------
/Synthesizers/gsv_fast/__init__.py:
--------------------------------------------------------------------------------
1 | from .GSV_Synthesizer import GSV_Synthesizer as TTS_Synthesizer
2 | from .gsv_task import GSV_TTS_Task as TTS_Task


--------------------------------------------------------------------------------
/Synthesizers/gsv_fast/configs/i18n/locale/zh_CN.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ", 返回内容：": ", 返回内容：",
  3 |     "<p>这是模型管理界面，为了实现对多段参考音频分配情感设计，如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解，可参考文档：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>这是模型管理界面，为了实现对多段参考音频分配情感设计，如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解，可参考文档：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>",
  4 |     "Endpoint": "Endpoint",
  5 |     "GPT模型路径": "GPT模型路径",
  6 |     "Sovits模型路径": "Sovits模型路径",
  7 |     "Temperature": "Temperature",
  8 |     "Top K": "Top K",
  9 |     "Top P": "Top P",
 10 |     "all_ja": "只有日文",
 11 |     "all_zh": "只有中文",
 12 |     "auto": "自动判断",
 13 |     "auto_cut": "智能切分",
 14 |     "batch_size，1代表不并行，越大越快，但是越可能出问题": "batch_size，1代表不并行，越大越快，但是越可能出问题",
 15 |     "cut0": "仅凭换行切分",
 16 |     "cut1": "凑四句一切",
 17 |     "cut2": "凑50字一切",
 18 |     "cut3": "按中文句号。切",
 19 |     "cut4": "按英文句号.切",
 20 |     "cut5": "按标点符号切",
 21 |     "en": "英文",
 22 |     "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
 23 |     "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
 24 |     "ja": "日文",
 25 |     "json设置（一般不动）": "json设置（一般不动）",
 26 |     "zh": "中文",
 27 |     "不切": "不切",
 28 |     "人物情感列表网址": "人物情感列表网址",
 29 |     "从json中读取": "从json中读取",
 30 |     "使用前，请确认后端服务已启动。": "使用前，请确认后端服务已启动。",
 31 |     "保存json\n（可能不会有完成提示，没报错就是成功）": "保存json\n（可能不会有完成提示，没报错就是成功）",
 32 |     "保存失败！": "保存失败！",
 33 |     "保存成功！": "保存成功！",
 34 |     "停止播放": "停止播放",
 35 |     "切句方式": "切句方式",
 36 |     "前端处理后的文本(每句):": "前端处理后的文本(每句):",
 37 |     "参考音频在3~10秒范围外，请更换！": "参考音频在3~10秒范围外，请更换！",
 38 |     "参考音频路径": "参考音频路径",
 39 |     "发送json格式": "发送json格式",
 40 |     "发送并开始播放": "发送并开始播放",
 41 |     "发送请求": "发送请求",
 42 |     "发送请求到": "发送请求到",
 43 |     "吞字漏字属于正常现象，太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象，太严重可尝试换行、加句号或调节batch size滑条。",
 44 |     "吞字漏字属于正常现象，太严重可通过换行或加句号解决，或者更换参考音频（使用模型管理界面）、调节下方batch size滑条。": "吞字漏字属于正常现象，太严重可通过换行或加句号解决，或者更换参考音频（使用模型管理界面）、调节下方batch size滑条。",
 45 |     "基础选项": "基础选项",
 46 |     "实际输入的参考文本:": "实际输入的参考文本:",
 47 |     "实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):",
 48 |     "实际输入的目标文本(每句):": "实际输入的目标文本(每句):",
 49 |     "实际输入的目标文本:": "实际输入的目标文本:",
 50 |     "密码": "密码",
 51 |     "当前人物": "当前人物",
 52 |     "当前人物变更为: ": "当前人物变更为: ",
 53 |     "您在使用经典推理模式，部分选项不可用": "您在使用经典推理模式，部分选项不可用",
 54 |     "情感列表": "情感列表",
 55 |     "情感风格": "情感风格",
 56 |     "我是一个粉刷匠，粉刷本领强。我要把那新房子，刷得更漂亮。刷了房顶又刷墙，刷子像飞一样。哎呀我的小鼻子，变呀变了样。": "我是一个粉刷匠，粉刷本领强。我要把那新房子，刷得更漂亮。刷了房顶又刷墙，刷子像飞一样。哎呀我的小鼻子，变呀变了样。",
 57 |     "扫描": "扫描",
 58 |     "扫描人物列表": "扫描人物列表",
 59 |     "扫描模型文件夹:": "扫描模型文件夹:",
 60 |     "找不到模型文件！请把有效文件放置在文件夹下！！！": "找不到模型文件！请把有效文件放置在文件夹下！！！",
 61 |     "提供的推理特化包，当前版本：": "提供的推理特化包，当前版本：",
 62 |     "提示": "提示",
 63 |     "提示文本": "提示文本",
 64 |     "提示语言": "提示语言",
 65 |     "文件打开失败，保存失败！": "文件打开失败，保存失败！",
 66 |     "文本语言": "文本语言",
 67 |     "是否自动匹配情感": "是否自动匹配情感",
 68 |     "模型文件夹路径": "模型文件夹路径",
 69 |     "每句允许最大切分字词数": "每句允许最大切分字词数",
 70 |     "流式音频": "流式音频",
 71 |     "添加情感": "添加情感",
 72 |     "点击查看详细文档": "点击查看详细文档",
 73 |     "版本": "版本",
 74 |     "用户名": "用户名",
 75 |     "种子": "种子",
 76 |     "简介": "简介",
 77 |     "缺失某些项，保存失败！": "缺失某些项，保存失败！",
 78 |     "网址设置": "网址设置",
 79 |     "自动生成info": "自动生成info",
 80 |     "若有疑问或需要进一步了解，可参考文档：": "若有疑问或需要进一步了解，可参考文档：",
 81 |     "认证信息": "认证信息",
 82 |     "认证信息已启用，您可以在config.json中关闭。\n但是这个功能还没做好，只是摆设": "认证信息已启用，您可以在config.json中关闭。\n但是这个功能还没做好，只是摆设",
 83 |     "语速": "语速",
 84 |     "请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存",
 85 |     "请求失败，状态码：": "请求失败，状态码：",
 86 |     "请求失败，请检查URL是否正确": "请求失败，请检查URL是否正确",
 87 |     "请求完整音频": "请求完整音频",
 88 |     "请求网址": "请求网址",
 89 |     "输入文本": "输入文本",
 90 |     "这是一个由": "这是一个由",
 91 |     "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一个简单好用的前后端项目",
 92 |     "这是展示页面的版本，并未使用后端服务，下面参数无效。": "这是展示页面的版本，并未使用后端服务，下面参数无效。",
 93 |     "选择角色": "选择角色",
 94 |     "音频输出": "音频输出",
 95 |     "音频预览": "音频预览",
 96 |     "项目开源地址：": "项目开源地址：",
 97 |     "高级选项": "高级选项",
 98 |     "最大允许长度": "最大允许长度"
 99 | }
100 | 


--------------------------------------------------------------------------------
/Synthesizers/gsv_fast/configs/i18n/locale/zh_TW.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ", 返回内容：": ", 返回內容：",
  3 |     "<p>这是模型管理界面，为了实现对多段参考音频分配情感设计，如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解，可参考文档：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>這是模型管理介面，為了實現對多段參考音頻分配情緒設計，如果您只有一段可不使用這個介面</p><p>若有疑問或需要進一步了解，可參考文件：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">點擊查看詳細文件</a>。</p>",
  4 |     "Endpoint": "Endpoint",
  5 |     "GPT模型路径": "GPT模型路徑",
  6 |     "Sovits模型路径": "Sovits模型路徑",
  7 |     "Temperature": "Temperature",
  8 |     "Top K": "Top K",
  9 |     "Top P": "Top P",
 10 |     "all_ja": "僅日文",
 11 |     "all_zh": "僅中文",
 12 |     "auto": "自動判斷",
 13 |     "auto_cut": "智慧切分",
 14 |     "batch_size，1代表不并行，越大越快，但是越可能出问题": "batch_size，1代表不並行，越大越快，但是越可能出現問題",
 15 |     "cut0": "僅憑換行切分",
 16 |     "cut1": "湊四句一切",
 17 |     "cut2": "湊50字一切",
 18 |     "cut3": "按中文句號。切",
 19 |     "cut4": "按英文句號.切",
 20 |     "cut5": "按標點符號切",
 21 |     "en": "英文",
 22 |     "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
 23 |     "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
 24 |     "ja": "日文",
 25 |     "json设置（一般不动）": "json設置（一般不動）",
 26 |     "zh": "中文",
 27 |     "不切": "不切",
 28 |     "人物情感列表网址": "人物情緒列表網址",
 29 |     "从json中读取": "從json中讀取",
 30 |     "使用前，请确认后端服务已启动。": "使用前，請確認後端服務已啟動。",
 31 |     "保存json\n（可能不会有完成提示，没报错就是成功）": "保存json\n（可能不會有完成提示，沒報錯就是成功）",
 32 |     "保存失败！": "保存失敗！",
 33 |     "保存成功！": "保存成功！",
 34 |     "停止播放": "停止播放",
 35 |     "切句方式": "切句方式",
 36 |     "前端处理后的文本(每句):": "前端處理後的文本(每句):",
 37 |     "参考音频在3~10秒范围外，请更换！": "參考音頻在3~10秒範圍外，請更換！",
 38 |     "参考音频路径": "參考音頻路徑",
 39 |     "发送json格式": "發送json格式",
 40 |     "发送并开始播放": "發送並開始播放",
 41 |     "发送请求": "發送請求",
 42 |     "发送请求到": "發送請求到",
 43 |     "吞字漏字属于正常现象，太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象，太嚴重可通過換行或加句號解決，或調節batch size滑條。",
 44 |     "吞字漏字属于正常现象，太严重可通过换行或加句号解决，或者更换参考音频（使用模型管理界面）、调节下方batch size滑条。": "吞字漏字屬於正常現象，太嚴重可通過換行或加句號解決，或者更換參考音頻（使用模型管理介面）、調節下方batch size滑條。",
 45 |     "基础选项": "基礎選項",
 46 |     "实际输入的参考文本:": "實際輸入的參考文本:",
 47 |     "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):",
 48 |     "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
 49 |     "实际输入的目标文本:": "實際輸入的目標文本:",
 50 |     "密码": "密碼",
 51 |     "当前人物": "當前人物",
 52 |     "当前人物变更为: ": "當前人物變更為: ",
 53 |     "您在使用经典推理模式，部分选项不可用": "您在使用經典推理模式，部分選項不可用",
 54 |     "情感列表": "情緒列表",
 55 |     "情感风格": "情緒風格",
 56 |     "我是一个粉刷匠，粉刷本领强。我要把那新房子，刷得更漂亮。刷了房顶又刷墙，刷子像飞一样。哎呀我的小鼻子，变呀变了样。": "有時掉進黑洞，有時候爬上彩虹。在下一秒鐘，命運如何轉動，沒有人會曉得。我說希望無窮，你猜美夢成空，相信和懷疑，總要決鬥。",
 57 |     "扫描": "掃描",
 58 |     "扫描人物列表": "掃描人物列表",
 59 |     "扫描模型文件夹:": "掃描模型文件夾:",
 60 |     "找不到模型文件！请把有效文件放置在文件夹下！！！": "找不到模型文件！請把有效文件放置在文件夾下！！！",
 61 |     "提供的推理特化包，当前版本：": "提供的推理特化包，當前版本：",
 62 |     "提示": "提示",
 63 |     "提示文本": "提示文本",
 64 |     "提示语言": "提示語言",
 65 |     "文件打开失败，保存失败！": "文件開啟失敗，保存失敗！",
 66 |     "文本语言": "文本語言",
 67 |     "是否自动匹配情感": "是否自動匹配情緒",
 68 |     "模型文件夹路径": "模型文件夾路徑",
 69 |     "每句允许最大切分字词数": "每句允許最大切分字詞數",
 70 |     "流式音频": "流式音頻",
 71 |     "添加情感": "添加情緒",
 72 |     "点击查看详细文档": "點擊查看詳細文件",
 73 |     "版本": "版本",
 74 |     "用户名": "使用者名稱",
 75 |     "种子": "種子",
 76 |     "简介": "簡介",
 77 |     "缺失某些项，保存失败！": "缺失某些項，保存失敗！",
 78 |     "网址设置": "網址設置",
 79 |     "自动生成info": "自動生成info",
 80 |     "若有疑问或需要进一步了解，可参考文档：": "若有疑問或需要進一步了解，可參考文件：",
 81 |     "认证信息": "認證信息",
 82 |     "认证信息已启用，您可以在config.json中关闭。\n但是这个功能还没做好，只是摆设": "認證信息已啟用，您可以在config.json中關閉。\n但是這個功能還沒做好，只是擺設",
 83 |     "语速": "語速",
 84 |     "请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存",
 85 |     "请求失败，状态码：": "請求失敗，狀態碼：",
 86 |     "请求失败，请检查URL是否正确": "請求失敗，請檢查URL是否正確",
 87 |     "请求完整音频": "請求完整音頻",
 88 |     "请求网址": "請求網址",
 89 |     "输入文本": "輸入文本",
 90 |     "这是一个由": "這是一個由",
 91 |     "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一個簡單好用的前後端項目",
 92 |     "这是展示页面的版本，并未使用后端服务，下面参数无效。": "這是展示頁面的版本，並未使用後端服務，下面參數無效。",
 93 |     "选择角色": "選擇角色",
 94 |     "音频输出": "音頻輸出",
 95 |     "音频预览": "音頻預覽",
 96 |     "项目开源地址：": "Github Link：",
 97 |     "高级选项": "高級選項",
 98 |     "最大允许长度": "最大允許長度"
 99 | }
100 | 


--------------------------------------------------------------------------------
/Synthesizers/gsv_fast/configs/ui_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ref_settings": [["ref_audio_path", "prompt_text", "prompt_language"]],
 3 |   "basic_settings": [
 4 |     "speed",
 5 |     
 6 |     ["text_language", "cut_method", "max_cut_length", "batch_size"]
 7 |   ],
 8 |   "advanced_settings": [
 9 |     "seed",
10 |     "parallel_infer",
11 |     ["top_k", "top_p", "temperature", "repetition_penalty"]
12 |   ]
13 | }
14 | 


--------------------------------------------------------------------------------
/Synthesizers/gsv_fast/gsv_task.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os, json, sys
 3 | sys.path.append(".")
 4 | 
 5 | from uuid import uuid4
 6 | from typing import List, Dict, Literal, Optional, Any, Union
 7 | import urllib.parse
 8 | import hashlib
 9 | 
10 | from Synthesizers.base import Base_TTS_Task, ParamItem, init_params_config
11 | 
12 | def get_params_config():
13 |     try:
14 |         with open(os.path.join("Synthesizers/gsv_fast/configs", "params_config.json"), "r", encoding="utf-8") as f:
15 |             return init_params_config(json.load(f))
16 |     except:
17 |         raise FileNotFoundError("params_config.json not found or invalid.")
18 |     
19 | 
20 | params_config = get_params_config()
21 | 
22 | from pydantic import BaseModel, Field, model_validator
23 | 
24 | class GSV_TTS_Task(Base_TTS_Task):
25 |     # character: Optional[str] = None
26 |     # emotion: Optional[str] = None
27 |     ref_audio_path: Optional[str] = None
28 |     prompt_text: Optional[str] = None
29 |     prompt_language: Optional[str] = None
30 |     text_language: Optional[str] = None
31 |     speaker_id: Optional[int] = None
32 |     batch_size: Optional[int] = None
33 |     top_k: Optional[int] = None
34 |     top_p: Optional[float] = None
35 |     temperature: Optional[float] = None
36 |     cut_method: Optional[str] = None
37 |     max_cut_length: Optional[int] = None
38 |     seed: Optional[int] = None
39 |     save_temp: Optional[bool] = False
40 |     parallel_infer : Optional[bool] = True
41 |     repetition_penalty : Optional[float] = 1.35
42 |     # the gsv_fast model only supports 32000 sample rate
43 |     sample_rate: int = 32000
44 |     
45 |     def __init__(self, other_task: Union[BaseModel, dict, None] = None, **data):
46 |         data.setdefault('params_config', params_config)
47 |         super().__init__(other_task, **data)
48 |     
49 |     @property
50 |     def md5(self):
51 |         m = hashlib.md5()
52 |         if self.task_type == "audio":
53 |             m.update(self.src.encode())
54 |         elif self.task_type == "ssml":
55 |             m.update(self.ssml.encode())
56 |         elif self.task_type == "text":
57 |             m.update(self.text.encode())
58 |             m.update(self.text_language.encode())
59 |             m.update(self.character.encode())
60 |             m.update(str(self.speaker_id).encode())
61 |             m.update(str(self.speed).encode())
62 |             m.update(str(self.top_k).encode())
63 |             m.update(str(self.top_p).encode())
64 |             m.update(str(self.temperature).encode())
65 |             m.update(str(self.cut_method).encode())
66 |             m.update(str(self.emotion).encode())
67 |         return m.hexdigest()
68 |     
69 |     
70 | 
71 | 


--------------------------------------------------------------------------------
/Synthesizers/remote/Remote_Synthesizer.py:
--------------------------------------------------------------------------------
  1 | import io, wave
  2 | import os, json, sys
  3 | import threading
  4 | 
  5 | from Synthesizers.base import Base_TTS_Synthesizer ,load_config
  6 | 
  7 | from .remote_task import Remote_TTS_Task as TTS_Task, set_based_synthesizer, get_ui_config
  8 | import requests
  9 | from urllib import parse
 10 | from datetime import datetime
 11 | from typing import Union, Generator, Tuple, Any, Optional, Dict, Literal
 12 | import numpy as np
 13 | import soundfile as sf
 14 | 
 15 | class Remote_Synthesizer(Base_TTS_Synthesizer):
 16 |     url :str = "http://127.0.0.1:5000"
 17 |     tts_endpoint:str = "/tts"
 18 |     character_endpoint:str = "/character_list"
 19 |     based_synthesizer :str = "gsv_fast"
 20 |     class Config:
 21 |         extra = "ignore"
 22 |     def __init__(self, config_path:str = None, **kwargs):
 23 |         super().__init__(**kwargs)
 24 |         if config_path is None:
 25 |             config_path = os.path.join(os.path.dirname(__file__), "configs", "config.json")
 26 |         config_dict = load_config(config_path)
 27 |         config_dict.update(kwargs)
 28 |         for key, value in config_dict.items():
 29 |             if hasattr(self, key):
 30 |                 setattr(self, key, value)
 31 |         set_based_synthesizer(self.based_synthesizer)
 32 |         self.ui_config = get_ui_config(self.based_synthesizer)
 33 | 
 34 |     def get_characters(self)-> dict:
 35 |         url = self.url + self.character_endpoint
 36 |         res = requests.get(url)
 37 |         return json.loads(res.text)
 38 | 
 39 |     @staticmethod
 40 |     def stream_audio(url, data: Dict[str, Any]) -> Generator[Tuple[int, np.ndarray], None, None]:
 41 |         headers = {"Content-Type": "application/json"}
 42 |         # 发起POST请求，获取响应流
 43 |         response = requests.post(
 44 |             url, data=json.dumps(data), headers=headers, stream=True
 45 |         )
 46 |         chunk_size = 1024
 47 |         # 确保请求成功
 48 |         if response.status_code == 200:
 49 |             # 循环读取音频流
 50 |             for chunk in response.iter_content(chunk_size):
 51 |                 # 将二进制数据转换为numpy数组，这里假设音频数据是16位整数格式
 52 |                 audiodata = np.frombuffer(chunk, dtype=np.int16)
 53 |                 yield 32000, audiodata
 54 |         else:
 55 |             raise Exception(
 56 |                 f"Failed to get audio stream, status code: {response.status_code}"
 57 |             )
 58 |     def generate(
 59 |         self,
 60 |         task: TTS_Task,
 61 |         return_type: Literal["filepath", "numpy"] = "numpy",
 62 |         save_path: Optional[str] = None,
 63 |     ) -> Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]:
 64 |         
 65 |         
 66 |         url = self.url + self.tts_endpoint
 67 |         data = task.data
 68 |         print(return_type)
 69 |         
 70 |         if self.debug_mode:
 71 |             print(f"generate task: \n{data}")
 72 |         headers = {"Content-Type": "application/json"}
 73 |         if return_type == "filepath" or (
 74 |             return_type == "numpy" and not task.stream
 75 |         ):
 76 |             if save_path is None:
 77 |                 save_path = f"tmp_audio/{datetime.now().strftime('%Y%m%d%H%M%S')}.wav"
 78 |             res = requests.post(url, data=json.dumps(data), headers=headers)
 79 |             if res.status_code == 200:
 80 |                 with open(save_path, "wb") as f:
 81 |                     f.write(res.content)
 82 |                 if return_type == "filepath":
 83 |                     return save_path
 84 |                 else:
 85 |                     audiodata, sr = sf.read(save_path)
 86 |                     return ((sr, audiodata) for _ in range(1))
 87 |             else:
 88 |                 raise Exception(f"remote synthesizer error: {res.text}")
 89 | 
 90 |         elif return_type == "numpy" and task.stream:
 91 |             return self.stream_audio(url, data)
 92 |             
 93 | 
 94 |     def params_parser(self, data) -> TTS_Task:
 95 |         task = TTS_Task(based_synthesizer=self.based_synthesizer, **data)
 96 |         return task
 97 | 
 98 |     def ms_like_parser(self,data) -> TTS_Task:
 99 |         task = TTS_Task(based_synthesizer=self.based_synthesizer, **data)
100 |         return task
101 | 


--------------------------------------------------------------------------------
/Synthesizers/remote/__init__.py:
--------------------------------------------------------------------------------
1 | from .Remote_Synthesizer import Remote_Synthesizer as TTS_Synthesizer
2 | from .remote_task import Remote_TTS_Task as TTS_Task


--------------------------------------------------------------------------------
/Synthesizers/remote/configs/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "url": "http://localhost:5000",
3 |     "tts_endpoint": "/tts",
4 |     "character_endpoint": "/character_list",
5 |     "based_synthesizer": "gsv_fast"
6 | }
7 | 


--------------------------------------------------------------------------------
/Synthesizers/remote/configs/i18n/locale/zh_CN.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ", 返回内容：": ", 返回内容：",
  3 |     "<p>这是模型管理界面，为了实现对多段参考音频分配情感设计，如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解，可参考文档：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>这是模型管理界面，为了实现对多段参考音频分配情感设计，如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解，可参考文档：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>",
  4 |     "Endpoint": "Endpoint",
  5 |     "GPT模型路径": "GPT模型路径",
  6 |     "Sovits模型路径": "Sovits模型路径",
  7 |     "Temperature": "Temperature",
  8 |     "Top K": "Top K",
  9 |     "Top P": "Top P",
 10 |     "all_ja": "只有日文",
 11 |     "all_zh": "只有中文",
 12 |     "auto": "自动判断",
 13 |     "auto_cut": "智能切分",
 14 |     "batch_size，1代表不并行，越大越快，但是越可能出问题": "batch_size，1代表不并行，越大越快，但是越可能出问题",
 15 |     "cut0": "仅凭换行切分",
 16 |     "cut1": "凑四句一切",
 17 |     "cut2": "凑50字一切",
 18 |     "cut3": "按中文句号。切",
 19 |     "cut4": "按英文句号.切",
 20 |     "cut5": "按标点符号切",
 21 |     "en": "英文",
 22 |     "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
 23 |     "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
 24 |     "ja": "日文",
 25 |     "json设置（一般不动）": "json设置（一般不动）",
 26 |     "zh": "中文",
 27 |     "不切": "不切",
 28 |     "人物情感列表网址": "人物情感列表网址",
 29 |     "从json中读取": "从json中读取",
 30 |     "使用前，请确认后端服务已启动。": "使用前，请确认后端服务已启动。",
 31 |     "保存json\n（可能不会有完成提示，没报错就是成功）": "保存json\n（可能不会有完成提示，没报错就是成功）",
 32 |     "保存失败！": "保存失败！",
 33 |     "保存成功！": "保存成功！",
 34 |     "停止播放": "停止播放",
 35 |     "切句方式": "切句方式",
 36 |     "前端处理后的文本(每句):": "前端处理后的文本(每句):",
 37 |     "参考音频在3~10秒范围外，请更换！": "参考音频在3~10秒范围外，请更换！",
 38 |     "参考音频路径": "参考音频路径",
 39 |     "发送json格式": "发送json格式",
 40 |     "发送并开始播放": "发送并开始播放",
 41 |     "发送请求": "发送请求",
 42 |     "发送请求到": "发送请求到",
 43 |     "吞字漏字属于正常现象，太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象，太严重可尝试换行、加句号或调节batch size滑条。",
 44 |     "吞字漏字属于正常现象，太严重可通过换行或加句号解决，或者更换参考音频（使用模型管理界面）、调节下方batch size滑条。": "吞字漏字属于正常现象，太严重可通过换行或加句号解决，或者更换参考音频（使用模型管理界面）、调节下方batch size滑条。",
 45 |     "基础选项": "基础选项",
 46 |     "实际输入的参考文本:": "实际输入的参考文本:",
 47 |     "实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):",
 48 |     "实际输入的目标文本(每句):": "实际输入的目标文本(每句):",
 49 |     "实际输入的目标文本:": "实际输入的目标文本:",
 50 |     "密码": "密码",
 51 |     "当前人物": "当前人物",
 52 |     "当前人物变更为: ": "当前人物变更为: ",
 53 |     "您在使用经典推理模式，部分选项不可用": "您在使用经典推理模式，部分选项不可用",
 54 |     "情感列表": "情感列表",
 55 |     "情感风格": "情感风格",
 56 |     "我是一个粉刷匠，粉刷本领强。我要把那新房子，刷得更漂亮。刷了房顶又刷墙，刷子像飞一样。哎呀我的小鼻子，变呀变了样。": "我是一个粉刷匠，粉刷本领强。我要把那新房子，刷得更漂亮。刷了房顶又刷墙，刷子像飞一样。哎呀我的小鼻子，变呀变了样。",
 57 |     "扫描": "扫描",
 58 |     "扫描人物列表": "扫描人物列表",
 59 |     "扫描模型文件夹:": "扫描模型文件夹:",
 60 |     "找不到模型文件！请把有效文件放置在文件夹下！！！": "找不到模型文件！请把有效文件放置在文件夹下！！！",
 61 |     "提供的推理特化包，当前版本：": "提供的推理特化包，当前版本：",
 62 |     "提示": "提示",
 63 |     "提示文本": "提示文本",
 64 |     "提示语言": "提示语言",
 65 |     "文件打开失败，保存失败！": "文件打开失败，保存失败！",
 66 |     "文本语言": "文本语言",
 67 |     "是否自动匹配情感": "是否自动匹配情感",
 68 |     "模型文件夹路径": "模型文件夹路径",
 69 |     "每句允许最大切分字词数": "每句允许最大切分字词数",
 70 |     "流式音频": "流式音频",
 71 |     "添加情感": "添加情感",
 72 |     "点击查看详细文档": "点击查看详细文档",
 73 |     "版本": "版本",
 74 |     "用户名": "用户名",
 75 |     "种子": "种子",
 76 |     "简介": "简介",
 77 |     "缺失某些项，保存失败！": "缺失某些项，保存失败！",
 78 |     "网址设置": "网址设置",
 79 |     "自动生成info": "自动生成info",
 80 |     "若有疑问或需要进一步了解，可参考文档：": "若有疑问或需要进一步了解，可参考文档：",
 81 |     "认证信息": "认证信息",
 82 |     "认证信息已启用，您可以在config.json中关闭。\n但是这个功能还没做好，只是摆设": "认证信息已启用，您可以在config.json中关闭。\n但是这个功能还没做好，只是摆设",
 83 |     "语速": "语速",
 84 |     "请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存",
 85 |     "请求失败，状态码：": "请求失败，状态码：",
 86 |     "请求失败，请检查URL是否正确": "请求失败，请检查URL是否正确",
 87 |     "请求完整音频": "请求完整音频",
 88 |     "请求网址": "请求网址",
 89 |     "输入文本": "输入文本",
 90 |     "这是一个由": "这是一个由",
 91 |     "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一个简单好用的前后端项目",
 92 |     "这是展示页面的版本，并未使用后端服务，下面参数无效。": "这是展示页面的版本，并未使用后端服务，下面参数无效。",
 93 |     "选择角色": "选择角色",
 94 |     "音频输出": "音频输出",
 95 |     "音频预览": "音频预览",
 96 |     "项目开源地址：": "项目开源地址：",
 97 |     "高级选项": "高级选项",
 98 |     "最大允许长度": "最大允许长度"
 99 | }
100 | 


--------------------------------------------------------------------------------
/Synthesizers/remote/configs/i18n/locale/zh_TW.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ", 返回内容：": ", 返回內容：",
  3 |     "<p>这是模型管理界面，为了实现对多段参考音频分配情感设计，如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解，可参考文档：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>這是模型管理介面，為了實現對多段參考音頻分配情緒設計，如果您只有一段可不使用這個介面</p><p>若有疑問或需要進一步了解，可參考文件：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">點擊查看詳細文件</a>。</p>",
  4 |     "Endpoint": "Endpoint",
  5 |     "GPT模型路径": "GPT模型路徑",
  6 |     "Sovits模型路径": "Sovits模型路徑",
  7 |     "Temperature": "Temperature",
  8 |     "Top K": "Top K",
  9 |     "Top P": "Top P",
 10 |     "all_ja": "僅日文",
 11 |     "all_zh": "僅中文",
 12 |     "auto": "自動判斷",
 13 |     "auto_cut": "智慧切分",
 14 |     "batch_size，1代表不并行，越大越快，但是越可能出问题": "batch_size，1代表不並行，越大越快，但是越可能出現問題",
 15 |     "cut0": "僅憑換行切分",
 16 |     "cut1": "湊四句一切",
 17 |     "cut2": "湊50字一切",
 18 |     "cut3": "按中文句號。切",
 19 |     "cut4": "按英文句號.切",
 20 |     "cut5": "按標點符號切",
 21 |     "en": "英文",
 22 |     "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
 23 |     "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
 24 |     "ja": "日文",
 25 |     "json设置（一般不动）": "json設置（一般不動）",
 26 |     "zh": "中文",
 27 |     "不切": "不切",
 28 |     "人物情感列表网址": "人物情緒列表網址",
 29 |     "从json中读取": "從json中讀取",
 30 |     "使用前，请确认后端服务已启动。": "使用前，請確認後端服務已啟動。",
 31 |     "保存json\n（可能不会有完成提示，没报错就是成功）": "保存json\n（可能不會有完成提示，沒報錯就是成功）",
 32 |     "保存失败！": "保存失敗！",
 33 |     "保存成功！": "保存成功！",
 34 |     "停止播放": "停止播放",
 35 |     "切句方式": "切句方式",
 36 |     "前端处理后的文本(每句):": "前端處理後的文本(每句):",
 37 |     "参考音频在3~10秒范围外，请更换！": "參考音頻在3~10秒範圍外，請更換！",
 38 |     "参考音频路径": "參考音頻路徑",
 39 |     "发送json格式": "發送json格式",
 40 |     "发送并开始播放": "發送並開始播放",
 41 |     "发送请求": "發送請求",
 42 |     "发送请求到": "發送請求到",
 43 |     "吞字漏字属于正常现象，太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象，太嚴重可通過換行或加句號解決，或調節batch size滑條。",
 44 |     "吞字漏字属于正常现象，太严重可通过换行或加句号解决，或者更换参考音频（使用模型管理界面）、调节下方batch size滑条。": "吞字漏字屬於正常現象，太嚴重可通過換行或加句號解決，或者更換參考音頻（使用模型管理介面）、調節下方batch size滑條。",
 45 |     "基础选项": "基礎選項",
 46 |     "实际输入的参考文本:": "實際輸入的參考文本:",
 47 |     "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):",
 48 |     "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
 49 |     "实际输入的目标文本:": "實際輸入的目標文本:",
 50 |     "密码": "密碼",
 51 |     "当前人物": "當前人物",
 52 |     "当前人物变更为: ": "當前人物變更為: ",
 53 |     "您在使用经典推理模式，部分选项不可用": "您在使用經典推理模式，部分選項不可用",
 54 |     "情感列表": "情緒列表",
 55 |     "情感风格": "情緒風格",
 56 |     "我是一个粉刷匠，粉刷本领强。我要把那新房子，刷得更漂亮。刷了房顶又刷墙，刷子像飞一样。哎呀我的小鼻子，变呀变了样。": "有時掉進黑洞，有時候爬上彩虹。在下一秒鐘，命運如何轉動，沒有人會曉得。我說希望無窮，你猜美夢成空，相信和懷疑，總要決鬥。",
 57 |     "扫描": "掃描",
 58 |     "扫描人物列表": "掃描人物列表",
 59 |     "扫描模型文件夹:": "掃描模型文件夾:",
 60 |     "找不到模型文件！请把有效文件放置在文件夹下！！！": "找不到模型文件！請把有效文件放置在文件夾下！！！",
 61 |     "提供的推理特化包，当前版本：": "提供的推理特化包，當前版本：",
 62 |     "提示": "提示",
 63 |     "提示文本": "提示文本",
 64 |     "提示语言": "提示語言",
 65 |     "文件打开失败，保存失败！": "文件開啟失敗，保存失敗！",
 66 |     "文本语言": "文本語言",
 67 |     "是否自动匹配情感": "是否自動匹配情緒",
 68 |     "模型文件夹路径": "模型文件夾路徑",
 69 |     "每句允许最大切分字词数": "每句允許最大切分字詞數",
 70 |     "流式音频": "流式音頻",
 71 |     "添加情感": "添加情緒",
 72 |     "点击查看详细文档": "點擊查看詳細文件",
 73 |     "版本": "版本",
 74 |     "用户名": "使用者名稱",
 75 |     "种子": "種子",
 76 |     "简介": "簡介",
 77 |     "缺失某些项，保存失败！": "缺失某些項，保存失敗！",
 78 |     "网址设置": "網址設置",
 79 |     "自动生成info": "自動生成info",
 80 |     "若有疑问或需要进一步了解，可参考文档：": "若有疑問或需要進一步了解，可參考文件：",
 81 |     "认证信息": "認證信息",
 82 |     "认证信息已启用，您可以在config.json中关闭。\n但是这个功能还没做好，只是摆设": "認證信息已啟用，您可以在config.json中關閉。\n但是這個功能還沒做好，只是擺設",
 83 |     "语速": "語速",
 84 |     "请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存",
 85 |     "请求失败，状态码：": "請求失敗，狀態碼：",
 86 |     "请求失败，请检查URL是否正确": "請求失敗，請檢查URL是否正確",
 87 |     "请求完整音频": "請求完整音頻",
 88 |     "请求网址": "請求網址",
 89 |     "输入文本": "輸入文本",
 90 |     "这是一个由": "這是一個由",
 91 |     "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一個簡單好用的前後端項目",
 92 |     "这是展示页面的版本，并未使用后端服务，下面参数无效。": "這是展示頁面的版本，並未使用後端服務，下面參數無效。",
 93 |     "选择角色": "選擇角色",
 94 |     "音频输出": "音頻輸出",
 95 |     "音频预览": "音頻預覽",
 96 |     "项目开源地址：": "Github Link：",
 97 |     "高级选项": "高級選項",
 98 |     "最大允许长度": "最大允許長度"
 99 | }
100 | 


--------------------------------------------------------------------------------
/Synthesizers/remote/configs/params_config.json:
--------------------------------------------------------------------------------
1 | {
2 | 
3 | }
4 | 


--------------------------------------------------------------------------------
/Synthesizers/remote/configs/ui_config.json:
--------------------------------------------------------------------------------
1 | {
2 | 
3 | }
4 | 


--------------------------------------------------------------------------------
/Synthesizers/remote/remote_task.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os, json, sys
 3 | sys.path.append(".")
 4 | 
 5 | from uuid import uuid4
 6 | from typing import List, Dict, Literal, Optional, Any, Union
 7 | import urllib.parse
 8 | import hashlib
 9 | 
10 | from Synthesizers.base import Base_TTS_Task, ParamItem, init_params_config
11 | 
12 | global global_based_synthesizer
13 | global_based_synthesizer = None
14 | 
15 | def set_based_synthesizer(based_synthesizer:str):
16 |     global global_based_synthesizer
17 |     global_based_synthesizer = based_synthesizer
18 | 
19 | def get_params_config(based_synthesizer:str= None):
20 |     assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first."
21 |     try:
22 |         with open(os.path.join(os.path.dirname(__file__), "configs", "params_config.json"), "r", encoding="utf-8") as f:
23 |             res:dict = json.load(f)
24 |         with open(os.path.join("Synthesizers", based_synthesizer ,"configs", "params_config.json"), "r", encoding="utf-8") as f:
25 |             res.update(json.load(f))
26 |         return init_params_config(res)
27 |     except:
28 |         raise FileNotFoundError("params_config.json not found or invalid.")
29 | 
30 | params_config = None
31 | 
32 | def get_ui_config(based_synthesizer:str= None)->Dict[str, Any]:
33 |     if based_synthesizer is None:
34 |         based_synthesizer = global_based_synthesizer
35 |     assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first."
36 |     
37 |     remote_ui_config_path = os.path.join(os.path.dirname(__file__), "configs", "ui_config.json")
38 |     based_ui_config_path = os.path.join("Synthesizers", based_synthesizer ,"configs", "ui_config.json")
39 |     
40 |     ui_config :Dict[str, Any] = {}
41 |     try:
42 |         with open(remote_ui_config_path, "r", encoding="utf-8") as f:
43 |             ui_config.update(json.load(f))
44 |         with open(based_ui_config_path, "r", encoding="utf-8") as f:
45 |             ui_config.update(json.load(f))
46 |         return ui_config
47 |     except:
48 |         raise FileNotFoundError("ui_config.json not found or invalid.")
49 | 
50 | from pydantic import BaseModel, Field, model_validator
51 | from copy import deepcopy
52 | class Remote_TTS_Task(Base_TTS_Task):
53 |     
54 |     is_remote: Optional[bool] = True
55 |     data : dict = {}
56 |     
57 |     class Config:
58 |         extra = "ignore"
59 |     
60 |     def __init__(self, based_synthesizer:str=None, **data):
61 |         
62 |         global params_config
63 |         based_synthesizer = based_synthesizer if based_synthesizer is not None else global_based_synthesizer
64 |         assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first."
65 |         if params_config is None:
66 |             params_config = get_params_config(based_synthesizer)
67 |         copyed_data = deepcopy(data)
68 |         copyed_data.setdefault("params_config",params_config)
69 |         super().__init__(**copyed_data)
70 |         self.data = data
71 |     
72 |     @property
73 |     def md5(self):
74 |         m = hashlib.md5()
75 |         m.update(self.data.__str__().encode())
76 |         return m.hexdigest()
77 |     
78 |     def __str__(self):
79 |         content = super().__str__()
80 |         return f"{content}"
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/api_doc.md:
--------------------------------------------------------------------------------
  1 | ## Overview
  2 | 
  3 | This document aims to introduce how to use our Text-to-Speech API, including making requests via GET and POST methods. This API supports converting text into the voice of specified characters and supports different languages and emotional expressions.
  4 | 
  5 | ## Character and Emotion List
  6 | 
  7 | To obtain the supported characters and their corresponding emotions, please visit the following URL:
  8 | 
  9 | - URL: `http://127.0.0.1:5000/character_list`
 10 | - Returns: A JSON format list of characters and corresponding emotions
 11 | - Method: `GET`
 12 | 
 13 | ```
 14 | {
 15 |     "Hanabi": [
 16 |         "default",
 17 |         "Normal",
 18 |         "Yandere",
 19 |     ],
 20 |     "Hutao": [
 21 |         "default"
 22 |     ]
 23 | }
 24 | ```
 25 | 
 26 | ## Regarding Aliases
 27 | 
 28 | From version 2.2.4, an alias system was added. Detailed allowed aliases can be found in `Inference/params_config.json`.
 29 | 
 30 | ## Text-to-Speech
 31 | 
 32 | - URL: `http://127.0.0.1:5000/tts`
 33 | - Returns:  Audio on success. Error message on failure.
 34 | - Method: `GET`/`POST`
 35 | 
 36 | ### GET Method
 37 | 
 38 | #### Format
 39 | 
 40 | ```
 41 | http://127.0.0.1:5000/tts?character={{characterName}}&text={{text}}
 42 | ```
 43 | 
 44 | - Parameter explanation:
 45 |   - `character`: The name of the character folder, pay attention to case sensitivity, full/half width, and language (Chinese/English).
 46 |   - `text`: The text to be converted, URL encoding is recommended.
 47 |   - Optional parameters include `text_language`, `format`, `top_k`, `top_p`, `batch_size`, `speed`, `temperature`, `emotion`, `save_temp`, and `stream`, detailed explanations are provided in the POST section below.
 48 | - From version 2.2.4, an alias system was added, with detailed allowed aliases found in `Inference/params_config.json`.
 49 | 
 50 | ### POST Method
 51 | 
 52 | #### JSON Package Format
 53 | 
 54 | ##### All Parameters
 55 | 
 56 | ```
 57 | {
 58 |     "method": "POST",
 59 |     "body": {
 60 |         "character": "${chaName}",
 61 |         "emotion": "${Emotion}",
 62 |         "text": "${speakText}",
 63 |         "text_language": "${textLanguage}",
 64 |         "batch_size": ${batch_size},
 65 |         "speed": ${speed},
 66 |         "top_k": ${topK},
 67 |         "top_p": ${topP},
 68 |         "temperature": ${temperature},
 69 |         "stream": "${stream}",
 70 |         "format": "${Format}",
 71 |         "save_temp": "${saveTemp}"
 72 |     }
 73 | }
 74 | ```
 75 | 
 76 | You can omit one or more items. From version 2.2.4, an alias system was introduced, detailed allowed aliases can be found in `Inference/params_config.json`.
 77 | 
 78 | ##### Minimal Data:
 79 | 
 80 | ```
 81 | {
 82 |     "method": "POST",
 83 |     "body": {
 84 |         "text": "${speakText}"
 85 |     }
 86 | }
 87 | ```
 88 | 
 89 | ##### Parameter Explanation
 90 | 
 91 | - **text**: The text to be converted, URL encoding is recommended.
 92 | - **character**: Character folder name, pay attention to case sensitivity, full/half width, and language.
 93 | - **emotion**: Character emotion, must be an actually supported emotion of the character, otherwise, the default emotion will be used.
 94 | - **text_language**: Text language (auto / zh / en / ja), default is multilingual mixed. 
 95 | - **top_k**, **top_p**, **temperature**: GPT model parameters, no need to modify if unfamiliar.
 96 | 
 97 | - **batch_size**: How many batches at a time, can be increased for faster processing if you have a powerful computer, integer, default is 1.
 98 | - **speed**: Speech speed, default is 1.0.
 99 | - **save_temp**: Whether to save temporary files, when true, the backend will save the generated audio, and subsequent identical requests will directly return that data, default is false.
100 | - **stream**: Whether to stream, when true, audio will be returned sentence by sentence, default is false.
101 | - **format**: Format, default is WAV, allows MP3/ WAV/ OGG.
102 | 
103 | 


--------------------------------------------------------------------------------
/common_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app_config": {
 3 |     "locale": {
 4 |       "default": "auto",
 5 |       "description": "Locale settings for the application",
 6 |       "label": "语言",
 7 |       "type": "string",
 8 |       "choices": ["auto", "en_US", "zh_CN", "zh_TW"]
 9 |     },
10 |     "server_port": {
11 |       "default": 5000,
12 |       "description": "Port number for the application, -1 for auto select",
13 |       "label": "服务端口",
14 |       "type": "integer"
15 |     },
16 |     "server_name": {
17 |       "default": "0.0.0.0",
18 |       "description": "Host address for the application",
19 |       "label": "服务主机",
20 |       "type": "string",
21 |       "choices": ["127.0.0.1", "0.0.0.0"]
22 |     },
23 |     "inbrowser": {
24 |       "default": true,
25 |       "description": "Flag to indicate if the application is running in browser",
26 |       "label": "是否在浏览器中打开",
27 |       "type": "boolean"
28 |     },
29 |     "synthesizer": {
30 |       "default": "gsv_fast",
31 |       "description": "Synthesizer used by app.py, 'remote' for using TTS service running on a remote host",
32 |       "label": "Web UI 所采用的语音合成器",
33 |       "type": "string",
34 |       "choices": ["gsv_fast", "remote"]
35 |     },
36 |     "also_enable_api": {
37 |       "default": true,
38 |       "description": "Flag to indicate if API is enabled",
39 |       "label": "是否启用API",
40 |       "type": "boolean"
41 |     },
42 |     "max_text_length": {
43 |       "default": -1,
44 |       "description": "Maximum length of text to synthesize in Web UI",
45 |       "label": "Max Text Length",
46 |       "type": "integer"
47 |     },
48 |     "is_share": {
49 |       "default": false,
50 |       "description": "Flag to indicate if sharing is enabled",
51 |       "label": "是否分享",
52 |       "type": "boolean"
53 |     }
54 |   },
55 |   "pure_api_config": {
56 |     "tts_port": {
57 |       "default": 5000,
58 |       "description": "Port number for TTS service",
59 |       "label": "tts服务端口",
60 |       "type": "integer"
61 |     },
62 |     "tts_host": {
63 |       "default": "0.0.0.0",
64 |       "description": "Host address for TTS service",
65 |       "label": "tts主机端口",
66 |       "type": "string",
67 |       "choices": ["127.0.0.1", "0.0.0.0"]
68 |     },
69 |     "synthesizer": {
70 |       "default": "gsv_fast",
71 |       "description": "Synthesizer used by api.py",
72 |       "label": "api.py 所采用的语音合成器",
73 |       "type": "string",
74 |       "choices": ["gsv_fast"]
75 |     }
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   gsvi:
 5 |     image: breakstring/gsvi:latest   # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as gsvi, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container.
 6 |     container_name: gsvi-container
 7 |     environment:
 8 |       - is_half=False
 9 |       - is_share=False
10 |     volumes:
11 |       - ./output:/workspace/output
12 |       - ./logs:/workspace/logs
13 |       - ./SoVITS_weights:/workspace/SoVITS_weights
14 |       - ./reference:/workspace/reference
15 |     working_dir: /workspace
16 |     ports:
17 |       - "9880:9880"
18 |       - "9871:9871"
19 |       - "9872:9872"
20 |       - "9873:9873"
21 |       - "9874:9874"
22 |     shm_size: 16G
23 |     deploy:
24 |       resources:
25 |         reservations:
26 |           devices:
27 |           - driver: nvidia
28 |             count: "all"
29 |             capabilities: [gpu]
30 |     stdin_open: true
31 |     tty: true
32 |     restart: unless-stopped
33 | 


--------------------------------------------------------------------------------
/dockerbuild.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 获取当前日期，格式为 YYYYMMDD
 4 | DATE=$(date +%Y%m%d)
 5 | # 获取最新的 Git commit 哈希值的前 7 位
 6 | COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7)
 7 | 
 8 | # 构建 full 版本的镜像
 9 | docker build --build-arg IMAGE_TYPE=full -t breakstring/gsvi:latest .
10 | # 为同一个镜像添加带日期的标签
11 | docker tag breakstring/gsvi:latest breakstring/gsvi:dev-$DATE
12 | # 为同一个镜像添加带当前代码库Commit哈希值的标签
13 | docker tag breakstring/gsvi:latest breakstring/gsvi:dev-$COMMIT_HASH
14 | 
15 | 
16 | # 构建 elite 版本的镜像(无模型下载步骤，需手工将模型下载安装进容器)
17 | docker build --build-arg IMAGE_TYPE=elite -t breakstring/gsvi:latest-elite .
18 | # 为同一个镜像添加带日期的标签
19 | docker tag breakstring/gsvi:latest-elite breakstring/gsvi:dev-$DATE-elite
20 | # 为同一个镜像添加带当前代码库Commit哈希值的标签
21 | docker tag breakstring/gsvi:latest-elite breakstring/gsvi:dev-$COMMIT_HASH-elite
22 | 


--------------------------------------------------------------------------------
/docs/cn/Changelog_CN.md:
--------------------------------------------------------------------------------
  1 | ### 20240121更新
  2 | 
  3 | 1-config添加is_share，诸如colab等场景可以将此改为True，来使得webui映射到公网
  4 | 
  5 | 2-WebUI添加英文系统英文翻译适配
  6 | 
  7 | 3-cmd-asr自动判断是否已自带damo模型，如不在默认目录上将从modelscope自带下载
  8 | 
  9 | 4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复（过滤长度0的样本等）
 10 | 
 11 | 5-清理TEMP文件夹缓存音频等文件
 12 | 
 13 | 6-大幅削弱合成音频包含参考音频结尾的问题
 14 | 
 15 | ### 20240122更新
 16 | 
 17 | 1-修复过短输出文件返回重复参考音频的问题。
 18 | 
 19 | 2-经测试，英文日文训练原生支持（日文训练需要根目录不含非英文等特殊字符）。
 20 | 
 21 | 3-音频路径检查。如果尝试读取输入错的路径报错路径不存在，而非ffmpeg错误。
 22 | 
 23 | ### 20240123更新
 24 | 
 25 | 1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题
 26 | 
 27 | 2-支持推理界面快速切换模型
 28 | 
 29 | 3-优化模型文件排序逻辑
 30 | 
 31 | 4-中文分词使用jieba_fast代替jieba
 32 | 
 33 | ### 20240126更新
 34 | 
 35 | 1-支持输出文本中英混合、日英混合
 36 | 
 37 | 2-输出可选切分模式
 38 | 
 39 | 3-修复uvr5读取到目录自动跳出的问题
 40 | 
 41 | 4-修复多个换行导致推理报错
 42 | 
 43 | 5-去除推理界面大量冗余log
 44 | 
 45 | 6-支持mac训练推理
 46 | 
 47 | 7-自动识别不支持半精度的卡强制单精度。cpu推理下强制单精度。
 48 | 
 49 | ### 20240128更新
 50 | 
 51 | 1-修复数字转汉字念法问题
 52 | 
 53 | 2-修复句首少量字容易吞字的问题
 54 | 
 55 | 3-通过限制排除不合理的参考音频长度
 56 | 
 57 | 4-修复GPT训练不保存ckpt的问题
 58 | 
 59 | 5-完善Dockerfile的下载模型流程
 60 | 
 61 | ### 20240129更新
 62 | 
 63 | 1-16系等半精度训练有问题的显卡把训练配置改为单精度训练
 64 | 
 65 | 2-测试更新可用的colab版本
 66 | 
 67 | 3-修复git clone modelscope funasr仓库+老版本funasr导致接口不对齐报错的问题
 68 | 
 69 | 
 70 | ### 20240130更新
 71 | 
 72 | 1-所有涉及路径的地方双引号自动去除,小白复制路径带双引号不会报错
 73 | 
 74 | 2-修复中英文标点切割问题和句首句尾补标点的问题
 75 | 
 76 | 3-增加按标点符号切分
 77 | 
 78 | ### 20240201更新
 79 | 
 80 | 1-修复uvr5读取格式错误导致分离失败的问题
 81 | 
 82 | 2-支持中日英混合多种文本自动切分识别语种
 83 | 
 84 | ### 20240202更新
 85 | 
 86 | 1-修复asr路径尾缀带/保存文件名报错
 87 | 
 88 | 2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题，例如：xx.xx%(带百分号类)，元/吨 会读成 元吨 而不是元每吨,下划线不再会报错
 89 | 
 90 | ### 20240207更新
 91 | 
 92 | 1-修正语种传参混乱导致中文推理效果下降 https://github.com/RVC-Boss/GPT-SoVITS/issues/391
 93 | 
 94 | 2-uvr5适配高版本librosa https://github.com/RVC-Boss/GPT-SoVITS/pull/403
 95 | 
 96 | 3-修复uvr5 inf everywhere报错的问题(is_half传参未转换bool导致恒定半精度推理，16系显卡会inf) https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8
 97 | 
 98 | 4-优化英文文本前端
 99 | 
100 | 5-修复gradio依赖
101 | 
102 | 6-支持三连根目录留空自动读取.list全路径
103 | 
104 | 7-集成faster whisper ASR日文英文
105 | 
106 | ### 20240208更新
107 | 
108 | 1-GPT训练卡死（win10 1909）和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 （系统语言繁体）GPT训练报错，[尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)。
109 | 
110 | ### 20240212更新
111 | 
112 | 1-faster whisper和funasr逻辑优化。faster whisper转镜像站下载，规避huggingface连不上的问题。
113 | 
114 | 2-DPO Loss实验性训练选项开启，通过构造负样本训练缓解GPT重复漏字问题。推理界面公开几个推理参数。 https://github.com/RVC-Boss/GPT-SoVITS/pull/457
115 | 
116 | ### 20240214更新
117 | 
118 | 1-训练支持中文实验名（原来会报错）
119 | 
120 | 2-DPO训练改为可勾选选项而非必须。如勾选batch size自动减半。修复推理界面新参数不传参的问题。
121 | 
122 | ### 20240216更新
123 | 
124 | 1-支持无参考文本输入
125 | 
126 | 2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475
127 | 
128 | ### 20240221更新
129 | 
130 | 1-数据处理添加语音降噪选项（降噪为只剩16k采样率，除非底噪很大先不急着用哦。）
131 | 
132 | 2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509
133 | 
134 | 3-mac CPU推理更快因此把推理设备从mps改到CPU
135 | 
136 | 4-colab修复不开启公网url
137 | 
138 | ### 20240306更新
139 | 
140 | 1-推理加速50%（RTX3090+pytorch2.2.1+cu11.8+win10+py39 tested）https://github.com/RVC-Boss/GPT-SoVITS/pull/672
141 | 
142 | 2-如果用faster whisper非中文ASR不再需要先下中文funasr模型
143 | 
144 | 3-修复uvr5去混响模型 是否混响 反的 https://github.com/RVC-Boss/GPT-SoVITS/pull/610
145 | 
146 | 4-faster whisper如果无cuda可用自动cpu推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/675
147 | 
148 | 5-修改is_half的判断使在Mac上能正常CPU推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/573
149 | 
150 | 
151 | todolist：
152 | 
153 | 1-中文多音字推理优化(有没有人来测试的，欢迎把测试结果写在pr评论区里) https://github.com/RVC-Boss/GPT-SoVITS/pull/488
154 | 
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/docs/ja/Changelog_JA.md:
--------------------------------------------------------------------------------
 1 | ### 20240121 更新
 2 | 
 3 | 1. `config`に`is_share`を追加し、Colab などの環境でこれを`True`に設定すると、webui を公共ネットワークにマッピングできます。
 4 | 
 5 | 2. WebUI に英語システムの英語翻訳を追加しました。
 6 | 
 7 | 3. `cmd-asr`は damo モデルが既に含まれているかどうかを自動的に確認し、デフォルトのパスにない場合は modelscope から自動的にダウンロードします。
 8 | 
 9 | 4. [SoVITS 训练报错 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 修復を試みます（長さ 0 のサンプルをフィルタリングなど）
10 | 
11 | 5. TEMP ファイルフォルダからオーディオやその他のファイルをクリーンアップして最適化します。
12 | 
13 | 6. 合成オーディオがリファレンスオーディオの終わりを含む問題を大幅に改善しました。
14 | 
15 | ### 20240122 更新
16 | 
17 | 1. 短すぎる出力ファイルが重複したリファレンスオーディオを返す問題を修正しました。
18 | 
19 | 2. 英語-日本語学習がスムーズに進む QA を完了しました。（ただし、日本語学習はルートディレクトリに英語以外の文字が含まれていない必要があります）
20 | 
21 | 3. オーディオパスをチェックします。間違ったパスを読み取ろうとすると、「パスが存在しません」というエラーメッセージが返されます。これは ffmpeg モジュールのエラーではありません。
22 | 
23 | ### 20240123 更新
24 | 
25 | 1. hubert から nan 抽出による SoVITS/GPT 学習中の ZeroDivisionError 関連エラーを修正しました。
26 | 
27 | 2. 推論インターフェースでモデルを素早く切り替えることができるようにサポートしました。
28 | 
29 | 3. モデルファイルのソートロジックを最適化しました。
30 | 
31 | 4. 中国語の分析に`jieba_fast`を`jieba`に置き換えました。
32 | 
33 | ### 20240126 更新
34 | 
35 | 1. 中国語と英語、日本語と英語が混在した出力テキストをサポートします。
36 | 
37 | 2. 出力で選択的な分割モードをサポートします。
38 | 
39 | 3. uvr5 がディレクトリを読み取り、自動的に終了する問題を修正しました。
40 | 
41 | 4. 複数の改行による推論エラーを修正しました。
42 | 
43 | 5. 推論インターフェースから不要なログを削除しました。
44 | 
45 | 6. MacOS での学習と推論をサポートします。
46 | 
47 | 7. 半精度をサポートしていないカードを自動的に識別して単精度を強制し、CPU 推論では単精度を強制します。
48 | 
49 | ### 20240128 更新
50 | 
51 | 1. 数字を漢字で読む問題を修正しました。
52 | 
53 | 2. 文章の先頭の一部の単語が欠落する問題を修正しました。
54 | 
55 | 3. 不適切な長さのリファレンスオーディオを制限しました。
56 | 
57 | 4. GPT 学習時の ckpt が保存されない問題を修正しました。
58 | 
59 | 5. Dockerfile のモデルダウンロードプロセスを改善しました。
60 | 
61 | ### 20240129 更新
62 | 
63 | 1. 16 系などの半精度学習に問題があるカードは、学習構成を単精度学習に変更しました。
64 | 
65 | 2. Colab でも使用可能なバージョンをテストして更新しました。
66 | 
67 | 3. `git clone modelscope funasr`リポジトリと古いバージョンの funasr を使用してインターフェースが一致しないエラーを修正しました。
68 | 
69 | ### 20240130 更新
70 | 
71 | 1. パスと関連する文字列を解析して、二重引用符を自動的に削除します。また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません。
72 | 
73 | 2. 中国語と英語、日本語と英語の混合出力をサポートします。
74 | 
75 | 3. 出力で選択的な分割モードをサポートします。
76 | 
77 | todolist：
78 | 
79 | 1. 同音異義語（中国語）の推論の最適化
80 | 
81 | 2. 英語大文字認識と英語ハイフン [問題](https://github.com/RVC-Boss/GPT-SoVITS/issues/271)
82 | 
83 | 3. テキストに%記号が含まれているとエラーが発生し、推論が不可能です。また、「元/吨」が「元吨」ではなく「元每吨」と読まれるなどの問題があります。このような問題を解決するには、どのライブラリを使用する必要があり、それに対する改善を検討しています。
84 | 
85 | 4. 中-日-英、中-英、日-英を含む 5 つの言語をサポートすることを目標にしています。
86 | 


--------------------------------------------------------------------------------
/gsv_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "device": "auto",
 3 |   "is_half": "auto",
 4 | 
 5 |   "models_path": "trained",
 6 |   "cnhubert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
 7 |   "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
 8 |   "save_prompt_cache": true,
 9 |   "prompt_cache_dir": "cache/prompt_cache"
10 | }
11 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | conda install -c conda-forge gcc
 3 | conda install -c conda-forge gxx
 4 | conda install ffmpeg cmake
 5 | conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
 6 | pip install -r requirements.txt
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/pure_api.py:
--------------------------------------------------------------------------------
  1 | # 在开头加入路径
  2 | import os, sys
  3 | import importlib
  4 | 
  5 | now_dir = os.getcwd()
  6 | sys.path.append(now_dir)
  7 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  8 | 
  9 | from src.common_config_manager import __version__, api_config
 10 | import soundfile as sf
 11 | from fastapi import FastAPI, Request, HTTPException
 12 | from fastapi.responses import JSONResponse, FileResponse, StreamingResponse
 13 | from fastapi.middleware.cors import CORSMiddleware
 14 | import tempfile
 15 | import uvicorn  
 16 | import json
 17 | 
 18 | # 将当前文件所在的目录添加到 sys.path
 19 | from Synthesizers.base import Base_TTS_Task, Base_TTS_Synthesizer
 20 | 
 21 | # 创建合成器实例
 22 | tts_synthesizer:Base_TTS_Synthesizer = None
 23 | 
 24 | def set_tts_synthesizer(synthesizer:Base_TTS_Synthesizer):
 25 |     global tts_synthesizer
 26 |     tts_synthesizer = synthesizer
 27 | 
 28 | # 存储临时文件的字典
 29 | temp_files = {}
 30 | 
 31 | async def character_list(request: Request):
 32 |     res = JSONResponse(tts_synthesizer.get_characters())
 33 |     return res
 34 | 
 35 | async def tts(request: Request):
 36 |     
 37 |     from time import time as tt
 38 |     t1 = tt()
 39 |     print(f"Request Time: {t1}")
 40 |     
 41 |     # 尝试从JSON中获取数据，如果不是JSON，则从查询参数中获取
 42 |     if request.method == "GET":
 43 |         data = request.query_params
 44 |     else:
 45 |         data = await request.json()
 46 |     
 47 |     task:Base_TTS_Task = tts_synthesizer.params_parser(data)
 48 | 
 49 |     if task.task_type == "text" and task.text.strip() == "":
 50 |         return HTTPException(status_code=400, detail="Text is empty")
 51 |     elif task.task_type == "ssml" and task.ssml.strip() == "":
 52 |         return HTTPException(status_code=400, detail="SSML is empty")
 53 |     md5_value = task.md5
 54 |     if task.stream == False:
 55 |         # TODO: use SQL instead of dict
 56 |         if task.save_temp and md5_value in temp_files:
 57 |             return FileResponse(path=temp_files[md5_value], media_type=f'audio/{task.format}')
 58 |         else:
 59 |             # 假设 gen 是你的音频生成器
 60 |             try:
 61 |                 save_path = tts_synthesizer.generate(task, return_type="filepath")
 62 |             except Exception as e:
 63 |                 return HTTPException(status_code=500, detail=str(e))
 64 |             if task.save_temp:
 65 |                 temp_files[md5_value] = save_path
 66 | 
 67 |             t2 = tt()
 68 |             print(f"total time: {t2-t1}")
 69 |             # 返回文件响应，FileResponse 会负责将文件发送给客户端
 70 |             return FileResponse(save_path, media_type=f"audio/{task.format}", filename=os.path.basename(save_path))
 71 |     else:
 72 |         gen = tts_synthesizer.generate(task, return_type="numpy")
 73 |         return StreamingResponse(gen,  media_type='audio/wav')
 74 | 
 75 | 
 76 | 
 77 | 
 78 | if __name__ == "__main__":
 79 |     # 动态导入合成器模块, 此处可写成 from Synthesizers.xxx import TTS_Synthesizer, TTS_Task
 80 |     from importlib import import_module
 81 |     from src.api_utils import get_localhost_ipv4_address
 82 |     synthesizer_name = api_config.synthesizer
 83 |     synthesizer_module = import_module(f"Synthesizers.{synthesizer_name}")
 84 |     TTS_Synthesizer = synthesizer_module.TTS_Synthesizer
 85 |     TTS_Task = synthesizer_module.TTS_Task
 86 |     # 初始化合成器的类
 87 |     tts_synthesizer = TTS_Synthesizer(debug_mode=True)
 88 |     
 89 |     # 生成一句话充当测试，减少第一次请求的等待时间
 90 |     gen = tts_synthesizer.generate(tts_synthesizer.params_parser({"text":"你好，世界"}) )
 91 |     next(gen)
 92 |     
 93 |     # 打印一些辅助信息
 94 |     print(f"Backend Version: {__version__}")
 95 |     tts_host = api_config.tts_host
 96 |     tts_port = api_config.tts_port
 97 |     ipv4_address = get_localhost_ipv4_address(tts_host)
 98 |     ipv4_link = f"http://{ipv4_address}:{tts_port}"
 99 |     print(f"INFO:     Local Network URL: {ipv4_link}")
100 |     
101 |     app = FastAPI()
102 | 
103 |     # 设置CORS
104 |     app.add_middleware(
105 |         CORSMiddleware,
106 |         allow_origins=["*"],
107 |         allow_credentials=True,
108 |         allow_methods=["*"],
109 |         allow_headers=["*"],
110 |     )
111 |     app.add_api_route('/tts', tts, methods=["GET", "POST"])
112 |     app.add_api_route('/character_list', character_list, methods=["GET"])
113 |     uvicorn.run(app, host=tts_host, port=tts_port)
114 | 
115 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | requests
 2 | pydub
 3 | 
 4 | pydantic
 5 | soundfile
 6 | flash-attention
 7 | numpy
 8 | scipy
 9 | tensorboard
10 | librosa==0.9.2
11 | numba
12 | pytorch-lightning
13 | gradio>=4.29
14 | gradio_client
15 | ffmpeg-python
16 | onnxruntime
17 | tqdm
18 | funasr==1.0.0
19 | cn2an
20 | pypinyin
21 | pyopenjtalk
22 | g2p_en
23 | torchaudio
24 | modelscope==1.10.0
25 | sentencepiece
26 | transformers
27 | chardet
28 | PyYAML
29 | psutil
30 | jieba_fast
31 | jieba
32 | LangSegment>=0.3.1
33 | Faster_Whisper
34 | fastapi 
35 | uvicorn
36 | wordsegment
37 | srt
38 | 
39 | pyloudnorm


--------------------------------------------------------------------------------
/src/api_utils.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | # 便于小白理解
 3 | def get_localhost_ipv4_address(host = "127.0.0.1"):
 4 | 
 5 |     def get_internal_ip():
 6 |         """获取内部IP地址"""
 7 |         s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
 8 |         try:
 9 |             # 这不会发送真正的数据包
10 |             s.connect(('10.253.156.219', 1))
11 |             IP = s.getsockname()[0]
12 |         except Exception:
13 |             IP = '127.0.0.1'
14 |         finally:
15 |             s.close()
16 |         return IP
17 | 
18 |     if host == "0.0.0.0":
19 |         display_hostname = get_internal_ip()
20 |         return display_hostname
21 |     else:
22 |         return host
23 |             
24 | def get_gradio_frp(server_name, server_port, share_token) -> str:
25 |     from urllib.parse import urlparse, urlunparse
26 |     from gradio import networking
27 |     share_url = networking.setup_tunnel(
28 |         local_host=server_name,
29 |         local_port=server_port,
30 |         share_token=share_token,
31 |         share_server_address=None,
32 |     )
33 |     parsed_url = urlparse(share_url)
34 |     share_server_protocol = "https" 
35 |     share_url = urlunparse(
36 |         (share_server_protocol,) + parsed_url[1:]
37 |     )
38 |     return share_url


--------------------------------------------------------------------------------
/src/common_config_manager.py:
--------------------------------------------------------------------------------
 1 | import os, sys, json
 2 | from typing import List, Any, Optional ,Dict, Literal   
 3 | from pydantic import BaseModel, Field, model_validator
 4 | 
 5 | __version__ = "2.6.3"
 6 | 
 7 | from Synthesizers.base import load_config
 8 | 
 9 | 
10 | class Api_Config(BaseModel):   
11 |     config_path:str = None
12 |     tts_port: int = 5000
13 |     tts_host: str = "0.0.0.0" 
14 |     synthesizer: str = "gsv_fast"
15 | 
16 | 
17 |     def __init__(self, config_path = None):
18 |         super().__init__()
19 |         
20 |         self.config_path = config_path
21 |         assert os.path.exists(self.config_path), f"配置文件不存在: {self.config_path}"
22 |         if os.path.exists(self.config_path):
23 |             all_config = load_config(self.config_path)
24 |             config:dict = all_config.get("common", {})
25 |             for key, value in config.items():
26 |                 setattr(self, key, value)
27 |         
28 | class App_Config(BaseModel):
29 | 
30 |     config_path:str = None
31 |     locale: str = "auto"
32 |     is_share: bool = False
33 |     inbrowser: bool = True
34 |     server_name: str = "0.0.0.0"
35 |     server_port: int = -1 # -1 means auto select
36 |     also_enable_api: bool = True
37 |     synthesizer: str = "gsv_fast"
38 |     max_text_length: int = -1
39 | 
40 |     @model_validator(mode='after')
41 |     def check_locale(self):
42 |         # Example: validating locale to be one of a set predefined values or patterns
43 |         self.locale = self.locale.replace("-", "_")
44 |         return self
45 | 
46 |     @staticmethod
47 |     def check_port(port:int, server_name:str):
48 |         url = f"http://{server_name}:{port}"
49 |      
50 |     
51 |     def __init__(self, config_path = None):
52 |         super().__init__()
53 |         
54 |         self.config_path = config_path
55 |         assert os.path.exists(self.config_path), f"配置文件不存在: {self.config_path}"
56 |         if os.path.exists(self.config_path):
57 |             all_config = load_config(self.config_path)
58 |             config = all_config.get("app_config", {})
59 |             for key, value in config.items():
60 |                 setattr(self, key, value)
61 | 
62 | app_config = App_Config("common_config.json")
63 | api_config = Api_Config("common_config.json")
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/tmp_audio/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/tools/__init__.py


--------------------------------------------------------------------------------
/tools/asr/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def check_fw_local_models():
 4 |     '''
 5 |     启动时检查本地是否有 Faster Whisper 模型.
 6 |     '''
 7 |     model_size_list = [
 8 |         "tiny",     "tiny.en", 
 9 |         "base",     "base.en", 
10 |         "small",    "small.en", 
11 |         "medium",   "medium.en", 
12 |         "large",    "large-v1", 
13 |         "large-v2", "large-v3"]
14 |     for i, size in enumerate(model_size_list):
15 |         if os.path.exists(f'tools/asr/models/faster-whisper-{size}'):
16 |             model_size_list[i] = size + '-local'
17 |     return model_size_list
18 | 
19 | asr_dict = {
20 |     "达摩 ASR (中文)": {
21 |         'lang': ['zh'],
22 |         'size': ['large'],
23 |         'path': 'funasr_asr.py',
24 |     },
25 |     "Faster Whisper (多语种)": {
26 |         'lang': ['auto', 'zh', 'en', 'ja'],
27 |         'size': check_fw_local_models(),
28 |         'path': 'fasterwhisper_asr.py'
29 |     }
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/tools/asr/funasr_asr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import argparse
 4 | import os
 5 | import traceback
 6 | from tqdm import tqdm
 7 | 
 8 | from funasr import AutoModel
 9 | 
10 | path_asr  = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
11 | path_vad  = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
12 | path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
13 | path_asr  = path_asr  if os.path.exists(path_asr)  else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
14 | path_vad  = path_vad  if os.path.exists(path_vad)  else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
15 | path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
16 | 
17 | model = AutoModel(
18 |     model               = path_asr,
19 |     model_revision      = "v2.0.4",
20 |     vad_model           = path_vad,
21 |     vad_model_revision  = "v2.0.4",
22 |     punc_model          = path_punc,
23 |     punc_model_revision = "v2.0.4",
24 | )
25 | 
26 | def only_asr(input_file):
27 |     try:
28 |         text = model.generate(input=input_file)[0]["text"]
29 |     except:
30 |         text = ''
31 |         print(traceback.format_exc())
32 |     return text
33 | 
34 | def execute_asr(input_folder, output_folder, model_size, language):
35 |     input_file_names = os.listdir(input_folder)
36 |     input_file_names.sort()
37 |     
38 |     output = []
39 |     output_file_name = os.path.basename(input_folder)
40 | 
41 |     for file_name in tqdm(input_file_names):
42 |         try:
43 |             file_path = os.path.join(input_folder, file_name)
44 |             text = model.generate(input=file_path)[0]["text"]
45 |             output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
46 |         except:
47 |             print(traceback.format_exc())
48 | 
49 |     output_folder = output_folder or "output/asr_opt"
50 |     os.makedirs(output_folder, exist_ok=True)
51 |     output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
52 | 
53 |     with open(output_file_path, "w", encoding="utf-8") as f:
54 |         f.write("\n".join(output))
55 |         print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
56 |     return output_file_path
57 | 
58 | if __name__ == '__main__':
59 |     parser = argparse.ArgumentParser()
60 |     parser.add_argument("-i", "--input_folder", type=str, required=True,
61 |                         help="Path to the folder containing WAV files.")
62 |     parser.add_argument("-o", "--output_folder", type=str, required=True, 
63 |                         help="Output folder to store transcriptions.")
64 |     parser.add_argument("-s", "--model_size", type=str, default='large',
65 |                         help="Model Size of FunASR is Large")
66 |     parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'],
67 |                         help="Language of the audio files.")
68 |     parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
69 |                         help="fp16 or fp32")#还没接入
70 | 
71 |     cmd = parser.parse_args()
72 |     execute_asr(
73 |         input_folder  = cmd.input_folder,
74 |         output_folder = cmd.output_folder,
75 |         model_size    = cmd.model_size,
76 |         language      = cmd.language,
77 |     )
78 | 


--------------------------------------------------------------------------------
/tools/asr/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/tools/cmd-denoise.py:
--------------------------------------------------------------------------------
 1 | import os,argparse
 2 | 
 3 | from modelscope.pipelines import pipeline
 4 | from modelscope.utils.constant import Tasks
 5 | from tqdm import tqdm
 6 | 
 7 | path_denoise  = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
 8 | path_denoise  = path_denoise  if os.path.exists(path_denoise)  else "damo/speech_frcrn_ans_cirm_16k"
 9 | ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
10 | def execute_denoise(input_folder,output_folder):
11 |     os.makedirs(output_folder,exist_ok=True)
12 |     # print(input_folder)
13 |     # print(list(os.listdir(input_folder).sort()))
14 |     for name in tqdm(os.listdir(input_folder)):
15 |         ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
16 | 
17 | if __name__ == '__main__':
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("-i", "--input_folder", type=str, required=True,
20 |                         help="Path to the folder containing WAV files.")
21 |     parser.add_argument("-o", "--output_folder", type=str, required=True, 
22 |                         help="Output folder to store transcriptions.")
23 |     parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
24 |                         help="fp16 or fp32")#还没接入
25 |     cmd = parser.parse_args()
26 |     execute_denoise(
27 |         input_folder  = cmd.input_folder,
28 |         output_folder = cmd.output_folder,
29 |     )


--------------------------------------------------------------------------------
/tools/denoise-model/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/tools/i18n/i18n.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import locale
 4 | from src.common_config_manager import app_config
 5 | 
 6 | def load_language_list(language, locale_paths):
 7 |     language_map = {}
 8 |     for locale_path in locale_paths:
 9 |         lang_file = os.path.join(locale_path, f"{language}.json")
10 |         if os.path.exists(lang_file):
11 |             with open(lang_file, 'r', encoding='utf-8') as f:
12 |                 language_map.update(json.load(f))
13 |     return language_map
14 | 
15 | class I18nAuto:
16 |     def __init__(self, language=None, locale_paths=[], locale_path="./i18n/locale"):
17 |         if language in ["auto", None]:
18 |             if app_config.locale in ["auto", None, ""]:
19 |                 language = locale.getdefaultlocale()[0]
20 |             else:
21 |                 language = app_config.locale
22 |         if not any(os.path.exists(os.path.join(locale_path, f"{language}.json")) for locale_path in locale_paths):
23 |             language = "zh_CN"
24 |         self.language = language
25 |         if len(locale_paths):
26 |             self.language_map = load_language_list(language, locale_paths)
27 |         else:
28 |             self.language_map = load_language_list(language, [locale_path])
29 | 
30 |     def __call__(self, key):
31 |         return self.language_map.get(key, key)
32 | 
33 |     def __repr__(self):
34 |         return "Use Language: " + self.language
35 | 


--------------------------------------------------------------------------------
/tools/i18n/locale_diff.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import OrderedDict
 4 | 
 5 | # dir_path = "./i18n/locale" # The path to the i18n locale directory, you can change it to your own path
 6 | dir_path = "./tools/srt_slicer/i18n/locale"
 7 | # Define the standard file name
 8 | standard_file = os.path.join(dir_path, "zh_CN.json")
 9 | 
10 | # Find all JSON files in the directory
11 | languages = [
12 |     os.path.join(dir_path, f)
13 |     for f in os.listdir(dir_path)
14 |     if f.endswith(".json") and f != standard_file
15 | ]
16 | 
17 | # Load the standard file
18 | with open(standard_file, "r", encoding="utf-8") as f:
19 |     standard_data = json.load(f, object_pairs_hook=OrderedDict)
20 | 
21 | # Loop through each language file
22 | for lang_file in languages:
23 |     # Load the language file
24 |     with open(lang_file, "r", encoding="utf-8") as f:
25 |         lang_data = json.load(f, object_pairs_hook=OrderedDict)
26 | 
27 |     # Find the difference between the language file and the standard file
28 |     diff = set(standard_data.keys()) - set(lang_data.keys())
29 | 
30 |     miss = set(lang_data.keys()) - set(standard_data.keys())
31 | 
32 |     # Add any missing keys to the language file
33 |     for key in diff:
34 |         lang_data[key] = standard_data[key]
35 | 
36 |     # Del any extra keys to the language file
37 |     for key in miss:
38 |         del lang_data[key]
39 | 
40 |     # Sort the keys of the language file to match the order of the standard file
41 |     lang_data = OrderedDict(
42 |         sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
43 |     )
44 | 
45 |     # Save the updated language file
46 |     with open(lang_file, "w", encoding="utf-8") as f:
47 |         json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
48 |         f.write("\n")
49 | 


--------------------------------------------------------------------------------
/tools/i18n/scan_i18n.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import json
  3 | from collections import OrderedDict
  4 | import os
  5 | 
  6 | # locale_path = "./i18n/locale" # The path to the i18n locale directory, you can change it to your own path
  7 | # scan_list = ["./",
  8 | #              "GPT_SoVITS/",
  9 | #              "tools/"
 10 | #              ]  # The path to the directory you want to scan, you can change it to your own path
 11 | # scan_subfolders = False  # Whether to scan subfolders
 12 | 
 13 | locale_path = "./tools/srt_slicer/i18n/locale"
 14 | scan_list = ["./tools/srt_slicer"]  # The path to the directory you want to scan, you can change it to your own path
 15 | scan_subfolders = True
 16 | 
 17 | special_words_to_keep = {
 18 |     "auto": "自动判断",
 19 |     "zh": "中文",
 20 |     "en": "英文",
 21 |     "ja": "日文",
 22 |     "all_zh": "只有中文",
 23 |     "all_ja": "只有日文",
 24 |     "auto_cut": "智能切分",
 25 |     "cut0": "仅凭换行切分",
 26 |     "cut1": "凑四句一切",
 27 |     "cut2": "凑50字一切",
 28 |     "cut3": "按中文句号。切",
 29 |     "cut4": "按英文句号.切",
 30 |     "cut5": "按标点符号切",
 31 |     
 32 | }
 33 | 
 34 | 
 35 | def extract_i18n_strings(node):
 36 |     i18n_strings = []
 37 | 
 38 |     if (
 39 |         isinstance(node, ast.Call)
 40 |         and isinstance(node.func, ast.Name)
 41 |         and node.func.id == "i18n"
 42 |     ):
 43 |         for arg in node.args:
 44 |             if isinstance(arg, ast.Str):
 45 |                 i18n_strings.append(arg.s)
 46 | 
 47 |     for child_node in ast.iter_child_nodes(node):
 48 |         i18n_strings.extend(extract_i18n_strings(child_node))
 49 | 
 50 |     return i18n_strings
 51 | 
 52 | strings = []
 53 | 
 54 | # for each file, parse the code into an AST
 55 | # for each AST, extract the i18n strings
 56 | def scan_i18n_strings(filename):
 57 |     with open(filename, "r", encoding="utf-8") as f:
 58 |         code = f.read()
 59 |         if "I18nAuto" in code:
 60 |             tree = ast.parse(code)
 61 |             i18n_strings = extract_i18n_strings(tree)
 62 |             print(filename, len(i18n_strings))
 63 |             strings.extend(i18n_strings)
 64 | 
 65 | 
 66 | # scan the directory for all .py files (recursively)
 67 | if scan_subfolders:
 68 |     for folder in scan_list:
 69 |         for dirpath, dirnames, filenames in os.walk(folder):
 70 |             for filename in [f for f in filenames if f.endswith(".py")]:
 71 |                 scan_i18n_strings(os.path.join(dirpath, filename))
 72 | else:
 73 |     for folder in scan_list:
 74 |         for filename in os.listdir(folder):
 75 |             if filename.endswith(".py"):
 76 |                 scan_i18n_strings(os.path.join(folder, filename))
 77 |         
 78 | code_keys = set(strings)
 79 | """
 80 | n_i18n.py
 81 | gui_v1.py 26
 82 | app.py 16
 83 | infer-web.py 147
 84 | scan_i18n.py 0
 85 | i18n.py 0
 86 | lib/train/process_ckpt.py 1
 87 | """
 88 | print()
 89 | print("Total unique:", len(code_keys))
 90 | 
 91 | 
 92 | standard_file = os.path.join(locale_path, "zh_CN.json")
 93 | try:
 94 |     with open(standard_file, "r", encoding="utf-8") as f:
 95 |         standard_data = json.load(f, object_pairs_hook=OrderedDict)
 96 |     standard_keys = set(standard_data.keys())
 97 | except FileNotFoundError:
 98 |     standard_keys = set()
 99 | # Define the standard file name
100 | unused_keys = standard_keys - code_keys
101 | print("Unused keys:", len(unused_keys))
102 | for unused_key in unused_keys:
103 |     print("\t", unused_key)
104 | 
105 | missing_keys = code_keys - standard_keys
106 | print("Missing keys:", len(missing_keys))
107 | for missing_key in missing_keys:
108 |     print("\t", missing_key)
109 |     
110 | 
111 | 
112 | code_keys_dict = OrderedDict()
113 | for s in strings:
114 |     if s in special_words_to_keep:
115 |         code_keys_dict[s] = special_words_to_keep[s]
116 |     else:    
117 |         code_keys_dict[s] = s
118 | 
119 | # write back
120 | os.makedirs(locale_path, exist_ok=True)
121 | with open(standard_file, "w", encoding="utf-8") as f:
122 |     json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
123 |     f.write("\n")
124 | 


--------------------------------------------------------------------------------
/tools/my_utils.py:
--------------------------------------------------------------------------------
 1 | import platform,os,traceback
 2 | import ffmpeg
 3 | import numpy as np
 4 | 
 5 | 
 6 | def load_audio(file, sr):
 7 |     try:
 8 |         # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
 9 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
10 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
11 |         file = clean_path(file)  # 防止小白拷路径头尾带了空格和"和回车
12 |         if os.path.exists(file) == False:
13 |             raise RuntimeError(
14 |                 "You input a wrong audio path that does not exists, please fix it!"
15 |             )
16 |         out, _ = (
17 |             ffmpeg.input(file, threads=0)
18 |             .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
19 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
20 |         )
21 |     except Exception as e:
22 |         traceback.print_exc()
23 |         raise RuntimeError(f"Failed to load audio: {e}")
24 | 
25 |     return np.frombuffer(out, np.float32).flatten()
26 | 
27 | 
28 | def clean_path(path_str):
29 |     if platform.system() == 'Windows':
30 |         path_str = path_str.replace('/', '\\')
31 |     return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ").strip("\u202a")
32 | 


--------------------------------------------------------------------------------
/tools/normalize_loudness.py:
--------------------------------------------------------------------------------
 1 | import soundfile as sf
 2 | import numpy as np
 3 | from pyloudnorm import Meter, normalize
 4 | import os
 5 | 
 6 | def normalize_loudness(audio_path, target_loudness, target_path):
 7 |     """
 8 |     归一化音频文件的响度到指定的目标响度。
 9 | 
10 |     参数:
11 |         audio_path (str): 原始音频文件的路径。
12 |         target_loudness (float): 目标响度值（LUFS）。
13 |         target_path (str): 归一化后音频的保存路径。
14 | 
15 |     返回:
16 |         bool: 归一化操作是否成功。
17 |     """
18 |     try:
19 |         # 读取音频文件
20 |         data, rate = sf.read(audio_path)
21 | 
22 |         # 创建响度仪表，基于ITU-R BS.1770
23 |         meter = Meter(rate)  # 采样率
24 | 
25 |         # 测量音频的响度
26 |         loudness = meter.integrated_loudness(data)
27 | 
28 |         # 响度归一化
29 |         normalized_audio = normalize.loudness(data, loudness, target_loudness)
30 | 
31 |         os.makedirs(os.path.dirname(target_path), exist_ok=True)
32 |         # 保存归一化后的音频文件
33 |         sf.write(target_path, normalized_audio, rate)
34 | 
35 |         return True
36 |     except Exception as e:
37 |         raise e


--------------------------------------------------------------------------------
/tools/slice_audio.py:
--------------------------------------------------------------------------------
 1 | import os,sys,numpy as np
 2 | import traceback
 3 | from scipy.io import wavfile
 4 | # parent_directory = os.path.dirname(os.path.abspath(__file__))
 5 | # sys.path.append(parent_directory)
 6 | from my_utils import load_audio
 7 | from slicer2 import Slicer
 8 | 
 9 | def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
10 |     os.makedirs(opt_root,exist_ok=True)
11 |     if os.path.isfile(inp):
12 |         input=[inp]
13 |     elif os.path.isdir(inp):
14 |         input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
15 |     else:
16 |         return "输入路径存在但既不是文件也不是文件夹"
17 |     slicer = Slicer(
18 |         sr=32000,  # 长音频采样率
19 |         threshold=      int(threshold),  # 音量小于这个值视作静音的备选切割点
20 |         min_length=     int(min_length),  # 每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值
21 |         min_interval=   int(min_interval),  # 最短切割间隔
22 |         hop_size=       int(hop_size),  # 怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）
23 |         max_sil_kept=   int(max_sil_kept),  # 切完后静音最多留多长
24 |     )
25 |     _max=float(_max)
26 |     alpha=float(alpha)
27 |     for inp_path in input[int(i_part)::int(all_part)]:
28 |         # print(inp_path)
29 |         try:
30 |             name = os.path.basename(inp_path)
31 |             audio = load_audio(inp_path, 32000)
32 |             # print(audio.shape)
33 |             for chunk, start, end in slicer.slice(audio):  # start和end是帧数
34 |                 tmp_max = np.abs(chunk).max()
35 |                 if(tmp_max>1):chunk/=tmp_max
36 |                 chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
37 |                 wavfile.write(
38 |                     "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
39 |                     32000,
40 |                     # chunk.astype(np.float32),
41 |                     (chunk * 32767).astype(np.int16),
42 |                 )
43 |         except:
44 |             print(inp_path,"->fail->",traceback.format_exc())
45 |     return "执行完毕，请检查输出文件"
46 | 
47 | print(slice(*sys.argv[1:]))
48 | 
49 | 


--------------------------------------------------------------------------------
/tools/srt_slicer/i18n/locale/en_US.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "List 合并小工具": "List Merge Tool",
 3 |     "SRT合并切分插件": "SRT Merge and Split Plugin",
 4 |     "SRT文件": "SRT File",
 5 |     "SRT编辑界面": "SRT Edit Interface",
 6 |     "srt文件内容": "SRT File Content",
 7 |     "上传SRT文件": "Upload SRT File",
 8 |     "上传文件": "Upload Files",
 9 |     "两个文件夹不能相同！！！": "The two folders cannot be the same!!!",
10 |     "主文件夹": "Main Folder",
11 |     "作者: ": "Author: ",
12 |     "使用方法": "How to Use",
13 |     "保存合并后字幕": "Save Merged Subtitles",
14 |     "保存子文件夹名称": "Save Subfolder Name",
15 |     "保存文件夹": "Save Folder",
16 |     "允许最短长度": "Minimum Allowed Length",
17 |     "内容预览": "Content Preview",
18 |     "切分与保存": "Split and Save",
19 |     "切分完成": "Split Completed",
20 |     "切分并保存音频、list": "Split and Save Audio, List",
21 |     "切分预览": "Split Preview",
22 |     "判定为短间隔时长": "Judged as Short Interval Duration",
23 |     "到": " to ",
24 |     "前置保留时间": "Preceding Retention Time",
25 |     "前置添加静音时间": "Prepend Silence Time",
26 |     "句末加句号": "Add Period at the End of Sentence",
27 |     "合并后srt文本": "Merged SRT Text",
28 |     "合并后的List": "Merged List",
29 |     "合并字幕": "Merge Subtitles",
30 |     "合并字幕设置": "Subtitle Merge Settings",
31 |     "合并文件夹与List": "Merge Folder and List",
32 |     "后置保留时间": "Following Retention Time",
33 |     "后置添加静音时间": "Append Silence Time",
34 |     "扫描文件夹": "Scan Folder",
35 |     "找不到字幕！！！": "Subtitles Not Found!!!",
36 |     "找不到音频！！！": "Audio Not Found!!!",
37 |     "提供SRT文件（可使用剪映或者ASR工具获得）与原始音频文件。": "Provide SRT File (can be obtained via Clip or ASR tools) and Original Audio File.",
38 |     "提前合并时间间隔很短的字幕": "Merge Subtitles with Short Intervals in Advance",
39 |     "提示": "Tips",
40 |     "文件夹路径": "Folder Path",
41 |     "最大间隔时间": "Maximum Interval Time",
42 |     "最长允许单句长度": "Maximum Allowed Sentence Length",
43 |     "根据面板合并短句并过滤你不希望出现的句子。": "Merge short sentences according to the panel and filter out sentences you do not want to appear.",
44 |     "次文件夹": "Second Folder",
45 |     "正在切分音频": "Splitting Audio",
46 |     "正在建设，敬请期待": "Under Construction, Stay Tuned",
47 |     "注意：该文件夹已存在": "Warning: The folder already exists",
48 |     "角色名称，留空使用主文件夹的": "Role Name, Leave Blank to Use Main Folder's",
49 |     "语言": "Language",
50 |     "读取文件": "Read File",
51 |     "读取本地文件": "Read Local File",
52 |     "过滤字幕": "Filter Subtitles",
53 |     "过滤带有英文的": "Filter Out English",
54 |     "过滤设置": "Filter Settings",
55 |     "过滤词语，一行一个": "Filter Words, One Per Line",
56 |     "这是一个插件，用于依靠SRT文件得到切分与打标好的音频。": "This is a plugin for obtaining split and tagged audio based on SRT files.",
57 |     "随后保存成切分好的音频与list文件。": "Then save as split audio and list files.",
58 |     "音频文件": "Audio File",
59 |     "音频格式": "Audio Format"
60 | }
61 | 


--------------------------------------------------------------------------------
/tools/srt_slicer/i18n/locale/zh_CN.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "List 合并小工具": "List 合并小工具",
 3 |     "SRT合并切分插件": "SRT合并切分插件",
 4 |     "SRT文件": "SRT文件",
 5 |     "SRT编辑界面": "SRT编辑界面",
 6 |     "srt文件内容": "srt文件内容",
 7 |     "上传SRT文件": "上传SRT文件",
 8 |     "上传文件": "上传文件",
 9 |     "两个文件夹不能相同！！！": "两个文件夹不能相同！！！",
10 |     "主文件夹": "主文件夹",
11 |     "作者: ": "作者: ",
12 |     "使用方法": "使用方法",
13 |     "保存合并后字幕": "保存合并后字幕",
14 |     "保存子文件夹名称": "保存子文件夹名称",
15 |     "保存文件夹": "保存文件夹",
16 |     "允许最短长度": "允许最短长度",
17 |     "内容预览": "内容预览",
18 |     "切分与保存": "切分与保存",
19 |     "切分完成": "切分完成",
20 |     "切分并保存音频、list": "切分并保存音频、list",
21 |     "切分预览": "切分预览",
22 |     "判定为短间隔时长": "判定为短间隔时长",
23 |     "到": "到",
24 |     "前置保留时间": "前置保留时间",
25 |     "前置添加静音时间": "前置添加静音时间",
26 |     "句末加句号": "句末加句号",
27 |     "合并后srt文本": "合并后srt文本",
28 |     "合并后的List": "合并后的List",
29 |     "合并字幕": "合并字幕",
30 |     "合并字幕设置": "合并字幕设置",
31 |     "合并文件夹与List": "合并文件夹与List",
32 |     "后置保留时间": "后置保留时间",
33 |     "后置添加静音时间": "后置添加静音时间",
34 |     "扫描文件夹": "扫描文件夹",
35 |     "找不到字幕！！！": "找不到字幕！！！",
36 |     "找不到音频！！！": "找不到音频！！！",
37 |     "提供SRT文件（可使用剪映或者ASR工具获得）与原始音频文件。": "提供SRT文件（可使用剪映或者ASR工具获得）与原始音频文件。",
38 |     "提前合并时间间隔很短的字幕": "提前合并时间间隔很短的字幕",
39 |     "提示": "提示",
40 |     "文件夹路径": "文件夹路径",
41 |     "最大间隔时间": "最大间隔时间",
42 |     "最长允许单句长度": "最长允许单句长度",
43 |     "根据面板合并短句并过滤你不希望出现的句子。": "根据面板合并短句并过滤你不希望出现的句子。",
44 |     "次文件夹": "次文件夹",
45 |     "正在切分音频": "正在切分音频",
46 |     "正在建设，敬请期待": "正在建设，敬请期待",
47 |     "注意：该文件夹已存在": "注意：该文件夹已存在",
48 |     "角色名称，留空使用主文件夹的": "角色名称，留空使用主文件夹的",
49 |     "语言": "语言",
50 |     "读取文件": "读取文件",
51 |     "读取本地文件": "读取本地文件",
52 |     "过滤字幕": "过滤字幕",
53 |     "过滤带有英文的": "过滤带有英文的",
54 |     "过滤设置": "过滤设置",
55 |     "过滤词语，一行一个": "过滤词语，一行一个",
56 |     "这是一个插件，用于依靠SRT文件得到切分与打标好的音频。": "这是一个插件，用于依靠SRT文件得到切分与打标好的音频。",
57 |     "随后保存成切分好的音频与list文件。": "随后保存成切分好的音频与list文件。",
58 |     "音频文件": "音频文件",
59 |     "音频格式": "音频格式"
60 | }
61 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123812KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123821KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/model_param_init.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pathlib
 4 | 
 5 | default_param = {}
 6 | default_param["bins"] = 768
 7 | default_param["unstable_bins"] = 9  # training only
 8 | default_param["reduction_bins"] = 762  # training only
 9 | default_param["sr"] = 44100
10 | default_param["pre_filter_start"] = 757
11 | default_param["pre_filter_stop"] = 768
12 | default_param["band"] = {}
13 | 
14 | 
15 | default_param["band"][1] = {
16 |     "sr": 11025,
17 |     "hl": 128,
18 |     "n_fft": 960,
19 |     "crop_start": 0,
20 |     "crop_stop": 245,
21 |     "lpf_start": 61,  # inference only
22 |     "res_type": "polyphase",
23 | }
24 | 
25 | default_param["band"][2] = {
26 |     "sr": 44100,
27 |     "hl": 512,
28 |     "n_fft": 1536,
29 |     "crop_start": 24,
30 |     "crop_stop": 547,
31 |     "hpf_start": 81,  # inference only
32 |     "res_type": "sinc_best",
33 | }
34 | 
35 | 
36 | def int_keys(d):
37 |     r = {}
38 |     for k, v in d:
39 |         if k.isdigit():
40 |             k = int(k)
41 |         r[k] = v
42 |     return r
43 | 
44 | 
45 | class ModelParameters(object):
46 |     def __init__(self, config_path=""):
47 |         if ".pth" == pathlib.Path(config_path).suffix:
48 |             import zipfile
49 | 
50 |             with zipfile.ZipFile(config_path, "r") as zip:
51 |                 self.param = json.loads(
52 |                     zip.read("param.json"), object_pairs_hook=int_keys
53 |                 )
54 |         elif ".json" == pathlib.Path(config_path).suffix:
55 |             with open(config_path, "r") as f:
56 |                 self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57 |         else:
58 |             self.param = default_param
59 | 
60 |         for k in [
61 |             "mid_side",
62 |             "mid_side_b",
63 |             "mid_side_b2",
64 |             "stereo_w",
65 |             "stereo_n",
66 |             "reverse",
67 |         ]:
68 |             if not k in self.param:
69 |                 self.param[k] = False
70 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 16000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 16000,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 32000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "kaiser_fast"
14 | 		}
15 | 	},
16 | 	"sr": 32000,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 33075,
 8 | 			"hl": 384,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 33075,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 1024,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 256,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 256,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 256,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 256,
18 | 	"pre_filter_stop": 256
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 700,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 700
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 118,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 32000,
18 | 			"hl": 352,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 44,
23 | 			"hpf_stop": 23,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 32000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }
31 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 512,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 510,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 160,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 192,
12 | 			"lpf_start": 41,
13 | 			"lpf_stop": 139,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 44100,
18 | 			"hl": 640,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 10,
21 | 			"crop_stop": 320,
22 | 			"hpf_start": 47,
23 | 			"hpf_stop": 15,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 44100,
28 | 	"pre_filter_start": 510,
29 | 	"pre_filter_stop": 512
30 | }
31 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 240,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 48000,
18 | 			"hl": 528,
19 | 			"n_fft": 1536,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 82,
23 | 			"hpf_stop": 22,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 48000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 5,
 4 | 	"reduction_bins": 733,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 278,
12 | 			"lpf_start": 28,
13 | 			"lpf_stop": 140,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 22050,
18 | 			"hl": 256,
19 | 			"n_fft": 768,
20 | 			"crop_start": 14,
21 | 			"crop_stop": 322,
22 | 			"hpf_start": 70,
23 | 			"hpf_stop": 14,
24 | 			"lpf_start": 283,
25 | 			"lpf_stop": 314,
26 | 			"res_type": "polyphase"
27 | 		},	
28 | 		"3": {
29 | 			"sr": 44100,
30 | 			"hl": 512,
31 | 			"n_fft": 768,
32 | 			"crop_start": 131,
33 | 			"crop_stop": 313,
34 | 			"hpf_start": 154,
35 | 			"hpf_stop": 141,
36 | 			"res_type": "sinc_medium"
37 | 		}
38 | 	},
39 | 	"sr": 44100,
40 | 	"pre_filter_start": 757,
41 | 	"pre_filter_stop": 768
42 | }
43 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 5,
 5 | 	"reduction_bins": 733,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 768,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 278,
13 | 			"lpf_start": 28,
14 | 			"lpf_stop": 140,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 256,
20 | 			"n_fft": 768,
21 | 			"crop_start": 14,
22 | 			"crop_stop": 322,
23 | 			"hpf_start": 70,
24 | 			"hpf_stop": 14,
25 | 			"lpf_start": 283,
26 | 			"lpf_stop": 314,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 512,
32 | 			"n_fft": 768,
33 | 			"crop_start": 131,
34 | 			"crop_stop": 313,
35 | 			"hpf_start": 154,
36 | 			"hpf_stop": 141,
37 | 			"res_type": "sinc_medium"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 757,
42 | 	"pre_filter_stop": 768
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 640,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 187,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 768,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 212,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 174,
26 | 			"lpf_stop": 209,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 640,
33 | 			"crop_start": 66,
34 | 			"crop_stop": 307,
35 | 			"hpf_start": 86,
36 | 			"hpf_stop": 72,
37 | 			"res_type": "kaiser_fast"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 639,
42 | 	"pre_filter_stop": 640
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 668,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 1024,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 186,
12 | 			"lpf_start": 37,
13 | 			"lpf_stop": 73,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 11025,
18 | 			"hl": 128,
19 | 			"n_fft": 512,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 185,			
22 | 			"hpf_start": 36,
23 | 			"hpf_stop": 18,
24 | 			"lpf_start": 93,
25 | 			"lpf_stop": 185,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 22050,
30 | 			"hl": 256,
31 | 			"n_fft": 512,
32 | 			"crop_start": 46,
33 | 			"crop_stop": 186,
34 | 			"hpf_start": 93,
35 | 			"hpf_stop": 46,
36 | 			"lpf_start": 164,
37 | 			"lpf_stop": 186,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 512,
43 | 			"n_fft": 768,
44 | 			"crop_start": 121,
45 | 			"crop_stop": 382,
46 | 			"hpf_start": 138,
47 | 			"hpf_stop": 123,
48 | 			"res_type": "sinc_medium"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 740,
53 | 	"pre_filter_stop": 768
54 | }
55 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"mid_side": true,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }
56 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"reverse": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"stereo_w": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"convert_channels": "stereo_n",
49 | 			"res_type": "kaiser_fast"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 668,
54 | 	"pre_filter_stop": 672
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 530,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/ensemble.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 1280,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 2048,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 374,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 1536,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 424,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 348,
26 | 			"lpf_stop": 418,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 1280,
33 | 			"crop_start": 132,
34 | 			"crop_stop": 614,
35 | 			"hpf_start": 172,
36 | 			"hpf_stop": 144,
37 | 			"res_type": "polyphase"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 1280,
42 | 	"pre_filter_stop": 1280
43 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets.py:
--------------------------------------------------------------------------------
  1 | import layers
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | from . import spec_utils
  7 | 
  8 | 
  9 | class BaseASPPNet(nn.Module):
 10 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 11 |         super(BaseASPPNet, self).__init__()
 12 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 13 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 14 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 15 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 16 | 
 17 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 18 | 
 19 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 20 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 21 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 22 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 23 | 
 24 |     def __call__(self, x):
 25 |         h, e1 = self.enc1(x)
 26 |         h, e2 = self.enc2(h)
 27 |         h, e3 = self.enc3(h)
 28 |         h, e4 = self.enc4(h)
 29 | 
 30 |         h = self.aspp(h)
 31 | 
 32 |         h = self.dec4(h, e4)
 33 |         h = self.dec3(h, e3)
 34 |         h = self.dec2(h, e2)
 35 |         h = self.dec1(h, e1)
 36 | 
 37 |         return h
 38 | 
 39 | 
 40 | class CascadedASPPNet(nn.Module):
 41 |     def __init__(self, n_fft):
 42 |         super(CascadedASPPNet, self).__init__()
 43 |         self.stg1_low_band_net = BaseASPPNet(2, 16)
 44 |         self.stg1_high_band_net = BaseASPPNet(2, 16)
 45 | 
 46 |         self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
 47 |         self.stg2_full_band_net = BaseASPPNet(8, 16)
 48 | 
 49 |         self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 50 |         self.stg3_full_band_net = BaseASPPNet(16, 32)
 51 | 
 52 |         self.out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
 54 |         self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
 55 | 
 56 |         self.max_bin = n_fft // 2
 57 |         self.output_bin = n_fft // 2 + 1
 58 | 
 59 |         self.offset = 128
 60 | 
 61 |     def forward(self, x, aggressiveness=None):
 62 |         mix = x.detach()
 63 |         x = x.clone()
 64 | 
 65 |         x = x[:, :, : self.max_bin]
 66 | 
 67 |         bandw = x.size()[2] // 2
 68 |         aux1 = torch.cat(
 69 |             [
 70 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 71 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 72 |             ],
 73 |             dim=2,
 74 |         )
 75 | 
 76 |         h = torch.cat([x, aux1], dim=1)
 77 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 78 | 
 79 |         h = torch.cat([x, aux1, aux2], dim=1)
 80 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 81 | 
 82 |         mask = torch.sigmoid(self.out(h))
 83 |         mask = F.pad(
 84 |             input=mask,
 85 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 86 |             mode="replicate",
 87 |         )
 88 | 
 89 |         if self.training:
 90 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 91 |             aux1 = F.pad(
 92 |                 input=aux1,
 93 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 94 |                 mode="replicate",
 95 |             )
 96 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 97 |             aux2 = F.pad(
 98 |                 input=aux2,
 99 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 |                 mode="replicate",
101 |             )
102 |             return mask * mix, aux1 * mix, aux2 * mix
103 |         else:
104 |             if aggressiveness:
105 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 |                     mask[:, :, : aggressiveness["split_bin"]],
107 |                     1 + aggressiveness["value"] / 3,
108 |                 )
109 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 |                     mask[:, :, aggressiveness["split_bin"] :],
111 |                     1 + aggressiveness["value"],
112 |                 )
113 | 
114 |             return mask * mix
115 | 
116 |     def predict(self, x_mag, aggressiveness=None):
117 |         h = self.forward(x_mag, aggressiveness)
118 | 
119 |         if self.offset > 0:
120 |             h = h[:, :, :, self.offset : -self.offset]
121 |             assert h.size()[3] > 0
122 | 
123 |         return h
124 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_123812KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_123821KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | def load_data(file_name: str = "./lib/name_params.json") -> dict:
  9 |     with open(file_name, "r") as f:
 10 |         data = json.load(f)
 11 | 
 12 |     return data
 13 | 
 14 | 
 15 | def make_padding(width, cropsize, offset):
 16 |     left = offset
 17 |     roi_size = cropsize - left * 2
 18 |     if roi_size == 0:
 19 |         roi_size = cropsize
 20 |     right = roi_size - (width % roi_size) + left
 21 | 
 22 |     return left, right, roi_size
 23 | 
 24 | 
 25 | def inference(X_spec, device, model, aggressiveness, data):
 26 |     """
 27 |     data : dic configs
 28 |     """
 29 | 
 30 |     def _execute(
 31 |         X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
 32 |     ):
 33 |         model.eval()
 34 |         with torch.no_grad():
 35 |             preds = []
 36 | 
 37 |             iterations = [n_window]
 38 | 
 39 |             total_iterations = sum(iterations)
 40 |             for i in tqdm(range(n_window)):
 41 |                 start = i * roi_size
 42 |                 X_mag_window = X_mag_pad[
 43 |                     None, :, :, start : start + data["window_size"]
 44 |                 ]
 45 |                 X_mag_window = torch.from_numpy(X_mag_window)
 46 |                 if is_half:
 47 |                     X_mag_window = X_mag_window.half()
 48 |                 X_mag_window = X_mag_window.to(device)
 49 | 
 50 |                 pred = model.predict(X_mag_window, aggressiveness)
 51 | 
 52 |                 pred = pred.detach().cpu().numpy()
 53 |                 preds.append(pred[0])
 54 | 
 55 |             pred = np.concatenate(preds, axis=2)
 56 |         return pred
 57 | 
 58 |     def preprocess(X_spec):
 59 |         X_mag = np.abs(X_spec)
 60 |         X_phase = np.angle(X_spec)
 61 | 
 62 |         return X_mag, X_phase
 63 | 
 64 |     X_mag, X_phase = preprocess(X_spec)
 65 | 
 66 |     coef = X_mag.max()
 67 |     X_mag_pre = X_mag / coef
 68 | 
 69 |     n_frame = X_mag_pre.shape[2]
 70 |     pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
 71 |     n_window = int(np.ceil(n_frame / roi_size))
 72 | 
 73 |     X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 74 | 
 75 |     if list(model.state_dict().values())[0].dtype == torch.float16:
 76 |         is_half = True
 77 |     else:
 78 |         is_half = False
 79 |     pred = _execute(
 80 |         X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
 81 |     )
 82 |     pred = pred[:, :, :n_frame]
 83 | 
 84 |     if data["tta"]:
 85 |         pad_l += roi_size // 2
 86 |         pad_r += roi_size // 2
 87 |         n_window += 1
 88 | 
 89 |         X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 90 | 
 91 |         pred_tta = _execute(
 92 |             X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
 93 |         )
 94 |         pred_tta = pred_tta[:, :, roi_size // 2 :]
 95 |         pred_tta = pred_tta[:, :, :n_frame]
 96 | 
 97 |         return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
 98 |     else:
 99 |         return pred * coef, X_mag, np.exp(1.0j * X_phase)
100 | 
101 | 
102 | def _get_name_params(model_path, model_hash):
103 |     data = load_data()
104 |     flag = False
105 |     ModelName = model_path
106 |     for type in list(data):
107 |         for model in list(data[type][0]):
108 |             for i in range(len(data[type][0][model])):
109 |                 if str(data[type][0][model][i]["hash_name"]) == model_hash:
110 |                     flag = True
111 |                 elif str(data[type][0][model][i]["hash_name"]) in ModelName:
112 |                     flag = True
113 | 
114 |                 if flag:
115 |                     model_params_auto = data[type][0][model][i]["model_params"]
116 |                     param_name_auto = data[type][0][model][i]["param_name"]
117 |                     if type == "equivalent":
118 |                         return param_name_auto, model_params_auto
119 |                     else:
120 |                         flag = False
121 |     return param_name_auto, model_params_auto
122 | 


--------------------------------------------------------------------------------
/tools/uvr5/uvr5_weights/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/webuis/character_manager/i18n/locale/zh_CN.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ", 返回内容：": ", 返回内容：",
  3 |     "<p>这是模型管理界面，为了实现对多段参考音频分配情感设计，如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解，可参考文档：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>这是模型管理界面，为了实现对多段参考音频分配情感设计，如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解，可参考文档：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>",
  4 |     "Endpoint": "Endpoint",
  5 |     "GPT模型路径": "GPT模型路径",
  6 |     "Sovits模型路径": "Sovits模型路径",
  7 |     "Temperature": "Temperature",
  8 |     "Top K": "Top K",
  9 |     "Top P": "Top P",
 10 |     "all_ja": "只有日文",
 11 |     "all_zh": "只有中文",
 12 |     "auto": "自动判断",
 13 |     "auto_cut": "智能切分",
 14 |     "batch_size，1代表不并行，越大越快，但是越可能出问题": "batch_size，1代表不并行，越大越快，但是越可能出问题",
 15 |     "cut0": "仅凭换行切分",
 16 |     "cut1": "凑四句一切",
 17 |     "cut2": "凑50字一切",
 18 |     "cut3": "按中文句号。切",
 19 |     "cut4": "按英文句号.切",
 20 |     "cut5": "按标点符号切",
 21 |     "en": "英文",
 22 |     "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
 23 |     "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
 24 |     "ja": "日文",
 25 |     "json设置（一般不动）": "json设置（一般不动）",
 26 |     "zh": "中文",
 27 |     "不切": "不切",
 28 |     "人物情感列表网址": "人物情感列表网址",
 29 |     "从json中读取": "从json中读取",
 30 |     "使用前，请确认后端服务已启动。": "使用前，请确认后端服务已启动。",
 31 |     "保存json\n（可能不会有完成提示，没报错就是成功）": "保存json\n（可能不会有完成提示，没报错就是成功）",
 32 |     "保存失败！": "保存失败！",
 33 |     "保存成功！": "保存成功！",
 34 |     "停止播放": "停止播放",
 35 |     "切句方式": "切句方式",
 36 |     "前端处理后的文本(每句):": "前端处理后的文本(每句):",
 37 |     "参考音频在3~10秒范围外，请更换！": "参考音频在3~10秒范围外，请更换！",
 38 |     "参考音频路径": "参考音频路径",
 39 |     "发送json格式": "发送json格式",
 40 |     "发送并开始播放": "发送并开始播放",
 41 |     "发送请求": "发送请求",
 42 |     "发送请求到": "发送请求到",
 43 |     "吞字漏字属于正常现象，太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象，太严重可尝试换行、加句号或调节batch size滑条。",
 44 |     "吞字漏字属于正常现象，太严重可通过换行或加句号解决，或者更换参考音频（使用模型管理界面）、调节下方batch size滑条。": "吞字漏字属于正常现象，太严重可通过换行或加句号解决，或者更换参考音频（使用模型管理界面）、调节下方batch size滑条。",
 45 |     "基础选项": "基础选项",
 46 |     "实际输入的参考文本:": "实际输入的参考文本:",
 47 |     "实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):",
 48 |     "实际输入的目标文本(每句):": "实际输入的目标文本(每句):",
 49 |     "实际输入的目标文本:": "实际输入的目标文本:",
 50 |     "密码": "密码",
 51 |     "当前人物": "当前人物",
 52 |     "当前人物变更为: ": "当前人物变更为: ",
 53 |     "您在使用经典推理模式，部分选项不可用": "您在使用经典推理模式，部分选项不可用",
 54 |     "情感列表": "情感列表",
 55 |     "情感风格": "情感风格",
 56 |     "我是一个粉刷匠，粉刷本领强。我要把那新房子，刷得更漂亮。刷了房顶又刷墙，刷子像飞一样。哎呀我的小鼻子，变呀变了样。": "我是一个粉刷匠，粉刷本领强。我要把那新房子，刷得更漂亮。刷了房顶又刷墙，刷子像飞一样。哎呀我的小鼻子，变呀变了样。",
 57 |     "扫描": "扫描",
 58 |     "扫描人物列表": "扫描人物列表",
 59 |     "扫描模型文件夹:": "扫描模型文件夹:",
 60 |     "找不到模型文件！请把有效文件放置在文件夹下！！！": "找不到模型文件！请把有效文件放置在文件夹下！！！",
 61 |     "提供的推理特化包，当前版本：": "提供的推理特化包，当前版本：",
 62 |     "提示": "提示",
 63 |     "提示文本": "提示文本",
 64 |     "提示语言": "提示语言",
 65 |     "文件打开失败，保存失败！": "文件打开失败，保存失败！",
 66 |     "文本语言": "文本语言",
 67 |     "是否自动匹配情感": "是否自动匹配情感",
 68 |     "模型文件夹路径": "模型文件夹路径",
 69 |     "每句允许最大切分字词数": "每句允许最大切分字词数",
 70 |     "流式音频": "流式音频",
 71 |     "添加情感": "添加情感",
 72 |     "点击查看详细文档": "点击查看详细文档",
 73 |     "版本": "版本",
 74 |     "用户名": "用户名",
 75 |     "种子": "种子",
 76 |     "简介": "简介",
 77 |     "缺失某些项，保存失败！": "缺失某些项，保存失败！",
 78 |     "网址设置": "网址设置",
 79 |     "自动生成info": "自动生成info",
 80 |     "若有疑问或需要进一步了解，可参考文档：": "若有疑问或需要进一步了解，可参考文档：",
 81 |     "认证信息": "认证信息",
 82 |     "认证信息已启用，您可以在config.json中关闭。\n但是这个功能还没做好，只是摆设": "认证信息已启用，您可以在config.json中关闭。\n但是这个功能还没做好，只是摆设",
 83 |     "语速": "语速",
 84 |     "请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存",
 85 |     "请求失败，状态码：": "请求失败，状态码：",
 86 |     "请求失败，请检查URL是否正确": "请求失败，请检查URL是否正确",
 87 |     "请求完整音频": "请求完整音频",
 88 |     "请求网址": "请求网址",
 89 |     "输入文本": "输入文本",
 90 |     "这是一个由": "这是一个由",
 91 |     "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一个简单好用的前后端项目",
 92 |     "这是展示页面的版本，并未使用后端服务，下面参数无效。": "这是展示页面的版本，并未使用后端服务，下面参数无效。",
 93 |     "选择角色": "选择角色",
 94 |     "音频输出": "音频输出",
 95 |     "音频预览": "音频预览",
 96 |     "项目开源地址：": "项目开源地址：",
 97 |     "高级选项": "高级选项",
 98 |     "最大允许长度": "最大允许长度"
 99 | }
100 | 


--------------------------------------------------------------------------------
/webuis/character_manager/i18n/locale/zh_TW.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     ", 返回内容：": ", 返回內容：",
  3 |     "<p>这是模型管理界面，为了实现对多段参考音频分配情感设计，如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解，可参考文档：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>這是模型管理介面，為了實現對多段參考音頻分配情緒設計，如果您只有一段可不使用這個介面</p><p>若有疑問或需要進一步了解，可參考文件：<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">點擊查看詳細文件</a>。</p>",
  4 |     "Endpoint": "Endpoint",
  5 |     "GPT模型路径": "GPT模型路徑",
  6 |     "Sovits模型路径": "Sovits模型路徑",
  7 |     "Temperature": "Temperature",
  8 |     "Top K": "Top K",
  9 |     "Top P": "Top P",
 10 |     "all_ja": "僅日文",
 11 |     "all_zh": "僅中文",
 12 |     "auto": "自動判斷",
 13 |     "auto_cut": "智慧切分",
 14 |     "batch_size，1代表不并行，越大越快，但是越可能出问题": "batch_size，1代表不並行，越大越快，但是越可能出現問題",
 15 |     "cut0": "僅憑換行切分",
 16 |     "cut1": "湊四句一切",
 17 |     "cut2": "湊50字一切",
 18 |     "cut3": "按中文句號。切",
 19 |     "cut4": "按英文句號.切",
 20 |     "cut5": "按標點符號切",
 21 |     "en": "英文",
 22 |     "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
 23 |     "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
 24 |     "ja": "日文",
 25 |     "json设置（一般不动）": "json設置（一般不動）",
 26 |     "zh": "中文",
 27 |     "不切": "不切",
 28 |     "人物情感列表网址": "人物情緒列表網址",
 29 |     "从json中读取": "從json中讀取",
 30 |     "使用前，请确认后端服务已启动。": "使用前，請確認後端服務已啟動。",
 31 |     "保存json\n（可能不会有完成提示，没报错就是成功）": "保存json\n（可能不會有完成提示，沒報錯就是成功）",
 32 |     "保存失败！": "保存失敗！",
 33 |     "保存成功！": "保存成功！",
 34 |     "停止播放": "停止播放",
 35 |     "切句方式": "切句方式",
 36 |     "前端处理后的文本(每句):": "前端處理後的文本(每句):",
 37 |     "参考音频在3~10秒范围外，请更换！": "參考音頻在3~10秒範圍外，請更換！",
 38 |     "参考音频路径": "參考音頻路徑",
 39 |     "发送json格式": "發送json格式",
 40 |     "发送并开始播放": "發送並開始播放",
 41 |     "发送请求": "發送請求",
 42 |     "发送请求到": "發送請求到",
 43 |     "吞字漏字属于正常现象，太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象，太嚴重可通過換行或加句號解決，或調節batch size滑條。",
 44 |     "吞字漏字属于正常现象，太严重可通过换行或加句号解决，或者更换参考音频（使用模型管理界面）、调节下方batch size滑条。": "吞字漏字屬於正常現象，太嚴重可通過換行或加句號解決，或者更換參考音頻（使用模型管理介面）、調節下方batch size滑條。",
 45 |     "基础选项": "基礎選項",
 46 |     "实际输入的参考文本:": "實際輸入的參考文本:",
 47 |     "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):",
 48 |     "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
 49 |     "实际输入的目标文本:": "實際輸入的目標文本:",
 50 |     "密码": "密碼",
 51 |     "当前人物": "當前人物",
 52 |     "当前人物变更为: ": "當前人物變更為: ",
 53 |     "您在使用经典推理模式，部分选项不可用": "您在使用經典推理模式，部分選項不可用",
 54 |     "情感列表": "情緒列表",
 55 |     "情感风格": "情緒風格",
 56 |     "我是一个粉刷匠，粉刷本领强。我要把那新房子，刷得更漂亮。刷了房顶又刷墙，刷子像飞一样。哎呀我的小鼻子，变呀变了样。": "有時掉進黑洞，有時候爬上彩虹。在下一秒鐘，命運如何轉動，沒有人會曉得。我說希望無窮，你猜美夢成空，相信和懷疑，總要決鬥。",
 57 |     "扫描": "掃描",
 58 |     "扫描人物列表": "掃描人物列表",
 59 |     "扫描模型文件夹:": "掃描模型文件夾:",
 60 |     "找不到模型文件！请把有效文件放置在文件夹下！！！": "找不到模型文件！請把有效文件放置在文件夾下！！！",
 61 |     "提供的推理特化包，当前版本：": "提供的推理特化包，當前版本：",
 62 |     "提示": "提示",
 63 |     "提示文本": "提示文本",
 64 |     "提示语言": "提示語言",
 65 |     "文件打开失败，保存失败！": "文件開啟失敗，保存失敗！",
 66 |     "文本语言": "文本語言",
 67 |     "是否自动匹配情感": "是否自動匹配情緒",
 68 |     "模型文件夹路径": "模型文件夾路徑",
 69 |     "每句允许最大切分字词数": "每句允許最大切分字詞數",
 70 |     "流式音频": "流式音頻",
 71 |     "添加情感": "添加情緒",
 72 |     "点击查看详细文档": "點擊查看詳細文件",
 73 |     "版本": "版本",
 74 |     "用户名": "使用者名稱",
 75 |     "种子": "種子",
 76 |     "简介": "簡介",
 77 |     "缺失某些项，保存失败！": "缺失某些項，保存失敗！",
 78 |     "网址设置": "網址設置",
 79 |     "自动生成info": "自動生成info",
 80 |     "若有疑问或需要进一步了解，可参考文档：": "若有疑問或需要進一步了解，可參考文件：",
 81 |     "认证信息": "認證信息",
 82 |     "认证信息已启用，您可以在config.json中关闭。\n但是这个功能还没做好，只是摆设": "認證信息已啟用，您可以在config.json中關閉。\n但是這個功能還沒做好，只是擺設",
 83 |     "语速": "語速",
 84 |     "请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存",
 85 |     "请求失败，状态码：": "請求失敗，狀態碼：",
 86 |     "请求失败，请检查URL是否正确": "請求失敗，請檢查URL是否正確",
 87 |     "请求完整音频": "請求完整音頻",
 88 |     "请求网址": "請求網址",
 89 |     "输入文本": "輸入文本",
 90 |     "这是一个由": "這是一個由",
 91 |     "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS，是一個簡單好用的前後端項目",
 92 |     "这是展示页面的版本，并未使用后端服务，下面参数无效。": "這是展示頁面的版本，並未使用後端服務，下面參數無效。",
 93 |     "选择角色": "選擇角色",
 94 |     "音频输出": "音頻輸出",
 95 |     "音频预览": "音頻預覽",
 96 |     "项目开源地址：": "Github Link：",
 97 |     "高级选项": "高級選項",
 98 |     "最大允许长度": "最大允許長度"
 99 | }
100 | 


--------------------------------------------------------------------------------