├── .dockerignore
├── .gitignore
├── 0 一键启动脚本
├── 0 一键更新项目.bat
├── 1 一键更新本项目所需要的依赖.bat
├── 10 启动模型管理界面(可选).bat
├── 11 启动原项目的训练界面(小白别开,请根据页面上的文档链接自行研究,推理群不包解答).bat
├── 3 启动GSVI.bat
├── 5 启动纯粹的后端(不推荐).bat
├── 999 强制更新:会覆盖你的设置,慎用,和0功能类似.bat
├── Cfg
│ ├── About.txt
│ └── Cfg.ini
├── GPT-soVITS Start.exe
└── 说明.txt
├── Docker
├── damo.sha256
├── download.py
├── download.sh
├── links.sha256
└── links.txt
├── Dockerfile
├── GPT_SoVITS
├── AR
│ ├── __init__.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── bucket_sampler.py
│ │ ├── data_module.py
│ │ └── dataset.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── t2s_lightning_module.py
│ │ ├── t2s_lightning_module_onnx.py
│ │ ├── t2s_model.py
│ │ ├── t2s_model_batch_only.py
│ │ ├── t2s_model_onnx.py
│ │ └── utils.py
│ ├── modules
│ │ ├── __init__.py
│ │ ├── activation.py
│ │ ├── activation_onnx.py
│ │ ├── embedding.py
│ │ ├── embedding_onnx.py
│ │ ├── lr_schedulers.py
│ │ ├── optim.py
│ │ ├── patched_mha_with_cache.py
│ │ ├── patched_mha_with_cache_onnx.py
│ │ ├── scaling.py
│ │ ├── transformer.py
│ │ └── transformer_onnx.py
│ ├── text_processing
│ │ ├── __init__.py
│ │ ├── phonemizer.py
│ │ └── symbols.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── initialize.py
│ │ └── io.py
├── TTS_infer_pack
│ ├── TTS.py
│ ├── TextPreprocessor.py
│ ├── __init__.py
│ └── text_segmentation_method.py
├── configs
│ ├── s1.yaml
│ ├── s1big.yaml
│ ├── s1big2.yaml
│ ├── s1longer.yaml
│ ├── s1mq.yaml
│ ├── s2.json
│ ├── train.yaml
│ └── tts_infer.yaml
├── feature_extractor
│ ├── __init__.py
│ ├── cnhubert.py
│ └── whisper_enc.py
├── inference_gui.py
├── inference_webui.py
├── inference_webui_old.py
├── module
│ ├── __init__.py
│ ├── attentions.py
│ ├── attentions_onnx.py
│ ├── commons.py
│ ├── core_vq.py
│ ├── data_utils.py
│ ├── losses.py
│ ├── mel_processing.py
│ ├── models.py
│ ├── models_onnx.py
│ ├── modules.py
│ ├── mrte_model.py
│ ├── quantize.py
│ └── transforms.py
├── my_utils.py
├── onnx_export.py
├── prepare_datasets
│ ├── 1-get-text.py
│ ├── 2-get-hubert-wav32k.py
│ └── 3-get-semantic.py
├── pretrained_models
│ └── .gitignore
├── process_ckpt.py
├── s1_train.py
├── s2_train.py
├── text
│ ├── __init__.py
│ ├── chinese.py
│ ├── cleaner.py
│ ├── cmudict-fast.rep
│ ├── cmudict.rep
│ ├── engdict-hot.rep
│ ├── engdict_cache.pickle
│ ├── english.py
│ ├── japanese.py
│ ├── namedict_cache.pickle
│ ├── opencpop-strict.txt
│ ├── symbols.py
│ ├── tone_sandhi.py
│ └── zh_normalization
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── char_convert.py
│ │ ├── chronology.py
│ │ ├── constants.py
│ │ ├── num.py
│ │ ├── phonecode.py
│ │ ├── quantifier.py
│ │ └── text_normlization.py
└── utils.py
├── LICENSE
├── README.md
├── Synthesizers
├── base
│ ├── Base_TTS_Synthesizer.py
│ ├── Base_TTS_Task.py
│ ├── __init__.py
│ └── config_utils.py
├── gsv_fast
│ ├── GSV_Synthesizer.py
│ ├── __init__.py
│ ├── configs
│ │ ├── i18n
│ │ │ └── locale
│ │ │ │ ├── en_US.json
│ │ │ │ ├── zh_CN.json
│ │ │ │ └── zh_TW.json
│ │ ├── params_config.json
│ │ └── ui_config.json
│ ├── gsv_config.py
│ ├── gsv_task.py
│ └── ssml_dealer.py
└── remote
│ ├── Remote_Synthesizer.py
│ ├── __init__.py
│ ├── configs
│ ├── config.json
│ ├── i18n
│ │ └── locale
│ │ │ ├── en_US.json
│ │ │ ├── zh_CN.json
│ │ │ └── zh_TW.json
│ ├── params_config.json
│ └── ui_config.json
│ └── remote_task.py
├── api_doc.md
├── app.py
├── colab_webui.ipynb
├── common_config.json
├── docker-compose.yaml
├── dockerbuild.sh
├── docs
├── cn
│ ├── Changelog_CN.md
│ └── README.md
├── ja
│ ├── Changelog_JA.md
│ └── README.md
├── ko
│ ├── Changelog_KO.md
│ └── README.md
└── tr
│ └── README.md
├── gpt-sovits_kaggle.ipynb
├── gsv_config.json
├── i18n
└── locale
│ ├── en_US.json
│ ├── es_ES.json
│ ├── fr_FR.json
│ ├── it_IT.json
│ ├── ja_JP.json
│ ├── ko_KR.json
│ ├── pt_BR.json
│ ├── ru_RU.json
│ ├── tr_TR.json
│ ├── zh_CN.json
│ ├── zh_HK.json
│ ├── zh_SG.json
│ └── zh_TW.json
├── install.sh
├── pure_api.py
├── requirements.txt
├── src
├── api_utils.py
└── common_config_manager.py
├── tmp_audio
└── .gitignore
├── tools
├── __init__.py
├── asr
│ ├── config.py
│ ├── fasterwhisper_asr.py
│ ├── funasr_asr.py
│ └── models
│ │ └── .gitignore
├── cmd-denoise.py
├── denoise-model
│ └── .gitignore
├── i18n
│ ├── i18n.py
│ ├── locale
│ │ ├── en_US.json
│ │ ├── es_ES.json
│ │ ├── fr_FR.json
│ │ ├── it_IT.json
│ │ ├── ja_JP.json
│ │ ├── ko_KR.json
│ │ ├── ru_RU.json
│ │ ├── tr_TR.json
│ │ ├── zh_CN.json
│ │ ├── zh_HK.json
│ │ ├── zh_SG.json
│ │ └── zh_TW.json
│ ├── locale_diff.py
│ └── scan_i18n.py
├── my_utils.py
├── normalize_loudness.py
├── slice_audio.py
├── slicer2.py
├── srt_slicer
│ ├── i18n
│ │ └── locale
│ │ │ ├── en_US.json
│ │ │ └── zh_CN.json
│ ├── srt_utils.py
│ └── webui.py
├── subfix_webui.py
└── uvr5
│ ├── lib
│ ├── lib_v5
│ │ ├── dataset.py
│ │ ├── layers.py
│ │ ├── layers_123812KB.py
│ │ ├── layers_123821KB.py
│ │ ├── layers_33966KB.py
│ │ ├── layers_537227KB.py
│ │ ├── layers_537238KB.py
│ │ ├── layers_new.py
│ │ ├── model_param_init.py
│ │ ├── modelparams
│ │ │ ├── 1band_sr16000_hl512.json
│ │ │ ├── 1band_sr32000_hl512.json
│ │ │ ├── 1band_sr33075_hl384.json
│ │ │ ├── 1band_sr44100_hl1024.json
│ │ │ ├── 1band_sr44100_hl256.json
│ │ │ ├── 1band_sr44100_hl512.json
│ │ │ ├── 1band_sr44100_hl512_cut.json
│ │ │ ├── 2band_32000.json
│ │ │ ├── 2band_44100_lofi.json
│ │ │ ├── 2band_48000.json
│ │ │ ├── 3band_44100.json
│ │ │ ├── 3band_44100_mid.json
│ │ │ ├── 3band_44100_msb2.json
│ │ │ ├── 4band_44100.json
│ │ │ ├── 4band_44100_mid.json
│ │ │ ├── 4band_44100_msb.json
│ │ │ ├── 4band_44100_msb2.json
│ │ │ ├── 4band_44100_reverse.json
│ │ │ ├── 4band_44100_sw.json
│ │ │ ├── 4band_v2.json
│ │ │ ├── 4band_v2_sn.json
│ │ │ ├── 4band_v3.json
│ │ │ └── ensemble.json
│ │ ├── nets.py
│ │ ├── nets_123812KB.py
│ │ ├── nets_123821KB.py
│ │ ├── nets_33966KB.py
│ │ ├── nets_537227KB.py
│ │ ├── nets_537238KB.py
│ │ ├── nets_61968KB.py
│ │ ├── nets_new.py
│ │ └── spec_utils.py
│ ├── name_params.json
│ └── utils.py
│ ├── mdxnet.py
│ ├── uvr5_weights
│ └── .gitignore
│ ├── vr.py
│ └── webui.py
└── webuis
├── builders
└── gradio_builder.py
└── character_manager
├── i18n
└── locale
│ ├── en_US.json
│ ├── zh_CN.json
│ └── zh_TW.json
└── webui.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | docs
2 | logs
3 | output
4 | reference
5 | SoVITS_weights
6 | GPT_weights
7 | TEMP
8 | GPT_SoVITS
9 | trained
10 | .git
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__
3 | *.pyc
4 | env
5 | runtime
6 | .idea
7 | output
8 | logs
9 | reference
10 | GPT_weights
11 | SoVITS_weights
12 | TEMP
13 | PortableGit
14 | cache
15 |
16 | ffmpeg.exe
17 | ffprobe.exe
18 | tmp_audio
19 | trained*
20 | history
21 | app.log
22 |
--------------------------------------------------------------------------------
/0 一键启动脚本/0 一键更新项目.bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off
3 | setlocal
4 |
5 |
6 | echo 设置需要同步的本地仓库路径
7 | set REPO_PATH=../
8 |
9 | echo 切换到仓库目录
10 | cd /d %REPO_PATH%
11 |
12 | echo 设置 PortableGit 的路径
13 | set GIT_PATH=PortableGit/bin
14 |
15 | echo 更新所有子模块
16 | "%GIT_PATH%\git.exe" submodule update --init --recursive
17 |
18 | echo 执行 git pull 更新本地仓库
19 | "%GIT_PATH%\git.exe" stash
20 | "%GIT_PATH%\git.exe" pull https://gitee.com/xxoy/GPT-SoVITS-Inference.git stable
21 |
22 | echo.
23 | echo 更新完成!
24 | pause
25 |
--------------------------------------------------------------------------------
/0 一键启动脚本/1 一键更新本项目所需要的依赖.bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off
3 | cd ../
4 | echo 请确保您的主项目运行正常
5 | runtime\python.exe -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
6 | runtime\python.exe -m pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn
7 | runtime\python.exe -m pip install -r ./requirements.txt
8 |
9 | pause
--------------------------------------------------------------------------------
/0 一键启动脚本/10 启动模型管理界面(可选).bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off
3 | cd ../
4 | echo 尝试启动程序,请耐心等待gradio启动,等待十几秒,若未自动弹出浏览器,请手动打开浏览器输入http://127.0.0.1:9868
5 | runtime\python.exe ./webuis/character_manager/webui.py
6 |
7 | pause
--------------------------------------------------------------------------------
/0 一键启动脚本/11 启动原项目的训练界面(小白别开,请根据页面上的文档链接自行研究,推理群不包解答).bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off
3 | cd ../
4 | echo 尝试启动原版的训练推理界面
5 |
6 | runtime\python.exe ./webui.py
7 |
8 | pause
--------------------------------------------------------------------------------
/0 一键启动脚本/3 启动GSVI.bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off
3 | cd ../
4 | echo 尝试启动程序,请耐心等待gradio启动,等待十几秒,若未自动弹出浏览器,请手动打开浏览器输入你配置的网址,例如:http://127.0.0.1:5000
5 | runtime\python.exe app.py
6 |
7 | pause
--------------------------------------------------------------------------------
/0 一键启动脚本/5 启动纯粹的后端(不推荐).bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off
3 | cd ../
4 | echo 尝试启动程序
5 | runtime\python.exe pure_api.py
6 |
7 | pause
--------------------------------------------------------------------------------
/0 一键启动脚本/999 强制更新:会覆盖你的设置,慎用,和0功能类似.bat:
--------------------------------------------------------------------------------
1 | CHCP 65001
2 | @echo off
3 | setlocal
4 |
5 |
6 | echo 设置需要同步的本地仓库路径
7 | set REPO_PATH=../
8 |
9 | echo 切换到仓库目录
10 | cd /d %REPO_PATH%
11 |
12 | echo 设置 PortableGit 的路径
13 | set GIT_PATH=PortableGit/bin
14 |
15 | echo 强制覆盖本地仓库
16 | "%GIT_PATH%\git.exe" fetch https://gitee.com/xxoy/GPT-SoVITS-Inference.git stable
17 | "%GIT_PATH%\git.exe" reset --hard FETCH_HEAD
18 |
19 | echo.
20 | echo 更新完成!
21 | pause
--------------------------------------------------------------------------------
/0 一键启动脚本/Cfg/About.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/0 一键启动脚本/Cfg/About.txt
--------------------------------------------------------------------------------
/0 一键启动脚本/Cfg/Cfg.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/0 一键启动脚本/Cfg/Cfg.ini
--------------------------------------------------------------------------------
/0 一键启动脚本/GPT-soVITS Start.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/0 一键启动脚本/GPT-soVITS Start.exe
--------------------------------------------------------------------------------
/0 一键启动脚本/说明.txt:
--------------------------------------------------------------------------------
1 |
2 | 一、启动GSVI
3 |
4 | 现在的GSVI已经集成了后端的功能,使用方法同样。
5 | 根据您的设置,比如您的网址是127.0.0.1:5000,
6 | 那您直接打开这个网址会是一个简单的gradio合成界面
7 | 访问例如 http://127.0.0.1:5000/tts?text=你好 这样的链接的话就会生成一个音频文件并返回给您
8 |
9 | 您可以用该目录下的启动器启动,或手动运行 3 启动GSVI.bat
10 |
11 | 更多有关示例请参见 https://www.yuque.com/xter/zibxlp
12 |
13 | 二、纯粹的后端(不推荐的)
14 |
15 | 您如果想要纯粹的后端,可以运行 5 启动纯粹的后端.bat
16 | 用法与上面完全一致,更多请参见API的文档,也在上面链接里
17 | 但完全没必要,附加一个简易的gradio界面并不会造成性能损耗
18 |
19 | 三、模型管理界面
20 |
21 | 有一个相当简陋的gradio界面,亟待升级,但是现在先将就着用吧~
22 | 您可以用该目录下的启动器启动,或双击运行 10 启动模型管理界面(可选).bat
23 |
24 | 四、启动原版GSV的训练
25 |
26 | 您可以启动 GSV WebUI
27 | 但是实际上我们不推荐您这么做,这个推理项目不被建议用来训练,版本太高了,据传会出现小问题
28 | 另外您需要补充降噪模型,在新版本为了节省体积默认不放置这些模型了
29 | 路径:tools\uvr5\uvr5_weights\ 与 tools\asr\models\models\ 与 tools\denoise-model\
30 | 您可以去原始项目重新复制一份tools\进来
31 |
32 | 有关于配置文件:您可以配置根目录下的gsv_config.json与common_config.json,详情见语雀文档
33 |
34 | 另外,如果您运行出了问题,请第一时间想到去更新(999-0-1:执行这3个bat)
35 |
36 |
37 |
--------------------------------------------------------------------------------
/Docker/damo.sha256:
--------------------------------------------------------------------------------
1 | 5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb
2 | b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb
3 | a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb
--------------------------------------------------------------------------------
/Docker/download.py:
--------------------------------------------------------------------------------
1 | # Download moda ASR related models
2 | from modelscope import snapshot_download
3 | model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',revision="v2.0.4")
4 | model_dir = snapshot_download('damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',revision="v2.0.4")
5 | model_dir = snapshot_download('damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',revision="v2.0.4")
6 |
--------------------------------------------------------------------------------
/Docker/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -Eeuo pipefail
4 |
5 | echo "Downloading models..."
6 |
7 | aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue
8 |
9 | echo "Checking SHA256..."
10 |
11 | parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c"
12 |
--------------------------------------------------------------------------------
/Docker/links.sha256:
--------------------------------------------------------------------------------
1 | b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
2 | fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth
3 | 020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth
4 | 24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
5 | e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
6 | 39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth
7 | 45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth
8 | 5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
9 | 8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
10 | 01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
11 | 56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
12 | 233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
--------------------------------------------------------------------------------
/Docker/links.txt:
--------------------------------------------------------------------------------
1 | # GPT-SoVITS models
2 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt
3 | out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
4 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth
5 | out=GPT_SoVITS/pretrained_models/s2D488k.pth
6 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth
7 | out=GPT_SoVITS/pretrained_models/s2G488k.pth
8 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json
9 | out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json
10 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json
11 | out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json
12 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin
13 | out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
14 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json
15 | out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json
16 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin
17 | out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
18 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json
19 | out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json
20 | # UVR5
21 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
22 | out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth
23 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth
24 | out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth
25 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth
26 | out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
27 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth
28 | out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
29 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth
30 | out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
31 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth
32 | out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
33 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
34 | out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Base CUDA image
2 | FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04
3 |
4 | LABEL maintainer="breakstring@hotmail.com"
5 | LABEL version="dev-20240209"
6 | LABEL description="Docker image for GPT-SoVITS-Inference"
7 |
8 |
9 | # Install 3rd party apps
10 | ENV DEBIAN_FRONTEND=noninteractive
11 | ENV TZ=Etc/UTC
12 | RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.aliyun.com/ubuntu/|g' /etc/apt/sources.list && \
13 | apt-get update && \
14 | apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \
15 | git lfs install && \
16 | rm -rf /var/lib/apt/lists/*
17 |
18 | # Copy only requirements.txt initially to leverage Docker cache
19 | WORKDIR /workspace
20 | COPY requirements.txt /workspace/
21 | RUN pip install --no-cache-dir -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
22 |
23 | # Define a build-time argument for image type
24 | ARG IMAGE_TYPE=full
25 |
26 | # Conditional logic based on the IMAGE_TYPE argument
27 | # Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite"
28 | COPY ./Docker /workspace/Docker
29 | # elite 类型的镜像里面不包含额外的模型
30 |
31 | #如果能直接从官方(翻墙)下载,则打开如下的注释,否则参考ReadMe.md中的说明自行将模型文件放到对应的文件夹中
32 | #RUN if [ "$IMAGE_TYPE" != "elite" ]; then \
33 | # chmod +x /workspace/Docker/download.sh && \
34 | # /workspace/Docker/download.sh && \
35 | # python /workspace/Docker/download.py && \
36 | # pip install -i https://pypi.tuna.tsinghua.edu.cn/simple nltk && \
37 | # python -m nltk.downloader averaged_perceptron_tagger cmudict; \
38 | # fi
39 |
40 |
41 |
42 | # Copy the rest of the application
43 | COPY . /workspace
44 |
45 | #EXPOSE 9871 9872 9873 9874 9880
46 | EXPOSE 5000
47 |
48 |
49 | CMD ["python", "app.py"]
50 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/AR/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/AR/data/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/data_module.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | from pytorch_lightning import LightningDataModule
4 | from AR.data.bucket_sampler import DistributedBucketSampler
5 | from AR.data.dataset import Text2SemanticDataset
6 | from torch.utils.data import DataLoader
7 |
8 |
9 | class Text2SemanticDataModule(LightningDataModule):
10 | def __init__(
11 | self,
12 | config,
13 | train_semantic_path,
14 | train_phoneme_path,
15 | dev_semantic_path=None,
16 | dev_phoneme_path=None,
17 | ):
18 | super().__init__()
19 | self.config = config
20 | self.train_semantic_path = train_semantic_path
21 | self.train_phoneme_path = train_phoneme_path
22 | self.dev_semantic_path = dev_semantic_path
23 | self.dev_phoneme_path = dev_phoneme_path
24 | self.num_workers = self.config["data"]["num_workers"]
25 |
26 | def prepare_data(self):
27 | pass
28 |
29 | def setup(self, stage=None, output_logs=False):
30 | self._train_dataset = Text2SemanticDataset(
31 | phoneme_path=self.train_phoneme_path,
32 | semantic_path=self.train_semantic_path,
33 | max_sec=self.config["data"]["max_sec"],
34 | pad_val=self.config["data"]["pad_val"],
35 | )
36 | self._dev_dataset = self._train_dataset
37 | # self._dev_dataset = Text2SemanticDataset(
38 | # phoneme_path=self.dev_phoneme_path,
39 | # semantic_path=self.dev_semantic_path,
40 | # max_sample=self.config['data']['max_eval_sample'],
41 | # max_sec=self.config['data']['max_sec'],
42 | # pad_val=self.config['data']['pad_val'])
43 |
44 | def train_dataloader(self):
45 | batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"]
46 | batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存
47 | sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
48 | return DataLoader(
49 | self._train_dataset,
50 | batch_size=batch_size,
51 | sampler=sampler,
52 | collate_fn=self._train_dataset.collate,
53 | num_workers=self.num_workers,
54 | persistent_workers=True,
55 | prefetch_factor=16,
56 | )
57 |
58 | def val_dataloader(self):
59 | return DataLoader(
60 | self._dev_dataset,
61 | batch_size=1,
62 | shuffle=False,
63 | collate_fn=self._train_dataset.collate,
64 | num_workers=max(self.num_workers, 12),
65 | persistent_workers=True,
66 | prefetch_factor=16,
67 | )
68 |
69 | # 这个会使用到嘛?
70 | def test_dataloader(self):
71 | return DataLoader(
72 | self._dev_dataset,
73 | batch_size=1,
74 | shuffle=False,
75 | collate_fn=self._train_dataset.collate,
76 | )
77 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/AR/models/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | import os, sys
4 |
5 | now_dir = os.getcwd()
6 | sys.path.append(now_dir)
7 | from typing import Dict
8 |
9 | import torch
10 | from pytorch_lightning import LightningModule
11 | from AR.models.t2s_model_onnx import Text2SemanticDecoder
12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule
13 | from AR.modules.optim import ScaledAdam
14 |
15 |
16 | class Text2SemanticLightningModule(LightningModule):
17 | def __init__(self, config, output_dir, is_train=True):
18 | super().__init__()
19 | self.config = config
20 | self.top_k = 3
21 | self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
22 | pretrained_s1 = config.get("pretrained_s1")
23 | if pretrained_s1 and is_train:
24 | # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
25 | print(
26 | self.load_state_dict(
27 | torch.load(pretrained_s1, map_location="cpu")["weight"]
28 | )
29 | )
30 | if is_train:
31 | self.automatic_optimization = False
32 | self.save_hyperparameters()
33 | self.eval_dir = output_dir / "eval"
34 | self.eval_dir.mkdir(parents=True, exist_ok=True)
35 |
36 | def training_step(self, batch: Dict, batch_idx: int):
37 | opt = self.optimizers()
38 | scheduler = self.lr_schedulers()
39 | loss, acc = self.model.forward(
40 | batch["phoneme_ids"],
41 | batch["phoneme_ids_len"],
42 | batch["semantic_ids"],
43 | batch["semantic_ids_len"],
44 | batch["bert_feature"],
45 | )
46 | self.manual_backward(loss)
47 | if batch_idx > 0 and batch_idx % 4 == 0:
48 | opt.step()
49 | opt.zero_grad()
50 | scheduler.step()
51 |
52 | self.log(
53 | "total_loss",
54 | loss,
55 | on_step=True,
56 | on_epoch=True,
57 | prog_bar=True,
58 | sync_dist=True,
59 | )
60 | self.log(
61 | "lr",
62 | scheduler.get_last_lr()[0],
63 | on_epoch=True,
64 | prog_bar=True,
65 | sync_dist=True,
66 | )
67 | self.log(
68 | f"top_{self.top_k}_acc",
69 | acc,
70 | on_step=True,
71 | on_epoch=True,
72 | prog_bar=True,
73 | sync_dist=True,
74 | )
75 |
76 | def validation_step(self, batch: Dict, batch_idx: int):
77 | return
78 |
79 | def configure_optimizers(self):
80 | model_parameters = self.model.parameters()
81 | parameters_names = []
82 | parameters_names.append(
83 | [name_param_pair[0] for name_param_pair in self.model.named_parameters()]
84 | )
85 | lm_opt = ScaledAdam(
86 | model_parameters,
87 | lr=0.01,
88 | betas=(0.9, 0.95),
89 | clipping_scale=2.0,
90 | parameters_names=parameters_names,
91 | show_dominant_parameters=False,
92 | clipping_update_period=1000,
93 | )
94 |
95 | return {
96 | "optimizer": lm_opt,
97 | "lr_scheduler": {
98 | "scheduler": WarmupCosineLRSchedule(
99 | lm_opt,
100 | init_lr=self.config["optimizer"]["lr_init"],
101 | peak_lr=self.config["optimizer"]["lr"],
102 | end_lr=self.config["optimizer"]["lr_end"],
103 | warmup_steps=self.config["optimizer"]["warmup_steps"],
104 | total_steps=self.config["optimizer"]["decay_steps"],
105 | )
106 | },
107 | }
108 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/AR/modules/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
2 | import math
3 |
4 | import torch
5 | from torch import nn
6 |
7 |
8 | class TokenEmbedding(nn.Module):
9 | def __init__(
10 | self,
11 | embedding_dim: int,
12 | vocab_size: int,
13 | dropout: float = 0.0,
14 | ):
15 | super().__init__()
16 |
17 | self.vocab_size = vocab_size
18 | self.embedding_dim = embedding_dim
19 |
20 | self.dropout = torch.nn.Dropout(p=dropout)
21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 |
23 | @property
24 | def weight(self) -> torch.Tensor:
25 | return self.word_embeddings.weight
26 |
27 | def embedding(self, index: int) -> torch.Tensor:
28 | return self.word_embeddings.weight[index : index + 1]
29 |
30 | def forward(self, x: torch.Tensor):
31 | x = self.word_embeddings(x)
32 | x = self.dropout(x)
33 | return x
34 |
35 |
36 | class SinePositionalEmbedding(nn.Module):
37 | def __init__(
38 | self,
39 | embedding_dim: int,
40 | dropout: float = 0.0,
41 | scale: bool = False,
42 | alpha: bool = False,
43 | ):
44 | super().__init__()
45 | self.embedding_dim = embedding_dim
46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 | self.dropout = torch.nn.Dropout(p=dropout)
49 |
50 | self.reverse = False
51 | self.pe = None
52 | self.extend_pe(torch.tensor(0.0).expand(1, 4000))
53 |
54 | def extend_pe(self, x):
55 | """Reset the positional encodings."""
56 | if self.pe is not None:
57 | if self.pe.size(1) >= x.size(1):
58 | if self.pe.dtype != x.dtype or self.pe.device != x.device:
59 | self.pe = self.pe.to(dtype=x.dtype, device=x.device)
60 | return
61 | pe = torch.zeros(x.size(1), self.embedding_dim)
62 | if self.reverse:
63 | position = torch.arange(
64 | x.size(1) - 1, -1, -1.0, dtype=torch.float32
65 | ).unsqueeze(1)
66 | else:
67 | position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
68 | div_term = torch.exp(
69 | torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
70 | * -(math.log(10000.0) / self.embedding_dim)
71 | )
72 | pe[:, 0::2] = torch.sin(position * div_term)
73 | pe[:, 1::2] = torch.cos(position * div_term)
74 | pe = pe.unsqueeze(0)
75 | self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
76 |
77 | def forward(self, x: torch.Tensor) -> torch.Tensor:
78 | self.extend_pe(x)
79 | output = x.unsqueeze(-1) if x.ndim == 2 else x
80 | output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
81 | return self.dropout(output)
82 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding_onnx.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
2 | import math
3 |
4 | import torch
5 | from torch import nn
6 |
7 |
8 | class TokenEmbedding(nn.Module):
9 | def __init__(
10 | self,
11 | embedding_dim: int,
12 | vocab_size: int,
13 | dropout: float = 0.0,
14 | ):
15 | super().__init__()
16 |
17 | self.vocab_size = vocab_size
18 | self.embedding_dim = embedding_dim
19 |
20 | self.dropout = torch.nn.Dropout(p=dropout)
21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 |
23 | @property
24 | def weight(self) -> torch.Tensor:
25 | return self.word_embeddings.weight
26 |
27 | def embedding(self, index: int) -> torch.Tensor:
28 | return self.word_embeddings.weight[index : index + 1]
29 |
30 | def forward(self, x: torch.Tensor):
31 | x = self.word_embeddings(x)
32 | x = self.dropout(x)
33 | return x
34 |
35 |
36 | class SinePositionalEmbedding(nn.Module):
37 | def __init__(
38 | self,
39 | embedding_dim: int,
40 | dropout: float = 0.0,
41 | scale: bool = False,
42 | alpha: bool = False,
43 | ):
44 | super().__init__()
45 | self.embedding_dim = embedding_dim
46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 | self.dropout = torch.nn.Dropout(p=dropout)
49 | self.reverse = False
50 | self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
51 |
52 | def extend_pe(self, x):
53 | position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1)
54 | scpe = (position * self.div_term).unsqueeze(0)
55 | pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
56 | pe = pe.contiguous().view(1, -1, self.embedding_dim)
57 | return pe
58 |
59 | def forward(self, x: torch.Tensor) -> torch.Tensor:
60 | pe = self.extend_pe(x)
61 | output = x.unsqueeze(-1) if x.ndim == 2 else x
62 | output = output * self.x_scale + self.alpha * pe
63 | return self.dropout(output)
64 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/lr_schedulers.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | import math
4 |
5 | import torch
6 | from matplotlib import pyplot as plt
7 | from torch import nn
8 | from torch.optim import Adam
9 |
10 |
11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
12 | """
13 | Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
14 | """
15 |
16 | def __init__(
17 | self,
18 | optimizer,
19 | init_lr,
20 | peak_lr,
21 | end_lr,
22 | warmup_steps=10000,
23 | total_steps=400000,
24 | current_step=0,
25 | ):
26 | self.init_lr = init_lr
27 | self.peak_lr = peak_lr
28 | self.end_lr = end_lr
29 | self.optimizer = optimizer
30 | self._warmup_rate = (peak_lr - init_lr) / warmup_steps
31 | self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
32 | self._current_step = current_step
33 | self.lr = init_lr
34 | self.warmup_steps = warmup_steps
35 | self.total_steps = total_steps
36 | self._last_lr = [self.lr]
37 |
38 | def set_lr(self, lr):
39 | self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
40 | for g in self.optimizer.param_groups:
41 | # g['lr'] = lr
42 | g["lr"] = self.end_lr ###锁定用线性
43 |
44 | def step(self):
45 | if self._current_step < self.warmup_steps:
46 | lr = self.init_lr + self._warmup_rate * self._current_step
47 |
48 | elif self._current_step > self.total_steps:
49 | lr = self.end_lr
50 |
51 | else:
52 | decay_ratio = (self._current_step - self.warmup_steps) / (
53 | self.total_steps - self.warmup_steps
54 | )
55 | if decay_ratio < 0.0 or decay_ratio > 1.0:
56 | raise RuntimeError(
57 | "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings."
58 | )
59 | coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
60 | lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
61 |
62 | self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定!
63 | self.set_lr(lr)
64 | self.lr = lr
65 | self._current_step += 1
66 | return self.lr
67 |
68 |
69 | if __name__ == "__main__":
70 | m = nn.Linear(10, 10)
71 | opt = Adam(m.parameters(), lr=1e-4)
72 | s = WarmupCosineLRSchedule(
73 | opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0
74 | )
75 | lrs = []
76 | for i in range(25000):
77 | s.step()
78 | lrs.append(s.lr)
79 | print(s.lr)
80 |
81 | plt.plot(lrs)
82 | plt.plot(range(0, 25000), lrs)
83 | plt.show()
84 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py:
--------------------------------------------------------------------------------
1 | from torch.nn.functional import *
2 | from torch.nn.functional import (
3 | _mha_shape_check,
4 | _canonical_mask,
5 | _none_or_dtype,
6 | _in_projection_packed,
7 | )
8 |
9 | def multi_head_attention_forward_patched(
10 | query,
11 | key,
12 | value,
13 | embed_dim_to_check: int,
14 | num_heads: int,
15 | in_proj_weight,
16 | in_proj_bias: Optional[Tensor],
17 | bias_k: Optional[Tensor],
18 | bias_v: Optional[Tensor],
19 | add_zero_attn: bool,
20 | dropout_p: float,
21 | out_proj_weight: Tensor,
22 | out_proj_bias: Optional[Tensor],
23 | training: bool = True,
24 | key_padding_mask: Optional[Tensor] = None,
25 | need_weights: bool = True,
26 | attn_mask: Optional[Tensor] = None,
27 | use_separate_proj_weight: bool = False,
28 | q_proj_weight: Optional[Tensor] = None,
29 | k_proj_weight: Optional[Tensor] = None,
30 | v_proj_weight: Optional[Tensor] = None,
31 | static_k: Optional[Tensor] = None,
32 | static_v: Optional[Tensor] = None,
33 | average_attn_weights: bool = True,
34 | is_causal: bool = False,
35 | cache=None,
36 | ) -> Tuple[Tensor, Optional[Tensor]]:
37 |
38 | # set up shape vars
39 | _, _, embed_dim = query.shape
40 | attn_mask = _canonical_mask(
41 | mask=attn_mask,
42 | mask_name="attn_mask",
43 | other_type=None,
44 | other_name="",
45 | target_type=query.dtype,
46 | check_other=False,
47 | )
48 | head_dim = embed_dim // num_heads
49 |
50 | proj_qkv = linear(query, in_proj_weight, in_proj_bias)
51 | proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
52 | q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
53 |
54 | if cache["first_infer"] == 1:
55 | cache["k"][cache["stage"]] = k
56 | cache["v"][cache["stage"]] = v
57 | else:
58 | cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
59 | cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
60 | k = cache["k"][cache["stage"]]
61 | v = cache["v"][cache["stage"]]
62 | cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
63 |
64 | attn_mask = _canonical_mask(
65 | mask=attn_mask,
66 | mask_name="attn_mask",
67 | other_type=None,
68 | other_name="",
69 | target_type=q.dtype,
70 | check_other=False,
71 | )
72 | attn_mask = attn_mask.unsqueeze(0)
73 |
74 | q = q.view(-1, num_heads, head_dim).transpose(0, 1)
75 | k = k.view(-1, num_heads, head_dim).transpose(0, 1)
76 | v = v.view(-1, num_heads, head_dim).transpose(0, 1)
77 |
78 | dropout_p = 0.0
79 | attn_mask = attn_mask.unsqueeze(0)
80 | q = q.view(num_heads, -1, head_dim).unsqueeze(0)
81 | k = k.view(num_heads, -1, head_dim).unsqueeze(0)
82 | v = v.view(num_heads, -1, head_dim).unsqueeze(0)
83 | attn_output = scaled_dot_product_attention(
84 | q, k, v, attn_mask, dropout_p, is_causal
85 | )
86 | attn_output = (
87 | attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
88 | )
89 | attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
90 | attn_output = attn_output.view(-1, 1, attn_output.size(1))
91 |
92 | return attn_output
93 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/AR/text_processing/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | import itertools
4 | import re
5 | from typing import Dict
6 | from typing import List
7 |
8 | import regex
9 | from gruut import sentences
10 | from gruut.const import Sentence
11 | from gruut.const import Word
12 | from AR.text_processing.symbols import SYMBOL_TO_ID
13 |
14 |
15 | class GruutPhonemizer:
16 | def __init__(self, language: str):
17 | self._phonemizer = sentences
18 | self.lang = language
19 | self.symbol_to_id = SYMBOL_TO_ID
20 | self._special_cases_dict: Dict[str] = {
21 | r"\.\.\.": "... ",
22 | ";": "; ",
23 | ":": ": ",
24 | ",": ", ",
25 | r"\.": ". ",
26 | "!": "! ",
27 | r"\?": "? ",
28 | "—": "—",
29 | "…": "… ",
30 | "«": "«",
31 | "»": "»",
32 | }
33 | self._punctuation_regexp: str = (
34 | rf"([{''.join(self._special_cases_dict.keys())}])"
35 | )
36 |
37 | def _normalize_punctuation(self, text: str) -> str:
38 | text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
39 | text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
40 | text = regex.sub(r"\pZ+", r" ", text)
41 | return text.strip()
42 |
43 | def _convert_punctuation(self, word: Word) -> str:
44 | if not word.phonemes:
45 | return ""
46 | if word.phonemes[0] in ["‖", "|"]:
47 | return word.text.strip()
48 |
49 | phonemes = "".join(word.phonemes)
50 | # remove modifier characters ˈˌː with regex
51 | phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
52 | return phonemes.strip()
53 |
54 | def phonemize(self, text: str, espeak: bool = False) -> str:
55 | text_to_phonemize: str = self._normalize_punctuation(text)
56 | sents: List[Sentence] = [
57 | sent
58 | for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)
59 | ]
60 | words: List[str] = [
61 | self._convert_punctuation(word) for word in itertools.chain(*sents)
62 | ]
63 | return " ".join(words)
64 |
65 | def transform(self, phonemes):
66 | # convert phonemes to ids
67 | # dictionary is in symbols.py
68 | return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
69 |
70 |
71 | if __name__ == "__main__":
72 | phonemizer = GruutPhonemizer("en-us")
73 | # text -> IPA
74 | phonemes = phonemizer.phonemize("Hello, wor-ld ?")
75 | print("phonemes:", phonemes)
76 | print("len(phonemes):", len(phonemes))
77 | phoneme_ids = phonemizer.transform(phonemes)
78 | print("phoneme_ids:", phoneme_ids)
79 | print("len(phoneme_ids):", len(phoneme_ids))
80 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/symbols.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | PAD = "_"
4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” '
5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
6 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
7 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
8 | SPACE_ID = SYMBOLS.index(" ")
9 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
10 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
11 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/__init__.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def str2bool(str):
5 | return True if str.lower() == 'true' else False
6 |
7 |
8 | def get_newest_ckpt(string_list):
9 | # 定义一个正则表达式模式,用于匹配字符串中的数字
10 | pattern = r'epoch=(\d+)-step=(\d+)\.ckpt'
11 |
12 | # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表
13 | extracted_info = []
14 | for string in string_list:
15 | match = re.match(pattern, string)
16 | if match:
17 | epoch = int(match.group(1))
18 | step = int(match.group(2))
19 | extracted_info.append((epoch, step, string))
20 | # 按照 epoch 后面的数字和 step 后面的数字进行排序
21 | sorted_info = sorted(
22 | extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
23 | # 获取最新的 ckpt 文件名
24 | newest_ckpt = sorted_info[0][2]
25 | return newest_ckpt
26 |
27 |
28 | # 文本存在且不为空时 return True
29 | def check_txt_file(file_path):
30 | try:
31 | with open(file_path, 'r') as file:
32 | text = file.readline().strip()
33 | assert text.strip() != ''
34 | return text
35 | except Exception:
36 | return False
37 | return False
38 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/initialize.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Initialize modules for espnet2 neural networks."""
3 | import torch
4 | from typeguard import check_argument_types
5 |
6 |
7 | def initialize(model: torch.nn.Module, init: str):
8 | """Initialize weights of a neural network module.
9 |
10 | Parameters are initialized using the given method or distribution.
11 |
12 | Custom initialization routines can be implemented into submodules
13 | as function `espnet_initialization_fn` within the custom module.
14 |
15 | Args:
16 | model: Target.
17 | init: Method of initialization.
18 | """
19 | assert check_argument_types()
20 | print("init with", init)
21 |
22 | # weight init
23 | for p in model.parameters():
24 | if p.dim() > 1:
25 | if init == "xavier_uniform":
26 | torch.nn.init.xavier_uniform_(p.data)
27 | elif init == "xavier_normal":
28 | torch.nn.init.xavier_normal_(p.data)
29 | elif init == "kaiming_uniform":
30 | torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
31 | elif init == "kaiming_normal":
32 | torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
33 | else:
34 | raise ValueError("Unknown initialization: " + init)
35 | # bias init
36 | for name, p in model.named_parameters():
37 | if ".bias" in name and p.dim() == 1:
38 | p.data.zero_()
39 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/io.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | import yaml
5 |
6 |
7 | def load_yaml_config(path):
8 | with open(path) as f:
9 | config = yaml.full_load(f)
10 | return config
11 |
12 |
13 | def save_config_to_yaml(config, path):
14 | assert path.endswith(".yaml")
15 | with open(path, "w") as f:
16 | f.write(yaml.dump(config))
17 | f.close()
18 |
19 |
20 | def write_args(args, path):
21 | args_dict = dict(
22 | (name, getattr(args, name)) for name in dir(args) if not name.startswith("_")
23 | )
24 | with open(path, "a") as args_file:
25 | args_file.write("==> torch version: {}\n".format(torch.__version__))
26 | args_file.write(
27 | "==> cudnn version: {}\n".format(torch.backends.cudnn.version())
28 | )
29 | args_file.write("==> Cmd:\n")
30 | args_file.write(str(sys.argv))
31 | args_file.write("\n==> args:\n")
32 | for k, v in sorted(args_dict.items()):
33 | args_file.write(" %s: %s\n" % (str(k), str(v)))
34 | args_file.close()
35 |
--------------------------------------------------------------------------------
/GPT_SoVITS/TTS_infer_pack/__init__.py:
--------------------------------------------------------------------------------
1 | from . import TTS, text_segmentation_method
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 300
4 | batch_size: 8
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 16
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 512
24 | hidden_dim: 512
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 12
28 | dropout: 0
29 | EOS: 1024
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 300
4 | batch_size: 8
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 16-mixed
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 1024
24 | hidden_dim: 1024
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 16
28 | dropout: 0
29 | EOS: 1024
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big2.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 300
4 | batch_size: 12
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 16-mixed
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 1024
24 | hidden_dim: 1024
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 6
28 | dropout: 0
29 | EOS: 1024
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1longer.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 20
4 | batch_size: 8
5 | save_every_n_epoch: 1
6 | precision: 16-mixed
7 | gradient_clip: 1.0
8 | optimizer:
9 | lr: 0.01
10 | lr_init: 0.00001
11 | lr_end: 0.0001
12 | warmup_steps: 2000
13 | decay_steps: 40000
14 | data:
15 | max_eval_sample: 8
16 | max_sec: 54
17 | num_workers: 4
18 | pad_val: 1024 # same with EOS in model
19 | model:
20 | vocab_size: 1025
21 | phoneme_vocab_size: 512
22 | embedding_dim: 512
23 | hidden_dim: 512
24 | head: 16
25 | linear_units: 2048
26 | n_layer: 24
27 | dropout: 0
28 | EOS: 1024
29 | random_bert: 0
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1mq.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 100
4 | batch_size: 6
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 32
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 40
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | saving_path: "ckpt/"
22 | resume_checkpoint: null
23 | vocoder_config_path: "quantizer/new_ckpt/config.json"
24 | vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000"
25 | datadir: "/home/liweiche/GigaSpeech/wavs"
26 | metapath: "/home/liweiche/GigaSpeech/train2.json"
27 | val_metapath: "/home/liweiche/GigaSpeech/dev2.json"
28 | sampledir: "logs/"
29 | pretrained_path: null
30 | lr: 0.0001
31 | batch_size: 200.0
32 | train_bucket_size: 8192
33 | training_step: 800000
34 | optim_flat_percent: 0.0
35 | warmup_step: 50
36 | adam_beta1: 0.9
37 | adam_beta2: 0.98
38 | ffd_size: 3072
39 | hidden_size: 768
40 | enc_nlayers: 6
41 | dec_nlayers: 6
42 | nheads: 12
43 | ar_layer: 4
44 | ar_ffd_size: 1024
45 | ar_hidden_size: 256
46 | ar_nheads: 4
47 | aligner_softmax_temp: 1.0
48 | layer_norm_eps: 0.00001
49 | speaker_embed_dropout: 0.05
50 | label_smoothing: 0.0
51 | val_check_interval: 5000
52 | check_val_every_n_epoch: 1
53 | precision: "fp16"
54 | nworkers: 16
55 | distributed: true
56 | accelerator: "ddp"
57 | version: null
58 | accumulate_grad_batches: 1
59 | use_repetition_token: true
60 | use_repetition_gating: false
61 | repetition_penalty: 1.0
62 | sampling_temperature: 1.0
63 | top_k: -1
64 | min_top_k: 3
65 | top_p: 0.8
66 | sample_num: 4
67 | length_penalty_max_length: 15000
68 | length_penalty_max_prob: 0.95
69 | max_input_length: 2048
70 | max_output_length: 2000
71 | sample_rate: 16000
72 | n_codes: 1024
73 | n_cluster_groups: 1
74 | phone_context_window: 4
75 | phoneset_size: 1000
76 | inference:
77 | top_k: 5
78 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s2.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 100,
4 | "eval_interval": 500,
5 | "seed": 1234,
6 | "epochs": 100,
7 | "learning_rate": 0.0001,
8 | "betas": [
9 | 0.8,
10 | 0.99
11 | ],
12 | "eps": 1e-09,
13 | "batch_size": 32,
14 | "fp16_run": true,
15 | "lr_decay": 0.999875,
16 | "segment_size": 20480,
17 | "init_lr_ratio": 1,
18 | "warmup_epochs": 0,
19 | "c_mel": 45,
20 | "c_kl": 1.0,
21 | "text_low_lr_rate": 0.4
22 | },
23 | "data": {
24 | "max_wav_value": 32768.0,
25 | "sampling_rate": 32000,
26 | "filter_length": 2048,
27 | "hop_length": 640,
28 | "win_length": 2048,
29 | "n_mel_channels": 128,
30 | "mel_fmin": 0.0,
31 | "mel_fmax": null,
32 | "add_blank": true,
33 | "n_speakers": 300,
34 | "cleaned_text": true
35 | },
36 | "model": {
37 | "inter_channels": 192,
38 | "hidden_channels": 192,
39 | "filter_channels": 768,
40 | "n_heads": 2,
41 | "n_layers": 6,
42 | "kernel_size": 3,
43 | "p_dropout": 0.1,
44 | "resblock": "1",
45 | "resblock_kernel_sizes": [
46 | 3,
47 | 7,
48 | 11
49 | ],
50 | "resblock_dilation_sizes": [
51 | [
52 | 1,
53 | 3,
54 | 5
55 | ],
56 | [
57 | 1,
58 | 3,
59 | 5
60 | ],
61 | [
62 | 1,
63 | 3,
64 | 5
65 | ]
66 | ],
67 | "upsample_rates": [
68 | 10,
69 | 8,
70 | 2,
71 | 2,
72 | 2
73 | ],
74 | "upsample_initial_channel": 512,
75 | "upsample_kernel_sizes": [
76 | 16,
77 | 16,
78 | 8,
79 | 2,
80 | 2
81 | ],
82 | "n_layers_q": 3,
83 | "use_spectral_norm": false,
84 | "gin_channels": 512,
85 | "semantic_frame_rate": "25hz",
86 | "freeze_quantizer": true
87 | },
88 | "s2_ckpt_dir": "logs/s2/big2k1",
89 | "content_module": "cnhubert"
90 | }
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/train.yaml:
--------------------------------------------------------------------------------
1 | gpu:
2 | n_card: 1
3 | n_process_per_card: 2
4 | io:
5 | text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS
6 | save_every_n_epoch: 1
7 | precision: 16-mixed
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 512
24 | hidden_dim: 512
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 24
28 | dropout: 0
29 | EOS: 1024
30 | random_bert: 0
31 | inference:
32 | top_k: 5
33 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/tts_infer.yaml:
--------------------------------------------------------------------------------
1 | custom:
2 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
3 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
4 | device: cuda
5 | is_half: true
6 | t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
7 | vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
8 | default:
9 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
10 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
11 | device: cpu
12 | is_half: false
13 | t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
14 | vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
15 |
--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cnhubert, whisper_enc
2 |
3 | content_module_map = {
4 | 'cnhubert': cnhubert,
5 | 'whisper': whisper_enc
6 | }
--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/cnhubert.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import librosa
4 | import torch
5 | import torch.nn.functional as F
6 | import soundfile as sf
7 | import logging
8 |
9 | logging.getLogger("numba").setLevel(logging.WARNING)
10 |
11 | from transformers import (
12 | Wav2Vec2FeatureExtractor,
13 | HubertModel,
14 | )
15 |
16 | import utils
17 | import torch.nn as nn
18 |
19 | cnhubert_base_path = None
20 |
21 |
22 | class CNHubert(nn.Module):
23 | def __init__(self, base_path:str=None):
24 | super().__init__()
25 | if base_path is None:
26 | base_path = cnhubert_base_path
27 | self.model = HubertModel.from_pretrained(base_path)
28 | self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
29 | base_path
30 | )
31 |
32 |
33 | def forward(self, x):
34 | input_values = self.feature_extractor(
35 | x, return_tensors="pt", sampling_rate=16000
36 | ).input_values.to(x.device)
37 | feats = self.model(input_values)["last_hidden_state"]
38 | return feats
39 |
40 |
41 | # class CNHubertLarge(nn.Module):
42 | # def __init__(self):
43 | # super().__init__()
44 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
45 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
46 | # def forward(self, x):
47 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
48 | # feats = self.model(input_values)["last_hidden_state"]
49 | # return feats
50 | #
51 | # class CVec(nn.Module):
52 | # def __init__(self):
53 | # super().__init__()
54 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
55 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
56 | # def forward(self, x):
57 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
58 | # feats = self.model(input_values)["last_hidden_state"]
59 | # return feats
60 | #
61 | # class cnw2v2base(nn.Module):
62 | # def __init__(self):
63 | # super().__init__()
64 | # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
65 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
66 | # def forward(self, x):
67 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
68 | # feats = self.model(input_values)["last_hidden_state"]
69 | # return feats
70 |
71 |
72 | def get_model():
73 | model = CNHubert()
74 | model.eval()
75 | return model
76 |
77 |
78 | # def get_large_model():
79 | # model = CNHubertLarge()
80 | # model.eval()
81 | # return model
82 | #
83 | # def get_model_cvec():
84 | # model = CVec()
85 | # model.eval()
86 | # return model
87 | #
88 | # def get_model_cnw2v2base():
89 | # model = cnw2v2base()
90 | # model.eval()
91 | # return model
92 |
93 |
94 | def get_content(hmodel, wav_16k_tensor):
95 | with torch.no_grad():
96 | feats = hmodel(wav_16k_tensor)
97 | return feats.transpose(1, 2)
98 |
99 |
100 | if __name__ == "__main__":
101 | model = get_model()
102 | src_path = "/Users/Shared/原音频2.wav"
103 | wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
104 | model = model
105 | wav_16k_tensor = wav_16k_tensor
106 | feats = get_content(model, wav_16k_tensor)
107 | print(feats.shape)
108 |
--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/whisper_enc.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_model():
5 | import whisper
6 |
7 | model = whisper.load_model("small", device="cpu")
8 |
9 | return model.encoder
10 |
11 |
12 | def get_content(model=None, wav_16k_tensor=None):
13 | from whisper import log_mel_spectrogram, pad_or_trim
14 |
15 | dev = next(model.parameters()).device
16 | mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
17 | # if torch.cuda.is_available():
18 | # mel = mel.to(torch.float16)
19 | feature_len = mel.shape[-1] // 2
20 | assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频"
21 | with torch.no_grad():
22 | feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[
23 | :1, :feature_len, :
24 | ].transpose(1, 2)
25 | return feature
26 |
--------------------------------------------------------------------------------
/GPT_SoVITS/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/module/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/module/losses.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch.nn import functional as F
5 |
6 |
7 | def feature_loss(fmap_r, fmap_g):
8 | loss = 0
9 | for dr, dg in zip(fmap_r, fmap_g):
10 | for rl, gl in zip(dr, dg):
11 | rl = rl.float().detach()
12 | gl = gl.float()
13 | loss += torch.mean(torch.abs(rl - gl))
14 |
15 | return loss * 2
16 |
17 |
18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19 | loss = 0
20 | r_losses = []
21 | g_losses = []
22 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23 | dr = dr.float()
24 | dg = dg.float()
25 | r_loss = torch.mean((1 - dr) ** 2)
26 | g_loss = torch.mean(dg**2)
27 | loss += r_loss + g_loss
28 | r_losses.append(r_loss.item())
29 | g_losses.append(g_loss.item())
30 |
31 | return loss, r_losses, g_losses
32 |
33 |
34 | def generator_loss(disc_outputs):
35 | loss = 0
36 | gen_losses = []
37 | for dg in disc_outputs:
38 | dg = dg.float()
39 | l = torch.mean((1 - dg) ** 2)
40 | gen_losses.append(l)
41 | loss += l
42 |
43 | return loss, gen_losses
44 |
45 |
46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47 | """
48 | z_p, logs_q: [b, h, t_t]
49 | m_p, logs_p: [b, h, t_t]
50 | """
51 | z_p = z_p.float()
52 | logs_q = logs_q.float()
53 | m_p = m_p.float()
54 | logs_p = logs_p.float()
55 | z_mask = z_mask.float()
56 |
57 | kl = logs_p - logs_q - 0.5
58 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
59 | kl = torch.sum(kl * z_mask)
60 | l = kl / torch.sum(z_mask)
61 | return l
62 |
63 |
64 | def mle_loss(z, m, logs, logdet, mask):
65 | l = torch.sum(logs) + 0.5 * torch.sum(
66 | torch.exp(-2 * logs) * ((z - m) ** 2)
67 | ) # neg normal likelihood w/o the constant term
68 | l = l - torch.sum(logdet) # log jacobian determinant
69 | l = l / torch.sum(
70 | torch.ones_like(z) * mask
71 | ) # averaging across batch, channel and time axes
72 | l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term
73 | return l
74 |
--------------------------------------------------------------------------------
/GPT_SoVITS/my_utils.py:
--------------------------------------------------------------------------------
1 | import ffmpeg
2 | import numpy as np
3 |
4 |
5 | def load_audio(file, sr):
6 | try:
7 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
8 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
9 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
10 | file = (
11 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
12 | ) # 防止小白拷路径头尾带了空格和"和回车
13 | out, _ = (
14 | ffmpeg.input(file, threads=0)
15 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
16 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17 | )
18 | except Exception as e:
19 | raise RuntimeError(f"Failed to load audio: {e}")
20 |
21 | return np.frombuffer(out, np.float32).flatten()
22 |
--------------------------------------------------------------------------------
/GPT_SoVITS/prepare_datasets/3-get-semantic.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | inp_text = os.environ.get("inp_text")
4 | exp_name = os.environ.get("exp_name")
5 | i_part = os.environ.get("i_part")
6 | all_parts = os.environ.get("all_parts")
7 | os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES")
8 | opt_dir = os.environ.get("opt_dir")
9 | pretrained_s2G = os.environ.get("pretrained_s2G")
10 | s2config_path = os.environ.get("s2config_path")
11 | is_half = eval(os.environ.get("is_half", "True"))
12 | import math, traceback
13 | import multiprocessing
14 | import sys, pdb
15 |
16 | now_dir = os.getcwd()
17 | sys.path.append(now_dir)
18 | from random import shuffle
19 | import torch.multiprocessing as mp
20 | from glob import glob
21 | from tqdm import tqdm
22 | import logging, librosa, utils, torch
23 | from module.models import SynthesizerTrn
24 |
25 | logging.getLogger("numba").setLevel(logging.WARNING)
26 | # from config import pretrained_s2G
27 |
28 | # inp_text=sys.argv[1]
29 | # exp_name=sys.argv[2]
30 | # i_part=sys.argv[3]
31 | # all_parts=sys.argv[4]
32 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[5]
33 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
34 |
35 |
36 | hubert_dir = "%s/4-cnhubert" % (opt_dir)
37 | semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
38 | if os.path.exists(semantic_path) == False:
39 | os.makedirs(opt_dir, exist_ok=True)
40 |
41 | if torch.cuda.is_available():
42 | device = "cuda"
43 | # elif torch.backends.mps.is_available():
44 | # device = "mps"
45 | else:
46 | device = "cpu"
47 | hps = utils.get_hparams_from_file(s2config_path)
48 | vq_model = SynthesizerTrn(
49 | hps.data.filter_length // 2 + 1,
50 | hps.train.segment_size // hps.data.hop_length,
51 | n_speakers=hps.data.n_speakers,
52 | **hps.model
53 | )
54 | if is_half == True:
55 | vq_model = vq_model.half().to(device)
56 | else:
57 | vq_model = vq_model.to(device)
58 | vq_model.eval()
59 | # utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True)
60 | # utils.load_checkpoint(pretrained_s2G, vq_model, None, True)
61 | print(
62 | vq_model.load_state_dict(
63 | torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False
64 | )
65 | )
66 |
67 | def name2go(wav_name, lines):
68 | hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
69 | if os.path.exists(hubert_path) == False:
70 | return
71 | ssl_content = torch.load(hubert_path, map_location="cpu")
72 | if is_half == True:
73 | ssl_content = ssl_content.half().to(device)
74 | else:
75 | ssl_content = ssl_content.to(device)
76 | codes = vq_model.extract_latent(ssl_content)
77 | semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()])
78 | lines.append("%s\t%s" % (wav_name, semantic))
79 |
80 | with open(inp_text, "r", encoding="utf8") as f:
81 | lines = f.read().strip("\n").split("\n")
82 |
83 | lines1 = []
84 | for line in lines[int(i_part) :: int(all_parts)]:
85 | # print(line)
86 | try:
87 | # wav_name,text=line.split("\t")
88 | wav_name, spk_name, language, text = line.split("|")
89 | wav_name = os.path.basename(wav_name)
90 | # name2go(name,lines1)
91 | name2go(wav_name, lines1)
92 | except:
93 | print(line, traceback.format_exc())
94 | with open(semantic_path, "w", encoding="utf8") as f:
95 | f.write("\n".join(lines1))
96 |
--------------------------------------------------------------------------------
/GPT_SoVITS/pretrained_models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
--------------------------------------------------------------------------------
/GPT_SoVITS/process_ckpt.py:
--------------------------------------------------------------------------------
1 | import traceback
2 | from collections import OrderedDict
3 | from time import time as ttime
4 | import shutil,os
5 | import torch
6 | from tools.i18n.i18n import I18nAuto
7 |
8 | i18n = I18nAuto()
9 |
10 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path
11 | dir=os.path.dirname(path)
12 | name=os.path.basename(path)
13 | tmp_path="%s.pth"%(ttime())
14 | torch.save(fea,tmp_path)
15 | shutil.move(tmp_path,"%s/%s"%(dir,name))
16 |
17 | def savee(ckpt, name, epoch, steps, hps):
18 | try:
19 | opt = OrderedDict()
20 | opt["weight"] = {}
21 | for key in ckpt.keys():
22 | if "enc_q" in key:
23 | continue
24 | opt["weight"][key] = ckpt[key].half()
25 | opt["config"] = hps
26 | opt["info"] = "%sepoch_%siteration" % (epoch, steps)
27 | # torch.save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
28 | my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
29 | return "Success."
30 | except:
31 | return traceback.format_exc()
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/__init__.py:
--------------------------------------------------------------------------------
1 | from text.symbols import *
2 |
3 |
4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5 |
6 | def cleaned_text_to_sequence(cleaned_text):
7 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8 | Args:
9 | text: string to convert to a sequence
10 | Returns:
11 | List of integers corresponding to the symbols in the text
12 | '''
13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 | return phones
15 |
16 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from text import chinese, japanese, cleaned_text_to_sequence, symbols, english
2 |
3 | language_module_map = {"zh": chinese, "ja": japanese, "en": english}
4 | special = [
5 | # ("%", "zh", "SP"),
6 | ("¥", "zh", "SP2"),
7 | ("^", "zh", "SP3"),
8 | # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧
9 | ]
10 |
11 |
12 | def clean_text(text, language):
13 | if(language not in language_module_map):
14 | language="en"
15 | text=" "
16 | for special_s, special_l, target_symbol in special:
17 | if special_s in text and language == special_l:
18 | return clean_special(text, language, special_s, target_symbol)
19 | language_module = language_module_map[language]
20 | norm_text = language_module.text_normalize(text)
21 | if language == "zh":
22 | phones, word2ph = language_module.g2p(norm_text)
23 | assert len(phones) == sum(word2ph)
24 | assert len(norm_text) == len(word2ph)
25 | else:
26 | phones = language_module.g2p(norm_text)
27 | word2ph = None
28 |
29 | for ph in phones:
30 | assert ph in symbols
31 | return phones, word2ph, norm_text
32 |
33 |
34 | def clean_special(text, language, special_s, target_symbol):
35 | """
36 | 特殊静音段sp符号处理
37 | """
38 | text = text.replace(special_s, ",")
39 | language_module = language_module_map[language]
40 | norm_text = language_module.text_normalize(text)
41 | phones = language_module.g2p(norm_text)
42 | new_ph = []
43 | for ph in phones[0]:
44 | assert ph in symbols
45 | if ph == ",":
46 | new_ph.append(target_symbol)
47 | else:
48 | new_ph.append(ph)
49 | return new_ph, phones[1], norm_text
50 |
51 |
52 | def text_to_sequence(text, language):
53 | phones = clean_text(text)
54 | return cleaned_text_to_sequence(phones)
55 |
56 |
57 | if __name__ == "__main__":
58 | print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
59 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict-hot.rep:
--------------------------------------------------------------------------------
1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2 | JSON JH EY1 S AH0 N
--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/text/engdict_cache.pickle
--------------------------------------------------------------------------------
/GPT_SoVITS/text/namedict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/GPT_SoVITS/text/namedict_cache.pickle
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/README.md:
--------------------------------------------------------------------------------
1 | ## Supported NSW (Non-Standard-Word) Normalization
2 |
3 | |NSW type|raw|normalized|
4 | |:--|:-|:-|
5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分|
7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二|
8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日|
9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万|
14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from text.zh_normalization.text_normlization import *
15 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import DIGITS
17 | from .num import num2str
18 | from .num import verbalize_cardinal
19 | from .num import verbalize_digit
20 |
21 |
22 | def _time_num2str(num_string: str) -> str:
23 | """A special case for verbalizing number in time."""
24 | result = num2str(num_string.lstrip('0'))
25 | if num_string.startswith('0'):
26 | result = DIGITS['0'] + result
27 | return result
28 |
29 |
30 | # 时刻表达式
31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
32 | r':([0-5][0-9])'
33 | r'(:([0-5][0-9]))?')
34 |
35 | # 时间范围,如8:30-12:30
36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
37 | r':([0-5][0-9])'
38 | r'(:([0-5][0-9]))?'
39 | r'(~|-)'
40 | r'([0-1]?[0-9]|2[0-3])'
41 | r':([0-5][0-9])'
42 | r'(:([0-5][0-9]))?')
43 |
44 |
45 | def replace_time(match) -> str:
46 | """
47 | Args:
48 | match (re.Match)
49 | Returns:
50 | str
51 | """
52 |
53 | is_range = len(match.groups()) > 5
54 |
55 | hour = match.group(1)
56 | minute = match.group(2)
57 | second = match.group(4)
58 |
59 | if is_range:
60 | hour_2 = match.group(6)
61 | minute_2 = match.group(7)
62 | second_2 = match.group(9)
63 |
64 | result = f"{num2str(hour)}点"
65 | if minute.lstrip('0'):
66 | if int(minute) == 30:
67 | result += "半"
68 | else:
69 | result += f"{_time_num2str(minute)}分"
70 | if second and second.lstrip('0'):
71 | result += f"{_time_num2str(second)}秒"
72 |
73 | if is_range:
74 | result += "至"
75 | result += f"{num2str(hour_2)}点"
76 | if minute_2.lstrip('0'):
77 | if int(minute) == 30:
78 | result += "半"
79 | else:
80 | result += f"{_time_num2str(minute_2)}分"
81 | if second_2 and second_2.lstrip('0'):
82 | result += f"{_time_num2str(second_2)}秒"
83 |
84 | return result
85 |
86 |
87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年'
88 | r'((0?[1-9]|1[0-2])月)?'
89 | r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
90 |
91 |
92 | def replace_date(match) -> str:
93 | """
94 | Args:
95 | match (re.Match)
96 | Returns:
97 | str
98 | """
99 | year = match.group(1)
100 | month = match.group(3)
101 | day = match.group(5)
102 | result = ""
103 | if year:
104 | result += f"{verbalize_digit(year)}年"
105 | if month:
106 | result += f"{verbalize_cardinal(month)}月"
107 | if day:
108 | result += f"{verbalize_cardinal(day)}{match.group(9)}"
109 | return result
110 |
111 |
112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
113 | RE_DATE2 = re.compile(
114 | r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
115 |
116 |
117 | def replace_date2(match) -> str:
118 | """
119 | Args:
120 | match (re.Match)
121 | Returns:
122 | str
123 | """
124 | year = match.group(1)
125 | month = match.group(3)
126 | day = match.group(4)
127 | result = ""
128 | if year:
129 | result += f"{verbalize_digit(year)}年"
130 | if month:
131 | result += f"{verbalize_cardinal(month)}月"
132 | if day:
133 | result += f"{verbalize_cardinal(day)}日"
134 | return result
135 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 |
17 | from pypinyin.constants import SUPPORT_UCS4
18 |
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {
22 | ord(char) + 65248: ord(char)
23 | for char in string.ascii_letters
24 | }
25 |
26 | # 英文字符半角 -> 全角映射表
27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
28 |
29 | # 数字字符全角 -> 半角映射表 (num: 10)
30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
31 | # 数字字符半角 -> 全角映射表
32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
33 |
34 | # 标点符号全角 -> 半角映射表 (num: 32)
35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
36 | # 标点符号半角 -> 全角映射表
37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
38 |
39 | # 空格 (num: 1)
40 | F2H_SPACE = {'\u3000': ' '}
41 | H2F_SPACE = {' ': '\u3000'}
42 |
43 | # 非"有拼音的汉字"的字符串,可用于NSW提取
44 | if SUPPORT_UCS4:
45 | RE_NSW = re.compile(r'(?:[^'
46 | r'\u3007' # 〇
47 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
48 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
49 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
50 | r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
51 | r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
52 | r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
53 | r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
54 | r'])+')
55 | else:
56 | RE_NSW = re.compile( # pragma: no cover
57 | r'(?:[^'
58 | r'\u3007' # 〇
59 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
60 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
61 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
62 | r'])+')
63 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import verbalize_digit
17 |
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通:130、131、132、156、155、186、185、176
23 | # 电信:133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(
25 | r"(? str:
34 | if mobile:
35 | sp_parts = phone_string.strip('+').split()
36 | result = ','.join(
37 | [verbalize_digit(part, alt_one=True) for part in sp_parts])
38 | return result
39 | else:
40 | sil_parts = phone_string.split('-')
41 | result = ','.join(
42 | [verbalize_digit(part, alt_one=True) for part in sil_parts])
43 | return result
44 |
45 |
46 | def replace_phone(match) -> str:
47 | """
48 | Args:
49 | match (re.Match)
50 | Returns:
51 | str
52 | """
53 | return phone2str(match.group(0), mobile=False)
54 |
55 |
56 | def replace_mobile(match) -> str:
57 | """
58 | Args:
59 | match (re.Match)
60 | Returns:
61 | str
62 | """
63 | return phone2str(match.group(0))
64 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import num2str
17 |
18 | # 温度表达式,温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
21 | measure_dict = {
22 | "cm2": "平方厘米",
23 | "cm²": "平方厘米",
24 | "cm3": "立方厘米",
25 | "cm³": "立方厘米",
26 | "cm": "厘米",
27 | "db": "分贝",
28 | "ds": "毫秒",
29 | "kg": "千克",
30 | "km": "千米",
31 | "m2": "平方米",
32 | "m²": "平方米",
33 | "m³": "立方米",
34 | "m3": "立方米",
35 | "ml": "毫升",
36 | "m": "米",
37 | "mm": "毫米",
38 | "s": "秒"
39 | }
40 |
41 |
42 | def replace_temperature(match) -> str:
43 | """
44 | Args:
45 | match (re.Match)
46 | Returns:
47 | str
48 | """
49 | sign = match.group(1)
50 | temperature = match.group(2)
51 | unit = match.group(3)
52 | sign: str = "零下" if sign else ""
53 | temperature: str = num2str(temperature)
54 | unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 | result = f"{sign}{temperature}{unit}"
56 | return result
57 |
58 |
59 | def replace_measure(sentence) -> str:
60 | for q_notation in measure_dict:
61 | if q_notation in sentence:
62 | sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 | return sentence
64 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 RVC-Boss
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Synthesizers/base/Base_TTS_Synthesizer.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | from .Base_TTS_Task import Base_TTS_Task as TTS_Task
4 | import json
5 | from typing import List, Dict, Literal, Optional, Any, Union, Generator, Tuple
6 | from pydantic import BaseModel, Field, model_validator
7 | import numpy as np
8 | from abc import ABC, abstractmethod
9 | from typing import Dict, List, Union, Generator, Tuple
10 | from typing_extensions import Literal
11 | import numpy as np
12 | import wave,io
13 |
14 | class Base_TTS_Synthesizer(ABC):
15 | """
16 | Abstract base class for a Text-To-Speech (TTS) synthesizer.
17 |
18 | Attributes:
19 | ui_config (Dict[str, List]): A dictionary containing UI configuration settings.
20 | debug_mode (bool): Flag to toggle debug mode for additional logging and debugging information.
21 |
22 | """
23 |
24 | ui_config: Dict[str, List] = {}
25 | debug_mode: bool = False
26 |
27 | def __init__(self, **kwargs):
28 | """
29 | Initializes the TTS synthesizer with optional UI configurations and debug mode setting.
30 |
31 | Args:
32 | ui_config (Dict[str, List], optional): Configuration for user interface settings.
33 | debug_mode (bool, optional): Enables or disables debug mode.
34 |
35 | """
36 | self.ui_config = kwargs.get("ui_config", {})
37 | self.debug_mode = kwargs.get("debug_mode", False)
38 |
39 | @abstractmethod
40 | def generate(
41 | self,
42 | task: TTS_Task,
43 | return_type: Literal["filepath", "numpy"] = "numpy",
44 | save_path: Optional[str] = None,
45 | ) -> Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]:
46 | """
47 | Generates speech from a given TTS task.
48 |
49 | Args:
50 | task (TTS_Task): The task containing data and parameters for speech synthesis.
51 | return_type (Literal["filepath", "numpy"], optional): The type of return value, either a file path or audio data.
52 | save_path (str, optional): The path to save the audio file.
53 | Returns:
54 | Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]: Depending on the return_type, returns a file path, a generator of audio data, or other types.
55 |
56 | """
57 | pass
58 |
59 | @abstractmethod
60 | def get_characters(self):
61 | """
62 | Retrieves the available characters and their emotions for the TTS.
63 |
64 | Returns:
65 | Dict[str, List[str]]: A dictionary mapping character names to lists of their emotions.
66 | """
67 | pass
68 |
69 | @abstractmethod
70 | def params_parser(self, data):
71 | """
72 | Parses input data into a TTS_Task.
73 |
74 | Args:
75 | data (Any): The raw input data to be parsed.
76 |
77 | Returns:
78 | TTS_Task: A TTS task object created from the input data.
79 | """
80 | pass
81 |
82 | @abstractmethod
83 | def ms_like_parser(self, data):
84 | """
85 | Parses input data in a Microsoft-like format into a TTS_Task.
86 |
87 | Args:
88 | data (Any): The raw input data to be parsed.
89 |
90 | Returns:
91 | TTS_Task: A TTS task object created from the Microsoft-like formatted input data.
92 | """
93 | pass
94 |
95 |
96 | def get_wave_header_chunk(sample_rate: int, channels: int = 1, sample_width: int = 2):
97 | """
98 | Generate a wave header with no data.
99 |
100 | Args:
101 | sample_rate (int): The sample rate of the audio.
102 | channels (int, optional): The number of audio channels. Defaults to 1.
103 | sample_width (int, optional): The sample width in bytes. Defaults to 2.
104 |
105 | Returns:
106 | bytes: The wave header as bytes.
107 | """
108 | wav_buf = io.BytesIO()
109 | with wave.open(wav_buf, "wb") as vfout:
110 | vfout.setnchannels(channels)
111 | vfout.setsampwidth(sample_width)
112 | vfout.setframerate(sample_rate)
113 |
114 | wav_buf.seek(0)
115 | return wav_buf.read()
116 |
--------------------------------------------------------------------------------
/Synthesizers/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .Base_TTS_Task import Base_TTS_Task, ParamItem, init_params_config
2 | from .Base_TTS_Synthesizer import Base_TTS_Synthesizer, get_wave_header_chunk
3 | from .config_utils import load_config
--------------------------------------------------------------------------------
/Synthesizers/base/config_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional, Dict, List, Literal
2 | from pydantic import BaseModel
3 | import os, json
4 |
5 | class ConfigItem(BaseModel):
6 | value : Optional[Any] = None
7 | default : Optional[Any] = None
8 | type : Optional[str] = None
9 | description : Optional[str] = None
10 |
11 | def __init__(self, **data):
12 | super().__init__(**data)
13 | if (self.value is None) and self.default is not None:
14 | self.value = self.default
15 |
16 | def is_config_item(item:Dict[str, Any])->bool:
17 | """判断是否为配置项"""
18 | return isinstance(item, dict) and ("value" in item or "default" in item)
19 |
20 | def parse_config_dict(input_config:Dict[str, Any], output_config)->Dict[str, Any]:
21 |
22 | for key, res in input_config.items():
23 | if is_config_item(res):
24 | value = ConfigItem(**res).value
25 | else:
26 | if isinstance(res, dict):
27 | value = parse_config_dict(res, {})
28 | else:
29 | value = res
30 | output_config[key] = value
31 | return output_config
32 |
33 | def load_config(config_path:str)->Dict[str, Any]:
34 | """加载配置文件"""
35 | assert os.path.exists(config_path), f"配置文件不存在: {config_path}"
36 | config:Dict[str, Any] = {}
37 | with open(config_path, 'r', encoding='utf-8') as f:
38 | config = parse_config_dict(json.load(f), {})
39 | return config
40 |
41 |
--------------------------------------------------------------------------------
/Synthesizers/gsv_fast/__init__.py:
--------------------------------------------------------------------------------
1 | from .GSV_Synthesizer import GSV_Synthesizer as TTS_Synthesizer
2 | from .gsv_task import GSV_TTS_Task as TTS_Task
--------------------------------------------------------------------------------
/Synthesizers/gsv_fast/configs/i18n/locale/zh_CN.json:
--------------------------------------------------------------------------------
1 | {
2 | ", 返回内容:": ", 返回内容:",
3 | "
这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面
若有疑问或需要进一步了解,可参考文档:点击查看详细文档。
": "这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面
若有疑问或需要进一步了解,可参考文档:点击查看详细文档。
", 4 | "Endpoint": "Endpoint", 5 | "GPT模型路径": "GPT模型路径", 6 | "Sovits模型路径": "Sovits模型路径", 7 | "Temperature": "Temperature", 8 | "Top K": "Top K", 9 | "Top P": "Top P", 10 | "all_ja": "只有日文", 11 | "all_zh": "只有中文", 12 | "auto": "自动判断", 13 | "auto_cut": "智能切分", 14 | "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不并行,越大越快,但是越可能出问题", 15 | "cut0": "仅凭换行切分", 16 | "cut1": "凑四句一切", 17 | "cut2": "凑50字一切", 18 | "cut3": "按中文句号。切", 19 | "cut4": "按英文句号.切", 20 | "cut5": "按标点符号切", 21 | "en": "英文", 22 | "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770", 23 | "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp", 24 | "ja": "日文", 25 | "json设置(一般不动)": "json设置(一般不动)", 26 | "zh": "中文", 27 | "不切": "不切", 28 | "人物情感列表网址": "人物情感列表网址", 29 | "从json中读取": "从json中读取", 30 | "使用前,请确认后端服务已启动。": "使用前,请确认后端服务已启动。", 31 | "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不会有完成提示,没报错就是成功)", 32 | "保存失败!": "保存失败!", 33 | "保存成功!": "保存成功!", 34 | "停止播放": "停止播放", 35 | "切句方式": "切句方式", 36 | "前端处理后的文本(每句):": "前端处理后的文本(每句):", 37 | "参考音频在3~10秒范围外,请更换!": "参考音频在3~10秒范围外,请更换!", 38 | "参考音频路径": "参考音频路径", 39 | "发送json格式": "发送json格式", 40 | "发送并开始播放": "发送并开始播放", 41 | "发送请求": "发送请求", 42 | "发送请求到": "发送请求到", 43 | "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。", 44 | "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。", 45 | "基础选项": "基础选项", 46 | "实际输入的参考文本:": "实际输入的参考文本:", 47 | "实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):", 48 | "实际输入的目标文本(每句):": "实际输入的目标文本(每句):", 49 | "实际输入的目标文本:": "实际输入的目标文本:", 50 | "密码": "密码", 51 | "当前人物": "当前人物", 52 | "当前人物变更为: ": "当前人物变更为: ", 53 | "您在使用经典推理模式,部分选项不可用": "您在使用经典推理模式,部分选项不可用", 54 | "情感列表": "情感列表", 55 | "情感风格": "情感风格", 56 | "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。", 57 | "扫描": "扫描", 58 | "扫描人物列表": "扫描人物列表", 59 | "扫描模型文件夹:": "扫描模型文件夹:", 60 | "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!请把有效文件放置在文件夹下!!!", 61 | "提供的推理特化包,当前版本:": "提供的推理特化包,当前版本:", 62 | "提示": "提示", 63 | "提示文本": "提示文本", 64 | "提示语言": "提示语言", 65 | "文件打开失败,保存失败!": "文件打开失败,保存失败!", 66 | "文本语言": "文本语言", 67 | "是否自动匹配情感": "是否自动匹配情感", 68 | "模型文件夹路径": "模型文件夹路径", 69 | "每句允许最大切分字词数": "每句允许最大切分字词数", 70 | "流式音频": "流式音频", 71 | "添加情感": "添加情感", 72 | "点击查看详细文档": "点击查看详细文档", 73 | "版本": "版本", 74 | "用户名": "用户名", 75 | "种子": "种子", 76 | "简介": "简介", 77 | "缺失某些项,保存失败!": "缺失某些项,保存失败!", 78 | "网址设置": "网址设置", 79 | "自动生成info": "自动生成info", 80 | "若有疑问或需要进一步了解,可参考文档:": "若有疑问或需要进一步了解,可参考文档:", 81 | "认证信息": "认证信息", 82 | "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设", 83 | "语速": "语速", 84 | "请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存", 85 | "请求失败,状态码:": "请求失败,状态码:", 86 | "请求失败,请检查URL是否正确": "请求失败,请检查URL是否正确", 87 | "请求完整音频": "请求完整音频", 88 | "请求网址": "请求网址", 89 | "输入文本": "输入文本", 90 | "这是一个由": "这是一个由", 91 | "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目", 92 | "这是展示页面的版本,并未使用后端服务,下面参数无效。": "这是展示页面的版本,并未使用后端服务,下面参数无效。", 93 | "选择角色": "选择角色", 94 | "音频输出": "音频输出", 95 | "音频预览": "音频预览", 96 | "项目开源地址:": "项目开源地址:", 97 | "高级选项": "高级选项", 98 | "最大允许长度": "最大允许长度" 99 | } 100 | -------------------------------------------------------------------------------- /Synthesizers/gsv_fast/configs/i18n/locale/zh_TW.json: -------------------------------------------------------------------------------- 1 | { 2 | ", 返回内容:": ", 返回內容:", 3 | "这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面
若有疑问或需要进一步了解,可参考文档:点击查看详细文档。
": "這是模型管理介面,為了實現對多段參考音頻分配情緒設計,如果您只有一段可不使用這個介面
若有疑問或需要進一步了解,可參考文件:點擊查看詳細文件。
", 4 | "Endpoint": "Endpoint", 5 | "GPT模型路径": "GPT模型路徑", 6 | "Sovits模型路径": "Sovits模型路徑", 7 | "Temperature": "Temperature", 8 | "Top K": "Top K", 9 | "Top P": "Top P", 10 | "all_ja": "僅日文", 11 | "all_zh": "僅中文", 12 | "auto": "自動判斷", 13 | "auto_cut": "智慧切分", 14 | "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不並行,越大越快,但是越可能出現問題", 15 | "cut0": "僅憑換行切分", 16 | "cut1": "湊四句一切", 17 | "cut2": "湊50字一切", 18 | "cut3": "按中文句號。切", 19 | "cut4": "按英文句號.切", 20 | "cut5": "按標點符號切", 21 | "en": "英文", 22 | "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770", 23 | "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp", 24 | "ja": "日文", 25 | "json设置(一般不动)": "json設置(一般不動)", 26 | "zh": "中文", 27 | "不切": "不切", 28 | "人物情感列表网址": "人物情緒列表網址", 29 | "从json中读取": "從json中讀取", 30 | "使用前,请确认后端服务已启动。": "使用前,請確認後端服務已啟動。", 31 | "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不會有完成提示,沒報錯就是成功)", 32 | "保存失败!": "保存失敗!", 33 | "保存成功!": "保存成功!", 34 | "停止播放": "停止播放", 35 | "切句方式": "切句方式", 36 | "前端处理后的文本(每句):": "前端處理後的文本(每句):", 37 | "参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!", 38 | "参考音频路径": "參考音頻路徑", 39 | "发送json格式": "發送json格式", 40 | "发送并开始播放": "發送並開始播放", 41 | "发送请求": "發送請求", 42 | "发送请求到": "發送請求到", 43 | "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或調節batch size滑條。", 44 | "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或者更換參考音頻(使用模型管理介面)、調節下方batch size滑條。", 45 | "基础选项": "基礎選項", 46 | "实际输入的参考文本:": "實際輸入的參考文本:", 47 | "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):", 48 | "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):", 49 | "实际输入的目标文本:": "實際輸入的目標文本:", 50 | "密码": "密碼", 51 | "当前人物": "當前人物", 52 | "当前人物变更为: ": "當前人物變更為: ", 53 | "您在使用经典推理模式,部分选项不可用": "您在使用經典推理模式,部分選項不可用", 54 | "情感列表": "情緒列表", 55 | "情感风格": "情緒風格", 56 | "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "有時掉進黑洞,有時候爬上彩虹。在下一秒鐘,命運如何轉動,沒有人會曉得。我說希望無窮,你猜美夢成空,相信和懷疑,總要決鬥。", 57 | "扫描": "掃描", 58 | "扫描人物列表": "掃描人物列表", 59 | "扫描模型文件夹:": "掃描模型文件夾:", 60 | "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!請把有效文件放置在文件夾下!!!", 61 | "提供的推理特化包,当前版本:": "提供的推理特化包,當前版本:", 62 | "提示": "提示", 63 | "提示文本": "提示文本", 64 | "提示语言": "提示語言", 65 | "文件打开失败,保存失败!": "文件開啟失敗,保存失敗!", 66 | "文本语言": "文本語言", 67 | "是否自动匹配情感": "是否自動匹配情緒", 68 | "模型文件夹路径": "模型文件夾路徑", 69 | "每句允许最大切分字词数": "每句允許最大切分字詞數", 70 | "流式音频": "流式音頻", 71 | "添加情感": "添加情緒", 72 | "点击查看详细文档": "點擊查看詳細文件", 73 | "版本": "版本", 74 | "用户名": "使用者名稱", 75 | "种子": "種子", 76 | "简介": "簡介", 77 | "缺失某些项,保存失败!": "缺失某些項,保存失敗!", 78 | "网址设置": "網址設置", 79 | "自动生成info": "自動生成info", 80 | "若有疑问或需要进一步了解,可参考文档:": "若有疑問或需要進一步了解,可參考文件:", 81 | "认证信息": "認證信息", 82 | "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "認證信息已啟用,您可以在config.json中關閉。\n但是這個功能還沒做好,只是擺設", 83 | "语速": "語速", 84 | "请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存", 85 | "请求失败,状态码:": "請求失敗,狀態碼:", 86 | "请求失败,请检查URL是否正确": "請求失敗,請檢查URL是否正確", 87 | "请求完整音频": "請求完整音頻", 88 | "请求网址": "請求網址", 89 | "输入文本": "輸入文本", 90 | "这是一个由": "這是一個由", 91 | "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一個簡單好用的前後端項目", 92 | "这是展示页面的版本,并未使用后端服务,下面参数无效。": "這是展示頁面的版本,並未使用後端服務,下面參數無效。", 93 | "选择角色": "選擇角色", 94 | "音频输出": "音頻輸出", 95 | "音频预览": "音頻預覽", 96 | "项目开源地址:": "Github Link:", 97 | "高级选项": "高級選項", 98 | "最大允许长度": "最大允許長度" 99 | } 100 | -------------------------------------------------------------------------------- /Synthesizers/gsv_fast/configs/ui_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "ref_settings": [["ref_audio_path", "prompt_text", "prompt_language"]], 3 | "basic_settings": [ 4 | "speed", 5 | 6 | ["text_language", "cut_method", "max_cut_length", "batch_size"] 7 | ], 8 | "advanced_settings": [ 9 | "seed", 10 | "parallel_infer", 11 | ["top_k", "top_p", "temperature", "repetition_penalty"] 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /Synthesizers/gsv_fast/gsv_task.py: -------------------------------------------------------------------------------- 1 | 2 | import os, json, sys 3 | sys.path.append(".") 4 | 5 | from uuid import uuid4 6 | from typing import List, Dict, Literal, Optional, Any, Union 7 | import urllib.parse 8 | import hashlib 9 | 10 | from Synthesizers.base import Base_TTS_Task, ParamItem, init_params_config 11 | 12 | def get_params_config(): 13 | try: 14 | with open(os.path.join("Synthesizers/gsv_fast/configs", "params_config.json"), "r", encoding="utf-8") as f: 15 | return init_params_config(json.load(f)) 16 | except: 17 | raise FileNotFoundError("params_config.json not found or invalid.") 18 | 19 | 20 | params_config = get_params_config() 21 | 22 | from pydantic import BaseModel, Field, model_validator 23 | 24 | class GSV_TTS_Task(Base_TTS_Task): 25 | # character: Optional[str] = None 26 | # emotion: Optional[str] = None 27 | ref_audio_path: Optional[str] = None 28 | prompt_text: Optional[str] = None 29 | prompt_language: Optional[str] = None 30 | text_language: Optional[str] = None 31 | speaker_id: Optional[int] = None 32 | batch_size: Optional[int] = None 33 | top_k: Optional[int] = None 34 | top_p: Optional[float] = None 35 | temperature: Optional[float] = None 36 | cut_method: Optional[str] = None 37 | max_cut_length: Optional[int] = None 38 | seed: Optional[int] = None 39 | save_temp: Optional[bool] = False 40 | parallel_infer : Optional[bool] = True 41 | repetition_penalty : Optional[float] = 1.35 42 | # the gsv_fast model only supports 32000 sample rate 43 | sample_rate: int = 32000 44 | 45 | def __init__(self, other_task: Union[BaseModel, dict, None] = None, **data): 46 | data.setdefault('params_config', params_config) 47 | super().__init__(other_task, **data) 48 | 49 | @property 50 | def md5(self): 51 | m = hashlib.md5() 52 | if self.task_type == "audio": 53 | m.update(self.src.encode()) 54 | elif self.task_type == "ssml": 55 | m.update(self.ssml.encode()) 56 | elif self.task_type == "text": 57 | m.update(self.text.encode()) 58 | m.update(self.text_language.encode()) 59 | m.update(self.character.encode()) 60 | m.update(str(self.speaker_id).encode()) 61 | m.update(str(self.speed).encode()) 62 | m.update(str(self.top_k).encode()) 63 | m.update(str(self.top_p).encode()) 64 | m.update(str(self.temperature).encode()) 65 | m.update(str(self.cut_method).encode()) 66 | m.update(str(self.emotion).encode()) 67 | return m.hexdigest() 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /Synthesizers/remote/Remote_Synthesizer.py: -------------------------------------------------------------------------------- 1 | import io, wave 2 | import os, json, sys 3 | import threading 4 | 5 | from Synthesizers.base import Base_TTS_Synthesizer ,load_config 6 | 7 | from .remote_task import Remote_TTS_Task as TTS_Task, set_based_synthesizer, get_ui_config 8 | import requests 9 | from urllib import parse 10 | from datetime import datetime 11 | from typing import Union, Generator, Tuple, Any, Optional, Dict, Literal 12 | import numpy as np 13 | import soundfile as sf 14 | 15 | class Remote_Synthesizer(Base_TTS_Synthesizer): 16 | url :str = "http://127.0.0.1:5000" 17 | tts_endpoint:str = "/tts" 18 | character_endpoint:str = "/character_list" 19 | based_synthesizer :str = "gsv_fast" 20 | class Config: 21 | extra = "ignore" 22 | def __init__(self, config_path:str = None, **kwargs): 23 | super().__init__(**kwargs) 24 | if config_path is None: 25 | config_path = os.path.join(os.path.dirname(__file__), "configs", "config.json") 26 | config_dict = load_config(config_path) 27 | config_dict.update(kwargs) 28 | for key, value in config_dict.items(): 29 | if hasattr(self, key): 30 | setattr(self, key, value) 31 | set_based_synthesizer(self.based_synthesizer) 32 | self.ui_config = get_ui_config(self.based_synthesizer) 33 | 34 | def get_characters(self)-> dict: 35 | url = self.url + self.character_endpoint 36 | res = requests.get(url) 37 | return json.loads(res.text) 38 | 39 | @staticmethod 40 | def stream_audio(url, data: Dict[str, Any]) -> Generator[Tuple[int, np.ndarray], None, None]: 41 | headers = {"Content-Type": "application/json"} 42 | # 发起POST请求,获取响应流 43 | response = requests.post( 44 | url, data=json.dumps(data), headers=headers, stream=True 45 | ) 46 | chunk_size = 1024 47 | # 确保请求成功 48 | if response.status_code == 200: 49 | # 循环读取音频流 50 | for chunk in response.iter_content(chunk_size): 51 | # 将二进制数据转换为numpy数组,这里假设音频数据是16位整数格式 52 | audiodata = np.frombuffer(chunk, dtype=np.int16) 53 | yield 32000, audiodata 54 | else: 55 | raise Exception( 56 | f"Failed to get audio stream, status code: {response.status_code}" 57 | ) 58 | def generate( 59 | self, 60 | task: TTS_Task, 61 | return_type: Literal["filepath", "numpy"] = "numpy", 62 | save_path: Optional[str] = None, 63 | ) -> Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]: 64 | 65 | 66 | url = self.url + self.tts_endpoint 67 | data = task.data 68 | print(return_type) 69 | 70 | if self.debug_mode: 71 | print(f"generate task: \n{data}") 72 | headers = {"Content-Type": "application/json"} 73 | if return_type == "filepath" or ( 74 | return_type == "numpy" and not task.stream 75 | ): 76 | if save_path is None: 77 | save_path = f"tmp_audio/{datetime.now().strftime('%Y%m%d%H%M%S')}.wav" 78 | res = requests.post(url, data=json.dumps(data), headers=headers) 79 | if res.status_code == 200: 80 | with open(save_path, "wb") as f: 81 | f.write(res.content) 82 | if return_type == "filepath": 83 | return save_path 84 | else: 85 | audiodata, sr = sf.read(save_path) 86 | return ((sr, audiodata) for _ in range(1)) 87 | else: 88 | raise Exception(f"remote synthesizer error: {res.text}") 89 | 90 | elif return_type == "numpy" and task.stream: 91 | return self.stream_audio(url, data) 92 | 93 | 94 | def params_parser(self, data) -> TTS_Task: 95 | task = TTS_Task(based_synthesizer=self.based_synthesizer, **data) 96 | return task 97 | 98 | def ms_like_parser(self,data) -> TTS_Task: 99 | task = TTS_Task(based_synthesizer=self.based_synthesizer, **data) 100 | return task 101 | -------------------------------------------------------------------------------- /Synthesizers/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from .Remote_Synthesizer import Remote_Synthesizer as TTS_Synthesizer 2 | from .remote_task import Remote_TTS_Task as TTS_Task -------------------------------------------------------------------------------- /Synthesizers/remote/configs/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "http://localhost:5000", 3 | "tts_endpoint": "/tts", 4 | "character_endpoint": "/character_list", 5 | "based_synthesizer": "gsv_fast" 6 | } 7 | -------------------------------------------------------------------------------- /Synthesizers/remote/configs/i18n/locale/zh_CN.json: -------------------------------------------------------------------------------- 1 | { 2 | ", 返回内容:": ", 返回内容:", 3 | "这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面
若有疑问或需要进一步了解,可参考文档:点击查看详细文档。
": "这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面
若有疑问或需要进一步了解,可参考文档:点击查看详细文档。
", 4 | "Endpoint": "Endpoint", 5 | "GPT模型路径": "GPT模型路径", 6 | "Sovits模型路径": "Sovits模型路径", 7 | "Temperature": "Temperature", 8 | "Top K": "Top K", 9 | "Top P": "Top P", 10 | "all_ja": "只有日文", 11 | "all_zh": "只有中文", 12 | "auto": "自动判断", 13 | "auto_cut": "智能切分", 14 | "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不并行,越大越快,但是越可能出问题", 15 | "cut0": "仅凭换行切分", 16 | "cut1": "凑四句一切", 17 | "cut2": "凑50字一切", 18 | "cut3": "按中文句号。切", 19 | "cut4": "按英文句号.切", 20 | "cut5": "按标点符号切", 21 | "en": "英文", 22 | "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770", 23 | "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp", 24 | "ja": "日文", 25 | "json设置(一般不动)": "json设置(一般不动)", 26 | "zh": "中文", 27 | "不切": "不切", 28 | "人物情感列表网址": "人物情感列表网址", 29 | "从json中读取": "从json中读取", 30 | "使用前,请确认后端服务已启动。": "使用前,请确认后端服务已启动。", 31 | "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不会有完成提示,没报错就是成功)", 32 | "保存失败!": "保存失败!", 33 | "保存成功!": "保存成功!", 34 | "停止播放": "停止播放", 35 | "切句方式": "切句方式", 36 | "前端处理后的文本(每句):": "前端处理后的文本(每句):", 37 | "参考音频在3~10秒范围外,请更换!": "参考音频在3~10秒范围外,请更换!", 38 | "参考音频路径": "参考音频路径", 39 | "发送json格式": "发送json格式", 40 | "发送并开始播放": "发送并开始播放", 41 | "发送请求": "发送请求", 42 | "发送请求到": "发送请求到", 43 | "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。", 44 | "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。", 45 | "基础选项": "基础选项", 46 | "实际输入的参考文本:": "实际输入的参考文本:", 47 | "实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):", 48 | "实际输入的目标文本(每句):": "实际输入的目标文本(每句):", 49 | "实际输入的目标文本:": "实际输入的目标文本:", 50 | "密码": "密码", 51 | "当前人物": "当前人物", 52 | "当前人物变更为: ": "当前人物变更为: ", 53 | "您在使用经典推理模式,部分选项不可用": "您在使用经典推理模式,部分选项不可用", 54 | "情感列表": "情感列表", 55 | "情感风格": "情感风格", 56 | "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。", 57 | "扫描": "扫描", 58 | "扫描人物列表": "扫描人物列表", 59 | "扫描模型文件夹:": "扫描模型文件夹:", 60 | "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!请把有效文件放置在文件夹下!!!", 61 | "提供的推理特化包,当前版本:": "提供的推理特化包,当前版本:", 62 | "提示": "提示", 63 | "提示文本": "提示文本", 64 | "提示语言": "提示语言", 65 | "文件打开失败,保存失败!": "文件打开失败,保存失败!", 66 | "文本语言": "文本语言", 67 | "是否自动匹配情感": "是否自动匹配情感", 68 | "模型文件夹路径": "模型文件夹路径", 69 | "每句允许最大切分字词数": "每句允许最大切分字词数", 70 | "流式音频": "流式音频", 71 | "添加情感": "添加情感", 72 | "点击查看详细文档": "点击查看详细文档", 73 | "版本": "版本", 74 | "用户名": "用户名", 75 | "种子": "种子", 76 | "简介": "简介", 77 | "缺失某些项,保存失败!": "缺失某些项,保存失败!", 78 | "网址设置": "网址设置", 79 | "自动生成info": "自动生成info", 80 | "若有疑问或需要进一步了解,可参考文档:": "若有疑问或需要进一步了解,可参考文档:", 81 | "认证信息": "认证信息", 82 | "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设", 83 | "语速": "语速", 84 | "请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存", 85 | "请求失败,状态码:": "请求失败,状态码:", 86 | "请求失败,请检查URL是否正确": "请求失败,请检查URL是否正确", 87 | "请求完整音频": "请求完整音频", 88 | "请求网址": "请求网址", 89 | "输入文本": "输入文本", 90 | "这是一个由": "这是一个由", 91 | "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目", 92 | "这是展示页面的版本,并未使用后端服务,下面参数无效。": "这是展示页面的版本,并未使用后端服务,下面参数无效。", 93 | "选择角色": "选择角色", 94 | "音频输出": "音频输出", 95 | "音频预览": "音频预览", 96 | "项目开源地址:": "项目开源地址:", 97 | "高级选项": "高级选项", 98 | "最大允许长度": "最大允许长度" 99 | } 100 | -------------------------------------------------------------------------------- /Synthesizers/remote/configs/i18n/locale/zh_TW.json: -------------------------------------------------------------------------------- 1 | { 2 | ", 返回内容:": ", 返回內容:", 3 | "这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面
若有疑问或需要进一步了解,可参考文档:点击查看详细文档。
": "這是模型管理介面,為了實現對多段參考音頻分配情緒設計,如果您只有一段可不使用這個介面
若有疑問或需要進一步了解,可參考文件:點擊查看詳細文件。
", 4 | "Endpoint": "Endpoint", 5 | "GPT模型路径": "GPT模型路徑", 6 | "Sovits模型路径": "Sovits模型路徑", 7 | "Temperature": "Temperature", 8 | "Top K": "Top K", 9 | "Top P": "Top P", 10 | "all_ja": "僅日文", 11 | "all_zh": "僅中文", 12 | "auto": "自動判斷", 13 | "auto_cut": "智慧切分", 14 | "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不並行,越大越快,但是越可能出現問題", 15 | "cut0": "僅憑換行切分", 16 | "cut1": "湊四句一切", 17 | "cut2": "湊50字一切", 18 | "cut3": "按中文句號。切", 19 | "cut4": "按英文句號.切", 20 | "cut5": "按標點符號切", 21 | "en": "英文", 22 | "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770", 23 | "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp", 24 | "ja": "日文", 25 | "json设置(一般不动)": "json設置(一般不動)", 26 | "zh": "中文", 27 | "不切": "不切", 28 | "人物情感列表网址": "人物情緒列表網址", 29 | "从json中读取": "從json中讀取", 30 | "使用前,请确认后端服务已启动。": "使用前,請確認後端服務已啟動。", 31 | "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不會有完成提示,沒報錯就是成功)", 32 | "保存失败!": "保存失敗!", 33 | "保存成功!": "保存成功!", 34 | "停止播放": "停止播放", 35 | "切句方式": "切句方式", 36 | "前端处理后的文本(每句):": "前端處理後的文本(每句):", 37 | "参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!", 38 | "参考音频路径": "參考音頻路徑", 39 | "发送json格式": "發送json格式", 40 | "发送并开始播放": "發送並開始播放", 41 | "发送请求": "發送請求", 42 | "发送请求到": "發送請求到", 43 | "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或調節batch size滑條。", 44 | "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或者更換參考音頻(使用模型管理介面)、調節下方batch size滑條。", 45 | "基础选项": "基礎選項", 46 | "实际输入的参考文本:": "實際輸入的參考文本:", 47 | "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):", 48 | "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):", 49 | "实际输入的目标文本:": "實際輸入的目標文本:", 50 | "密码": "密碼", 51 | "当前人物": "當前人物", 52 | "当前人物变更为: ": "當前人物變更為: ", 53 | "您在使用经典推理模式,部分选项不可用": "您在使用經典推理模式,部分選項不可用", 54 | "情感列表": "情緒列表", 55 | "情感风格": "情緒風格", 56 | "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "有時掉進黑洞,有時候爬上彩虹。在下一秒鐘,命運如何轉動,沒有人會曉得。我說希望無窮,你猜美夢成空,相信和懷疑,總要決鬥。", 57 | "扫描": "掃描", 58 | "扫描人物列表": "掃描人物列表", 59 | "扫描模型文件夹:": "掃描模型文件夾:", 60 | "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!請把有效文件放置在文件夾下!!!", 61 | "提供的推理特化包,当前版本:": "提供的推理特化包,當前版本:", 62 | "提示": "提示", 63 | "提示文本": "提示文本", 64 | "提示语言": "提示語言", 65 | "文件打开失败,保存失败!": "文件開啟失敗,保存失敗!", 66 | "文本语言": "文本語言", 67 | "是否自动匹配情感": "是否自動匹配情緒", 68 | "模型文件夹路径": "模型文件夾路徑", 69 | "每句允许最大切分字词数": "每句允許最大切分字詞數", 70 | "流式音频": "流式音頻", 71 | "添加情感": "添加情緒", 72 | "点击查看详细文档": "點擊查看詳細文件", 73 | "版本": "版本", 74 | "用户名": "使用者名稱", 75 | "种子": "種子", 76 | "简介": "簡介", 77 | "缺失某些项,保存失败!": "缺失某些項,保存失敗!", 78 | "网址设置": "網址設置", 79 | "自动生成info": "自動生成info", 80 | "若有疑问或需要进一步了解,可参考文档:": "若有疑問或需要進一步了解,可參考文件:", 81 | "认证信息": "認證信息", 82 | "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "認證信息已啟用,您可以在config.json中關閉。\n但是這個功能還沒做好,只是擺設", 83 | "语速": "語速", 84 | "请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存", 85 | "请求失败,状态码:": "請求失敗,狀態碼:", 86 | "请求失败,请检查URL是否正确": "請求失敗,請檢查URL是否正確", 87 | "请求完整音频": "請求完整音頻", 88 | "请求网址": "請求網址", 89 | "输入文本": "輸入文本", 90 | "这是一个由": "這是一個由", 91 | "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一個簡單好用的前後端項目", 92 | "这是展示页面的版本,并未使用后端服务,下面参数无效。": "這是展示頁面的版本,並未使用後端服務,下面參數無效。", 93 | "选择角色": "選擇角色", 94 | "音频输出": "音頻輸出", 95 | "音频预览": "音頻預覽", 96 | "项目开源地址:": "Github Link:", 97 | "高级选项": "高級選項", 98 | "最大允许长度": "最大允許長度" 99 | } 100 | -------------------------------------------------------------------------------- /Synthesizers/remote/configs/params_config.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | } 4 | -------------------------------------------------------------------------------- /Synthesizers/remote/configs/ui_config.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | } 4 | -------------------------------------------------------------------------------- /Synthesizers/remote/remote_task.py: -------------------------------------------------------------------------------- 1 | 2 | import os, json, sys 3 | sys.path.append(".") 4 | 5 | from uuid import uuid4 6 | from typing import List, Dict, Literal, Optional, Any, Union 7 | import urllib.parse 8 | import hashlib 9 | 10 | from Synthesizers.base import Base_TTS_Task, ParamItem, init_params_config 11 | 12 | global global_based_synthesizer 13 | global_based_synthesizer = None 14 | 15 | def set_based_synthesizer(based_synthesizer:str): 16 | global global_based_synthesizer 17 | global_based_synthesizer = based_synthesizer 18 | 19 | def get_params_config(based_synthesizer:str= None): 20 | assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first." 21 | try: 22 | with open(os.path.join(os.path.dirname(__file__), "configs", "params_config.json"), "r", encoding="utf-8") as f: 23 | res:dict = json.load(f) 24 | with open(os.path.join("Synthesizers", based_synthesizer ,"configs", "params_config.json"), "r", encoding="utf-8") as f: 25 | res.update(json.load(f)) 26 | return init_params_config(res) 27 | except: 28 | raise FileNotFoundError("params_config.json not found or invalid.") 29 | 30 | params_config = None 31 | 32 | def get_ui_config(based_synthesizer:str= None)->Dict[str, Any]: 33 | if based_synthesizer is None: 34 | based_synthesizer = global_based_synthesizer 35 | assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first." 36 | 37 | remote_ui_config_path = os.path.join(os.path.dirname(__file__), "configs", "ui_config.json") 38 | based_ui_config_path = os.path.join("Synthesizers", based_synthesizer ,"configs", "ui_config.json") 39 | 40 | ui_config :Dict[str, Any] = {} 41 | try: 42 | with open(remote_ui_config_path, "r", encoding="utf-8") as f: 43 | ui_config.update(json.load(f)) 44 | with open(based_ui_config_path, "r", encoding="utf-8") as f: 45 | ui_config.update(json.load(f)) 46 | return ui_config 47 | except: 48 | raise FileNotFoundError("ui_config.json not found or invalid.") 49 | 50 | from pydantic import BaseModel, Field, model_validator 51 | from copy import deepcopy 52 | class Remote_TTS_Task(Base_TTS_Task): 53 | 54 | is_remote: Optional[bool] = True 55 | data : dict = {} 56 | 57 | class Config: 58 | extra = "ignore" 59 | 60 | def __init__(self, based_synthesizer:str=None, **data): 61 | 62 | global params_config 63 | based_synthesizer = based_synthesizer if based_synthesizer is not None else global_based_synthesizer 64 | assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first." 65 | if params_config is None: 66 | params_config = get_params_config(based_synthesizer) 67 | copyed_data = deepcopy(data) 68 | copyed_data.setdefault("params_config",params_config) 69 | super().__init__(**copyed_data) 70 | self.data = data 71 | 72 | @property 73 | def md5(self): 74 | m = hashlib.md5() 75 | m.update(self.data.__str__().encode()) 76 | return m.hexdigest() 77 | 78 | def __str__(self): 79 | content = super().__str__() 80 | return f"{content}" 81 | 82 | 83 | -------------------------------------------------------------------------------- /api_doc.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | This document aims to introduce how to use our Text-to-Speech API, including making requests via GET and POST methods. This API supports converting text into the voice of specified characters and supports different languages and emotional expressions. 4 | 5 | ## Character and Emotion List 6 | 7 | To obtain the supported characters and their corresponding emotions, please visit the following URL: 8 | 9 | - URL: `http://127.0.0.1:5000/character_list` 10 | - Returns: A JSON format list of characters and corresponding emotions 11 | - Method: `GET` 12 | 13 | ``` 14 | { 15 | "Hanabi": [ 16 | "default", 17 | "Normal", 18 | "Yandere", 19 | ], 20 | "Hutao": [ 21 | "default" 22 | ] 23 | } 24 | ``` 25 | 26 | ## Regarding Aliases 27 | 28 | From version 2.2.4, an alias system was added. Detailed allowed aliases can be found in `Inference/params_config.json`. 29 | 30 | ## Text-to-Speech 31 | 32 | - URL: `http://127.0.0.1:5000/tts` 33 | - Returns: Audio on success. Error message on failure. 34 | - Method: `GET`/`POST` 35 | 36 | ### GET Method 37 | 38 | #### Format 39 | 40 | ``` 41 | http://127.0.0.1:5000/tts?character={{characterName}}&text={{text}} 42 | ``` 43 | 44 | - Parameter explanation: 45 | - `character`: The name of the character folder, pay attention to case sensitivity, full/half width, and language (Chinese/English). 46 | - `text`: The text to be converted, URL encoding is recommended. 47 | - Optional parameters include `text_language`, `format`, `top_k`, `top_p`, `batch_size`, `speed`, `temperature`, `emotion`, `save_temp`, and `stream`, detailed explanations are provided in the POST section below. 48 | - From version 2.2.4, an alias system was added, with detailed allowed aliases found in `Inference/params_config.json`. 49 | 50 | ### POST Method 51 | 52 | #### JSON Package Format 53 | 54 | ##### All Parameters 55 | 56 | ``` 57 | { 58 | "method": "POST", 59 | "body": { 60 | "character": "${chaName}", 61 | "emotion": "${Emotion}", 62 | "text": "${speakText}", 63 | "text_language": "${textLanguage}", 64 | "batch_size": ${batch_size}, 65 | "speed": ${speed}, 66 | "top_k": ${topK}, 67 | "top_p": ${topP}, 68 | "temperature": ${temperature}, 69 | "stream": "${stream}", 70 | "format": "${Format}", 71 | "save_temp": "${saveTemp}" 72 | } 73 | } 74 | ``` 75 | 76 | You can omit one or more items. From version 2.2.4, an alias system was introduced, detailed allowed aliases can be found in `Inference/params_config.json`. 77 | 78 | ##### Minimal Data: 79 | 80 | ``` 81 | { 82 | "method": "POST", 83 | "body": { 84 | "text": "${speakText}" 85 | } 86 | } 87 | ``` 88 | 89 | ##### Parameter Explanation 90 | 91 | - **text**: The text to be converted, URL encoding is recommended. 92 | - **character**: Character folder name, pay attention to case sensitivity, full/half width, and language. 93 | - **emotion**: Character emotion, must be an actually supported emotion of the character, otherwise, the default emotion will be used. 94 | - **text_language**: Text language (auto / zh / en / ja), default is multilingual mixed. 95 | - **top_k**, **top_p**, **temperature**: GPT model parameters, no need to modify if unfamiliar. 96 | 97 | - **batch_size**: How many batches at a time, can be increased for faster processing if you have a powerful computer, integer, default is 1. 98 | - **speed**: Speech speed, default is 1.0. 99 | - **save_temp**: Whether to save temporary files, when true, the backend will save the generated audio, and subsequent identical requests will directly return that data, default is false. 100 | - **stream**: Whether to stream, when true, audio will be returned sentence by sentence, default is false. 101 | - **format**: Format, default is WAV, allows MP3/ WAV/ OGG. 102 | 103 | -------------------------------------------------------------------------------- /common_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_config": { 3 | "locale": { 4 | "default": "auto", 5 | "description": "Locale settings for the application", 6 | "label": "语言", 7 | "type": "string", 8 | "choices": ["auto", "en_US", "zh_CN", "zh_TW"] 9 | }, 10 | "server_port": { 11 | "default": 5000, 12 | "description": "Port number for the application, -1 for auto select", 13 | "label": "服务端口", 14 | "type": "integer" 15 | }, 16 | "server_name": { 17 | "default": "0.0.0.0", 18 | "description": "Host address for the application", 19 | "label": "服务主机", 20 | "type": "string", 21 | "choices": ["127.0.0.1", "0.0.0.0"] 22 | }, 23 | "inbrowser": { 24 | "default": true, 25 | "description": "Flag to indicate if the application is running in browser", 26 | "label": "是否在浏览器中打开", 27 | "type": "boolean" 28 | }, 29 | "synthesizer": { 30 | "default": "gsv_fast", 31 | "description": "Synthesizer used by app.py, 'remote' for using TTS service running on a remote host", 32 | "label": "Web UI 所采用的语音合成器", 33 | "type": "string", 34 | "choices": ["gsv_fast", "remote"] 35 | }, 36 | "also_enable_api": { 37 | "default": true, 38 | "description": "Flag to indicate if API is enabled", 39 | "label": "是否启用API", 40 | "type": "boolean" 41 | }, 42 | "max_text_length": { 43 | "default": -1, 44 | "description": "Maximum length of text to synthesize in Web UI", 45 | "label": "Max Text Length", 46 | "type": "integer" 47 | }, 48 | "is_share": { 49 | "default": false, 50 | "description": "Flag to indicate if sharing is enabled", 51 | "label": "是否分享", 52 | "type": "boolean" 53 | } 54 | }, 55 | "pure_api_config": { 56 | "tts_port": { 57 | "default": 5000, 58 | "description": "Port number for TTS service", 59 | "label": "tts服务端口", 60 | "type": "integer" 61 | }, 62 | "tts_host": { 63 | "default": "0.0.0.0", 64 | "description": "Host address for TTS service", 65 | "label": "tts主机端口", 66 | "type": "string", 67 | "choices": ["127.0.0.1", "0.0.0.0"] 68 | }, 69 | "synthesizer": { 70 | "default": "gsv_fast", 71 | "description": "Synthesizer used by api.py", 72 | "label": "api.py 所采用的语音合成器", 73 | "type": "string", 74 | "choices": ["gsv_fast"] 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | gsvi: 5 | image: breakstring/gsvi:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as gsvi, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container. 6 | container_name: gsvi-container 7 | environment: 8 | - is_half=False 9 | - is_share=False 10 | volumes: 11 | - ./output:/workspace/output 12 | - ./logs:/workspace/logs 13 | - ./SoVITS_weights:/workspace/SoVITS_weights 14 | - ./reference:/workspace/reference 15 | working_dir: /workspace 16 | ports: 17 | - "9880:9880" 18 | - "9871:9871" 19 | - "9872:9872" 20 | - "9873:9873" 21 | - "9874:9874" 22 | shm_size: 16G 23 | deploy: 24 | resources: 25 | reservations: 26 | devices: 27 | - driver: nvidia 28 | count: "all" 29 | capabilities: [gpu] 30 | stdin_open: true 31 | tty: true 32 | restart: unless-stopped 33 | -------------------------------------------------------------------------------- /dockerbuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 获取当前日期,格式为 YYYYMMDD 4 | DATE=$(date +%Y%m%d) 5 | # 获取最新的 Git commit 哈希值的前 7 位 6 | COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7) 7 | 8 | # 构建 full 版本的镜像 9 | docker build --build-arg IMAGE_TYPE=full -t breakstring/gsvi:latest . 10 | # 为同一个镜像添加带日期的标签 11 | docker tag breakstring/gsvi:latest breakstring/gsvi:dev-$DATE 12 | # 为同一个镜像添加带当前代码库Commit哈希值的标签 13 | docker tag breakstring/gsvi:latest breakstring/gsvi:dev-$COMMIT_HASH 14 | 15 | 16 | # 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器) 17 | docker build --build-arg IMAGE_TYPE=elite -t breakstring/gsvi:latest-elite . 18 | # 为同一个镜像添加带日期的标签 19 | docker tag breakstring/gsvi:latest-elite breakstring/gsvi:dev-$DATE-elite 20 | # 为同一个镜像添加带当前代码库Commit哈希值的标签 21 | docker tag breakstring/gsvi:latest-elite breakstring/gsvi:dev-$COMMIT_HASH-elite 22 | -------------------------------------------------------------------------------- /docs/cn/Changelog_CN.md: -------------------------------------------------------------------------------- 1 | ### 20240121更新 2 | 3 | 1-config添加is_share,诸如colab等场景可以将此改为True,来使得webui映射到公网 4 | 5 | 2-WebUI添加英文系统英文翻译适配 6 | 7 | 3-cmd-asr自动判断是否已自带damo模型,如不在默认目录上将从modelscope自带下载 8 | 9 | 4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等) 10 | 11 | 5-清理TEMP文件夹缓存音频等文件 12 | 13 | 6-大幅削弱合成音频包含参考音频结尾的问题 14 | 15 | ### 20240122更新 16 | 17 | 1-修复过短输出文件返回重复参考音频的问题。 18 | 19 | 2-经测试,英文日文训练原生支持(日文训练需要根目录不含非英文等特殊字符)。 20 | 21 | 3-音频路径检查。如果尝试读取输入错的路径报错路径不存在,而非ffmpeg错误。 22 | 23 | ### 20240123更新 24 | 25 | 1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题 26 | 27 | 2-支持推理界面快速切换模型 28 | 29 | 3-优化模型文件排序逻辑 30 | 31 | 4-中文分词使用jieba_fast代替jieba 32 | 33 | ### 20240126更新 34 | 35 | 1-支持输出文本中英混合、日英混合 36 | 37 | 2-输出可选切分模式 38 | 39 | 3-修复uvr5读取到目录自动跳出的问题 40 | 41 | 4-修复多个换行导致推理报错 42 | 43 | 5-去除推理界面大量冗余log 44 | 45 | 6-支持mac训练推理 46 | 47 | 7-自动识别不支持半精度的卡强制单精度。cpu推理下强制单精度。 48 | 49 | ### 20240128更新 50 | 51 | 1-修复数字转汉字念法问题 52 | 53 | 2-修复句首少量字容易吞字的问题 54 | 55 | 3-通过限制排除不合理的参考音频长度 56 | 57 | 4-修复GPT训练不保存ckpt的问题 58 | 59 | 5-完善Dockerfile的下载模型流程 60 | 61 | ### 20240129更新 62 | 63 | 1-16系等半精度训练有问题的显卡把训练配置改为单精度训练 64 | 65 | 2-测试更新可用的colab版本 66 | 67 | 3-修复git clone modelscope funasr仓库+老版本funasr导致接口不对齐报错的问题 68 | 69 | 70 | ### 20240130更新 71 | 72 | 1-所有涉及路径的地方双引号自动去除,小白复制路径带双引号不会报错 73 | 74 | 2-修复中英文标点切割问题和句首句尾补标点的问题 75 | 76 | 3-增加按标点符号切分 77 | 78 | ### 20240201更新 79 | 80 | 1-修复uvr5读取格式错误导致分离失败的问题 81 | 82 | 2-支持中日英混合多种文本自动切分识别语种 83 | 84 | ### 20240202更新 85 | 86 | 1-修复asr路径尾缀带/保存文件名报错 87 | 88 | 2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题,例如:xx.xx%(带百分号类),元/吨 会读成 元吨 而不是元每吨,下划线不再会报错 89 | 90 | ### 20240207更新 91 | 92 | 1-修正语种传参混乱导致中文推理效果下降 https://github.com/RVC-Boss/GPT-SoVITS/issues/391 93 | 94 | 2-uvr5适配高版本librosa https://github.com/RVC-Boss/GPT-SoVITS/pull/403 95 | 96 | 3-修复uvr5 inf everywhere报错的问题(is_half传参未转换bool导致恒定半精度推理,16系显卡会inf) https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8 97 | 98 | 4-优化英文文本前端 99 | 100 | 5-修复gradio依赖 101 | 102 | 6-支持三连根目录留空自动读取.list全路径 103 | 104 | 7-集成faster whisper ASR日文英文 105 | 106 | ### 20240208更新 107 | 108 | 1-GPT训练卡死(win10 1909)和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 (系统语言繁体)GPT训练报错,[尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)。 109 | 110 | ### 20240212更新 111 | 112 | 1-faster whisper和funasr逻辑优化。faster whisper转镜像站下载,规避huggingface连不上的问题。 113 | 114 | 2-DPO Loss实验性训练选项开启,通过构造负样本训练缓解GPT重复漏字问题。推理界面公开几个推理参数。 https://github.com/RVC-Boss/GPT-SoVITS/pull/457 115 | 116 | ### 20240214更新 117 | 118 | 1-训练支持中文实验名(原来会报错) 119 | 120 | 2-DPO训练改为可勾选选项而非必须。如勾选batch size自动减半。修复推理界面新参数不传参的问题。 121 | 122 | ### 20240216更新 123 | 124 | 1-支持无参考文本输入 125 | 126 | 2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475 127 | 128 | ### 20240221更新 129 | 130 | 1-数据处理添加语音降噪选项(降噪为只剩16k采样率,除非底噪很大先不急着用哦。) 131 | 132 | 2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509 133 | 134 | 3-mac CPU推理更快因此把推理设备从mps改到CPU 135 | 136 | 4-colab修复不开启公网url 137 | 138 | ### 20240306更新 139 | 140 | 1-推理加速50%(RTX3090+pytorch2.2.1+cu11.8+win10+py39 tested)https://github.com/RVC-Boss/GPT-SoVITS/pull/672 141 | 142 | 2-如果用faster whisper非中文ASR不再需要先下中文funasr模型 143 | 144 | 3-修复uvr5去混响模型 是否混响 反的 https://github.com/RVC-Boss/GPT-SoVITS/pull/610 145 | 146 | 4-faster whisper如果无cuda可用自动cpu推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/675 147 | 148 | 5-修改is_half的判断使在Mac上能正常CPU推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/573 149 | 150 | 151 | todolist: 152 | 153 | 1-中文多音字推理优化(有没有人来测试的,欢迎把测试结果写在pr评论区里) https://github.com/RVC-Boss/GPT-SoVITS/pull/488 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /docs/ja/Changelog_JA.md: -------------------------------------------------------------------------------- 1 | ### 20240121 更新 2 | 3 | 1. `config`に`is_share`を追加し、Colab などの環境でこれを`True`に設定すると、webui を公共ネットワークにマッピングできます。 4 | 5 | 2. WebUI に英語システムの英語翻訳を追加しました。 6 | 7 | 3. `cmd-asr`は damo モデルが既に含まれているかどうかを自動的に確認し、デフォルトのパスにない場合は modelscope から自動的にダウンロードします。 8 | 9 | 4. [SoVITS 训练报错 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 修復を試みます(長さ 0 のサンプルをフィルタリングなど) 10 | 11 | 5. TEMP ファイルフォルダからオーディオやその他のファイルをクリーンアップして最適化します。 12 | 13 | 6. 合成オーディオがリファレンスオーディオの終わりを含む問題を大幅に改善しました。 14 | 15 | ### 20240122 更新 16 | 17 | 1. 短すぎる出力ファイルが重複したリファレンスオーディオを返す問題を修正しました。 18 | 19 | 2. 英語-日本語学習がスムーズに進む QA を完了しました。(ただし、日本語学習はルートディレクトリに英語以外の文字が含まれていない必要があります) 20 | 21 | 3. オーディオパスをチェックします。間違ったパスを読み取ろうとすると、「パスが存在しません」というエラーメッセージが返されます。これは ffmpeg モジュールのエラーではありません。 22 | 23 | ### 20240123 更新 24 | 25 | 1. hubert から nan 抽出による SoVITS/GPT 学習中の ZeroDivisionError 関連エラーを修正しました。 26 | 27 | 2. 推論インターフェースでモデルを素早く切り替えることができるようにサポートしました。 28 | 29 | 3. モデルファイルのソートロジックを最適化しました。 30 | 31 | 4. 中国語の分析に`jieba_fast`を`jieba`に置き換えました。 32 | 33 | ### 20240126 更新 34 | 35 | 1. 中国語と英語、日本語と英語が混在した出力テキストをサポートします。 36 | 37 | 2. 出力で選択的な分割モードをサポートします。 38 | 39 | 3. uvr5 がディレクトリを読み取り、自動的に終了する問題を修正しました。 40 | 41 | 4. 複数の改行による推論エラーを修正しました。 42 | 43 | 5. 推論インターフェースから不要なログを削除しました。 44 | 45 | 6. MacOS での学習と推論をサポートします。 46 | 47 | 7. 半精度をサポートしていないカードを自動的に識別して単精度を強制し、CPU 推論では単精度を強制します。 48 | 49 | ### 20240128 更新 50 | 51 | 1. 数字を漢字で読む問題を修正しました。 52 | 53 | 2. 文章の先頭の一部の単語が欠落する問題を修正しました。 54 | 55 | 3. 不適切な長さのリファレンスオーディオを制限しました。 56 | 57 | 4. GPT 学習時の ckpt が保存されない問題を修正しました。 58 | 59 | 5. Dockerfile のモデルダウンロードプロセスを改善しました。 60 | 61 | ### 20240129 更新 62 | 63 | 1. 16 系などの半精度学習に問題があるカードは、学習構成を単精度学習に変更しました。 64 | 65 | 2. Colab でも使用可能なバージョンをテストして更新しました。 66 | 67 | 3. `git clone modelscope funasr`リポジトリと古いバージョンの funasr を使用してインターフェースが一致しないエラーを修正しました。 68 | 69 | ### 20240130 更新 70 | 71 | 1. パスと関連する文字列を解析して、二重引用符を自動的に削除します。また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません。 72 | 73 | 2. 中国語と英語、日本語と英語の混合出力をサポートします。 74 | 75 | 3. 出力で選択的な分割モードをサポートします。 76 | 77 | todolist: 78 | 79 | 1. 同音異義語(中国語)の推論の最適化 80 | 81 | 2. 英語大文字認識と英語ハイフン [問題](https://github.com/RVC-Boss/GPT-SoVITS/issues/271) 82 | 83 | 3. テキストに%記号が含まれているとエラーが発生し、推論が不可能です。また、「元/吨」が「元吨」ではなく「元每吨」と読まれるなどの問題があります。このような問題を解決するには、どのライブラリを使用する必要があり、それに対する改善を検討しています。 84 | 85 | 4. 中-日-英、中-英、日-英を含む 5 つの言語をサポートすることを目標にしています。 86 | -------------------------------------------------------------------------------- /gsv_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "device": "auto", 3 | "is_half": "auto", 4 | 5 | "models_path": "trained", 6 | "cnhubert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", 7 | "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", 8 | "save_prompt_cache": true, 9 | "prompt_cache_dir": "cache/prompt_cache" 10 | } 11 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | conda install -c conda-forge gcc 3 | conda install -c conda-forge gxx 4 | conda install ffmpeg cmake 5 | conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia 6 | pip install -r requirements.txt 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /pure_api.py: -------------------------------------------------------------------------------- 1 | # 在开头加入路径 2 | import os, sys 3 | import importlib 4 | 5 | now_dir = os.getcwd() 6 | sys.path.append(now_dir) 7 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 8 | 9 | from src.common_config_manager import __version__, api_config 10 | import soundfile as sf 11 | from fastapi import FastAPI, Request, HTTPException 12 | from fastapi.responses import JSONResponse, FileResponse, StreamingResponse 13 | from fastapi.middleware.cors import CORSMiddleware 14 | import tempfile 15 | import uvicorn 16 | import json 17 | 18 | # 将当前文件所在的目录添加到 sys.path 19 | from Synthesizers.base import Base_TTS_Task, Base_TTS_Synthesizer 20 | 21 | # 创建合成器实例 22 | tts_synthesizer:Base_TTS_Synthesizer = None 23 | 24 | def set_tts_synthesizer(synthesizer:Base_TTS_Synthesizer): 25 | global tts_synthesizer 26 | tts_synthesizer = synthesizer 27 | 28 | # 存储临时文件的字典 29 | temp_files = {} 30 | 31 | async def character_list(request: Request): 32 | res = JSONResponse(tts_synthesizer.get_characters()) 33 | return res 34 | 35 | async def tts(request: Request): 36 | 37 | from time import time as tt 38 | t1 = tt() 39 | print(f"Request Time: {t1}") 40 | 41 | # 尝试从JSON中获取数据,如果不是JSON,则从查询参数中获取 42 | if request.method == "GET": 43 | data = request.query_params 44 | else: 45 | data = await request.json() 46 | 47 | task:Base_TTS_Task = tts_synthesizer.params_parser(data) 48 | 49 | if task.task_type == "text" and task.text.strip() == "": 50 | return HTTPException(status_code=400, detail="Text is empty") 51 | elif task.task_type == "ssml" and task.ssml.strip() == "": 52 | return HTTPException(status_code=400, detail="SSML is empty") 53 | md5_value = task.md5 54 | if task.stream == False: 55 | # TODO: use SQL instead of dict 56 | if task.save_temp and md5_value in temp_files: 57 | return FileResponse(path=temp_files[md5_value], media_type=f'audio/{task.format}') 58 | else: 59 | # 假设 gen 是你的音频生成器 60 | try: 61 | save_path = tts_synthesizer.generate(task, return_type="filepath") 62 | except Exception as e: 63 | return HTTPException(status_code=500, detail=str(e)) 64 | if task.save_temp: 65 | temp_files[md5_value] = save_path 66 | 67 | t2 = tt() 68 | print(f"total time: {t2-t1}") 69 | # 返回文件响应,FileResponse 会负责将文件发送给客户端 70 | return FileResponse(save_path, media_type=f"audio/{task.format}", filename=os.path.basename(save_path)) 71 | else: 72 | gen = tts_synthesizer.generate(task, return_type="numpy") 73 | return StreamingResponse(gen, media_type='audio/wav') 74 | 75 | 76 | 77 | 78 | if __name__ == "__main__": 79 | # 动态导入合成器模块, 此处可写成 from Synthesizers.xxx import TTS_Synthesizer, TTS_Task 80 | from importlib import import_module 81 | from src.api_utils import get_localhost_ipv4_address 82 | synthesizer_name = api_config.synthesizer 83 | synthesizer_module = import_module(f"Synthesizers.{synthesizer_name}") 84 | TTS_Synthesizer = synthesizer_module.TTS_Synthesizer 85 | TTS_Task = synthesizer_module.TTS_Task 86 | # 初始化合成器的类 87 | tts_synthesizer = TTS_Synthesizer(debug_mode=True) 88 | 89 | # 生成一句话充当测试,减少第一次请求的等待时间 90 | gen = tts_synthesizer.generate(tts_synthesizer.params_parser({"text":"你好,世界"}) ) 91 | next(gen) 92 | 93 | # 打印一些辅助信息 94 | print(f"Backend Version: {__version__}") 95 | tts_host = api_config.tts_host 96 | tts_port = api_config.tts_port 97 | ipv4_address = get_localhost_ipv4_address(tts_host) 98 | ipv4_link = f"http://{ipv4_address}:{tts_port}" 99 | print(f"INFO: Local Network URL: {ipv4_link}") 100 | 101 | app = FastAPI() 102 | 103 | # 设置CORS 104 | app.add_middleware( 105 | CORSMiddleware, 106 | allow_origins=["*"], 107 | allow_credentials=True, 108 | allow_methods=["*"], 109 | allow_headers=["*"], 110 | ) 111 | app.add_api_route('/tts', tts, methods=["GET", "POST"]) 112 | app.add_api_route('/character_list', character_list, methods=["GET"]) 113 | uvicorn.run(app, host=tts_host, port=tts_port) 114 | 115 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pydub 3 | 4 | pydantic 5 | soundfile 6 | flash-attention 7 | numpy 8 | scipy 9 | tensorboard 10 | librosa==0.9.2 11 | numba 12 | pytorch-lightning 13 | gradio>=4.29 14 | gradio_client 15 | ffmpeg-python 16 | onnxruntime 17 | tqdm 18 | funasr==1.0.0 19 | cn2an 20 | pypinyin 21 | pyopenjtalk 22 | g2p_en 23 | torchaudio 24 | modelscope==1.10.0 25 | sentencepiece 26 | transformers 27 | chardet 28 | PyYAML 29 | psutil 30 | jieba_fast 31 | jieba 32 | LangSegment>=0.3.1 33 | Faster_Whisper 34 | fastapi 35 | uvicorn 36 | wordsegment 37 | srt 38 | 39 | pyloudnorm -------------------------------------------------------------------------------- /src/api_utils.py: -------------------------------------------------------------------------------- 1 | import socket 2 | # 便于小白理解 3 | def get_localhost_ipv4_address(host = "127.0.0.1"): 4 | 5 | def get_internal_ip(): 6 | """获取内部IP地址""" 7 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 8 | try: 9 | # 这不会发送真正的数据包 10 | s.connect(('10.253.156.219', 1)) 11 | IP = s.getsockname()[0] 12 | except Exception: 13 | IP = '127.0.0.1' 14 | finally: 15 | s.close() 16 | return IP 17 | 18 | if host == "0.0.0.0": 19 | display_hostname = get_internal_ip() 20 | return display_hostname 21 | else: 22 | return host 23 | 24 | def get_gradio_frp(server_name, server_port, share_token) -> str: 25 | from urllib.parse import urlparse, urlunparse 26 | from gradio import networking 27 | share_url = networking.setup_tunnel( 28 | local_host=server_name, 29 | local_port=server_port, 30 | share_token=share_token, 31 | share_server_address=None, 32 | ) 33 | parsed_url = urlparse(share_url) 34 | share_server_protocol = "https" 35 | share_url = urlunparse( 36 | (share_server_protocol,) + parsed_url[1:] 37 | ) 38 | return share_url -------------------------------------------------------------------------------- /src/common_config_manager.py: -------------------------------------------------------------------------------- 1 | import os, sys, json 2 | from typing import List, Any, Optional ,Dict, Literal 3 | from pydantic import BaseModel, Field, model_validator 4 | 5 | __version__ = "2.6.3" 6 | 7 | from Synthesizers.base import load_config 8 | 9 | 10 | class Api_Config(BaseModel): 11 | config_path:str = None 12 | tts_port: int = 5000 13 | tts_host: str = "0.0.0.0" 14 | synthesizer: str = "gsv_fast" 15 | 16 | 17 | def __init__(self, config_path = None): 18 | super().__init__() 19 | 20 | self.config_path = config_path 21 | assert os.path.exists(self.config_path), f"配置文件不存在: {self.config_path}" 22 | if os.path.exists(self.config_path): 23 | all_config = load_config(self.config_path) 24 | config:dict = all_config.get("common", {}) 25 | for key, value in config.items(): 26 | setattr(self, key, value) 27 | 28 | class App_Config(BaseModel): 29 | 30 | config_path:str = None 31 | locale: str = "auto" 32 | is_share: bool = False 33 | inbrowser: bool = True 34 | server_name: str = "0.0.0.0" 35 | server_port: int = -1 # -1 means auto select 36 | also_enable_api: bool = True 37 | synthesizer: str = "gsv_fast" 38 | max_text_length: int = -1 39 | 40 | @model_validator(mode='after') 41 | def check_locale(self): 42 | # Example: validating locale to be one of a set predefined values or patterns 43 | self.locale = self.locale.replace("-", "_") 44 | return self 45 | 46 | @staticmethod 47 | def check_port(port:int, server_name:str): 48 | url = f"http://{server_name}:{port}" 49 | 50 | 51 | def __init__(self, config_path = None): 52 | super().__init__() 53 | 54 | self.config_path = config_path 55 | assert os.path.exists(self.config_path), f"配置文件不存在: {self.config_path}" 56 | if os.path.exists(self.config_path): 57 | all_config = load_config(self.config_path) 58 | config = all_config.get("app_config", {}) 59 | for key, value in config.items(): 60 | setattr(self, key, value) 61 | 62 | app_config = App_Config("common_config.json") 63 | api_config = Api_Config("common_config.json") 64 | 65 | 66 | -------------------------------------------------------------------------------- /tmp_audio/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/GPT-SoVITS-Inference/a55436e81784d21d158445ca103e1cfb3fb586f3/tools/__init__.py -------------------------------------------------------------------------------- /tools/asr/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def check_fw_local_models(): 4 | ''' 5 | 启动时检查本地是否有 Faster Whisper 模型. 6 | ''' 7 | model_size_list = [ 8 | "tiny", "tiny.en", 9 | "base", "base.en", 10 | "small", "small.en", 11 | "medium", "medium.en", 12 | "large", "large-v1", 13 | "large-v2", "large-v3"] 14 | for i, size in enumerate(model_size_list): 15 | if os.path.exists(f'tools/asr/models/faster-whisper-{size}'): 16 | model_size_list[i] = size + '-local' 17 | return model_size_list 18 | 19 | asr_dict = { 20 | "达摩 ASR (中文)": { 21 | 'lang': ['zh'], 22 | 'size': ['large'], 23 | 'path': 'funasr_asr.py', 24 | }, 25 | "Faster Whisper (多语种)": { 26 | 'lang': ['auto', 'zh', 'en', 'ja'], 27 | 'size': check_fw_local_models(), 28 | 'path': 'fasterwhisper_asr.py' 29 | } 30 | } 31 | 32 | -------------------------------------------------------------------------------- /tools/asr/funasr_asr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import argparse 4 | import os 5 | import traceback 6 | from tqdm import tqdm 7 | 8 | from funasr import AutoModel 9 | 10 | path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' 11 | path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' 12 | path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' 13 | path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" 14 | path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" 15 | path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" 16 | 17 | model = AutoModel( 18 | model = path_asr, 19 | model_revision = "v2.0.4", 20 | vad_model = path_vad, 21 | vad_model_revision = "v2.0.4", 22 | punc_model = path_punc, 23 | punc_model_revision = "v2.0.4", 24 | ) 25 | 26 | def only_asr(input_file): 27 | try: 28 | text = model.generate(input=input_file)[0]["text"] 29 | except: 30 | text = '' 31 | print(traceback.format_exc()) 32 | return text 33 | 34 | def execute_asr(input_folder, output_folder, model_size, language): 35 | input_file_names = os.listdir(input_folder) 36 | input_file_names.sort() 37 | 38 | output = [] 39 | output_file_name = os.path.basename(input_folder) 40 | 41 | for file_name in tqdm(input_file_names): 42 | try: 43 | file_path = os.path.join(input_folder, file_name) 44 | text = model.generate(input=file_path)[0]["text"] 45 | output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}") 46 | except: 47 | print(traceback.format_exc()) 48 | 49 | output_folder = output_folder or "output/asr_opt" 50 | os.makedirs(output_folder, exist_ok=True) 51 | output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') 52 | 53 | with open(output_file_path, "w", encoding="utf-8") as f: 54 | f.write("\n".join(output)) 55 | print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") 56 | return output_file_path 57 | 58 | if __name__ == '__main__': 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument("-i", "--input_folder", type=str, required=True, 61 | help="Path to the folder containing WAV files.") 62 | parser.add_argument("-o", "--output_folder", type=str, required=True, 63 | help="Output folder to store transcriptions.") 64 | parser.add_argument("-s", "--model_size", type=str, default='large', 65 | help="Model Size of FunASR is Large") 66 | parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'], 67 | help="Language of the audio files.") 68 | parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], 69 | help="fp16 or fp32")#还没接入 70 | 71 | cmd = parser.parse_args() 72 | execute_asr( 73 | input_folder = cmd.input_folder, 74 | output_folder = cmd.output_folder, 75 | model_size = cmd.model_size, 76 | language = cmd.language, 77 | ) 78 | -------------------------------------------------------------------------------- /tools/asr/models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /tools/cmd-denoise.py: -------------------------------------------------------------------------------- 1 | import os,argparse 2 | 3 | from modelscope.pipelines import pipeline 4 | from modelscope.utils.constant import Tasks 5 | from tqdm import tqdm 6 | 7 | path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k' 8 | path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" 9 | ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise) 10 | def execute_denoise(input_folder,output_folder): 11 | os.makedirs(output_folder,exist_ok=True) 12 | # print(input_folder) 13 | # print(list(os.listdir(input_folder).sort())) 14 | for name in tqdm(os.listdir(input_folder)): 15 | ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("-i", "--input_folder", type=str, required=True, 20 | help="Path to the folder containing WAV files.") 21 | parser.add_argument("-o", "--output_folder", type=str, required=True, 22 | help="Output folder to store transcriptions.") 23 | parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], 24 | help="fp16 or fp32")#还没接入 25 | cmd = parser.parse_args() 26 | execute_denoise( 27 | input_folder = cmd.input_folder, 28 | output_folder = cmd.output_folder, 29 | ) -------------------------------------------------------------------------------- /tools/denoise-model/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /tools/i18n/i18n.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import locale 4 | from src.common_config_manager import app_config 5 | 6 | def load_language_list(language, locale_paths): 7 | language_map = {} 8 | for locale_path in locale_paths: 9 | lang_file = os.path.join(locale_path, f"{language}.json") 10 | if os.path.exists(lang_file): 11 | with open(lang_file, 'r', encoding='utf-8') as f: 12 | language_map.update(json.load(f)) 13 | return language_map 14 | 15 | class I18nAuto: 16 | def __init__(self, language=None, locale_paths=[], locale_path="./i18n/locale"): 17 | if language in ["auto", None]: 18 | if app_config.locale in ["auto", None, ""]: 19 | language = locale.getdefaultlocale()[0] 20 | else: 21 | language = app_config.locale 22 | if not any(os.path.exists(os.path.join(locale_path, f"{language}.json")) for locale_path in locale_paths): 23 | language = "zh_CN" 24 | self.language = language 25 | if len(locale_paths): 26 | self.language_map = load_language_list(language, locale_paths) 27 | else: 28 | self.language_map = load_language_list(language, [locale_path]) 29 | 30 | def __call__(self, key): 31 | return self.language_map.get(key, key) 32 | 33 | def __repr__(self): 34 | return "Use Language: " + self.language 35 | -------------------------------------------------------------------------------- /tools/i18n/locale_diff.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import OrderedDict 4 | 5 | # dir_path = "./i18n/locale" # The path to the i18n locale directory, you can change it to your own path 6 | dir_path = "./tools/srt_slicer/i18n/locale" 7 | # Define the standard file name 8 | standard_file = os.path.join(dir_path, "zh_CN.json") 9 | 10 | # Find all JSON files in the directory 11 | languages = [ 12 | os.path.join(dir_path, f) 13 | for f in os.listdir(dir_path) 14 | if f.endswith(".json") and f != standard_file 15 | ] 16 | 17 | # Load the standard file 18 | with open(standard_file, "r", encoding="utf-8") as f: 19 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 20 | 21 | # Loop through each language file 22 | for lang_file in languages: 23 | # Load the language file 24 | with open(lang_file, "r", encoding="utf-8") as f: 25 | lang_data = json.load(f, object_pairs_hook=OrderedDict) 26 | 27 | # Find the difference between the language file and the standard file 28 | diff = set(standard_data.keys()) - set(lang_data.keys()) 29 | 30 | miss = set(lang_data.keys()) - set(standard_data.keys()) 31 | 32 | # Add any missing keys to the language file 33 | for key in diff: 34 | lang_data[key] = standard_data[key] 35 | 36 | # Del any extra keys to the language file 37 | for key in miss: 38 | del lang_data[key] 39 | 40 | # Sort the keys of the language file to match the order of the standard file 41 | lang_data = OrderedDict( 42 | sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])) 43 | ) 44 | 45 | # Save the updated language file 46 | with open(lang_file, "w", encoding="utf-8") as f: 47 | json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True) 48 | f.write("\n") 49 | -------------------------------------------------------------------------------- /tools/i18n/scan_i18n.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | from collections import OrderedDict 4 | import os 5 | 6 | # locale_path = "./i18n/locale" # The path to the i18n locale directory, you can change it to your own path 7 | # scan_list = ["./", 8 | # "GPT_SoVITS/", 9 | # "tools/" 10 | # ] # The path to the directory you want to scan, you can change it to your own path 11 | # scan_subfolders = False # Whether to scan subfolders 12 | 13 | locale_path = "./tools/srt_slicer/i18n/locale" 14 | scan_list = ["./tools/srt_slicer"] # The path to the directory you want to scan, you can change it to your own path 15 | scan_subfolders = True 16 | 17 | special_words_to_keep = { 18 | "auto": "自动判断", 19 | "zh": "中文", 20 | "en": "英文", 21 | "ja": "日文", 22 | "all_zh": "只有中文", 23 | "all_ja": "只有日文", 24 | "auto_cut": "智能切分", 25 | "cut0": "仅凭换行切分", 26 | "cut1": "凑四句一切", 27 | "cut2": "凑50字一切", 28 | "cut3": "按中文句号。切", 29 | "cut4": "按英文句号.切", 30 | "cut5": "按标点符号切", 31 | 32 | } 33 | 34 | 35 | def extract_i18n_strings(node): 36 | i18n_strings = [] 37 | 38 | if ( 39 | isinstance(node, ast.Call) 40 | and isinstance(node.func, ast.Name) 41 | and node.func.id == "i18n" 42 | ): 43 | for arg in node.args: 44 | if isinstance(arg, ast.Str): 45 | i18n_strings.append(arg.s) 46 | 47 | for child_node in ast.iter_child_nodes(node): 48 | i18n_strings.extend(extract_i18n_strings(child_node)) 49 | 50 | return i18n_strings 51 | 52 | strings = [] 53 | 54 | # for each file, parse the code into an AST 55 | # for each AST, extract the i18n strings 56 | def scan_i18n_strings(filename): 57 | with open(filename, "r", encoding="utf-8") as f: 58 | code = f.read() 59 | if "I18nAuto" in code: 60 | tree = ast.parse(code) 61 | i18n_strings = extract_i18n_strings(tree) 62 | print(filename, len(i18n_strings)) 63 | strings.extend(i18n_strings) 64 | 65 | 66 | # scan the directory for all .py files (recursively) 67 | if scan_subfolders: 68 | for folder in scan_list: 69 | for dirpath, dirnames, filenames in os.walk(folder): 70 | for filename in [f for f in filenames if f.endswith(".py")]: 71 | scan_i18n_strings(os.path.join(dirpath, filename)) 72 | else: 73 | for folder in scan_list: 74 | for filename in os.listdir(folder): 75 | if filename.endswith(".py"): 76 | scan_i18n_strings(os.path.join(folder, filename)) 77 | 78 | code_keys = set(strings) 79 | """ 80 | n_i18n.py 81 | gui_v1.py 26 82 | app.py 16 83 | infer-web.py 147 84 | scan_i18n.py 0 85 | i18n.py 0 86 | lib/train/process_ckpt.py 1 87 | """ 88 | print() 89 | print("Total unique:", len(code_keys)) 90 | 91 | 92 | standard_file = os.path.join(locale_path, "zh_CN.json") 93 | try: 94 | with open(standard_file, "r", encoding="utf-8") as f: 95 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 96 | standard_keys = set(standard_data.keys()) 97 | except FileNotFoundError: 98 | standard_keys = set() 99 | # Define the standard file name 100 | unused_keys = standard_keys - code_keys 101 | print("Unused keys:", len(unused_keys)) 102 | for unused_key in unused_keys: 103 | print("\t", unused_key) 104 | 105 | missing_keys = code_keys - standard_keys 106 | print("Missing keys:", len(missing_keys)) 107 | for missing_key in missing_keys: 108 | print("\t", missing_key) 109 | 110 | 111 | 112 | code_keys_dict = OrderedDict() 113 | for s in strings: 114 | if s in special_words_to_keep: 115 | code_keys_dict[s] = special_words_to_keep[s] 116 | else: 117 | code_keys_dict[s] = s 118 | 119 | # write back 120 | os.makedirs(locale_path, exist_ok=True) 121 | with open(standard_file, "w", encoding="utf-8") as f: 122 | json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True) 123 | f.write("\n") 124 | -------------------------------------------------------------------------------- /tools/my_utils.py: -------------------------------------------------------------------------------- 1 | import platform,os,traceback 2 | import ffmpeg 3 | import numpy as np 4 | 5 | 6 | def load_audio(file, sr): 7 | try: 8 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 9 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 10 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 11 | file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 12 | if os.path.exists(file) == False: 13 | raise RuntimeError( 14 | "You input a wrong audio path that does not exists, please fix it!" 15 | ) 16 | out, _ = ( 17 | ffmpeg.input(file, threads=0) 18 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 19 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 20 | ) 21 | except Exception as e: 22 | traceback.print_exc() 23 | raise RuntimeError(f"Failed to load audio: {e}") 24 | 25 | return np.frombuffer(out, np.float32).flatten() 26 | 27 | 28 | def clean_path(path_str): 29 | if platform.system() == 'Windows': 30 | path_str = path_str.replace('/', '\\') 31 | return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ").strip("\u202a") 32 | -------------------------------------------------------------------------------- /tools/normalize_loudness.py: -------------------------------------------------------------------------------- 1 | import soundfile as sf 2 | import numpy as np 3 | from pyloudnorm import Meter, normalize 4 | import os 5 | 6 | def normalize_loudness(audio_path, target_loudness, target_path): 7 | """ 8 | 归一化音频文件的响度到指定的目标响度。 9 | 10 | 参数: 11 | audio_path (str): 原始音频文件的路径。 12 | target_loudness (float): 目标响度值(LUFS)。 13 | target_path (str): 归一化后音频的保存路径。 14 | 15 | 返回: 16 | bool: 归一化操作是否成功。 17 | """ 18 | try: 19 | # 读取音频文件 20 | data, rate = sf.read(audio_path) 21 | 22 | # 创建响度仪表,基于ITU-R BS.1770 23 | meter = Meter(rate) # 采样率 24 | 25 | # 测量音频的响度 26 | loudness = meter.integrated_loudness(data) 27 | 28 | # 响度归一化 29 | normalized_audio = normalize.loudness(data, loudness, target_loudness) 30 | 31 | os.makedirs(os.path.dirname(target_path), exist_ok=True) 32 | # 保存归一化后的音频文件 33 | sf.write(target_path, normalized_audio, rate) 34 | 35 | return True 36 | except Exception as e: 37 | raise e -------------------------------------------------------------------------------- /tools/slice_audio.py: -------------------------------------------------------------------------------- 1 | import os,sys,numpy as np 2 | import traceback 3 | from scipy.io import wavfile 4 | # parent_directory = os.path.dirname(os.path.abspath(__file__)) 5 | # sys.path.append(parent_directory) 6 | from my_utils import load_audio 7 | from slicer2 import Slicer 8 | 9 | def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): 10 | os.makedirs(opt_root,exist_ok=True) 11 | if os.path.isfile(inp): 12 | input=[inp] 13 | elif os.path.isdir(inp): 14 | input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] 15 | else: 16 | return "输入路径存在但既不是文件也不是文件夹" 17 | slicer = Slicer( 18 | sr=32000, # 长音频采样率 19 | threshold= int(threshold), # 音量小于这个值视作静音的备选切割点 20 | min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 21 | min_interval= int(min_interval), # 最短切割间隔 22 | hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) 23 | max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长 24 | ) 25 | _max=float(_max) 26 | alpha=float(alpha) 27 | for inp_path in input[int(i_part)::int(all_part)]: 28 | # print(inp_path) 29 | try: 30 | name = os.path.basename(inp_path) 31 | audio = load_audio(inp_path, 32000) 32 | # print(audio.shape) 33 | for chunk, start, end in slicer.slice(audio): # start和end是帧数 34 | tmp_max = np.abs(chunk).max() 35 | if(tmp_max>1):chunk/=tmp_max 36 | chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk 37 | wavfile.write( 38 | "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), 39 | 32000, 40 | # chunk.astype(np.float32), 41 | (chunk * 32767).astype(np.int16), 42 | ) 43 | except: 44 | print(inp_path,"->fail->",traceback.format_exc()) 45 | return "执行完毕,请检查输出文件" 46 | 47 | print(slice(*sys.argv[1:])) 48 | 49 | -------------------------------------------------------------------------------- /tools/srt_slicer/i18n/locale/en_US.json: -------------------------------------------------------------------------------- 1 | { 2 | "List 合并小工具": "List Merge Tool", 3 | "SRT合并切分插件": "SRT Merge and Split Plugin", 4 | "SRT文件": "SRT File", 5 | "SRT编辑界面": "SRT Edit Interface", 6 | "srt文件内容": "SRT File Content", 7 | "上传SRT文件": "Upload SRT File", 8 | "上传文件": "Upload Files", 9 | "两个文件夹不能相同!!!": "The two folders cannot be the same!!!", 10 | "主文件夹": "Main Folder", 11 | "作者: ": "Author: ", 12 | "使用方法": "How to Use", 13 | "保存合并后字幕": "Save Merged Subtitles", 14 | "保存子文件夹名称": "Save Subfolder Name", 15 | "保存文件夹": "Save Folder", 16 | "允许最短长度": "Minimum Allowed Length", 17 | "内容预览": "Content Preview", 18 | "切分与保存": "Split and Save", 19 | "切分完成": "Split Completed", 20 | "切分并保存音频、list": "Split and Save Audio, List", 21 | "切分预览": "Split Preview", 22 | "判定为短间隔时长": "Judged as Short Interval Duration", 23 | "到": " to ", 24 | "前置保留时间": "Preceding Retention Time", 25 | "前置添加静音时间": "Prepend Silence Time", 26 | "句末加句号": "Add Period at the End of Sentence", 27 | "合并后srt文本": "Merged SRT Text", 28 | "合并后的List": "Merged List", 29 | "合并字幕": "Merge Subtitles", 30 | "合并字幕设置": "Subtitle Merge Settings", 31 | "合并文件夹与List": "Merge Folder and List", 32 | "后置保留时间": "Following Retention Time", 33 | "后置添加静音时间": "Append Silence Time", 34 | "扫描文件夹": "Scan Folder", 35 | "找不到字幕!!!": "Subtitles Not Found!!!", 36 | "找不到音频!!!": "Audio Not Found!!!", 37 | "提供SRT文件(可使用剪映或者ASR工具获得)与原始音频文件。": "Provide SRT File (can be obtained via Clip or ASR tools) and Original Audio File.", 38 | "提前合并时间间隔很短的字幕": "Merge Subtitles with Short Intervals in Advance", 39 | "提示": "Tips", 40 | "文件夹路径": "Folder Path", 41 | "最大间隔时间": "Maximum Interval Time", 42 | "最长允许单句长度": "Maximum Allowed Sentence Length", 43 | "根据面板合并短句并过滤你不希望出现的句子。": "Merge short sentences according to the panel and filter out sentences you do not want to appear.", 44 | "次文件夹": "Second Folder", 45 | "正在切分音频": "Splitting Audio", 46 | "正在建设,敬请期待": "Under Construction, Stay Tuned", 47 | "注意:该文件夹已存在": "Warning: The folder already exists", 48 | "角色名称,留空使用主文件夹的": "Role Name, Leave Blank to Use Main Folder's", 49 | "语言": "Language", 50 | "读取文件": "Read File", 51 | "读取本地文件": "Read Local File", 52 | "过滤字幕": "Filter Subtitles", 53 | "过滤带有英文的": "Filter Out English", 54 | "过滤设置": "Filter Settings", 55 | "过滤词语,一行一个": "Filter Words, One Per Line", 56 | "这是一个插件,用于依靠SRT文件得到切分与打标好的音频。": "This is a plugin for obtaining split and tagged audio based on SRT files.", 57 | "随后保存成切分好的音频与list文件。": "Then save as split audio and list files.", 58 | "音频文件": "Audio File", 59 | "音频格式": "Audio Format" 60 | } 61 | -------------------------------------------------------------------------------- /tools/srt_slicer/i18n/locale/zh_CN.json: -------------------------------------------------------------------------------- 1 | { 2 | "List 合并小工具": "List 合并小工具", 3 | "SRT合并切分插件": "SRT合并切分插件", 4 | "SRT文件": "SRT文件", 5 | "SRT编辑界面": "SRT编辑界面", 6 | "srt文件内容": "srt文件内容", 7 | "上传SRT文件": "上传SRT文件", 8 | "上传文件": "上传文件", 9 | "两个文件夹不能相同!!!": "两个文件夹不能相同!!!", 10 | "主文件夹": "主文件夹", 11 | "作者: ": "作者: ", 12 | "使用方法": "使用方法", 13 | "保存合并后字幕": "保存合并后字幕", 14 | "保存子文件夹名称": "保存子文件夹名称", 15 | "保存文件夹": "保存文件夹", 16 | "允许最短长度": "允许最短长度", 17 | "内容预览": "内容预览", 18 | "切分与保存": "切分与保存", 19 | "切分完成": "切分完成", 20 | "切分并保存音频、list": "切分并保存音频、list", 21 | "切分预览": "切分预览", 22 | "判定为短间隔时长": "判定为短间隔时长", 23 | "到": "到", 24 | "前置保留时间": "前置保留时间", 25 | "前置添加静音时间": "前置添加静音时间", 26 | "句末加句号": "句末加句号", 27 | "合并后srt文本": "合并后srt文本", 28 | "合并后的List": "合并后的List", 29 | "合并字幕": "合并字幕", 30 | "合并字幕设置": "合并字幕设置", 31 | "合并文件夹与List": "合并文件夹与List", 32 | "后置保留时间": "后置保留时间", 33 | "后置添加静音时间": "后置添加静音时间", 34 | "扫描文件夹": "扫描文件夹", 35 | "找不到字幕!!!": "找不到字幕!!!", 36 | "找不到音频!!!": "找不到音频!!!", 37 | "提供SRT文件(可使用剪映或者ASR工具获得)与原始音频文件。": "提供SRT文件(可使用剪映或者ASR工具获得)与原始音频文件。", 38 | "提前合并时间间隔很短的字幕": "提前合并时间间隔很短的字幕", 39 | "提示": "提示", 40 | "文件夹路径": "文件夹路径", 41 | "最大间隔时间": "最大间隔时间", 42 | "最长允许单句长度": "最长允许单句长度", 43 | "根据面板合并短句并过滤你不希望出现的句子。": "根据面板合并短句并过滤你不希望出现的句子。", 44 | "次文件夹": "次文件夹", 45 | "正在切分音频": "正在切分音频", 46 | "正在建设,敬请期待": "正在建设,敬请期待", 47 | "注意:该文件夹已存在": "注意:该文件夹已存在", 48 | "角色名称,留空使用主文件夹的": "角色名称,留空使用主文件夹的", 49 | "语言": "语言", 50 | "读取文件": "读取文件", 51 | "读取本地文件": "读取本地文件", 52 | "过滤字幕": "过滤字幕", 53 | "过滤带有英文的": "过滤带有英文的", 54 | "过滤设置": "过滤设置", 55 | "过滤词语,一行一个": "过滤词语,一行一个", 56 | "这是一个插件,用于依靠SRT文件得到切分与打标好的音频。": "这是一个插件,用于依靠SRT文件得到切分与打标好的音频。", 57 | "随后保存成切分好的音频与list文件。": "随后保存成切分好的音频与list文件。", 58 | "音频文件": "音频文件", 59 | "音频格式": "音频格式" 60 | } 61 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_123812KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/model_param_init.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | 5 | default_param = {} 6 | default_param["bins"] = 768 7 | default_param["unstable_bins"] = 9 # training only 8 | default_param["reduction_bins"] = 762 # training only 9 | default_param["sr"] = 44100 10 | default_param["pre_filter_start"] = 757 11 | default_param["pre_filter_stop"] = 768 12 | default_param["band"] = {} 13 | 14 | 15 | default_param["band"][1] = { 16 | "sr": 11025, 17 | "hl": 128, 18 | "n_fft": 960, 19 | "crop_start": 0, 20 | "crop_stop": 245, 21 | "lpf_start": 61, # inference only 22 | "res_type": "polyphase", 23 | } 24 | 25 | default_param["band"][2] = { 26 | "sr": 44100, 27 | "hl": 512, 28 | "n_fft": 1536, 29 | "crop_start": 24, 30 | "crop_stop": 547, 31 | "hpf_start": 81, # inference only 32 | "res_type": "sinc_best", 33 | } 34 | 35 | 36 | def int_keys(d): 37 | r = {} 38 | for k, v in d: 39 | if k.isdigit(): 40 | k = int(k) 41 | r[k] = v 42 | return r 43 | 44 | 45 | class ModelParameters(object): 46 | def __init__(self, config_path=""): 47 | if ".pth" == pathlib.Path(config_path).suffix: 48 | import zipfile 49 | 50 | with zipfile.ZipFile(config_path, "r") as zip: 51 | self.param = json.loads( 52 | zip.read("param.json"), object_pairs_hook=int_keys 53 | ) 54 | elif ".json" == pathlib.Path(config_path).suffix: 55 | with open(config_path, "r") as f: 56 | self.param = json.loads(f.read(), object_pairs_hook=int_keys) 57 | else: 58 | self.param = default_param 59 | 60 | for k in [ 61 | "mid_side", 62 | "mid_side_b", 63 | "mid_side_b2", 64 | "stereo_w", 65 | "stereo_n", 66 | "reverse", 67 | ]: 68 | if not k in self.param: 69 | self.param[k] = False 70 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 16000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 16000, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 32000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "kaiser_fast" 14 | } 15 | }, 16 | "sr": 32000, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 33075, 8 | "hl": 384, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 33075, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 1024, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 256, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 256, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 256, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 256, 18 | "pre_filter_stop": 256 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 700, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 700 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 118, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 32000, 18 | "hl": 352, 19 | "n_fft": 1024, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 44, 23 | "hpf_stop": 23, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 32000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } 31 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 512, 3 | "unstable_bins": 7, 4 | "reduction_bins": 510, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 160, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 192, 12 | "lpf_start": 41, 13 | "lpf_stop": 139, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 44100, 18 | "hl": 640, 19 | "n_fft": 1024, 20 | "crop_start": 10, 21 | "crop_stop": 320, 22 | "hpf_start": 47, 23 | "hpf_stop": 15, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 44100, 28 | "pre_filter_start": 510, 29 | "pre_filter_stop": 512 30 | } 31 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 240, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 48000, 18 | "hl": 528, 19 | "n_fft": 1536, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 82, 23 | "hpf_stop": 22, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 48000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 5, 4 | "reduction_bins": 733, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 278, 12 | "lpf_start": 28, 13 | "lpf_stop": 140, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 22050, 18 | "hl": 256, 19 | "n_fft": 768, 20 | "crop_start": 14, 21 | "crop_stop": 322, 22 | "hpf_start": 70, 23 | "hpf_stop": 14, 24 | "lpf_start": 283, 25 | "lpf_stop": 314, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 44100, 30 | "hl": 512, 31 | "n_fft": 768, 32 | "crop_start": 131, 33 | "crop_stop": 313, 34 | "hpf_start": 154, 35 | "hpf_stop": 141, 36 | "res_type": "sinc_medium" 37 | } 38 | }, 39 | "sr": 44100, 40 | "pre_filter_start": 757, 41 | "pre_filter_stop": 768 42 | } 43 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side": true, 3 | "bins": 768, 4 | "unstable_bins": 5, 5 | "reduction_bins": 733, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 768, 11 | "crop_start": 0, 12 | "crop_stop": 278, 13 | "lpf_start": 28, 14 | "lpf_stop": 140, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 256, 20 | "n_fft": 768, 21 | "crop_start": 14, 22 | "crop_stop": 322, 23 | "hpf_start": 70, 24 | "hpf_stop": 14, 25 | "lpf_start": 283, 26 | "lpf_stop": 314, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 512, 32 | "n_fft": 768, 33 | "crop_start": 131, 34 | "crop_stop": 313, 35 | "hpf_start": 154, 36 | "hpf_stop": 141, 37 | "res_type": "sinc_medium" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 757, 42 | "pre_filter_stop": 768 43 | } 44 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 640, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 187, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 768, 21 | "crop_start": 0, 22 | "crop_stop": 212, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 174, 26 | "lpf_stop": 209, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 640, 33 | "crop_start": 66, 34 | "crop_stop": 307, 35 | "hpf_start": 86, 36 | "hpf_stop": 72, 37 | "res_type": "kaiser_fast" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 639, 42 | "pre_filter_stop": 640 43 | } 44 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 668, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 1024, 10 | "crop_start": 0, 11 | "crop_stop": 186, 12 | "lpf_start": 37, 13 | "lpf_stop": 73, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 11025, 18 | "hl": 128, 19 | "n_fft": 512, 20 | "crop_start": 4, 21 | "crop_stop": 185, 22 | "hpf_start": 36, 23 | "hpf_stop": 18, 24 | "lpf_start": 93, 25 | "lpf_stop": 185, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 22050, 30 | "hl": 256, 31 | "n_fft": 512, 32 | "crop_start": 46, 33 | "crop_stop": 186, 34 | "hpf_start": 93, 35 | "hpf_stop": 46, 36 | "lpf_start": 164, 37 | "lpf_stop": 186, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 512, 43 | "n_fft": 768, 44 | "crop_start": 121, 45 | "crop_stop": 382, 46 | "hpf_start": 138, 47 | "hpf_stop": 123, 48 | "res_type": "sinc_medium" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 740, 53 | "pre_filter_stop": 768 54 | } 55 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "mid_side": true, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } 56 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json: -------------------------------------------------------------------------------- 1 | { 2 | "reverse": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json: -------------------------------------------------------------------------------- 1 | { 2 | "stereo_w": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "convert_channels": "stereo_n", 49 | "res_type": "kaiser_fast" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 668, 54 | "pre_filter_stop": 672 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 530, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/ensemble.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 1280, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 2048, 11 | "crop_start": 0, 12 | "crop_stop": 374, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 1536, 21 | "crop_start": 0, 22 | "crop_stop": 424, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 348, 26 | "lpf_stop": 418, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 1280, 33 | "crop_start": 132, 34 | "crop_stop": 614, 35 | "hpf_start": 172, 36 | "hpf_stop": 144, 37 | "res_type": "polyphase" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 1280, 42 | "pre_filter_stop": 1280 43 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets.py: -------------------------------------------------------------------------------- 1 | import layers 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from . import spec_utils 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | def __init__(self, n_fft): 42 | super(CascadedASPPNet, self).__init__() 43 | self.stg1_low_band_net = BaseASPPNet(2, 16) 44 | self.stg1_high_band_net = BaseASPPNet(2, 16) 45 | 46 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) 47 | self.stg2_full_band_net = BaseASPPNet(8, 16) 48 | 49 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 50 | self.stg3_full_band_net = BaseASPPNet(16, 32) 51 | 52 | self.out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) 54 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) 55 | 56 | self.max_bin = n_fft // 2 57 | self.output_bin = n_fft // 2 + 1 58 | 59 | self.offset = 128 60 | 61 | def forward(self, x, aggressiveness=None): 62 | mix = x.detach() 63 | x = x.clone() 64 | 65 | x = x[:, :, : self.max_bin] 66 | 67 | bandw = x.size()[2] // 2 68 | aux1 = torch.cat( 69 | [ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]), 72 | ], 73 | dim=2, 74 | ) 75 | 76 | h = torch.cat([x, aux1], dim=1) 77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 78 | 79 | h = torch.cat([x, aux1, aux2], dim=1) 80 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 81 | 82 | mask = torch.sigmoid(self.out(h)) 83 | mask = F.pad( 84 | input=mask, 85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 86 | mode="replicate", 87 | ) 88 | 89 | if self.training: 90 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 91 | aux1 = F.pad( 92 | input=aux1, 93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 94 | mode="replicate", 95 | ) 96 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 97 | aux2 = F.pad( 98 | input=aux2, 99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 100 | mode="replicate", 101 | ) 102 | return mask * mix, aux1 * mix, aux2 * mix 103 | else: 104 | if aggressiveness: 105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 106 | mask[:, :, : aggressiveness["split_bin"]], 107 | 1 + aggressiveness["value"] / 3, 108 | ) 109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 110 | mask[:, :, aggressiveness["split_bin"] :], 111 | 1 + aggressiveness["value"], 112 | ) 113 | 114 | return mask * mix 115 | 116 | def predict(self, x_mag, aggressiveness=None): 117 | h = self.forward(x_mag, aggressiveness) 118 | 119 | if self.offset > 0: 120 | h = h[:, :, :, self.offset : -self.offset] 121 | assert h.size()[3] > 0 122 | 123 | return h 124 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets_123812KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /tools/uvr5/lib/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import torch 5 | from tqdm import tqdm 6 | 7 | 8 | def load_data(file_name: str = "./lib/name_params.json") -> dict: 9 | with open(file_name, "r") as f: 10 | data = json.load(f) 11 | 12 | return data 13 | 14 | 15 | def make_padding(width, cropsize, offset): 16 | left = offset 17 | roi_size = cropsize - left * 2 18 | if roi_size == 0: 19 | roi_size = cropsize 20 | right = roi_size - (width % roi_size) + left 21 | 22 | return left, right, roi_size 23 | 24 | 25 | def inference(X_spec, device, model, aggressiveness, data): 26 | """ 27 | data : dic configs 28 | """ 29 | 30 | def _execute( 31 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True 32 | ): 33 | model.eval() 34 | with torch.no_grad(): 35 | preds = [] 36 | 37 | iterations = [n_window] 38 | 39 | total_iterations = sum(iterations) 40 | for i in tqdm(range(n_window)): 41 | start = i * roi_size 42 | X_mag_window = X_mag_pad[ 43 | None, :, :, start : start + data["window_size"] 44 | ] 45 | X_mag_window = torch.from_numpy(X_mag_window) 46 | if is_half: 47 | X_mag_window = X_mag_window.half() 48 | X_mag_window = X_mag_window.to(device) 49 | 50 | pred = model.predict(X_mag_window, aggressiveness) 51 | 52 | pred = pred.detach().cpu().numpy() 53 | preds.append(pred[0]) 54 | 55 | pred = np.concatenate(preds, axis=2) 56 | return pred 57 | 58 | def preprocess(X_spec): 59 | X_mag = np.abs(X_spec) 60 | X_phase = np.angle(X_spec) 61 | 62 | return X_mag, X_phase 63 | 64 | X_mag, X_phase = preprocess(X_spec) 65 | 66 | coef = X_mag.max() 67 | X_mag_pre = X_mag / coef 68 | 69 | n_frame = X_mag_pre.shape[2] 70 | pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) 71 | n_window = int(np.ceil(n_frame / roi_size)) 72 | 73 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 74 | 75 | if list(model.state_dict().values())[0].dtype == torch.float16: 76 | is_half = True 77 | else: 78 | is_half = False 79 | pred = _execute( 80 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half 81 | ) 82 | pred = pred[:, :, :n_frame] 83 | 84 | if data["tta"]: 85 | pad_l += roi_size // 2 86 | pad_r += roi_size // 2 87 | n_window += 1 88 | 89 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 90 | 91 | pred_tta = _execute( 92 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half 93 | ) 94 | pred_tta = pred_tta[:, :, roi_size // 2 :] 95 | pred_tta = pred_tta[:, :, :n_frame] 96 | 97 | return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) 98 | else: 99 | return pred * coef, X_mag, np.exp(1.0j * X_phase) 100 | 101 | 102 | def _get_name_params(model_path, model_hash): 103 | data = load_data() 104 | flag = False 105 | ModelName = model_path 106 | for type in list(data): 107 | for model in list(data[type][0]): 108 | for i in range(len(data[type][0][model])): 109 | if str(data[type][0][model][i]["hash_name"]) == model_hash: 110 | flag = True 111 | elif str(data[type][0][model][i]["hash_name"]) in ModelName: 112 | flag = True 113 | 114 | if flag: 115 | model_params_auto = data[type][0][model][i]["model_params"] 116 | param_name_auto = data[type][0][model][i]["param_name"] 117 | if type == "equivalent": 118 | return param_name_auto, model_params_auto 119 | else: 120 | flag = False 121 | return param_name_auto, model_params_auto 122 | -------------------------------------------------------------------------------- /tools/uvr5/uvr5_weights/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /webuis/character_manager/i18n/locale/zh_CN.json: -------------------------------------------------------------------------------- 1 | { 2 | ", 返回内容:": ", 返回内容:", 3 | "这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面
若有疑问或需要进一步了解,可参考文档:点击查看详细文档。
": "这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面
若有疑问或需要进一步了解,可参考文档:点击查看详细文档。
", 4 | "Endpoint": "Endpoint", 5 | "GPT模型路径": "GPT模型路径", 6 | "Sovits模型路径": "Sovits模型路径", 7 | "Temperature": "Temperature", 8 | "Top K": "Top K", 9 | "Top P": "Top P", 10 | "all_ja": "只有日文", 11 | "all_zh": "只有中文", 12 | "auto": "自动判断", 13 | "auto_cut": "智能切分", 14 | "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不并行,越大越快,但是越可能出问题", 15 | "cut0": "仅凭换行切分", 16 | "cut1": "凑四句一切", 17 | "cut2": "凑50字一切", 18 | "cut3": "按中文句号。切", 19 | "cut4": "按英文句号.切", 20 | "cut5": "按标点符号切", 21 | "en": "英文", 22 | "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770", 23 | "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp", 24 | "ja": "日文", 25 | "json设置(一般不动)": "json设置(一般不动)", 26 | "zh": "中文", 27 | "不切": "不切", 28 | "人物情感列表网址": "人物情感列表网址", 29 | "从json中读取": "从json中读取", 30 | "使用前,请确认后端服务已启动。": "使用前,请确认后端服务已启动。", 31 | "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不会有完成提示,没报错就是成功)", 32 | "保存失败!": "保存失败!", 33 | "保存成功!": "保存成功!", 34 | "停止播放": "停止播放", 35 | "切句方式": "切句方式", 36 | "前端处理后的文本(每句):": "前端处理后的文本(每句):", 37 | "参考音频在3~10秒范围外,请更换!": "参考音频在3~10秒范围外,请更换!", 38 | "参考音频路径": "参考音频路径", 39 | "发送json格式": "发送json格式", 40 | "发送并开始播放": "发送并开始播放", 41 | "发送请求": "发送请求", 42 | "发送请求到": "发送请求到", 43 | "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。", 44 | "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。", 45 | "基础选项": "基础选项", 46 | "实际输入的参考文本:": "实际输入的参考文本:", 47 | "实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):", 48 | "实际输入的目标文本(每句):": "实际输入的目标文本(每句):", 49 | "实际输入的目标文本:": "实际输入的目标文本:", 50 | "密码": "密码", 51 | "当前人物": "当前人物", 52 | "当前人物变更为: ": "当前人物变更为: ", 53 | "您在使用经典推理模式,部分选项不可用": "您在使用经典推理模式,部分选项不可用", 54 | "情感列表": "情感列表", 55 | "情感风格": "情感风格", 56 | "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。", 57 | "扫描": "扫描", 58 | "扫描人物列表": "扫描人物列表", 59 | "扫描模型文件夹:": "扫描模型文件夹:", 60 | "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!请把有效文件放置在文件夹下!!!", 61 | "提供的推理特化包,当前版本:": "提供的推理特化包,当前版本:", 62 | "提示": "提示", 63 | "提示文本": "提示文本", 64 | "提示语言": "提示语言", 65 | "文件打开失败,保存失败!": "文件打开失败,保存失败!", 66 | "文本语言": "文本语言", 67 | "是否自动匹配情感": "是否自动匹配情感", 68 | "模型文件夹路径": "模型文件夹路径", 69 | "每句允许最大切分字词数": "每句允许最大切分字词数", 70 | "流式音频": "流式音频", 71 | "添加情感": "添加情感", 72 | "点击查看详细文档": "点击查看详细文档", 73 | "版本": "版本", 74 | "用户名": "用户名", 75 | "种子": "种子", 76 | "简介": "简介", 77 | "缺失某些项,保存失败!": "缺失某些项,保存失败!", 78 | "网址设置": "网址设置", 79 | "自动生成info": "自动生成info", 80 | "若有疑问或需要进一步了解,可参考文档:": "若有疑问或需要进一步了解,可参考文档:", 81 | "认证信息": "认证信息", 82 | "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设", 83 | "语速": "语速", 84 | "请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存", 85 | "请求失败,状态码:": "请求失败,状态码:", 86 | "请求失败,请检查URL是否正确": "请求失败,请检查URL是否正确", 87 | "请求完整音频": "请求完整音频", 88 | "请求网址": "请求网址", 89 | "输入文本": "输入文本", 90 | "这是一个由": "这是一个由", 91 | "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目", 92 | "这是展示页面的版本,并未使用后端服务,下面参数无效。": "这是展示页面的版本,并未使用后端服务,下面参数无效。", 93 | "选择角色": "选择角色", 94 | "音频输出": "音频输出", 95 | "音频预览": "音频预览", 96 | "项目开源地址:": "项目开源地址:", 97 | "高级选项": "高级选项", 98 | "最大允许长度": "最大允许长度" 99 | } 100 | -------------------------------------------------------------------------------- /webuis/character_manager/i18n/locale/zh_TW.json: -------------------------------------------------------------------------------- 1 | { 2 | ", 返回内容:": ", 返回內容:", 3 | "这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面
若有疑问或需要进一步了解,可参考文档:点击查看详细文档。
": "這是模型管理介面,為了實現對多段參考音頻分配情緒設計,如果您只有一段可不使用這個介面
若有疑問或需要進一步了解,可參考文件:點擊查看詳細文件。
", 4 | "Endpoint": "Endpoint", 5 | "GPT模型路径": "GPT模型路徑", 6 | "Sovits模型路径": "Sovits模型路徑", 7 | "Temperature": "Temperature", 8 | "Top K": "Top K", 9 | "Top P": "Top P", 10 | "all_ja": "僅日文", 11 | "all_zh": "僅中文", 12 | "auto": "自動判斷", 13 | "auto_cut": "智慧切分", 14 | "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不並行,越大越快,但是越可能出現問題", 15 | "cut0": "僅憑換行切分", 16 | "cut1": "湊四句一切", 17 | "cut2": "湊50字一切", 18 | "cut3": "按中文句號。切", 19 | "cut4": "按英文句號.切", 20 | "cut5": "按標點符號切", 21 | "en": "英文", 22 | "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770", 23 | "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp", 24 | "ja": "日文", 25 | "json设置(一般不动)": "json設置(一般不動)", 26 | "zh": "中文", 27 | "不切": "不切", 28 | "人物情感列表网址": "人物情緒列表網址", 29 | "从json中读取": "從json中讀取", 30 | "使用前,请确认后端服务已启动。": "使用前,請確認後端服務已啟動。", 31 | "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不會有完成提示,沒報錯就是成功)", 32 | "保存失败!": "保存失敗!", 33 | "保存成功!": "保存成功!", 34 | "停止播放": "停止播放", 35 | "切句方式": "切句方式", 36 | "前端处理后的文本(每句):": "前端處理後的文本(每句):", 37 | "参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!", 38 | "参考音频路径": "參考音頻路徑", 39 | "发送json格式": "發送json格式", 40 | "发送并开始播放": "發送並開始播放", 41 | "发送请求": "發送請求", 42 | "发送请求到": "發送請求到", 43 | "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或調節batch size滑條。", 44 | "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或者更換參考音頻(使用模型管理介面)、調節下方batch size滑條。", 45 | "基础选项": "基礎選項", 46 | "实际输入的参考文本:": "實際輸入的參考文本:", 47 | "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):", 48 | "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):", 49 | "实际输入的目标文本:": "實際輸入的目標文本:", 50 | "密码": "密碼", 51 | "当前人物": "當前人物", 52 | "当前人物变更为: ": "當前人物變更為: ", 53 | "您在使用经典推理模式,部分选项不可用": "您在使用經典推理模式,部分選項不可用", 54 | "情感列表": "情緒列表", 55 | "情感风格": "情緒風格", 56 | "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "有時掉進黑洞,有時候爬上彩虹。在下一秒鐘,命運如何轉動,沒有人會曉得。我說希望無窮,你猜美夢成空,相信和懷疑,總要決鬥。", 57 | "扫描": "掃描", 58 | "扫描人物列表": "掃描人物列表", 59 | "扫描模型文件夹:": "掃描模型文件夾:", 60 | "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!請把有效文件放置在文件夾下!!!", 61 | "提供的推理特化包,当前版本:": "提供的推理特化包,當前版本:", 62 | "提示": "提示", 63 | "提示文本": "提示文本", 64 | "提示语言": "提示語言", 65 | "文件打开失败,保存失败!": "文件開啟失敗,保存失敗!", 66 | "文本语言": "文本語言", 67 | "是否自动匹配情感": "是否自動匹配情緒", 68 | "模型文件夹路径": "模型文件夾路徑", 69 | "每句允许最大切分字词数": "每句允許最大切分字詞數", 70 | "流式音频": "流式音頻", 71 | "添加情感": "添加情緒", 72 | "点击查看详细文档": "點擊查看詳細文件", 73 | "版本": "版本", 74 | "用户名": "使用者名稱", 75 | "种子": "種子", 76 | "简介": "簡介", 77 | "缺失某些项,保存失败!": "缺失某些項,保存失敗!", 78 | "网址设置": "網址設置", 79 | "自动生成info": "自動生成info", 80 | "若有疑问或需要进一步了解,可参考文档:": "若有疑問或需要進一步了解,可參考文件:", 81 | "认证信息": "認證信息", 82 | "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "認證信息已啟用,您可以在config.json中關閉。\n但是這個功能還沒做好,只是擺設", 83 | "语速": "語速", 84 | "请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存", 85 | "请求失败,状态码:": "請求失敗,狀態碼:", 86 | "请求失败,请检查URL是否正确": "請求失敗,請檢查URL是否正確", 87 | "请求完整音频": "請求完整音頻", 88 | "请求网址": "請求網址", 89 | "输入文本": "輸入文本", 90 | "这是一个由": "這是一個由", 91 | "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一個簡單好用的前後端項目", 92 | "这是展示页面的版本,并未使用后端服务,下面参数无效。": "這是展示頁面的版本,並未使用後端服務,下面參數無效。", 93 | "选择角色": "選擇角色", 94 | "音频输出": "音頻輸出", 95 | "音频预览": "音頻預覽", 96 | "项目开源地址:": "Github Link:", 97 | "高级选项": "高級選項", 98 | "最大允许长度": "最大允許長度" 99 | } 100 | --------------------------------------------------------------------------------