├── .dockerignore
├── .gitignore
├── Changelog_CN.md
├── Docker
├── damo.sha256
├── download.py
├── download.sh
├── links.sha256
└── links.txt
├── Dockerfile
├── GPT_SoVITS
├── AR
│ ├── __init__.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── bucket_sampler.py
│ │ ├── data_module.py
│ │ └── dataset.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── t2s_lightning_module.py
│ │ ├── t2s_lightning_module_onnx.py
│ │ ├── t2s_model.py
│ │ ├── t2s_model_onnx.py
│ │ └── utils.py
│ ├── modules
│ │ ├── __init__.py
│ │ ├── activation.py
│ │ ├── activation_onnx.py
│ │ ├── embedding.py
│ │ ├── embedding_onnx.py
│ │ ├── lr_schedulers.py
│ │ ├── optim.py
│ │ ├── patched_mha_with_cache.py
│ │ ├── patched_mha_with_cache_onnx.py
│ │ ├── scaling.py
│ │ ├── transformer.py
│ │ └── transformer_onnx.py
│ ├── text_processing
│ │ ├── __init__.py
│ │ ├── phonemizer.py
│ │ └── symbols.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── initialize.py
│ │ └── io.py
├── configs
│ ├── s1.yaml
│ ├── s1big.yaml
│ ├── s1big2.yaml
│ ├── s1longer.yaml
│ ├── s1mq.yaml
│ ├── s2.json
│ └── train.yaml
├── feature_extractor
│ ├── __init__.py
│ ├── cnhubert.py
│ └── whisper_enc.py
├── inference_gui.py
├── inference_webui.py
├── module
│ ├── __init__.py
│ ├── attentions.py
│ ├── attentions_onnx.py
│ ├── commons.py
│ ├── core_vq.py
│ ├── data_utils.py
│ ├── losses.py
│ ├── mel_processing.py
│ ├── models.py
│ ├── models_onnx.py
│ ├── modules.py
│ ├── mrte_model.py
│ ├── quantize.py
│ └── transforms.py
├── my_utils.py
├── onnx_export.py
├── prepare_datasets
│ ├── 1-get-text.py
│ ├── 2-get-hubert-wav32k.py
│ └── 3-get-semantic.py
├── pretrained_models
│ └── .gitignore
├── process_ckpt.py
├── s1_train.py
├── s2_train.py
├── text
│ ├── __init__.py
│ ├── chinese.py
│ ├── cleaner.py
│ ├── cmudict-fast.rep
│ ├── cmudict.rep
│ ├── engdict-hot.rep
│ ├── engdict_cache.pickle
│ ├── english.py
│ ├── japanese.py
│ ├── opencpop-strict.txt
│ ├── symbols.py
│ ├── tone_sandhi.py
│ └── zh_normalization
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── char_convert.py
│ │ ├── chronology.py
│ │ ├── constants.py
│ │ ├── num.py
│ │ ├── phonecode.py
│ │ ├── quantifier.py
│ │ └── text_normlization.py
└── utils.py
├── GPT_SoVITS_Inference.ipynb
├── LICENSE
├── README.md
├── api.py
├── colab_webui.ipynb
├── config.py
├── docker-compose.yaml
├── dockerbuild.sh
├── docs
├── en
│ └── README.md
├── ja
│ ├── Changelog_JA.md
│ └── README.md
└── ko
│ ├── Changelog_KO.md
│ └── README.md
├── go-webui.bat
├── go-webui.ps1
├── gpt-sovits_kaggle.ipynb
├── i18n
└── locale
│ ├── en_US.json
│ ├── es_ES.json
│ ├── fr_FR.json
│ ├── it_IT.json
│ ├── ja_JP.json
│ ├── ko_KR.json
│ ├── pt_BR.json
│ ├── ru_RU.json
│ ├── tr_TR.json
│ ├── zh_CN.json
│ ├── zh_HK.json
│ ├── zh_SG.json
│ └── zh_TW.json
├── install.sh
├── requirements.txt
├── tools
├── asr
│ ├── config.py
│ ├── fasterwhisper_asr.py
│ ├── funasr_asr.py
│ └── models
│ │ └── .gitignore
├── cmd-denoise.py
├── denoise-model
│ └── .gitignore
├── i18n
│ ├── i18n.py
│ ├── locale
│ │ ├── en_US.json
│ │ ├── es_ES.json
│ │ ├── fr_FR.json
│ │ ├── it_IT.json
│ │ ├── ja_JP.json
│ │ ├── ko_KR.json
│ │ ├── ru_RU.json
│ │ ├── tr_TR.json
│ │ ├── zh_CN.json
│ │ ├── zh_HK.json
│ │ ├── zh_SG.json
│ │ └── zh_TW.json
│ ├── locale_diff.py
│ └── scan_i18n.py
├── my_utils.py
├── slice_audio.py
├── slicer2.py
├── subfix_webui.py
└── uvr5
│ ├── lib
│ ├── lib_v5
│ │ ├── dataset.py
│ │ ├── layers.py
│ │ ├── layers_123812KB.py
│ │ ├── layers_123821KB.py
│ │ ├── layers_33966KB.py
│ │ ├── layers_537227KB.py
│ │ ├── layers_537238KB.py
│ │ ├── layers_new.py
│ │ ├── model_param_init.py
│ │ ├── modelparams
│ │ │ ├── 1band_sr16000_hl512.json
│ │ │ ├── 1band_sr32000_hl512.json
│ │ │ ├── 1band_sr33075_hl384.json
│ │ │ ├── 1band_sr44100_hl1024.json
│ │ │ ├── 1band_sr44100_hl256.json
│ │ │ ├── 1band_sr44100_hl512.json
│ │ │ ├── 1band_sr44100_hl512_cut.json
│ │ │ ├── 2band_32000.json
│ │ │ ├── 2band_44100_lofi.json
│ │ │ ├── 2band_48000.json
│ │ │ ├── 3band_44100.json
│ │ │ ├── 3band_44100_mid.json
│ │ │ ├── 3band_44100_msb2.json
│ │ │ ├── 4band_44100.json
│ │ │ ├── 4band_44100_mid.json
│ │ │ ├── 4band_44100_msb.json
│ │ │ ├── 4band_44100_msb2.json
│ │ │ ├── 4band_44100_reverse.json
│ │ │ ├── 4band_44100_sw.json
│ │ │ ├── 4band_v2.json
│ │ │ ├── 4band_v2_sn.json
│ │ │ ├── 4band_v3.json
│ │ │ └── ensemble.json
│ │ ├── nets.py
│ │ ├── nets_123812KB.py
│ │ ├── nets_123821KB.py
│ │ ├── nets_33966KB.py
│ │ ├── nets_537227KB.py
│ │ ├── nets_537238KB.py
│ │ ├── nets_61968KB.py
│ │ ├── nets_new.py
│ │ └── spec_utils.py
│ ├── name_params.json
│ └── utils.py
│ ├── mdxnet.py
│ ├── vr.py
│ └── webui.py
├── vc_webui.py
└── webui.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | docs
2 | logs
3 | output
4 | reference
5 | SoVITS_weights
6 | GPT_weights
7 | TEMP
8 | .git
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__
3 | *.pyc
4 | env
5 | runtime
6 | .idea
7 | output
8 | logs
9 | reference
10 | GPT_weights
11 | SoVITS_weights
12 | TEMP
13 |
14 |
15 |
--------------------------------------------------------------------------------
/Changelog_CN.md:
--------------------------------------------------------------------------------
1 | ### 20240121更新
2 |
3 | 1-config添加is_share,诸如colab等场景可以将此改为True,来使得webui映射到公网
4 |
5 | 2-WebUI添加英文系统英文翻译适配
6 |
7 | 3-cmd-asr自动判断是否已自带damo模型,如不在默认目录上将从modelscope自带下载
8 |
9 | 4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等)
10 |
11 | 5-清理TEMP文件夹缓存音频等文件
12 |
13 | 6-大幅削弱合成音频包含参考音频结尾的问题
14 |
15 | ### 20240122更新
16 |
17 | 1-修复过短输出文件返回重复参考音频的问题。
18 |
19 | 2-经测试,英文日文训练原生支持(日文训练需要根目录不含非英文等特殊字符)。
20 |
21 | 3-音频路径检查。如果尝试读取输入错的路径报错路径不存在,而非ffmpeg错误。
22 |
23 | ### 20240123更新
24 |
25 | 1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题
26 |
27 | 2-支持推理界面快速切换模型
28 |
29 | 3-优化模型文件排序逻辑
30 |
31 | 4-中文分词使用jieba_fast代替jieba
32 |
33 | ### 20240126更新
34 |
35 | 1-支持输出文本中英混合、日英混合
36 |
37 | 2-输出可选切分模式
38 |
39 | 3-修复uvr5读取到目录自动跳出的问题
40 |
41 | 4-修复多个换行导致推理报错
42 |
43 | 5-去除推理界面大量冗余log
44 |
45 | 6-支持mac训练推理
46 |
47 | 7-自动识别不支持半精度的卡强制单精度。cpu推理下强制单精度。
48 |
49 | ### 20240128更新
50 |
51 | 1-修复数字转汉字念法问题
52 |
53 | 2-修复句首少量字容易吞字的问题
54 |
55 | 3-通过限制排除不合理的参考音频长度
56 |
57 | 4-修复GPT训练不保存ckpt的问题
58 |
59 | 5-完善Dockerfile的下载模型流程
60 |
61 | ### 20240129更新
62 |
63 | 1-16系等半精度训练有问题的显卡把训练配置改为单精度训练
64 |
65 | 2-测试更新可用的colab版本
66 |
67 | 3-修复git clone modelscope funasr仓库+老版本funasr导致接口不对齐报错的问题
68 |
69 |
70 | ### 20240130更新
71 |
72 | 1-所有涉及路径的地方双引号自动去除,小白复制路径带双引号不会报错
73 |
74 | 2-修复中英文标点切割问题和句首句尾补标点的问题
75 |
76 | 3-增加按标点符号切分
77 |
78 | ### 20240201更新
79 |
80 | 1-修复uvr5读取格式错误导致分离失败的问题
81 |
82 | 2-支持中日英混合多种文本自动切分识别语种
83 |
84 | ### 20240202更新
85 |
86 | 1-修复asr路径尾缀带/保存文件名报错
87 |
88 | 2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题,例如:xx.xx%(带百分号类),元/吨 会读成 元吨 而不是元每吨,下划线不再会报错
89 |
90 | ### 20240207更新
91 |
92 | 1-修正语种传参混乱导致中文推理效果下降 https://github.com/RVC-Boss/GPT-SoVITS/issues/391
93 |
94 | 2-uvr5适配高版本librosa https://github.com/RVC-Boss/GPT-SoVITS/pull/403
95 |
96 | 3-修复uvr5 inf everywhere报错的问题(is_half传参未转换bool导致恒定半精度推理,16系显卡会inf) https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8
97 |
98 | 4-优化英文文本前端
99 |
100 | 5-修复gradio依赖
101 |
102 | 6-支持三连根目录留空自动读取.list全路径
103 |
104 | 7-集成faster whisper ASR日文英文
105 |
106 | ### 20240208更新
107 |
108 | 1-GPT训练卡死(win10 1909)和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 (系统语言繁体)GPT训练报错,[尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)。
109 |
110 | ### 20240212更新
111 |
112 | 1-faster whisper和funasr逻辑优化。faster whisper转镜像站下载,规避huggingface连不上的问题。
113 |
114 | 2-DPO Loss实验性训练选项开启,通过构造负样本训练缓解GPT重复漏字问题。推理界面公开几个推理参数。 https://github.com/RVC-Boss/GPT-SoVITS/pull/457
115 |
116 | ### 20240214更新
117 |
118 | 1-训练支持中文实验名(原来会报错)
119 |
120 | 2-DPO训练改为可勾选选项而非必须。如勾选batch size自动减半。修复推理界面新参数不传参的问题。
121 |
122 | ### 20240216更新
123 |
124 | 1-支持无参考文本输入
125 |
126 | 2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475
127 |
128 | ### 20240221更新
129 |
130 | 1-数据处理添加语音降噪选项
131 |
132 | 2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509
133 |
134 | 3-mac CPU推理更快因此把推理设备从mps改到CPU
135 |
136 | 4-colab修复不开启公网url
137 |
138 | todolist:
139 |
140 | 1-中文多音字推理优化
141 |
142 |
143 |
144 |
--------------------------------------------------------------------------------
/Docker/damo.sha256:
--------------------------------------------------------------------------------
1 | 5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb
2 | b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb
3 | a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb
--------------------------------------------------------------------------------
/Docker/download.py:
--------------------------------------------------------------------------------
1 | # Download moda ASR related models
2 | from modelscope import snapshot_download
3 | model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',revision="v2.0.4")
4 | model_dir = snapshot_download('damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',revision="v2.0.4")
5 | model_dir = snapshot_download('damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',revision="v2.0.4")
6 |
--------------------------------------------------------------------------------
/Docker/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -Eeuo pipefail
4 |
5 | echo "Downloading models..."
6 |
7 | aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue
8 |
9 | echo "Checking SHA256..."
10 |
11 | parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c"
12 |
--------------------------------------------------------------------------------
/Docker/links.sha256:
--------------------------------------------------------------------------------
1 | b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
2 | fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth
3 | 020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth
4 | 24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
5 | e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
6 | 39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth
7 | 45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth
8 | 5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
9 | 8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
10 | 01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
11 | 56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
12 | 233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
--------------------------------------------------------------------------------
/Docker/links.txt:
--------------------------------------------------------------------------------
1 | # GPT-SoVITS models
2 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt
3 | out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
4 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth
5 | out=GPT_SoVITS/pretrained_models/s2D488k.pth
6 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth
7 | out=GPT_SoVITS/pretrained_models/s2G488k.pth
8 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json
9 | out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json
10 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json
11 | out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json
12 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin
13 | out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
14 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json
15 | out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json
16 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin
17 | out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
18 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json
19 | out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json
20 | # UVR5
21 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
22 | out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth
23 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth
24 | out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth
25 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth
26 | out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth
27 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth
28 | out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth
29 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth
30 | out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth
31 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth
32 | out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth
33 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
34 | out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Base CUDA image
2 | FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04
3 |
4 | LABEL maintainer="breakstring@hotmail.com"
5 | LABEL version="dev-20240209"
6 | LABEL description="Docker image for GPT-SoVITS"
7 |
8 |
9 | # Install 3rd party apps
10 | ENV DEBIAN_FRONTEND=noninteractive
11 | ENV TZ=Etc/UTC
12 | RUN apt-get update && \
13 | apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \
14 | git lfs install && \
15 | rm -rf /var/lib/apt/lists/*
16 |
17 | # Copy only requirements.txt initially to leverage Docker cache
18 | WORKDIR /workspace
19 | COPY requirements.txt /workspace/
20 | RUN pip install --no-cache-dir -r requirements.txt
21 |
22 | # Define a build-time argument for image type
23 | ARG IMAGE_TYPE=full
24 |
25 | # Conditional logic based on the IMAGE_TYPE argument
26 | # Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite"
27 | COPY ./Docker /workspace/Docker
28 | # elite 类型的镜像里面不包含额外的模型
29 | RUN if [ "$IMAGE_TYPE" != "elite" ]; then \
30 | chmod +x /workspace/Docker/download.sh && \
31 | /workspace/Docker/download.sh && \
32 | python /workspace/Docker/download.py && \
33 | python -m nltk.downloader averaged_perceptron_tagger cmudict; \
34 | fi
35 |
36 |
37 | # Copy the rest of the application
38 | COPY . /workspace
39 |
40 | # Copy the rest of the application
41 | COPY . /workspace
42 |
43 | EXPOSE 9871 9872 9873 9874 9880
44 |
45 | CMD ["python", "webui.py"]
46 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/AR/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/AR/data/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/data_module.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | from pytorch_lightning import LightningDataModule
4 | from AR.data.bucket_sampler import DistributedBucketSampler
5 | from AR.data.dataset import Text2SemanticDataset
6 | from torch.utils.data import DataLoader
7 |
8 |
9 | class Text2SemanticDataModule(LightningDataModule):
10 | def __init__(
11 | self,
12 | config,
13 | train_semantic_path,
14 | train_phoneme_path,
15 | dev_semantic_path=None,
16 | dev_phoneme_path=None,
17 | ):
18 | super().__init__()
19 | self.config = config
20 | self.train_semantic_path = train_semantic_path
21 | self.train_phoneme_path = train_phoneme_path
22 | self.dev_semantic_path = dev_semantic_path
23 | self.dev_phoneme_path = dev_phoneme_path
24 | self.num_workers = self.config["data"]["num_workers"]
25 |
26 | def prepare_data(self):
27 | pass
28 |
29 | def setup(self, stage=None, output_logs=False):
30 | self._train_dataset = Text2SemanticDataset(
31 | phoneme_path=self.train_phoneme_path,
32 | semantic_path=self.train_semantic_path,
33 | max_sec=self.config["data"]["max_sec"],
34 | pad_val=self.config["data"]["pad_val"],
35 | )
36 | self._dev_dataset = self._train_dataset
37 | # self._dev_dataset = Text2SemanticDataset(
38 | # phoneme_path=self.dev_phoneme_path,
39 | # semantic_path=self.dev_semantic_path,
40 | # max_sample=self.config['data']['max_eval_sample'],
41 | # max_sec=self.config['data']['max_sec'],
42 | # pad_val=self.config['data']['pad_val'])
43 |
44 | def train_dataloader(self):
45 | batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"]
46 | batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存
47 | sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
48 | return DataLoader(
49 | self._train_dataset,
50 | batch_size=batch_size,
51 | sampler=sampler,
52 | collate_fn=self._train_dataset.collate,
53 | num_workers=self.num_workers,
54 | persistent_workers=True,
55 | prefetch_factor=16,
56 | )
57 |
58 | def val_dataloader(self):
59 | return DataLoader(
60 | self._dev_dataset,
61 | batch_size=1,
62 | shuffle=False,
63 | collate_fn=self._train_dataset.collate,
64 | num_workers=max(self.num_workers, 12),
65 | persistent_workers=True,
66 | prefetch_factor=16,
67 | )
68 |
69 | # 这个会使用到嘛?
70 | def test_dataloader(self):
71 | return DataLoader(
72 | self._dev_dataset,
73 | batch_size=1,
74 | shuffle=False,
75 | collate_fn=self._train_dataset.collate,
76 | )
77 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/AR/models/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | import os, sys
4 |
5 | now_dir = os.getcwd()
6 | sys.path.append(now_dir)
7 | from typing import Dict
8 |
9 | import torch
10 | from pytorch_lightning import LightningModule
11 | from AR.models.t2s_model_onnx import Text2SemanticDecoder
12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule
13 | from AR.modules.optim import ScaledAdam
14 |
15 |
16 | class Text2SemanticLightningModule(LightningModule):
17 | def __init__(self, config, output_dir, is_train=True):
18 | super().__init__()
19 | self.config = config
20 | self.top_k = 3
21 | self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
22 | pretrained_s1 = config.get("pretrained_s1")
23 | if pretrained_s1 and is_train:
24 | # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
25 | print(
26 | self.load_state_dict(
27 | torch.load(pretrained_s1, map_location="cpu")["weight"]
28 | )
29 | )
30 | if is_train:
31 | self.automatic_optimization = False
32 | self.save_hyperparameters()
33 | self.eval_dir = output_dir / "eval"
34 | self.eval_dir.mkdir(parents=True, exist_ok=True)
35 |
36 | def training_step(self, batch: Dict, batch_idx: int):
37 | opt = self.optimizers()
38 | scheduler = self.lr_schedulers()
39 | loss, acc = self.model.forward(
40 | batch["phoneme_ids"],
41 | batch["phoneme_ids_len"],
42 | batch["semantic_ids"],
43 | batch["semantic_ids_len"],
44 | batch["bert_feature"],
45 | )
46 | self.manual_backward(loss)
47 | if batch_idx > 0 and batch_idx % 4 == 0:
48 | opt.step()
49 | opt.zero_grad()
50 | scheduler.step()
51 |
52 | self.log(
53 | "total_loss",
54 | loss,
55 | on_step=True,
56 | on_epoch=True,
57 | prog_bar=True,
58 | sync_dist=True,
59 | )
60 | self.log(
61 | "lr",
62 | scheduler.get_last_lr()[0],
63 | on_epoch=True,
64 | prog_bar=True,
65 | sync_dist=True,
66 | )
67 | self.log(
68 | f"top_{self.top_k}_acc",
69 | acc,
70 | on_step=True,
71 | on_epoch=True,
72 | prog_bar=True,
73 | sync_dist=True,
74 | )
75 |
76 | def validation_step(self, batch: Dict, batch_idx: int):
77 | return
78 |
79 | def configure_optimizers(self):
80 | model_parameters = self.model.parameters()
81 | parameters_names = []
82 | parameters_names.append(
83 | [name_param_pair[0] for name_param_pair in self.model.named_parameters()]
84 | )
85 | lm_opt = ScaledAdam(
86 | model_parameters,
87 | lr=0.01,
88 | betas=(0.9, 0.95),
89 | clipping_scale=2.0,
90 | parameters_names=parameters_names,
91 | show_dominant_parameters=False,
92 | clipping_update_period=1000,
93 | )
94 |
95 | return {
96 | "optimizer": lm_opt,
97 | "lr_scheduler": {
98 | "scheduler": WarmupCosineLRSchedule(
99 | lm_opt,
100 | init_lr=self.config["optimizer"]["lr_init"],
101 | peak_lr=self.config["optimizer"]["lr"],
102 | end_lr=self.config["optimizer"]["lr_end"],
103 | warmup_steps=self.config["optimizer"]["warmup_steps"],
104 | total_steps=self.config["optimizer"]["decay_steps"],
105 | )
106 | },
107 | }
108 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/AR/modules/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
2 | import math
3 |
4 | import torch
5 | from torch import nn
6 |
7 |
8 | class TokenEmbedding(nn.Module):
9 | def __init__(
10 | self,
11 | embedding_dim: int,
12 | vocab_size: int,
13 | dropout: float = 0.0,
14 | ):
15 | super().__init__()
16 |
17 | self.vocab_size = vocab_size
18 | self.embedding_dim = embedding_dim
19 |
20 | self.dropout = torch.nn.Dropout(p=dropout)
21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 |
23 | @property
24 | def weight(self) -> torch.Tensor:
25 | return self.word_embeddings.weight
26 |
27 | def embedding(self, index: int) -> torch.Tensor:
28 | return self.word_embeddings.weight[index : index + 1]
29 |
30 | def forward(self, x: torch.Tensor):
31 | x = self.word_embeddings(x)
32 | x = self.dropout(x)
33 | return x
34 |
35 |
36 | class SinePositionalEmbedding(nn.Module):
37 | def __init__(
38 | self,
39 | embedding_dim: int,
40 | dropout: float = 0.0,
41 | scale: bool = False,
42 | alpha: bool = False,
43 | ):
44 | super().__init__()
45 | self.embedding_dim = embedding_dim
46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 | self.dropout = torch.nn.Dropout(p=dropout)
49 |
50 | self.reverse = False
51 | self.pe = None
52 | self.extend_pe(torch.tensor(0.0).expand(1, 4000))
53 |
54 | def extend_pe(self, x):
55 | """Reset the positional encodings."""
56 | if self.pe is not None:
57 | if self.pe.size(1) >= x.size(1):
58 | if self.pe.dtype != x.dtype or self.pe.device != x.device:
59 | self.pe = self.pe.to(dtype=x.dtype, device=x.device)
60 | return
61 | pe = torch.zeros(x.size(1), self.embedding_dim)
62 | if self.reverse:
63 | position = torch.arange(
64 | x.size(1) - 1, -1, -1.0, dtype=torch.float32
65 | ).unsqueeze(1)
66 | else:
67 | position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
68 | div_term = torch.exp(
69 | torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
70 | * -(math.log(10000.0) / self.embedding_dim)
71 | )
72 | pe[:, 0::2] = torch.sin(position * div_term)
73 | pe[:, 1::2] = torch.cos(position * div_term)
74 | pe = pe.unsqueeze(0)
75 | self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
76 |
77 | def forward(self, x: torch.Tensor) -> torch.Tensor:
78 | self.extend_pe(x)
79 | output = x.unsqueeze(-1) if x.ndim == 2 else x
80 | output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
81 | return self.dropout(output)
82 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding_onnx.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
2 | import math
3 |
4 | import torch
5 | from torch import nn
6 |
7 |
8 | class TokenEmbedding(nn.Module):
9 | def __init__(
10 | self,
11 | embedding_dim: int,
12 | vocab_size: int,
13 | dropout: float = 0.0,
14 | ):
15 | super().__init__()
16 |
17 | self.vocab_size = vocab_size
18 | self.embedding_dim = embedding_dim
19 |
20 | self.dropout = torch.nn.Dropout(p=dropout)
21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 |
23 | @property
24 | def weight(self) -> torch.Tensor:
25 | return self.word_embeddings.weight
26 |
27 | def embedding(self, index: int) -> torch.Tensor:
28 | return self.word_embeddings.weight[index : index + 1]
29 |
30 | def forward(self, x: torch.Tensor):
31 | x = self.word_embeddings(x)
32 | x = self.dropout(x)
33 | return x
34 |
35 |
36 | class SinePositionalEmbedding(nn.Module):
37 | def __init__(
38 | self,
39 | embedding_dim: int,
40 | dropout: float = 0.0,
41 | scale: bool = False,
42 | alpha: bool = False,
43 | ):
44 | super().__init__()
45 | self.embedding_dim = embedding_dim
46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 | self.dropout = torch.nn.Dropout(p=dropout)
49 | self.reverse = False
50 | self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
51 |
52 | def extend_pe(self, x):
53 | position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1)
54 | scpe = (position * self.div_term).unsqueeze(0)
55 | pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
56 | pe = pe.contiguous().view(1, -1, self.embedding_dim)
57 | return pe
58 |
59 | def forward(self, x: torch.Tensor) -> torch.Tensor:
60 | pe = self.extend_pe(x)
61 | output = x.unsqueeze(-1) if x.ndim == 2 else x
62 | output = output * self.x_scale + self.alpha * pe
63 | return self.dropout(output)
64 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/lr_schedulers.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | import math
4 |
5 | import torch
6 | from matplotlib import pyplot as plt
7 | from torch import nn
8 | from torch.optim import Adam
9 |
10 |
11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
12 | """
13 | Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
14 | """
15 |
16 | def __init__(
17 | self,
18 | optimizer,
19 | init_lr,
20 | peak_lr,
21 | end_lr,
22 | warmup_steps=10000,
23 | total_steps=400000,
24 | current_step=0,
25 | ):
26 | self.init_lr = init_lr
27 | self.peak_lr = peak_lr
28 | self.end_lr = end_lr
29 | self.optimizer = optimizer
30 | self._warmup_rate = (peak_lr - init_lr) / warmup_steps
31 | self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
32 | self._current_step = current_step
33 | self.lr = init_lr
34 | self.warmup_steps = warmup_steps
35 | self.total_steps = total_steps
36 | self._last_lr = [self.lr]
37 |
38 | def set_lr(self, lr):
39 | self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
40 | for g in self.optimizer.param_groups:
41 | # g['lr'] = lr
42 | g["lr"] = self.end_lr ###锁定用线性
43 |
44 | def step(self):
45 | if self._current_step < self.warmup_steps:
46 | lr = self.init_lr + self._warmup_rate * self._current_step
47 |
48 | elif self._current_step > self.total_steps:
49 | lr = self.end_lr
50 |
51 | else:
52 | decay_ratio = (self._current_step - self.warmup_steps) / (
53 | self.total_steps - self.warmup_steps
54 | )
55 | if decay_ratio < 0.0 or decay_ratio > 1.0:
56 | raise RuntimeError(
57 | "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings."
58 | )
59 | coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
60 | lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
61 |
62 | self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定!
63 | self.set_lr(lr)
64 | self.lr = lr
65 | self._current_step += 1
66 | return self.lr
67 |
68 |
69 | if __name__ == "__main__":
70 | m = nn.Linear(10, 10)
71 | opt = Adam(m.parameters(), lr=1e-4)
72 | s = WarmupCosineLRSchedule(
73 | opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0
74 | )
75 | lrs = []
76 | for i in range(25000):
77 | s.step()
78 | lrs.append(s.lr)
79 | print(s.lr)
80 |
81 | plt.plot(lrs)
82 | plt.plot(range(0, 25000), lrs)
83 | plt.show()
84 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py:
--------------------------------------------------------------------------------
1 | from torch.nn.functional import *
2 | from torch.nn.functional import (
3 | _mha_shape_check,
4 | _canonical_mask,
5 | _none_or_dtype,
6 | _in_projection_packed,
7 | )
8 |
9 | def multi_head_attention_forward_patched(
10 | query,
11 | key,
12 | value,
13 | embed_dim_to_check: int,
14 | num_heads: int,
15 | in_proj_weight,
16 | in_proj_bias: Optional[Tensor],
17 | bias_k: Optional[Tensor],
18 | bias_v: Optional[Tensor],
19 | add_zero_attn: bool,
20 | dropout_p: float,
21 | out_proj_weight: Tensor,
22 | out_proj_bias: Optional[Tensor],
23 | training: bool = True,
24 | key_padding_mask: Optional[Tensor] = None,
25 | need_weights: bool = True,
26 | attn_mask: Optional[Tensor] = None,
27 | use_separate_proj_weight: bool = False,
28 | q_proj_weight: Optional[Tensor] = None,
29 | k_proj_weight: Optional[Tensor] = None,
30 | v_proj_weight: Optional[Tensor] = None,
31 | static_k: Optional[Tensor] = None,
32 | static_v: Optional[Tensor] = None,
33 | average_attn_weights: bool = True,
34 | is_causal: bool = False,
35 | cache=None,
36 | ) -> Tuple[Tensor, Optional[Tensor]]:
37 |
38 | # set up shape vars
39 | _, _, embed_dim = query.shape
40 | attn_mask = _canonical_mask(
41 | mask=attn_mask,
42 | mask_name="attn_mask",
43 | other_type=None,
44 | other_name="",
45 | target_type=query.dtype,
46 | check_other=False,
47 | )
48 | head_dim = embed_dim // num_heads
49 |
50 | proj_qkv = linear(query, in_proj_weight, in_proj_bias)
51 | proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
52 | q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
53 |
54 | if cache["first_infer"] == 1:
55 | cache["k"][cache["stage"]] = k
56 | cache["v"][cache["stage"]] = v
57 | else:
58 | cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
59 | cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
60 | k = cache["k"][cache["stage"]]
61 | v = cache["v"][cache["stage"]]
62 | cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
63 |
64 | attn_mask = _canonical_mask(
65 | mask=attn_mask,
66 | mask_name="attn_mask",
67 | other_type=None,
68 | other_name="",
69 | target_type=q.dtype,
70 | check_other=False,
71 | )
72 | attn_mask = attn_mask.unsqueeze(0)
73 |
74 | q = q.view(-1, num_heads, head_dim).transpose(0, 1)
75 | k = k.view(-1, num_heads, head_dim).transpose(0, 1)
76 | v = v.view(-1, num_heads, head_dim).transpose(0, 1)
77 |
78 | dropout_p = 0.0
79 | attn_mask = attn_mask.unsqueeze(0)
80 | q = q.view(num_heads, -1, head_dim).unsqueeze(0)
81 | k = k.view(num_heads, -1, head_dim).unsqueeze(0)
82 | v = v.view(num_heads, -1, head_dim).unsqueeze(0)
83 | attn_output = scaled_dot_product_attention(
84 | q, k, v, attn_mask, dropout_p, is_causal
85 | )
86 | attn_output = (
87 | attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
88 | )
89 | attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
90 | attn_output = attn_output.view(-1, 1, attn_output.size(1))
91 |
92 | return attn_output
93 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/AR/text_processing/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | import itertools
4 | import re
5 | from typing import Dict
6 | from typing import List
7 |
8 | import regex
9 | from gruut import sentences
10 | from gruut.const import Sentence
11 | from gruut.const import Word
12 | from AR.text_processing.symbols import SYMBOL_TO_ID
13 |
14 |
15 | class GruutPhonemizer:
16 | def __init__(self, language: str):
17 | self._phonemizer = sentences
18 | self.lang = language
19 | self.symbol_to_id = SYMBOL_TO_ID
20 | self._special_cases_dict: Dict[str] = {
21 | r"\.\.\.": "... ",
22 | ";": "; ",
23 | ":": ": ",
24 | ",": ", ",
25 | r"\.": ". ",
26 | "!": "! ",
27 | r"\?": "? ",
28 | "—": "—",
29 | "…": "… ",
30 | "«": "«",
31 | "»": "»",
32 | }
33 | self._punctuation_regexp: str = (
34 | rf"([{''.join(self._special_cases_dict.keys())}])"
35 | )
36 |
37 | def _normalize_punctuation(self, text: str) -> str:
38 | text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
39 | text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
40 | text = regex.sub(r"\pZ+", r" ", text)
41 | return text.strip()
42 |
43 | def _convert_punctuation(self, word: Word) -> str:
44 | if not word.phonemes:
45 | return ""
46 | if word.phonemes[0] in ["‖", "|"]:
47 | return word.text.strip()
48 |
49 | phonemes = "".join(word.phonemes)
50 | # remove modifier characters ˈˌː with regex
51 | phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
52 | return phonemes.strip()
53 |
54 | def phonemize(self, text: str, espeak: bool = False) -> str:
55 | text_to_phonemize: str = self._normalize_punctuation(text)
56 | sents: List[Sentence] = [
57 | sent
58 | for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)
59 | ]
60 | words: List[str] = [
61 | self._convert_punctuation(word) for word in itertools.chain(*sents)
62 | ]
63 | return " ".join(words)
64 |
65 | def transform(self, phonemes):
66 | # convert phonemes to ids
67 | # dictionary is in symbols.py
68 | return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
69 |
70 |
71 | if __name__ == "__main__":
72 | phonemizer = GruutPhonemizer("en-us")
73 | # text -> IPA
74 | phonemes = phonemizer.phonemize("Hello, wor-ld ?")
75 | print("phonemes:", phonemes)
76 | print("len(phonemes):", len(phonemes))
77 | phoneme_ids = phonemizer.transform(phonemes)
78 | print("phoneme_ids:", phoneme_ids)
79 | print("len(phoneme_ids):", len(phoneme_ids))
80 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/symbols.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | PAD = "_"
4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” '
5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
6 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
7 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
8 | SPACE_ID = SYMBOLS.index(" ")
9 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
10 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
11 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/__init__.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def str2bool(str):
5 | return True if str.lower() == 'true' else False
6 |
7 |
8 | def get_newest_ckpt(string_list):
9 | # 定义一个正则表达式模式,用于匹配字符串中的数字
10 | pattern = r'epoch=(\d+)-step=(\d+)\.ckpt'
11 |
12 | # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表
13 | extracted_info = []
14 | for string in string_list:
15 | match = re.match(pattern, string)
16 | if match:
17 | epoch = int(match.group(1))
18 | step = int(match.group(2))
19 | extracted_info.append((epoch, step, string))
20 | # 按照 epoch 后面的数字和 step 后面的数字进行排序
21 | sorted_info = sorted(
22 | extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
23 | # 获取最新的 ckpt 文件名
24 | newest_ckpt = sorted_info[0][2]
25 | return newest_ckpt
26 |
27 |
28 | # 文本存在且不为空时 return True
29 | def check_txt_file(file_path):
30 | try:
31 | with open(file_path, 'r') as file:
32 | text = file.readline().strip()
33 | assert text.strip() != ''
34 | return text
35 | except Exception:
36 | return False
37 | return False
38 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/initialize.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Initialize modules for espnet2 neural networks."""
3 | import torch
4 | from typeguard import check_argument_types
5 |
6 |
7 | def initialize(model: torch.nn.Module, init: str):
8 | """Initialize weights of a neural network module.
9 |
10 | Parameters are initialized using the given method or distribution.
11 |
12 | Custom initialization routines can be implemented into submodules
13 | as function `espnet_initialization_fn` within the custom module.
14 |
15 | Args:
16 | model: Target.
17 | init: Method of initialization.
18 | """
19 | assert check_argument_types()
20 | print("init with", init)
21 |
22 | # weight init
23 | for p in model.parameters():
24 | if p.dim() > 1:
25 | if init == "xavier_uniform":
26 | torch.nn.init.xavier_uniform_(p.data)
27 | elif init == "xavier_normal":
28 | torch.nn.init.xavier_normal_(p.data)
29 | elif init == "kaiming_uniform":
30 | torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
31 | elif init == "kaiming_normal":
32 | torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
33 | else:
34 | raise ValueError("Unknown initialization: " + init)
35 | # bias init
36 | for name, p in model.named_parameters():
37 | if ".bias" in name and p.dim() == 1:
38 | p.data.zero_()
39 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/io.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | import yaml
5 |
6 |
7 | def load_yaml_config(path):
8 | with open(path) as f:
9 | config = yaml.full_load(f)
10 | return config
11 |
12 |
13 | def save_config_to_yaml(config, path):
14 | assert path.endswith(".yaml")
15 | with open(path, "w") as f:
16 | f.write(yaml.dump(config))
17 | f.close()
18 |
19 |
20 | def write_args(args, path):
21 | args_dict = dict(
22 | (name, getattr(args, name)) for name in dir(args) if not name.startswith("_")
23 | )
24 | with open(path, "a") as args_file:
25 | args_file.write("==> torch version: {}\n".format(torch.__version__))
26 | args_file.write(
27 | "==> cudnn version: {}\n".format(torch.backends.cudnn.version())
28 | )
29 | args_file.write("==> Cmd:\n")
30 | args_file.write(str(sys.argv))
31 | args_file.write("\n==> args:\n")
32 | for k, v in sorted(args_dict.items()):
33 | args_file.write(" %s: %s\n" % (str(k), str(v)))
34 | args_file.close()
35 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 300
4 | batch_size: 8
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 16
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 512
24 | hidden_dim: 512
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 12
28 | dropout: 0
29 | EOS: 1024
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 300
4 | batch_size: 8
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 16-mixed
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 1024
24 | hidden_dim: 1024
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 16
28 | dropout: 0
29 | EOS: 1024
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big2.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 300
4 | batch_size: 12
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 16-mixed
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 1024
24 | hidden_dim: 1024
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 6
28 | dropout: 0
29 | EOS: 1024
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1longer.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 20
4 | batch_size: 8
5 | save_every_n_epoch: 1
6 | precision: 16-mixed
7 | gradient_clip: 1.0
8 | optimizer:
9 | lr: 0.01
10 | lr_init: 0.00001
11 | lr_end: 0.0001
12 | warmup_steps: 2000
13 | decay_steps: 40000
14 | data:
15 | max_eval_sample: 8
16 | max_sec: 54
17 | num_workers: 4
18 | pad_val: 1024 # same with EOS in model
19 | model:
20 | vocab_size: 1025
21 | phoneme_vocab_size: 512
22 | embedding_dim: 512
23 | hidden_dim: 512
24 | head: 16
25 | linear_units: 2048
26 | n_layer: 24
27 | dropout: 0
28 | EOS: 1024
29 | random_bert: 0
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1mq.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 100
4 | batch_size: 6
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 32
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 40
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | saving_path: "ckpt/"
22 | resume_checkpoint: null
23 | vocoder_config_path: "quantizer/new_ckpt/config.json"
24 | vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000"
25 | datadir: "/home/liweiche/GigaSpeech/wavs"
26 | metapath: "/home/liweiche/GigaSpeech/train2.json"
27 | val_metapath: "/home/liweiche/GigaSpeech/dev2.json"
28 | sampledir: "logs/"
29 | pretrained_path: null
30 | lr: 0.0001
31 | batch_size: 200.0
32 | train_bucket_size: 8192
33 | training_step: 800000
34 | optim_flat_percent: 0.0
35 | warmup_step: 50
36 | adam_beta1: 0.9
37 | adam_beta2: 0.98
38 | ffd_size: 3072
39 | hidden_size: 768
40 | enc_nlayers: 6
41 | dec_nlayers: 6
42 | nheads: 12
43 | ar_layer: 4
44 | ar_ffd_size: 1024
45 | ar_hidden_size: 256
46 | ar_nheads: 4
47 | aligner_softmax_temp: 1.0
48 | layer_norm_eps: 0.00001
49 | speaker_embed_dropout: 0.05
50 | label_smoothing: 0.0
51 | val_check_interval: 5000
52 | check_val_every_n_epoch: 1
53 | precision: "fp16"
54 | nworkers: 16
55 | distributed: true
56 | accelerator: "ddp"
57 | version: null
58 | accumulate_grad_batches: 1
59 | use_repetition_token: true
60 | use_repetition_gating: false
61 | repetition_penalty: 1.0
62 | sampling_temperature: 1.0
63 | top_k: -1
64 | min_top_k: 3
65 | top_p: 0.8
66 | sample_num: 4
67 | length_penalty_max_length: 15000
68 | length_penalty_max_prob: 0.95
69 | max_input_length: 2048
70 | max_output_length: 2000
71 | sample_rate: 16000
72 | n_codes: 1024
73 | n_cluster_groups: 1
74 | phone_context_window: 4
75 | phoneset_size: 1000
76 | inference:
77 | top_k: 5
78 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s2.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 100,
4 | "eval_interval": 500,
5 | "seed": 1234,
6 | "epochs": 100,
7 | "learning_rate": 0.0001,
8 | "betas": [
9 | 0.8,
10 | 0.99
11 | ],
12 | "eps": 1e-09,
13 | "batch_size": 32,
14 | "fp16_run": true,
15 | "lr_decay": 0.999875,
16 | "segment_size": 20480,
17 | "init_lr_ratio": 1,
18 | "warmup_epochs": 0,
19 | "c_mel": 45,
20 | "c_kl": 1.0,
21 | "text_low_lr_rate": 0.4
22 | },
23 | "data": {
24 | "max_wav_value": 32768.0,
25 | "sampling_rate": 32000,
26 | "filter_length": 2048,
27 | "hop_length": 640,
28 | "win_length": 2048,
29 | "n_mel_channels": 128,
30 | "mel_fmin": 0.0,
31 | "mel_fmax": null,
32 | "add_blank": true,
33 | "n_speakers": 300,
34 | "cleaned_text": true
35 | },
36 | "model": {
37 | "inter_channels": 192,
38 | "hidden_channels": 192,
39 | "filter_channels": 768,
40 | "n_heads": 2,
41 | "n_layers": 6,
42 | "kernel_size": 3,
43 | "p_dropout": 0.1,
44 | "resblock": "1",
45 | "resblock_kernel_sizes": [
46 | 3,
47 | 7,
48 | 11
49 | ],
50 | "resblock_dilation_sizes": [
51 | [
52 | 1,
53 | 3,
54 | 5
55 | ],
56 | [
57 | 1,
58 | 3,
59 | 5
60 | ],
61 | [
62 | 1,
63 | 3,
64 | 5
65 | ]
66 | ],
67 | "upsample_rates": [
68 | 10,
69 | 8,
70 | 2,
71 | 2,
72 | 2
73 | ],
74 | "upsample_initial_channel": 512,
75 | "upsample_kernel_sizes": [
76 | 16,
77 | 16,
78 | 8,
79 | 2,
80 | 2
81 | ],
82 | "n_layers_q": 3,
83 | "use_spectral_norm": false,
84 | "gin_channels": 512,
85 | "semantic_frame_rate": "25hz",
86 | "freeze_quantizer": true
87 | },
88 | "s2_ckpt_dir": "logs/s2/big2k1",
89 | "content_module": "cnhubert"
90 | }
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/train.yaml:
--------------------------------------------------------------------------------
1 | gpu:
2 | n_card: 1
3 | n_process_per_card: 2
4 | io:
5 | text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS
6 | save_every_n_epoch: 1
7 | precision: 16-mixed
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 512
24 | hidden_dim: 512
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 24
28 | dropout: 0
29 | EOS: 1024
30 | random_bert: 0
31 | inference:
32 | top_k: 5
33 |
--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cnhubert, whisper_enc
2 |
3 | content_module_map = {
4 | 'cnhubert': cnhubert,
5 | 'whisper': whisper_enc
6 | }
--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/cnhubert.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import librosa
4 | import torch
5 | import torch.nn.functional as F
6 | import soundfile as sf
7 | import logging
8 |
9 | logging.getLogger("numba").setLevel(logging.WARNING)
10 |
11 | from transformers import (
12 | Wav2Vec2FeatureExtractor,
13 | HubertModel,
14 | )
15 |
16 | import utils
17 | import torch.nn as nn
18 |
19 | cnhubert_base_path = None
20 |
21 |
22 | class CNHubert(nn.Module):
23 | def __init__(self):
24 | super().__init__()
25 | self.model = HubertModel.from_pretrained(cnhubert_base_path)
26 | self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
27 | cnhubert_base_path
28 | )
29 |
30 | def forward(self, x):
31 | input_values = self.feature_extractor(
32 | x, return_tensors="pt", sampling_rate=16000
33 | ).input_values.to(x.device)
34 | feats = self.model(input_values)["last_hidden_state"]
35 | return feats
36 |
37 |
38 | # class CNHubertLarge(nn.Module):
39 | # def __init__(self):
40 | # super().__init__()
41 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
42 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
43 | # def forward(self, x):
44 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
45 | # feats = self.model(input_values)["last_hidden_state"]
46 | # return feats
47 | #
48 | # class CVec(nn.Module):
49 | # def __init__(self):
50 | # super().__init__()
51 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
52 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
53 | # def forward(self, x):
54 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
55 | # feats = self.model(input_values)["last_hidden_state"]
56 | # return feats
57 | #
58 | # class cnw2v2base(nn.Module):
59 | # def __init__(self):
60 | # super().__init__()
61 | # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
62 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
63 | # def forward(self, x):
64 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
65 | # feats = self.model(input_values)["last_hidden_state"]
66 | # return feats
67 |
68 |
69 | def get_model():
70 | model = CNHubert()
71 | model.eval()
72 | return model
73 |
74 |
75 | # def get_large_model():
76 | # model = CNHubertLarge()
77 | # model.eval()
78 | # return model
79 | #
80 | # def get_model_cvec():
81 | # model = CVec()
82 | # model.eval()
83 | # return model
84 | #
85 | # def get_model_cnw2v2base():
86 | # model = cnw2v2base()
87 | # model.eval()
88 | # return model
89 |
90 |
91 | def get_content(hmodel, wav_16k_tensor):
92 | with torch.no_grad():
93 | feats = hmodel(wav_16k_tensor)
94 | return feats.transpose(1, 2)
95 |
96 |
97 | if __name__ == "__main__":
98 | model = get_model()
99 | src_path = "/Users/Shared/原音频2.wav"
100 | wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
101 | model = model
102 | wav_16k_tensor = wav_16k_tensor
103 | feats = get_content(model, wav_16k_tensor)
104 | print(feats.shape)
105 |
--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/whisper_enc.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_model():
5 | import whisper
6 |
7 | model = whisper.load_model("small", device="cpu")
8 |
9 | return model.encoder
10 |
11 |
12 | def get_content(model=None, wav_16k_tensor=None):
13 | from whisper import log_mel_spectrogram, pad_or_trim
14 |
15 | dev = next(model.parameters()).device
16 | mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
17 | # if torch.cuda.is_available():
18 | # mel = mel.to(torch.float16)
19 | feature_len = mel.shape[-1] // 2
20 | assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频"
21 | with torch.no_grad():
22 | feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[
23 | :1, :feature_len, :
24 | ].transpose(1, 2)
25 | return feature
26 |
--------------------------------------------------------------------------------
/GPT_SoVITS/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/module/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/module/losses.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch.nn import functional as F
5 |
6 |
7 | def feature_loss(fmap_r, fmap_g):
8 | loss = 0
9 | for dr, dg in zip(fmap_r, fmap_g):
10 | for rl, gl in zip(dr, dg):
11 | rl = rl.float().detach()
12 | gl = gl.float()
13 | loss += torch.mean(torch.abs(rl - gl))
14 |
15 | return loss * 2
16 |
17 |
18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19 | loss = 0
20 | r_losses = []
21 | g_losses = []
22 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23 | dr = dr.float()
24 | dg = dg.float()
25 | r_loss = torch.mean((1 - dr) ** 2)
26 | g_loss = torch.mean(dg**2)
27 | loss += r_loss + g_loss
28 | r_losses.append(r_loss.item())
29 | g_losses.append(g_loss.item())
30 |
31 | return loss, r_losses, g_losses
32 |
33 |
34 | def generator_loss(disc_outputs):
35 | loss = 0
36 | gen_losses = []
37 | for dg in disc_outputs:
38 | dg = dg.float()
39 | l = torch.mean((1 - dg) ** 2)
40 | gen_losses.append(l)
41 | loss += l
42 |
43 | return loss, gen_losses
44 |
45 |
46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47 | """
48 | z_p, logs_q: [b, h, t_t]
49 | m_p, logs_p: [b, h, t_t]
50 | """
51 | z_p = z_p.float()
52 | logs_q = logs_q.float()
53 | m_p = m_p.float()
54 | logs_p = logs_p.float()
55 | z_mask = z_mask.float()
56 |
57 | kl = logs_p - logs_q - 0.5
58 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
59 | kl = torch.sum(kl * z_mask)
60 | l = kl / torch.sum(z_mask)
61 | return l
62 |
63 |
64 | def mle_loss(z, m, logs, logdet, mask):
65 | l = torch.sum(logs) + 0.5 * torch.sum(
66 | torch.exp(-2 * logs) * ((z - m) ** 2)
67 | ) # neg normal likelihood w/o the constant term
68 | l = l - torch.sum(logdet) # log jacobian determinant
69 | l = l / torch.sum(
70 | torch.ones_like(z) * mask
71 | ) # averaging across batch, channel and time axes
72 | l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term
73 | return l
74 |
--------------------------------------------------------------------------------
/GPT_SoVITS/module/mel_processing.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import random
4 | import torch
5 | from torch import nn
6 | import torch.nn.functional as F
7 | import torch.utils.data
8 | import numpy as np
9 | import librosa
10 | import librosa.util as librosa_util
11 | from librosa.util import normalize, pad_center, tiny
12 | from scipy.signal import get_window
13 | from scipy.io.wavfile import read
14 | from librosa.filters import mel as librosa_mel_fn
15 |
16 | MAX_WAV_VALUE = 32768.0
17 |
18 |
19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20 | """
21 | PARAMS
22 | ------
23 | C: compression factor
24 | """
25 | return torch.log(torch.clamp(x, min=clip_val) * C)
26 |
27 |
28 | def dynamic_range_decompression_torch(x, C=1):
29 | """
30 | PARAMS
31 | ------
32 | C: compression factor used to compress
33 | """
34 | return torch.exp(x) / C
35 |
36 |
37 | def spectral_normalize_torch(magnitudes):
38 | output = dynamic_range_compression_torch(magnitudes)
39 | return output
40 |
41 |
42 | def spectral_de_normalize_torch(magnitudes):
43 | output = dynamic_range_decompression_torch(magnitudes)
44 | return output
45 |
46 |
47 | mel_basis = {}
48 | hann_window = {}
49 |
50 |
51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
52 | if torch.min(y) < -1.0:
53 | print("min value is ", torch.min(y))
54 | if torch.max(y) > 1.0:
55 | print("max value is ", torch.max(y))
56 |
57 | global hann_window
58 | dtype_device = str(y.dtype) + "_" + str(y.device)
59 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
60 | if wnsize_dtype_device not in hann_window:
61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
62 | dtype=y.dtype, device=y.device
63 | )
64 |
65 | y = torch.nn.functional.pad(
66 | y.unsqueeze(1),
67 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
68 | mode="reflect",
69 | )
70 | y = y.squeeze(1)
71 | spec = torch.stft(
72 | y,
73 | n_fft,
74 | hop_length=hop_size,
75 | win_length=win_size,
76 | window=hann_window[wnsize_dtype_device],
77 | center=center,
78 | pad_mode="reflect",
79 | normalized=False,
80 | onesided=True,
81 | return_complex=False,
82 | )
83 |
84 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
85 | return spec
86 |
87 |
88 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
89 | global mel_basis
90 | dtype_device = str(spec.dtype) + "_" + str(spec.device)
91 | fmax_dtype_device = str(fmax) + "_" + dtype_device
92 | if fmax_dtype_device not in mel_basis:
93 | mel = librosa_mel_fn(
94 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
95 | )
96 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
97 | dtype=spec.dtype, device=spec.device
98 | )
99 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
100 | spec = spectral_normalize_torch(spec)
101 | return spec
102 |
103 |
104 | def mel_spectrogram_torch(
105 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
106 | ):
107 | if torch.min(y) < -1.0:
108 | print("min value is ", torch.min(y))
109 | if torch.max(y) > 1.0:
110 | print("max value is ", torch.max(y))
111 |
112 | global mel_basis, hann_window
113 | dtype_device = str(y.dtype) + "_" + str(y.device)
114 | fmax_dtype_device = str(fmax) + "_" + dtype_device
115 | wnsize_dtype_device = str(win_size) + "_" + dtype_device
116 | if fmax_dtype_device not in mel_basis:
117 | mel = librosa_mel_fn(
118 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
119 | )
120 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
121 | dtype=y.dtype, device=y.device
122 | )
123 | if wnsize_dtype_device not in hann_window:
124 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
125 | dtype=y.dtype, device=y.device
126 | )
127 |
128 | y = torch.nn.functional.pad(
129 | y.unsqueeze(1),
130 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
131 | mode="reflect",
132 | )
133 | y = y.squeeze(1)
134 |
135 | spec = torch.stft(
136 | y,
137 | n_fft,
138 | hop_length=hop_size,
139 | win_length=win_size,
140 | window=hann_window[wnsize_dtype_device],
141 | center=center,
142 | pad_mode="reflect",
143 | normalized=False,
144 | onesided=True,
145 | return_complex=False,
146 | )
147 |
148 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
149 |
150 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
151 | spec = spectral_normalize_torch(spec)
152 |
153 | return spec
154 |
--------------------------------------------------------------------------------
/GPT_SoVITS/module/quantize.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | """Residual vector quantizer implementation."""
8 |
9 | from dataclasses import dataclass, field
10 | import math
11 | import typing as tp
12 |
13 | import torch
14 | from torch import nn
15 |
16 | from module.core_vq import ResidualVectorQuantization
17 |
18 |
19 | @dataclass
20 | class QuantizedResult:
21 | quantized: torch.Tensor
22 | codes: torch.Tensor
23 | bandwidth: torch.Tensor # bandwidth in kb/s used, per batch item.
24 | penalty: tp.Optional[torch.Tensor] = None
25 | metrics: dict = field(default_factory=dict)
26 |
27 |
28 | class ResidualVectorQuantizer(nn.Module):
29 | """Residual Vector Quantizer.
30 | Args:
31 | dimension (int): Dimension of the codebooks.
32 | n_q (int): Number of residual vector quantizers used.
33 | bins (int): Codebook size.
34 | decay (float): Decay for exponential moving average over the codebooks.
35 | kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
36 | kmeans_iters (int): Number of iterations used for kmeans initialization.
37 | threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
38 | that have an exponential moving average cluster size less than the specified threshold with
39 | randomly selected vector from the current batch.
40 | """
41 |
42 | def __init__(
43 | self,
44 | dimension: int = 256,
45 | n_q: int = 8,
46 | bins: int = 1024,
47 | decay: float = 0.99,
48 | kmeans_init: bool = True,
49 | kmeans_iters: int = 50,
50 | threshold_ema_dead_code: int = 2,
51 | ):
52 | super().__init__()
53 | self.n_q = n_q
54 | self.dimension = dimension
55 | self.bins = bins
56 | self.decay = decay
57 | self.kmeans_init = kmeans_init
58 | self.kmeans_iters = kmeans_iters
59 | self.threshold_ema_dead_code = threshold_ema_dead_code
60 | self.vq = ResidualVectorQuantization(
61 | dim=self.dimension,
62 | codebook_size=self.bins,
63 | num_quantizers=self.n_q,
64 | decay=self.decay,
65 | kmeans_init=self.kmeans_init,
66 | kmeans_iters=self.kmeans_iters,
67 | threshold_ema_dead_code=self.threshold_ema_dead_code,
68 | )
69 |
70 | def forward(
71 | self,
72 | x: torch.Tensor,
73 | n_q: tp.Optional[int] = None,
74 | layers: tp.Optional[list] = None,
75 | ) -> QuantizedResult:
76 | """Residual vector quantization on the given input tensor.
77 | Args:
78 | x (torch.Tensor): Input tensor.
79 | n_q (int): Number of quantizer used to quantize. Default: All quantizers.
80 | layers (list): Layer that need to return quantized. Defalt: None.
81 | Returns:
82 | QuantizedResult:
83 | The quantized (or approximately quantized) representation with
84 | the associated numbert quantizers and layer quantized required to return.
85 | """
86 | n_q = n_q if n_q else self.n_q
87 | if layers and max(layers) >= n_q:
88 | raise ValueError(
89 | f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B."
90 | )
91 | quantized, codes, commit_loss, quantized_list = self.vq(
92 | x, n_q=n_q, layers=layers
93 | )
94 | return quantized, codes, torch.mean(commit_loss), quantized_list
95 |
96 | def encode(
97 | self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None
98 | ) -> torch.Tensor:
99 | """Encode a given input tensor with the specified sample rate at the given bandwidth.
100 | The RVQ encode method sets the appropriate number of quantizer to use
101 | and returns indices for each quantizer.
102 | Args:
103 | x (torch.Tensor): Input tensor.
104 | n_q (int): Number of quantizer used to quantize. Default: All quantizers.
105 | st (int): Start to encode input from which layers. Default: 0.
106 | """
107 | n_q = n_q if n_q else self.n_q
108 | st = st or 0
109 | codes = self.vq.encode(x, n_q=n_q, st=st)
110 | return codes
111 |
112 | def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor:
113 | """Decode the given codes to the quantized representation.
114 | Args:
115 | codes (torch.Tensor): Input indices for each quantizer.
116 | st (int): Start to decode input codes from which layers. Default: 0.
117 | """
118 | quantized = self.vq.decode(codes, st=st)
119 | return quantized
120 |
--------------------------------------------------------------------------------
/GPT_SoVITS/my_utils.py:
--------------------------------------------------------------------------------
1 | import ffmpeg
2 | import numpy as np
3 |
4 |
5 | def load_audio(file, sr):
6 | try:
7 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
8 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
9 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
10 | file = (
11 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
12 | ) # 防止小白拷路径头尾带了空格和"和回车
13 | out, _ = (
14 | ffmpeg.input(file, threads=0)
15 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
16 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17 | )
18 | except Exception as e:
19 | raise RuntimeError(f"Failed to load audio: {e}")
20 |
21 | return np.frombuffer(out, np.float32).flatten()
22 |
--------------------------------------------------------------------------------
/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import sys,os
4 | inp_text= os.environ.get("inp_text")
5 | inp_wav_dir= os.environ.get("inp_wav_dir")
6 | exp_name= os.environ.get("exp_name")
7 | i_part= os.environ.get("i_part")
8 | all_parts= os.environ.get("all_parts")
9 | os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES")
10 | from feature_extractor import cnhubert
11 | opt_dir= os.environ.get("opt_dir")
12 | cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir")
13 | is_half=eval(os.environ.get("is_half","True"))
14 |
15 | import pdb,traceback,numpy as np,logging
16 | from scipy.io import wavfile
17 | import librosa,torch
18 | now_dir = os.getcwd()
19 | sys.path.append(now_dir)
20 | from my_utils import load_audio
21 |
22 | # from config import cnhubert_base_path
23 | # cnhubert.cnhubert_base_path=cnhubert_base_path
24 | # inp_text=sys.argv[1]
25 | # inp_wav_dir=sys.argv[2]
26 | # exp_name=sys.argv[3]
27 | # i_part=sys.argv[4]
28 | # all_parts=sys.argv[5]
29 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]
30 | # cnhubert.cnhubert_base_path=sys.argv[7]
31 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
32 |
33 | from time import time as ttime
34 | import shutil
35 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path
36 | dir=os.path.dirname(path)
37 | name=os.path.basename(path)
38 | # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
39 | tmp_path="%s%s.pth"%(ttime(),i_part)
40 | torch.save(fea,tmp_path)
41 | shutil.move(tmp_path,"%s/%s"%(dir,name))
42 |
43 | hubert_dir="%s/4-cnhubert"%(opt_dir)
44 | wav32dir="%s/5-wav32k"%(opt_dir)
45 | os.makedirs(opt_dir,exist_ok=True)
46 | os.makedirs(hubert_dir,exist_ok=True)
47 | os.makedirs(wav32dir,exist_ok=True)
48 |
49 | maxx=0.95
50 | alpha=0.5
51 | if torch.cuda.is_available():
52 | device = "cuda:0"
53 | elif torch.backends.mps.is_available():
54 | device = "mps"
55 | else:
56 | device = "cpu"
57 | model=cnhubert.get_model()
58 | # is_half=False
59 | if(is_half==True):
60 | model=model.half().to(device)
61 | else:
62 | model = model.to(device)
63 |
64 | nan_fails=[]
65 | def name2go(wav_name,wav_path):
66 | hubert_path="%s/%s.pt"%(hubert_dir,wav_name)
67 | if(os.path.exists(hubert_path)):return
68 | tmp_audio = load_audio(wav_path, 32000)
69 | tmp_max = np.abs(tmp_audio).max()
70 | if tmp_max > 2.2:
71 | print("%s-filtered,%s" % (wav_name, tmp_max))
72 | return
73 | tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio
74 | tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio
75 | tmp_audio = librosa.resample(
76 | tmp_audio32b, orig_sr=32000, target_sr=16000
77 | )#不是重采样问题
78 | tensor_wav16 = torch.from_numpy(tmp_audio)
79 | if (is_half == True):
80 | tensor_wav16=tensor_wav16.half().to(device)
81 | else:
82 | tensor_wav16 = tensor_wav16.to(device)
83 | ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215])
84 | if np.isnan(ssl.detach().numpy()).sum()!= 0:
85 | nan_fails.append(wav_name)
86 | print("nan filtered:%s"%wav_name)
87 | return
88 | wavfile.write(
89 | "%s/%s"%(wav32dir,wav_name),
90 | 32000,
91 | tmp_audio32.astype("int16"),
92 | )
93 | my_save(ssl,hubert_path )
94 |
95 | with open(inp_text,"r",encoding="utf8")as f:
96 | lines=f.read().strip("\n").split("\n")
97 |
98 | for line in lines[int(i_part)::int(all_parts)]:
99 | try:
100 | # wav_name,text=line.split("\t")
101 | wav_name, spk_name, language, text = line.split("|")
102 | if (inp_wav_dir != "" and inp_wav_dir != None):
103 | wav_name = os.path.basename(wav_name)
104 | wav_path = "%s/%s"%(inp_wav_dir, wav_name)
105 |
106 | else:
107 | wav_path=wav_name
108 | wav_name = os.path.basename(wav_name)
109 | name2go(wav_name,wav_path)
110 | except:
111 | print(line,traceback.format_exc())
112 |
113 | if(len(nan_fails)>0 and is_half==True):
114 | is_half=False
115 | model=model.float()
116 | for wav_name in nan_fails:
117 | try:
118 | name2go(wav_name)
119 | except:
120 | print(wav_name,traceback.format_exc())
121 |
--------------------------------------------------------------------------------
/GPT_SoVITS/prepare_datasets/3-get-semantic.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | inp_text = os.environ.get("inp_text")
4 | exp_name = os.environ.get("exp_name")
5 | i_part = os.environ.get("i_part")
6 | all_parts = os.environ.get("all_parts")
7 | os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES")
8 | opt_dir = os.environ.get("opt_dir")
9 | pretrained_s2G = os.environ.get("pretrained_s2G")
10 | s2config_path = os.environ.get("s2config_path")
11 | is_half = eval(os.environ.get("is_half", "True"))
12 | import math, traceback
13 | import multiprocessing
14 | import sys, pdb
15 |
16 | now_dir = os.getcwd()
17 | sys.path.append(now_dir)
18 | from random import shuffle
19 | import torch.multiprocessing as mp
20 | from glob import glob
21 | from tqdm import tqdm
22 | import logging, librosa, utils, torch
23 | from module.models import SynthesizerTrn
24 |
25 | logging.getLogger("numba").setLevel(logging.WARNING)
26 | # from config import pretrained_s2G
27 |
28 | # inp_text=sys.argv[1]
29 | # exp_name=sys.argv[2]
30 | # i_part=sys.argv[3]
31 | # all_parts=sys.argv[4]
32 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[5]
33 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
34 |
35 |
36 | hubert_dir = "%s/4-cnhubert" % (opt_dir)
37 | semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
38 | if os.path.exists(semantic_path) == False:
39 | os.makedirs(opt_dir, exist_ok=True)
40 |
41 | if torch.cuda.is_available():
42 | device = "cuda"
43 | elif torch.backends.mps.is_available():
44 | device = "mps"
45 | else:
46 | device = "cpu"
47 | hps = utils.get_hparams_from_file(s2config_path)
48 | vq_model = SynthesizerTrn(
49 | hps.data.filter_length // 2 + 1,
50 | hps.train.segment_size // hps.data.hop_length,
51 | n_speakers=hps.data.n_speakers,
52 | **hps.model
53 | )
54 | if is_half == True:
55 | vq_model = vq_model.half().to(device)
56 | else:
57 | vq_model = vq_model.to(device)
58 | vq_model.eval()
59 | # utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True)
60 | # utils.load_checkpoint(pretrained_s2G, vq_model, None, True)
61 | print(
62 | vq_model.load_state_dict(
63 | torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False
64 | )
65 | )
66 |
67 | def name2go(wav_name, lines):
68 | hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
69 | if os.path.exists(hubert_path) == False:
70 | return
71 | ssl_content = torch.load(hubert_path, map_location="cpu")
72 | if is_half == True:
73 | ssl_content = ssl_content.half().to(device)
74 | else:
75 | ssl_content = ssl_content.to(device)
76 | codes = vq_model.extract_latent(ssl_content)
77 | semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()])
78 | lines.append("%s\t%s" % (wav_name, semantic))
79 |
80 | with open(inp_text, "r", encoding="utf8") as f:
81 | lines = f.read().strip("\n").split("\n")
82 |
83 | lines1 = []
84 | for line in lines[int(i_part) :: int(all_parts)]:
85 | # print(line)
86 | try:
87 | # wav_name,text=line.split("\t")
88 | wav_name, spk_name, language, text = line.split("|")
89 | wav_name = os.path.basename(wav_name)
90 | # name2go(name,lines1)
91 | name2go(wav_name, lines1)
92 | except:
93 | print(line, traceback.format_exc())
94 | with open(semantic_path, "w", encoding="utf8") as f:
95 | f.write("\n".join(lines1))
96 |
--------------------------------------------------------------------------------
/GPT_SoVITS/pretrained_models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
--------------------------------------------------------------------------------
/GPT_SoVITS/process_ckpt.py:
--------------------------------------------------------------------------------
1 | import traceback
2 | from collections import OrderedDict
3 | from time import time as ttime
4 | import shutil,os
5 | import torch
6 | from tools.i18n.i18n import I18nAuto
7 |
8 | i18n = I18nAuto()
9 |
10 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path
11 | dir=os.path.dirname(path)
12 | name=os.path.basename(path)
13 | tmp_path="%s.pth"%(ttime())
14 | torch.save(fea,tmp_path)
15 | shutil.move(tmp_path,"%s/%s"%(dir,name))
16 |
17 | def savee(ckpt, name, epoch, steps, hps):
18 | try:
19 | opt = OrderedDict()
20 | opt["weight"] = {}
21 | for key in ckpt.keys():
22 | if "enc_q" in key:
23 | continue
24 | opt["weight"][key] = ckpt[key].half()
25 | opt["config"] = hps
26 | opt["info"] = "%sepoch_%siteration" % (epoch, steps)
27 | # torch.save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
28 | my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
29 | return "Success."
30 | except:
31 | return traceback.format_exc()
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/__init__.py:
--------------------------------------------------------------------------------
1 | from text.symbols import *
2 |
3 |
4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5 |
6 | def cleaned_text_to_sequence(cleaned_text):
7 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8 | Args:
9 | text: string to convert to a sequence
10 | Returns:
11 | List of integers corresponding to the symbols in the text
12 | '''
13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 | return phones
15 |
16 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from text import chinese, japanese, cleaned_text_to_sequence, symbols, english
2 |
3 | language_module_map = {"zh": chinese, "ja": japanese, "en": english}
4 | special = [
5 | # ("%", "zh", "SP"),
6 | ("¥", "zh", "SP2"),
7 | ("^", "zh", "SP3"),
8 | # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧
9 | ]
10 |
11 |
12 | def clean_text(text, language):
13 | if(language not in language_module_map):
14 | language="en"
15 | text=" "
16 | for special_s, special_l, target_symbol in special:
17 | if special_s in text and language == special_l:
18 | return clean_special(text, language, special_s, target_symbol)
19 | language_module = language_module_map[language]
20 | norm_text = language_module.text_normalize(text)
21 | if language == "zh":
22 | phones, word2ph = language_module.g2p(norm_text)
23 | assert len(phones) == sum(word2ph)
24 | assert len(norm_text) == len(word2ph)
25 | else:
26 | phones = language_module.g2p(norm_text)
27 | word2ph = None
28 |
29 | for ph in phones:
30 | assert ph in symbols
31 | return phones, word2ph, norm_text
32 |
33 |
34 | def clean_special(text, language, special_s, target_symbol):
35 | """
36 | 特殊静音段sp符号处理
37 | """
38 | text = text.replace(special_s, ",")
39 | language_module = language_module_map[language]
40 | norm_text = language_module.text_normalize(text)
41 | phones = language_module.g2p(norm_text)
42 | new_ph = []
43 | for ph in phones[0]:
44 | assert ph in symbols
45 | if ph == ",":
46 | new_ph.append(target_symbol)
47 | else:
48 | new_ph.append(ph)
49 | return new_ph, phones[1], norm_text
50 |
51 |
52 | def text_to_sequence(text, language):
53 | phones = clean_text(text)
54 | return cleaned_text_to_sequence(phones)
55 |
56 |
57 | if __name__ == "__main__":
58 | print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
59 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict-hot.rep:
--------------------------------------------------------------------------------
1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1
--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/text/engdict_cache.pickle
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/README.md:
--------------------------------------------------------------------------------
1 | ## Supported NSW (Non-Standard-Word) Normalization
2 |
3 | |NSW type|raw|normalized|
4 | |:--|:-|:-|
5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分|
7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二|
8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日|
9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万|
14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from text.zh_normalization.text_normlization import *
15 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import DIGITS
17 | from .num import num2str
18 | from .num import verbalize_cardinal
19 | from .num import verbalize_digit
20 |
21 |
22 | def _time_num2str(num_string: str) -> str:
23 | """A special case for verbalizing number in time."""
24 | result = num2str(num_string.lstrip('0'))
25 | if num_string.startswith('0'):
26 | result = DIGITS['0'] + result
27 | return result
28 |
29 |
30 | # 时刻表达式
31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
32 | r':([0-5][0-9])'
33 | r'(:([0-5][0-9]))?')
34 |
35 | # 时间范围,如8:30-12:30
36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
37 | r':([0-5][0-9])'
38 | r'(:([0-5][0-9]))?'
39 | r'(~|-)'
40 | r'([0-1]?[0-9]|2[0-3])'
41 | r':([0-5][0-9])'
42 | r'(:([0-5][0-9]))?')
43 |
44 |
45 | def replace_time(match) -> str:
46 | """
47 | Args:
48 | match (re.Match)
49 | Returns:
50 | str
51 | """
52 |
53 | is_range = len(match.groups()) > 5
54 |
55 | hour = match.group(1)
56 | minute = match.group(2)
57 | second = match.group(4)
58 |
59 | if is_range:
60 | hour_2 = match.group(6)
61 | minute_2 = match.group(7)
62 | second_2 = match.group(9)
63 |
64 | result = f"{num2str(hour)}点"
65 | if minute.lstrip('0'):
66 | if int(minute) == 30:
67 | result += "半"
68 | else:
69 | result += f"{_time_num2str(minute)}分"
70 | if second and second.lstrip('0'):
71 | result += f"{_time_num2str(second)}秒"
72 |
73 | if is_range:
74 | result += "至"
75 | result += f"{num2str(hour_2)}点"
76 | if minute_2.lstrip('0'):
77 | if int(minute) == 30:
78 | result += "半"
79 | else:
80 | result += f"{_time_num2str(minute_2)}分"
81 | if second_2 and second_2.lstrip('0'):
82 | result += f"{_time_num2str(second_2)}秒"
83 |
84 | return result
85 |
86 |
87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年'
88 | r'((0?[1-9]|1[0-2])月)?'
89 | r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
90 |
91 |
92 | def replace_date(match) -> str:
93 | """
94 | Args:
95 | match (re.Match)
96 | Returns:
97 | str
98 | """
99 | year = match.group(1)
100 | month = match.group(3)
101 | day = match.group(5)
102 | result = ""
103 | if year:
104 | result += f"{verbalize_digit(year)}年"
105 | if month:
106 | result += f"{verbalize_cardinal(month)}月"
107 | if day:
108 | result += f"{verbalize_cardinal(day)}{match.group(9)}"
109 | return result
110 |
111 |
112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
113 | RE_DATE2 = re.compile(
114 | r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
115 |
116 |
117 | def replace_date2(match) -> str:
118 | """
119 | Args:
120 | match (re.Match)
121 | Returns:
122 | str
123 | """
124 | year = match.group(1)
125 | month = match.group(3)
126 | day = match.group(4)
127 | result = ""
128 | if year:
129 | result += f"{verbalize_digit(year)}年"
130 | if month:
131 | result += f"{verbalize_cardinal(month)}月"
132 | if day:
133 | result += f"{verbalize_cardinal(day)}日"
134 | return result
135 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 |
17 | from pypinyin.constants import SUPPORT_UCS4
18 |
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {
22 | ord(char) + 65248: ord(char)
23 | for char in string.ascii_letters
24 | }
25 |
26 | # 英文字符半角 -> 全角映射表
27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
28 |
29 | # 数字字符全角 -> 半角映射表 (num: 10)
30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
31 | # 数字字符半角 -> 全角映射表
32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
33 |
34 | # 标点符号全角 -> 半角映射表 (num: 32)
35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
36 | # 标点符号半角 -> 全角映射表
37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
38 |
39 | # 空格 (num: 1)
40 | F2H_SPACE = {'\u3000': ' '}
41 | H2F_SPACE = {' ': '\u3000'}
42 |
43 | # 非"有拼音的汉字"的字符串,可用于NSW提取
44 | if SUPPORT_UCS4:
45 | RE_NSW = re.compile(r'(?:[^'
46 | r'\u3007' # 〇
47 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
48 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
49 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
50 | r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
51 | r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
52 | r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
53 | r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
54 | r'])+')
55 | else:
56 | RE_NSW = re.compile( # pragma: no cover
57 | r'(?:[^'
58 | r'\u3007' # 〇
59 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
60 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
61 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
62 | r'])+')
63 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import verbalize_digit
17 |
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通:130、131、132、156、155、186、185、176
23 | # 电信:133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(
25 | r"(? str:
34 | if mobile:
35 | sp_parts = phone_string.strip('+').split()
36 | result = ','.join(
37 | [verbalize_digit(part, alt_one=True) for part in sp_parts])
38 | return result
39 | else:
40 | sil_parts = phone_string.split('-')
41 | result = ','.join(
42 | [verbalize_digit(part, alt_one=True) for part in sil_parts])
43 | return result
44 |
45 |
46 | def replace_phone(match) -> str:
47 | """
48 | Args:
49 | match (re.Match)
50 | Returns:
51 | str
52 | """
53 | return phone2str(match.group(0), mobile=False)
54 |
55 |
56 | def replace_mobile(match) -> str:
57 | """
58 | Args:
59 | match (re.Match)
60 | Returns:
61 | str
62 | """
63 | return phone2str(match.group(0))
64 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import num2str
17 |
18 | # 温度表达式,温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
21 | measure_dict = {
22 | "cm2": "平方厘米",
23 | "cm²": "平方厘米",
24 | "cm3": "立方厘米",
25 | "cm³": "立方厘米",
26 | "cm": "厘米",
27 | "db": "分贝",
28 | "ds": "毫秒",
29 | "kg": "千克",
30 | "km": "千米",
31 | "m2": "平方米",
32 | "m²": "平方米",
33 | "m³": "立方米",
34 | "m3": "立方米",
35 | "ml": "毫升",
36 | "m": "米",
37 | "mm": "毫米",
38 | "s": "秒"
39 | }
40 |
41 |
42 | def replace_temperature(match) -> str:
43 | """
44 | Args:
45 | match (re.Match)
46 | Returns:
47 | str
48 | """
49 | sign = match.group(1)
50 | temperature = match.group(2)
51 | unit = match.group(3)
52 | sign: str = "零下" if sign else ""
53 | temperature: str = num2str(temperature)
54 | unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 | result = f"{sign}{temperature}{unit}"
56 | return result
57 |
58 |
59 | def replace_measure(sentence) -> str:
60 | for q_notation in measure_dict:
61 | if q_notation in sentence:
62 | sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 | return sentence
64 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 RVC-Boss
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
GPT-SoVITS-VC-WebUI
4 | 强大的少样本语音转换与语音合成Web用户界面。
5 |
6 | [](https://github.com/RVC-Boss/GPT-SoVITS)
7 |
8 |

9 |
10 | [](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
11 | [](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
12 | [](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)
13 |
14 | [**English**](./docs/en/README.md) | [**中文简体**](./README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md)
15 |
16 |
17 |
18 | ---
19 | ## 本项目新增 Voice Conversion (VC) 变声
20 | ### 特点:
21 | 1. 无需任何训练
22 | 2. 不改任何配置
23 | 3. 支持非固定、任意目标音色变声,无需训练/微调,infer 时目标音色直接作为 prompt 输入
24 | ### 用法:
25 | 1. 按照源项目要求逐一配置,主要是下载好所有预训练模型
26 | 2. 打开终端,执行 python vc_webui.py
27 | ---
28 | ## 变声 demo 试听
29 | 目标音色:ChatGPT 助手音色
30 |
31 | 待转换音色:窃格瓦拉
32 | ### demo 1
33 | 原声:
34 |
35 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/b5a5c3aa-6620-48fc-9c08-9b0711fbd76a
36 |
37 | 变声:
38 |
39 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/6c29db50-471d-4a98-a557-55e356732662
40 |
41 | ### demo 2
42 | 原声:
43 |
44 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/9cb746c8-9d23-4fca-98be-94496af85d14
45 |
46 | 变声:
47 |
48 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/e71279b8-90a9-4dca-9214-31ed1803170a
49 |
50 |
51 | ### demo 3
52 | 原声:
53 |
54 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/ca745165-fc75-44f8-9d92-50097e8d4924
55 |
56 | 变声:
57 |
58 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/7378a676-1f89-4e0b-b931-b921228f7e2c
59 |
60 | ### demo 4
61 | 原声:
62 |
63 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/f2fbc07a-816a-469b-ac27-996b3c2a4cdf
64 |
65 | 变声:
66 |
67 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/dbcb0533-903d-49be-b03b-a8da4189b645
68 |
69 | ---
70 |
71 | ## Star History
72 | [](https://star-history.com/#huangxu1991/GPT-SoVITS-VC&Date)
73 |
--------------------------------------------------------------------------------
/colab_webui.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "include_colab_link": true
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "accelerator": "GPU"
14 | },
15 | "cells": [
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {
19 | "id": "view-in-github",
20 | "colab_type": "text"
21 | },
22 | "source": [
23 | "
"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "source": [
29 | "环境配置 environment"
30 | ],
31 | "metadata": {
32 | "id": "_o6a8GS2lWQM"
33 | }
34 | },
35 | {
36 | "cell_type": "code",
37 | "metadata": {
38 | "id": "e9b7iFV3dm1f"
39 | },
40 | "source": [
41 | "!pip install -q condacolab\n",
42 | "# Setting up condacolab and installing packages\n",
43 | "import condacolab\n",
44 | "condacolab.install_from_url(\"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh\")\n",
45 | "%cd -q /content\n",
46 | "!git clone https://github.com/RVC-Boss/GPT-SoVITS\n",
47 | "!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
48 | "%cd -q /content/GPT-SoVITS\n",
49 | "!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
50 | "!/usr/local/bin/pip install -r requirements.txt"
51 | ],
52 | "execution_count": null,
53 | "outputs": []
54 | },
55 | {
56 | "cell_type": "code",
57 | "source": [
58 | "# @title Download pretrained models 下载预训练模型\n",
59 | "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
60 | "!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n",
61 | "!mkdir -p /content/GPT-SoVITS/tools/uvr5\n",
62 | "%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
63 | "!git clone https://huggingface.co/lj1995/GPT-SoVITS\n",
64 | "%cd /content/GPT-SoVITS/tools/damo_asr/models\n",
65 | "!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n",
66 | "!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n",
67 | "!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n",
68 | "# @title UVR5 pretrains 安装uvr5模型\n",
69 | "%cd /content/GPT-SoVITS/tools/uvr5\n",
70 | "!git clone https://huggingface.co/Delik/uvr5_weights\n",
71 | "!git config core.sparseCheckout true\n",
72 | "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
73 | ],
74 | "metadata": {
75 | "id": "0NgxXg5sjv7z"
76 | },
77 | "execution_count": null,
78 | "outputs": []
79 | },
80 | {
81 | "cell_type": "code",
82 | "source": [
83 | "# @title launch WebUI 启动WebUI\n",
84 | "!/usr/local/bin/pip install ipykernel\n",
85 | "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
86 | "%cd /content/GPT-SoVITS/\n",
87 | "!/usr/local/bin/python webui.py"
88 | ],
89 | "metadata": {
90 | "id": "4oRGUzkrk8C7"
91 | },
92 | "execution_count": null,
93 | "outputs": []
94 | }
95 | ]
96 | }
97 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import sys,os
2 |
3 | import torch
4 |
5 | # 推理用的指定模型
6 | sovits_path = ""
7 | gpt_path = ""
8 | is_half_str = os.environ.get("is_half", "True")
9 | is_half = True if is_half_str.lower() == 'true' else False
10 | is_share_str = os.environ.get("is_share","False")
11 | is_share= True if is_share_str.lower() == 'true' else False
12 |
13 | cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
14 | bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
15 | pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
16 | pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
17 |
18 | exp_root = "logs"
19 | python_exec = sys.executable or "python"
20 | if torch.cuda.is_available():
21 | infer_device = "cuda"
22 | else:
23 | infer_device = "cpu"
24 |
25 | webui_port_main = 9874
26 | webui_port_uvr5 = 9873
27 | webui_port_infer_tts = 9872
28 | webui_port_subfix = 9871
29 |
30 | api_port = 9880
31 |
32 | if infer_device == "cuda":
33 | gpu_name = torch.cuda.get_device_name(0)
34 | if (
35 | ("16" in gpu_name and "V100" not in gpu_name.upper())
36 | or "P40" in gpu_name.upper()
37 | or "P10" in gpu_name.upper()
38 | or "1060" in gpu_name
39 | or "1070" in gpu_name
40 | or "1080" in gpu_name
41 | ):
42 | is_half=False
43 |
44 | if(infer_device=="cpu"):is_half=False
45 |
46 | class Config:
47 | def __init__(self):
48 | self.sovits_path = sovits_path
49 | self.gpt_path = gpt_path
50 | self.is_half = is_half
51 |
52 | self.cnhubert_path = cnhubert_path
53 | self.bert_path = bert_path
54 | self.pretrained_sovits_path = pretrained_sovits_path
55 | self.pretrained_gpt_path = pretrained_gpt_path
56 |
57 | self.exp_root = exp_root
58 | self.python_exec = python_exec
59 | self.infer_device = infer_device
60 |
61 | self.webui_port_main = webui_port_main
62 | self.webui_port_uvr5 = webui_port_uvr5
63 | self.webui_port_infer_tts = webui_port_infer_tts
64 | self.webui_port_subfix = webui_port_subfix
65 |
66 | self.api_port = api_port
67 |
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: '3.8'
2 |
3 | services:
4 | gpt-sovits:
5 | image: breakstring/gpt-sovits:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as GPT-SoVITS, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container.
6 | container_name: gpt-sovits-container
7 | environment:
8 | - is_half=False
9 | - is_share=False
10 | volumes:
11 | - ./output:/workspace/output
12 | - ./logs:/workspace/logs
13 | - ./SoVITS_weights:/workspace/SoVITS_weights
14 | - ./reference:/workspace/reference
15 | working_dir: /workspace
16 | ports:
17 | - "9880:9880"
18 | - "9871:9871"
19 | - "9872:9872"
20 | - "9873:9873"
21 | - "9874:9874"
22 | shm_size: 16G
23 | deploy:
24 | resources:
25 | reservations:
26 | devices:
27 | - driver: nvidia
28 | count: "all"
29 | capabilities: [gpu]
30 | stdin_open: true
31 | tty: true
32 | restart: unless-stopped
33 |
--------------------------------------------------------------------------------
/dockerbuild.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # 获取当前日期,格式为 YYYYMMDD
4 | DATE=$(date +%Y%m%d)
5 | # 获取最新的 Git commit 哈希值的前 7 位
6 | COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7)
7 |
8 | # 构建 full 版本的镜像
9 | docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest .
10 | # 为同一个镜像添加带日期的标签
11 | docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE
12 | # 为同一个镜像添加带当前代码库Commit哈希值的标签
13 | docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH
14 |
15 |
16 | # 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器)
17 | docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite .
18 | # 为同一个镜像添加带日期的标签
19 | docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite
20 | # 为同一个镜像添加带当前代码库Commit哈希值的标签
21 | docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite
22 |
--------------------------------------------------------------------------------
/docs/en/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
GPT-SoVITS-VC-WebUI
4 | A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.
5 |
6 | [](https://github.com/RVC-Boss/GPT-SoVITS)
7 |
8 |

9 |
10 | [](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
11 | [](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
12 | [](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)
13 |
14 | [**English**](./README.md) | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md)
15 |
16 |
17 |
18 | ---
19 | ## Voice Conversion (VC) - new added in this fork
20 | ### Features
21 | 1. Don't need any retrain
22 | 2. Support non-fixed/any tartget speaker vocie, and don't need any training or finetuning. You can infer directly through webui!
23 | ### Usages
24 | 1. please config following source repo steps;
25 | 2. open terminal, and execute `python vc_webui.py`
26 | ---
27 |
28 | ## Demos
29 |
30 | Note: Target speaker from ChatGPT Assistant Voice
31 |
32 | ### 1. demo 1
33 |
34 | source:
35 |
36 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/39248790-87f5-484f-8f48-532078412a80
37 |
38 | target:
39 |
40 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/d055d970-6b28-44e0-af9c-e0db0dc01e8c
41 |
42 | ### 2. demo 2
43 |
44 | source:
45 |
46 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/08ab4451-bbeb-4940-a04a-a43df94f0e61
47 |
48 | target:
49 |
50 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/e1917fa3-e5e7-4d8e-89f1-e3f0a40fdd18
51 |
52 | ---
53 |
--------------------------------------------------------------------------------
/docs/ja/Changelog_JA.md:
--------------------------------------------------------------------------------
1 | ### 20240121 更新
2 |
3 | 1. `config`に`is_share`を追加し、Colab などの環境でこれを`True`に設定すると、webui を公共ネットワークにマッピングできます。
4 |
5 | 2. WebUI に英語システムの英語翻訳を追加しました。
6 |
7 | 3. `cmd-asr`は damo モデルが既に含まれているかどうかを自動的に確認し、デフォルトのパスにない場合は modelscope から自動的にダウンロードします。
8 |
9 | 4. [SoVITS 训练报错 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 修復を試みます(長さ 0 のサンプルをフィルタリングなど)
10 |
11 | 5. TEMP ファイルフォルダからオーディオやその他のファイルをクリーンアップして最適化します。
12 |
13 | 6. 合成オーディオがリファレンスオーディオの終わりを含む問題を大幅に改善しました。
14 |
15 | ### 20240122 更新
16 |
17 | 1. 短すぎる出力ファイルが重複したリファレンスオーディオを返す問題を修正しました。
18 |
19 | 2. 英語-日本語学習がスムーズに進む QA を完了しました。(ただし、日本語学習はルートディレクトリに英語以外の文字が含まれていない必要があります)
20 |
21 | 3. オーディオパスをチェックします。間違ったパスを読み取ろうとすると、「パスが存在しません」というエラーメッセージが返されます。これは ffmpeg モジュールのエラーではありません。
22 |
23 | ### 20240123 更新
24 |
25 | 1. hubert から nan 抽出による SoVITS/GPT 学習中の ZeroDivisionError 関連エラーを修正しました。
26 |
27 | 2. 推論インターフェースでモデルを素早く切り替えることができるようにサポートしました。
28 |
29 | 3. モデルファイルのソートロジックを最適化しました。
30 |
31 | 4. 中国語の分析に`jieba_fast`を`jieba`に置き換えました。
32 |
33 | ### 20240126 更新
34 |
35 | 1. 中国語と英語、日本語と英語が混在した出力テキストをサポートします。
36 |
37 | 2. 出力で選択的な分割モードをサポートします。
38 |
39 | 3. uvr5 がディレクトリを読み取り、自動的に終了する問題を修正しました。
40 |
41 | 4. 複数の改行による推論エラーを修正しました。
42 |
43 | 5. 推論インターフェースから不要なログを削除しました。
44 |
45 | 6. MacOS での学習と推論をサポートします。
46 |
47 | 7. 半精度をサポートしていないカードを自動的に識別して単精度を強制し、CPU 推論では単精度を強制します。
48 |
49 | ### 20240128 更新
50 |
51 | 1. 数字を漢字で読む問題を修正しました。
52 |
53 | 2. 文章の先頭の一部の単語が欠落する問題を修正しました。
54 |
55 | 3. 不適切な長さのリファレンスオーディオを制限しました。
56 |
57 | 4. GPT 学習時の ckpt が保存されない問題を修正しました。
58 |
59 | 5. Dockerfile のモデルダウンロードプロセスを改善しました。
60 |
61 | ### 20240129 更新
62 |
63 | 1. 16 系などの半精度学習に問題があるカードは、学習構成を単精度学習に変更しました。
64 |
65 | 2. Colab でも使用可能なバージョンをテストして更新しました。
66 |
67 | 3. `git clone modelscope funasr`リポジトリと古いバージョンの funasr を使用してインターフェースが一致しないエラーを修正しました。
68 |
69 | ### 20240130 更新
70 |
71 | 1. パスと関連する文字列を解析して、二重引用符を自動的に削除します。また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません。
72 |
73 | 2. 中国語と英語、日本語と英語の混合出力をサポートします。
74 |
75 | 3. 出力で選択的な分割モードをサポートします。
76 |
77 | todolist:
78 |
79 | 1. 同音異義語(中国語)の推論の最適化
80 |
81 | 2. 英語大文字認識と英語ハイフン [問題](https://github.com/RVC-Boss/GPT-SoVITS/issues/271)
82 |
83 | 3. テキストに%記号が含まれているとエラーが発生し、推論が不可能です。また、「元/吨」が「元吨」ではなく「元每吨」と読まれるなどの問題があります。このような問題を解決するには、どのライブラリを使用する必要があり、それに対する改善を検討しています。
84 |
85 | 4. 中-日-英、中-英、日-英を含む 5 つの言語をサポートすることを目標にしています。
86 |
--------------------------------------------------------------------------------
/docs/ko/Changelog_KO.md:
--------------------------------------------------------------------------------
1 | ### 20240121 업데이트
2 |
3 | 1. `config`에 `is_share` 추가, Colab 등의 환경에서 이를 `True`로 설정하여 webui를 공용 네트워크에 매핑되도록 할 수 있습니다.
4 | 2. WebUI에 영어 번역이 추가되었습니다.
5 | 3. `cmd-asr`은 damo 모델이 이미 포함되어 있는지 자동으로 확인하고, 기본 경로에 없는 경우 modelscope에서 자동 다운로드 되도록 수정하였습니다.
6 | 4. [SoVITS 학습 중 ZeroDivisionError가 발생](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)하는 경우 복구를 시도합니다. (길이가 0인 샘플 필터링 등)
7 | 5. TEMP 파일 폴더에서 오디오 및 기타 파일을 정리하여 최적화합니다.
8 | 6. 합성 오디오가 레퍼런스 오디오의 끝부분을 포함하는 문제를 개선하였습니다.
9 |
10 | ### 20240122 업데이트
11 |
12 | 1. 너무 짧은 출력 파일이 중복된 레퍼런스 오디오를 반환하는 문제 수정하였습니다.
13 | 2. 영어-일본어 학습이 원활하게 진행되는 QA를 완료하였습니다. (다만, 일본어 학습은 루트 디렉토리에 영어 이외의 문자가 없어야 합니다)
14 | 3. 오디오 경로를 검사합니다. 잘못된 경로를 읽으려고 할 때 '경로가 존재하지 않습니다'라는 에러 메시지를 반환하도록 수정하였습니다. 이는 ffmpeg 모듈의 에러가 아닙니다.
15 |
16 | ### 20240123 업데이트
17 |
18 | 1. hubert에서 nan 추출로 인한 SoVITS/GPT 학습 중 ZeroDivisionError 관련 에러를 해결하였습니다.
19 | 2. 추론 인터페이스에서 모델을 빠르게 전환할 수 있도록 지원하도록 수정되었습니다.
20 | 3. 모델 파일 정렬 로직 최적화하였습니다.
21 | 4. 중문 분석에 `jieba_fast`를 `jieba`로 대체하였습니다.
22 |
23 | ### 20240126 업데이트
24 |
25 | 1. 중국어와 영어, 일본어와 영어가 혼합된 출력 텍스트를 지원합니다.
26 | 2. 출력에서 선택적 분할 모드를 지원합니다.
27 | 3. uvr5가 디렉토리를 읽고 자동으로 종료되는 문제를 수정하였습니다.
28 | 4. 여러 줄바꿈으로 인한 추론 오류를 수정하였습니다.
29 | 5. 추론 인터페이스에서 불필요한 로그 제거하였습니다.
30 | 6. MacOS에서의 학습 및 추론을 지원합니다.
31 | 7. 반정밀을 지원하지 않는 카드를 자동으로 식별하여 단일 정밀도를 강제 적용하고, CPU 추론에서 단일 정밀도를 강제 적용합니다.
32 |
33 | ### 20240128 업데이트
34 |
35 | 1. 숫자를 한자로 읽는 문제를 수정했습니다.
36 | 2. 문장 시작 부분의 일부 단어가 누락되는 문제 수정하였습니다.
37 | 3. 부적절한 길이의 레퍼런스 오디오를 제한하였습니다.
38 | 4. GPT 학습 시 ckpt가 저장되지 않는 문제 수정하였습니다.
39 | 5. Dockerfile에서 모델 다운로드 프로세스 개선하였습니다.
40 |
41 | ### 20240129 업데이트
42 |
43 | 1. 반정밀도 훈련에 문제가 있는 16 시리즈 및 기타 그래픽 카드의 훈련 구성을 단정밀도 훈련으로 변경했습니다.
44 | 2. Colab에서도 사용이 가능한 버전을 테스트 및 업데이트 하였습니다.
45 | 3. `git clone modelscope funasr` 저장소와 오래된 버전의 funasr 사용으로 인해 인터페이스가 일치하지 않는 오류를 수정하였습니다.
46 |
47 | ### 20240130 업데이트
48 |
49 | 1. 경로와 관련된 문자열을 파싱하여 큰따옴표를 자동으로 제거합니다. 또한, 경로를 복사하는 경우 큰따옴표가 포함되어도 오류가 발생하지 않습니다.
50 | 2. 중국어 및 영어 문자열의 문장 부호가 잘리는 문제 및 문장의 시작과 끝에 문장 부호가 추가되는 문제를 수정했습니다.
51 | 3. 문장 부호의 수를 확장하였습니다.
52 |
53 | ### 20240201 업데이트
54 |
55 | 1. uvr5가 잘못된 형식으로 읽어들이는 문제를 수정하였습니다.
56 | 2. 중국어, 일본어, 영어가 혼합된 여러 텍스트를 자동으로 분리하여 언어를 인식합니다.
57 |
58 | ### 20240202 업데이트
59 |
60 | 1. asr 경로의 끝에 `/`가 포함되어 있는 경우 오류가 발생하는 문제를 수정하였습니다.
61 | 2. paddlespeech의 Normalizer를 도입하여 [문제를 해결](https://github.com/RVC-Boss/GPT-SoVITS/pull/377)하여, 예를 들어 xx.xx%(백분율), 元/吨이 元吨으로 읽히는 문제를 해결하였습니다. 또한, 밑줄이 더 이상 오류를 발생시키지 않습니다.
62 |
63 | ### 20240207 업데이트
64 |
65 | 1. 언어 전달 매개변수가 혼란스러워져 [중국어 추론 효과가 저하되는 문제](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)를 수정하였습니다.
66 | 2. uvr5가 `inf everywhere` [오류를 반환하는 문제](https://github.com/RVC-Boss/GPT-SoVITS/pull/403)를 수정하였습니다.
67 | 3. uvr5의 `is_half` 매개변수가 bool로 변환되지 않아 항상 반정밀도 추론으로 설정되어 16 시리즈 그래픽 카드에서 `inf`가 반환되는 [문제](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)를 수정하였습니다.
68 | 4. 영어 텍스트 입력을 최적화하였습니다.
69 | 5. gradio 종속성을 지원합니다.
70 | 6. 루트 디렉토리가 비어 있으면 `.list` 전체 경로를 자동으로 읽습니다.
71 | 7. faster whisper ASR 일본어 및 영어를 지원합니다.
72 |
73 | ### 20240208 업데이트
74 |
75 | 1. GPT 학습이 카드에 따라 멈추는 문제와 [GPT 학습 중 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) 문제를 수정하였습니다.
76 |
77 | ### 20240212 업데이트
78 |
79 | 1. faster whisper 및 funasr 로직을 최적화하였습니다. faster whisper는 이미지 스토어에서 다운로드하여 huggingface에 연결하지 못하는 문제를 회피합니다.
80 | 2. DPO Loss 실험적 학습 옵션을 활성화하여 부정적 샘플을 생성하여 [GPT 반복 및 누락 문자 문제](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)를 완화합니다. 추론 인터페이스에 몇 가지 추론 매개변수를 공개합니다.
81 |
82 | ### 20240214 업데이트
83 |
84 | 1. 학습에서 중국어 실험 이름을 지원합니다. (이전에 오류가 발생했습니다)
85 | 2. DPO 학습을 선택적으로 설정할 수 있도록 변경하였습니다. 배치 크기를 선택하면 자동으로 절반으로 줄어듭니다. 추론 인터페이스에서 새로운 매개변수를 전달하지 않는 문제를 수정하였습니다.
86 |
87 | ### 20240216 업데이트
88 |
89 | 1. 참조 텍스트 입력을 지원합니다.
90 | 2. 프론트엔드에 있던 중국어 텍스트 입력 버그를 수정하였습니다.
91 |
92 | todolist :
93 |
94 | 1. 중국어 다음음자 추론 최적화
95 |
--------------------------------------------------------------------------------
/go-webui.bat:
--------------------------------------------------------------------------------
1 | runtime\python.exe vc_webui.py
2 | pause
3 |
--------------------------------------------------------------------------------
/go-webui.ps1:
--------------------------------------------------------------------------------
1 | $ErrorActionPreference = "SilentlyContinue"
2 | chcp 65001
3 | & "$PSScriptRoot\runtime\python.exe" "$PSScriptRoot\vc_webui.py"
4 | pause
5 |
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | conda install -c conda-forge gcc
3 | conda install -c conda-forge gxx
4 | conda install ffmpeg cmake
5 | conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
6 | pip install -r requirements.txt
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | tensorboard
4 | librosa==0.9.2
5 | numba==0.56.4
6 | pytorch-lightning
7 | gradio==3.38.0
8 | gradio_client==0.8.1
9 | ffmpeg-python
10 | onnxruntime
11 | tqdm
12 | funasr==1.0.0
13 | cn2an
14 | pypinyin
15 | pyopenjtalk
16 | g2p_en
17 | torchaudio
18 | modelscope==1.10.0
19 | sentencepiece
20 | transformers
21 | chardet
22 | PyYAML
23 | psutil
24 | jieba_fast
25 | jieba
26 | LangSegment>=0.2.0
27 | Faster_Whisper
--------------------------------------------------------------------------------
/tools/asr/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | def check_fw_local_models():
4 | '''
5 | 启动时检查本地是否有 Faster Whisper 模型.
6 | '''
7 | model_size_list = [
8 | "tiny", "tiny.en",
9 | "base", "base.en",
10 | "small", "small.en",
11 | "medium", "medium.en",
12 | "large", "large-v1",
13 | "large-v2", "large-v3"]
14 | for i, size in enumerate(model_size_list):
15 | if os.path.exists(f'tools/asr/models/faster-whisper-{size}'):
16 | model_size_list[i] = size + '-local'
17 | return model_size_list
18 |
19 | asr_dict = {
20 | "达摩 ASR (中文)": {
21 | 'lang': ['zh'],
22 | 'size': ['large'],
23 | 'path': 'funasr_asr.py',
24 | },
25 | "Faster Whisper (多语种)": {
26 | 'lang': ['auto', 'zh', 'en', 'ja'],
27 | 'size': check_fw_local_models(),
28 | 'path': 'fasterwhisper_asr.py'
29 | }
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/tools/asr/fasterwhisper_asr.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | os.environ["HF_ENDPOINT"]="https://hf-mirror.com"
4 | import traceback
5 | import requests
6 | from glob import glob
7 |
8 | from faster_whisper import WhisperModel
9 | from tqdm import tqdm
10 |
11 | from tools.asr.config import check_fw_local_models
12 | from tools.asr.funasr_asr import only_asr
13 |
14 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
15 |
16 | language_code_list = [
17 | "af", "am", "ar", "as", "az",
18 | "ba", "be", "bg", "bn", "bo",
19 | "br", "bs", "ca", "cs", "cy",
20 | "da", "de", "el", "en", "es",
21 | "et", "eu", "fa", "fi", "fo",
22 | "fr", "gl", "gu", "ha", "haw",
23 | "he", "hi", "hr", "ht", "hu",
24 | "hy", "id", "is", "it", "ja",
25 | "jw", "ka", "kk", "km", "kn",
26 | "ko", "la", "lb", "ln", "lo",
27 | "lt", "lv", "mg", "mi", "mk",
28 | "ml", "mn", "mr", "ms", "mt",
29 | "my", "ne", "nl", "nn", "no",
30 | "oc", "pa", "pl", "ps", "pt",
31 | "ro", "ru", "sa", "sd", "si",
32 | "sk", "sl", "sn", "so", "sq",
33 | "sr", "su", "sv", "sw", "ta",
34 | "te", "tg", "th", "tk", "tl",
35 | "tr", "tt", "uk", "ur", "uz",
36 | "vi", "yi", "yo", "zh", "yue",
37 | "auto"]
38 |
39 | def execute_asr(input_folder, output_folder, model_size, language,precision):
40 | if '-local' in model_size:
41 | model_size = model_size[:-6]
42 | model_path = f'tools/asr/models/faster-whisper-{model_size}'
43 | else:
44 | model_path = model_size
45 | if language == 'auto':
46 | language = None #不设置语种由模型自动输出概率最高的语种
47 | print("loading faster whisper model:",model_size,model_path)
48 | try:
49 | model = WhisperModel(model_path, device="cuda", compute_type=precision)
50 | except:
51 | return print(traceback.format_exc())
52 | output = []
53 | output_file_name = os.path.basename(input_folder)
54 | output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
55 |
56 | if not os.path.exists(output_folder):
57 | os.makedirs(output_folder)
58 |
59 | for file in tqdm(glob(os.path.join(input_folder, '**/*.wav'), recursive=True)):
60 | try:
61 | segments, info = model.transcribe(
62 | audio = file,
63 | beam_size = 5,
64 | vad_filter = True,
65 | vad_parameters = dict(min_silence_duration_ms=700),
66 | language = language)
67 | text = ''
68 |
69 | if info.language == "zh":
70 | print("检测为中文文本,转funasr处理")
71 | text = only_asr(file)
72 |
73 | if text == '':
74 | for segment in segments:
75 | text += segment.text
76 | output.append(f"{file}|{output_file_name}|{info.language.upper()}|{text}")
77 | except:
78 | return print(traceback.format_exc())
79 |
80 | with open(output_file_path, "w", encoding="utf-8") as f:
81 | f.write("\n".join(output))
82 | print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
83 | return output_file_path
84 |
85 | if __name__ == '__main__':
86 | parser = argparse.ArgumentParser()
87 | parser.add_argument("-i", "--input_folder", type=str, required=True,
88 | help="Path to the folder containing WAV files.")
89 | parser.add_argument("-o", "--output_folder", type=str, required=True,
90 | help="Output folder to store transcriptions.")
91 | parser.add_argument("-s", "--model_size", type=str, default='large-v3',
92 | choices=check_fw_local_models(),
93 | help="Model Size of Faster Whisper")
94 | parser.add_argument("-l", "--language", type=str, default='ja',
95 | choices=language_code_list,
96 | help="Language of the audio files.")
97 | parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
98 | help="fp16 or fp32")
99 |
100 | cmd = parser.parse_args()
101 | output_file_path = execute_asr(
102 | input_folder = cmd.input_folder,
103 | output_folder = cmd.output_folder,
104 | model_size = cmd.model_size,
105 | language = cmd.language,
106 | precision = cmd.precision,
107 | )
108 |
--------------------------------------------------------------------------------
/tools/asr/funasr_asr.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import argparse
4 | import os
5 | import traceback
6 | from tqdm import tqdm
7 |
8 | from funasr import AutoModel
9 |
10 | path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
11 | path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
12 | path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
13 | path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
14 | path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
15 | path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
16 |
17 | model = AutoModel(
18 | model = path_asr,
19 | model_revision = "v2.0.4",
20 | vad_model = path_vad,
21 | vad_model_revision = "v2.0.4",
22 | punc_model = path_punc,
23 | punc_model_revision = "v2.0.4",
24 | )
25 |
26 | def only_asr(input_file):
27 | try:
28 | text = model.generate(input=input_file)[0]["text"]
29 | except:
30 | text = ''
31 | print(traceback.format_exc())
32 | return text
33 |
34 | def execute_asr(input_folder, output_folder, model_size, language):
35 | input_file_names = os.listdir(input_folder)
36 | input_file_names.sort()
37 |
38 | output = []
39 | output_file_name = os.path.basename(input_folder)
40 |
41 | for name in tqdm(input_file_names):
42 | try:
43 | text = model.generate(input="%s/%s"%(input_folder, name))[0]["text"]
44 | output.append(f"{input_folder}/{name}|{output_file_name}|{language.upper()}|{text}")
45 | except:
46 | print(traceback.format_exc())
47 |
48 | output_folder = output_folder or "output/asr_opt"
49 | os.makedirs(output_folder, exist_ok=True)
50 | output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
51 |
52 | with open(output_file_path, "w", encoding="utf-8") as f:
53 | f.write("\n".join(output))
54 | print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
55 | return output_file_path
56 |
57 | if __name__ == '__main__':
58 | parser = argparse.ArgumentParser()
59 | parser.add_argument("-i", "--input_folder", type=str, required=True,
60 | help="Path to the folder containing WAV files.")
61 | parser.add_argument("-o", "--output_folder", type=str, required=True,
62 | help="Output folder to store transcriptions.")
63 | parser.add_argument("-s", "--model_size", type=str, default='large',
64 | help="Model Size of FunASR is Large")
65 | parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'],
66 | help="Language of the audio files.")
67 | parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
68 | help="fp16 or fp32")#还没接入
69 |
70 | cmd = parser.parse_args()
71 | execute_asr(
72 | input_folder = cmd.input_folder,
73 | output_folder = cmd.output_folder,
74 | model_size = cmd.model_size,
75 | language = cmd.language,
76 | )
77 |
--------------------------------------------------------------------------------
/tools/asr/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
--------------------------------------------------------------------------------
/tools/cmd-denoise.py:
--------------------------------------------------------------------------------
1 | import os,argparse
2 |
3 | from modelscope.pipelines import pipeline
4 | from modelscope.utils.constant import Tasks
5 | from tqdm import tqdm
6 |
7 | path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
8 | path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
9 | ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
10 | def execute_denoise(input_folder,output_folder):
11 | os.makedirs(output_folder,exist_ok=True)
12 | # print(input_folder)
13 | # print(list(os.listdir(input_folder).sort()))
14 | for name in tqdm(os.listdir(input_folder)):
15 | ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
16 |
17 | if __name__ == '__main__':
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument("-i", "--input_folder", type=str, required=True,
20 | help="Path to the folder containing WAV files.")
21 | parser.add_argument("-o", "--output_folder", type=str, required=True,
22 | help="Output folder to store transcriptions.")
23 | parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
24 | help="fp16 or fp32")#还没接入
25 | cmd = parser.parse_args()
26 | execute_denoise(
27 | input_folder = cmd.input_folder,
28 | output_folder = cmd.output_folder,
29 | )
--------------------------------------------------------------------------------
/tools/denoise-model/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/tools/i18n/i18n.py:
--------------------------------------------------------------------------------
1 | import json
2 | import locale
3 | import os
4 |
5 |
6 | def load_language_list(language):
7 | with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f:
8 | language_list = json.load(f)
9 | return language_list
10 |
11 |
12 | class I18nAuto:
13 | def __init__(self, language=None):
14 | if language in ["Auto", None]:
15 | language = locale.getdefaultlocale()[
16 | 0
17 | ] # getlocale can't identify the system's language ((None, None))
18 | if not os.path.exists(f"./i18n/locale/{language}.json"):
19 | language = "en_US"
20 | self.language = language
21 | self.language_map = load_language_list(language)
22 |
23 | def __call__(self, key):
24 | return self.language_map.get(key, key)
25 |
26 | def __repr__(self):
27 | return "Use Language: " + self.language
28 |
--------------------------------------------------------------------------------
/tools/i18n/locale_diff.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from collections import OrderedDict
4 |
5 | # Define the standard file name
6 | standard_file = "locale/zh_CN.json"
7 |
8 | # Find all JSON files in the directory
9 | dir_path = "locale/"
10 | languages = [
11 | os.path.join(dir_path, f)
12 | for f in os.listdir(dir_path)
13 | if f.endswith(".json") and f != standard_file
14 | ]
15 |
16 | # Load the standard file
17 | with open(standard_file, "r", encoding="utf-8") as f:
18 | standard_data = json.load(f, object_pairs_hook=OrderedDict)
19 |
20 | # Loop through each language file
21 | for lang_file in languages:
22 | # Load the language file
23 | with open(lang_file, "r", encoding="utf-8") as f:
24 | lang_data = json.load(f, object_pairs_hook=OrderedDict)
25 |
26 | # Find the difference between the language file and the standard file
27 | diff = set(standard_data.keys()) - set(lang_data.keys())
28 |
29 | miss = set(lang_data.keys()) - set(standard_data.keys())
30 |
31 | # Add any missing keys to the language file
32 | for key in diff:
33 | lang_data[key] = key
34 |
35 | # Del any extra keys to the language file
36 | for key in miss:
37 | del lang_data[key]
38 |
39 | # Sort the keys of the language file to match the order of the standard file
40 | lang_data = OrderedDict(
41 | sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
42 | )
43 |
44 | # Save the updated language file
45 | with open(lang_file, "w", encoding="utf-8") as f:
46 | json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
47 | f.write("\n")
48 |
--------------------------------------------------------------------------------
/tools/i18n/scan_i18n.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import glob
3 | import json
4 | from collections import OrderedDict
5 |
6 |
7 | def extract_i18n_strings(node):
8 | i18n_strings = []
9 |
10 | if (
11 | isinstance(node, ast.Call)
12 | and isinstance(node.func, ast.Name)
13 | and node.func.id == "i18n"
14 | ):
15 | for arg in node.args:
16 | if isinstance(arg, ast.Str):
17 | i18n_strings.append(arg.s)
18 |
19 | for child_node in ast.iter_child_nodes(node):
20 | i18n_strings.extend(extract_i18n_strings(child_node))
21 |
22 | return i18n_strings
23 |
24 |
25 | # scan the directory for all .py files (recursively)
26 | # for each file, parse the code into an AST
27 | # for each AST, extract the i18n strings
28 |
29 | strings = []
30 | for filename in glob.iglob("**/*.py", recursive=True):
31 | with open(filename, "r") as f:
32 | code = f.read()
33 | if "I18nAuto" in code:
34 | tree = ast.parse(code)
35 | i18n_strings = extract_i18n_strings(tree)
36 | print(filename, len(i18n_strings))
37 | strings.extend(i18n_strings)
38 | code_keys = set(strings)
39 | """
40 | n_i18n.py
41 | gui_v1.py 26
42 | app.py 16
43 | infer-web.py 147
44 | scan_i18n.py 0
45 | i18n.py 0
46 | lib/train/process_ckpt.py 1
47 | """
48 | print()
49 | print("Total unique:", len(code_keys))
50 |
51 |
52 | standard_file = "i18n/locale/zh_CN.json"
53 | with open(standard_file, "r", encoding="utf-8") as f:
54 | standard_data = json.load(f, object_pairs_hook=OrderedDict)
55 | standard_keys = set(standard_data.keys())
56 |
57 | # Define the standard file name
58 | unused_keys = standard_keys - code_keys
59 | print("Unused keys:", len(unused_keys))
60 | for unused_key in unused_keys:
61 | print("\t", unused_key)
62 |
63 | missing_keys = code_keys - standard_keys
64 | print("Missing keys:", len(missing_keys))
65 | for missing_key in missing_keys:
66 | print("\t", missing_key)
67 |
68 | code_keys_dict = OrderedDict()
69 | for s in strings:
70 | code_keys_dict[s] = s
71 |
72 | # write back
73 | with open(standard_file, "w", encoding="utf-8") as f:
74 | json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
75 | f.write("\n")
76 |
--------------------------------------------------------------------------------
/tools/my_utils.py:
--------------------------------------------------------------------------------
1 | import platform,os,traceback
2 | import ffmpeg
3 | import numpy as np
4 |
5 |
6 | def load_audio(file, sr):
7 | try:
8 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
9 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
10 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
11 | file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
12 | if os.path.exists(file) == False:
13 | raise RuntimeError(
14 | "You input a wrong audio path that does not exists, please fix it!"
15 | )
16 | out, _ = (
17 | ffmpeg.input(file, threads=0)
18 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
19 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
20 | )
21 | except Exception as e:
22 | traceback.print_exc()
23 | raise RuntimeError(f"Failed to load audio: {e}")
24 |
25 | return np.frombuffer(out, np.float32).flatten()
26 |
27 |
28 | def clean_path(path_str):
29 | if platform.system() == 'Windows':
30 | path_str = path_str.replace('/', '\\')
31 | return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
32 |
--------------------------------------------------------------------------------
/tools/slice_audio.py:
--------------------------------------------------------------------------------
1 | import os,sys,numpy as np
2 | import traceback
3 | from scipy.io import wavfile
4 | # parent_directory = os.path.dirname(os.path.abspath(__file__))
5 | # sys.path.append(parent_directory)
6 | from my_utils import load_audio
7 | from slicer2 import Slicer
8 |
9 | def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
10 | os.makedirs(opt_root,exist_ok=True)
11 | if os.path.isfile(inp):
12 | input=[inp]
13 | elif os.path.isdir(inp):
14 | input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
15 | else:
16 | return "输入路径存在但既不是文件也不是文件夹"
17 | slicer = Slicer(
18 | sr=32000, # 长音频采样率
19 | threshold= int(threshold), # 音量小于这个值视作静音的备选切割点
20 | min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值
21 | min_interval= int(min_interval), # 最短切割间隔
22 | hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)
23 | max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长
24 | )
25 | _max=float(_max)
26 | alpha=float(alpha)
27 | for inp_path in input[int(i_part)::int(all_part)]:
28 | # print(inp_path)
29 | try:
30 | name = os.path.basename(inp_path)
31 | audio = load_audio(inp_path, 32000)
32 | # print(audio.shape)
33 | for chunk, start, end in slicer.slice(audio): # start和end是帧数
34 | tmp_max = np.abs(chunk).max()
35 | if(tmp_max>1):chunk/=tmp_max
36 | chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
37 | wavfile.write(
38 | "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
39 | 32000,
40 | # chunk.astype(np.float32),
41 | (chunk * 32767).astype(np.int16),
42 | )
43 | except:
44 | print(inp_path,"->fail->",traceback.format_exc())
45 | return "执行完毕,请检查输出文件"
46 |
47 | print(slice(*sys.argv[1:]))
48 |
49 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.bottleneck = nn.Sequential(
104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 | )
106 |
107 | def forward(self, x):
108 | _, _, h, w = x.size()
109 | feat1 = F.interpolate(
110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 | )
112 | feat2 = self.conv2(x)
113 | feat3 = self.conv3(x)
114 | feat4 = self.conv4(x)
115 | feat5 = self.conv5(x)
116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 | bottle = self.bottleneck(out)
118 | return bottle
119 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123812KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.bottleneck = nn.Sequential(
104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 | )
106 |
107 | def forward(self, x):
108 | _, _, h, w = x.size()
109 | feat1 = F.interpolate(
110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 | )
112 | feat2 = self.conv2(x)
113 | feat3 = self.conv3(x)
114 | feat4 = self.conv4(x)
115 | feat5 = self.conv5(x)
116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 | bottle = self.bottleneck(out)
118 | return bottle
119 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123821KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.bottleneck = nn.Sequential(
104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 | )
106 |
107 | def forward(self, x):
108 | _, _, h, w = x.size()
109 | feat1 = F.interpolate(
110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 | )
112 | feat2 = self.conv2(x)
113 | feat3 = self.conv3(x)
114 | feat4 = self.conv4(x)
115 | feat5 = self.conv5(x)
116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 | bottle = self.bottleneck(out)
118 | return bottle
119 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_33966KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.conv6 = SeperableConv2DBNActiv(
104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 | )
106 | self.conv7 = SeperableConv2DBNActiv(
107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 | )
109 | self.bottleneck = nn.Sequential(
110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 | )
112 |
113 | def forward(self, x):
114 | _, _, h, w = x.size()
115 | feat1 = F.interpolate(
116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 | )
118 | feat2 = self.conv2(x)
119 | feat3 = self.conv3(x)
120 | feat4 = self.conv4(x)
121 | feat5 = self.conv5(x)
122 | feat6 = self.conv6(x)
123 | feat7 = self.conv7(x)
124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 | bottle = self.bottleneck(out)
126 | return bottle
127 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_537227KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.conv6 = SeperableConv2DBNActiv(
104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 | )
106 | self.conv7 = SeperableConv2DBNActiv(
107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 | )
109 | self.bottleneck = nn.Sequential(
110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 | )
112 |
113 | def forward(self, x):
114 | _, _, h, w = x.size()
115 | feat1 = F.interpolate(
116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 | )
118 | feat2 = self.conv2(x)
119 | feat3 = self.conv3(x)
120 | feat4 = self.conv4(x)
121 | feat5 = self.conv5(x)
122 | feat6 = self.conv6(x)
123 | feat7 = self.conv7(x)
124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 | bottle = self.bottleneck(out)
126 | return bottle
127 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_537238KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(
67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68 | ):
69 | super(Decoder, self).__init__()
70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71 | self.dropout = nn.Dropout2d(0.1) if dropout else None
72 |
73 | def __call__(self, x, skip=None):
74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75 | if skip is not None:
76 | skip = spec_utils.crop_center(skip, x)
77 | x = torch.cat([x, skip], dim=1)
78 | h = self.conv(x)
79 |
80 | if self.dropout is not None:
81 | h = self.dropout(h)
82 |
83 | return h
84 |
85 |
86 | class ASPPModule(nn.Module):
87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88 | super(ASPPModule, self).__init__()
89 | self.conv1 = nn.Sequential(
90 | nn.AdaptiveAvgPool2d((1, None)),
91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92 | )
93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94 | self.conv3 = SeperableConv2DBNActiv(
95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96 | )
97 | self.conv4 = SeperableConv2DBNActiv(
98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99 | )
100 | self.conv5 = SeperableConv2DBNActiv(
101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 | )
103 | self.conv6 = SeperableConv2DBNActiv(
104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 | )
106 | self.conv7 = SeperableConv2DBNActiv(
107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 | )
109 | self.bottleneck = nn.Sequential(
110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 | )
112 |
113 | def forward(self, x):
114 | _, _, h, w = x.size()
115 | feat1 = F.interpolate(
116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 | )
118 | feat2 = self.conv2(x)
119 | feat3 = self.conv3(x)
120 | feat4 = self.conv4(x)
121 | feat5 = self.conv5(x)
122 | feat6 = self.conv6(x)
123 | feat7 = self.conv7(x)
124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 | bottle = self.bottleneck(out)
126 | return bottle
127 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_new.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class Encoder(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
31 | super(Encoder, self).__init__()
32 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
33 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
34 |
35 | def __call__(self, x):
36 | h = self.conv1(x)
37 | h = self.conv2(h)
38 |
39 | return h
40 |
41 |
42 | class Decoder(nn.Module):
43 | def __init__(
44 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
45 | ):
46 | super(Decoder, self).__init__()
47 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
48 | # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
49 | self.dropout = nn.Dropout2d(0.1) if dropout else None
50 |
51 | def __call__(self, x, skip=None):
52 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
53 |
54 | if skip is not None:
55 | skip = spec_utils.crop_center(skip, x)
56 | x = torch.cat([x, skip], dim=1)
57 |
58 | h = self.conv1(x)
59 | # h = self.conv2(h)
60 |
61 | if self.dropout is not None:
62 | h = self.dropout(h)
63 |
64 | return h
65 |
66 |
67 | class ASPPModule(nn.Module):
68 | def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
69 | super(ASPPModule, self).__init__()
70 | self.conv1 = nn.Sequential(
71 | nn.AdaptiveAvgPool2d((1, None)),
72 | Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
73 | )
74 | self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
75 | self.conv3 = Conv2DBNActiv(
76 | nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
77 | )
78 | self.conv4 = Conv2DBNActiv(
79 | nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
80 | )
81 | self.conv5 = Conv2DBNActiv(
82 | nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
83 | )
84 | self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
85 | self.dropout = nn.Dropout2d(0.1) if dropout else None
86 |
87 | def forward(self, x):
88 | _, _, h, w = x.size()
89 | feat1 = F.interpolate(
90 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
91 | )
92 | feat2 = self.conv2(x)
93 | feat3 = self.conv3(x)
94 | feat4 = self.conv4(x)
95 | feat5 = self.conv5(x)
96 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
97 | out = self.bottleneck(out)
98 |
99 | if self.dropout is not None:
100 | out = self.dropout(out)
101 |
102 | return out
103 |
104 |
105 | class LSTMModule(nn.Module):
106 | def __init__(self, nin_conv, nin_lstm, nout_lstm):
107 | super(LSTMModule, self).__init__()
108 | self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
109 | self.lstm = nn.LSTM(
110 | input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
111 | )
112 | self.dense = nn.Sequential(
113 | nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
114 | )
115 |
116 | def forward(self, x):
117 | N, _, nbins, nframes = x.size()
118 | h = self.conv(x)[:, 0] # N, nbins, nframes
119 | h = h.permute(2, 0, 1) # nframes, N, nbins
120 | h, _ = self.lstm(h)
121 | h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
122 | h = h.reshape(nframes, N, 1, nbins)
123 | h = h.permute(1, 2, 3, 0)
124 |
125 | return h
126 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/model_param_init.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import pathlib
4 |
5 | default_param = {}
6 | default_param["bins"] = 768
7 | default_param["unstable_bins"] = 9 # training only
8 | default_param["reduction_bins"] = 762 # training only
9 | default_param["sr"] = 44100
10 | default_param["pre_filter_start"] = 757
11 | default_param["pre_filter_stop"] = 768
12 | default_param["band"] = {}
13 |
14 |
15 | default_param["band"][1] = {
16 | "sr": 11025,
17 | "hl": 128,
18 | "n_fft": 960,
19 | "crop_start": 0,
20 | "crop_stop": 245,
21 | "lpf_start": 61, # inference only
22 | "res_type": "polyphase",
23 | }
24 |
25 | default_param["band"][2] = {
26 | "sr": 44100,
27 | "hl": 512,
28 | "n_fft": 1536,
29 | "crop_start": 24,
30 | "crop_stop": 547,
31 | "hpf_start": 81, # inference only
32 | "res_type": "sinc_best",
33 | }
34 |
35 |
36 | def int_keys(d):
37 | r = {}
38 | for k, v in d:
39 | if k.isdigit():
40 | k = int(k)
41 | r[k] = v
42 | return r
43 |
44 |
45 | class ModelParameters(object):
46 | def __init__(self, config_path=""):
47 | if ".pth" == pathlib.Path(config_path).suffix:
48 | import zipfile
49 |
50 | with zipfile.ZipFile(config_path, "r") as zip:
51 | self.param = json.loads(
52 | zip.read("param.json"), object_pairs_hook=int_keys
53 | )
54 | elif ".json" == pathlib.Path(config_path).suffix:
55 | with open(config_path, "r") as f:
56 | self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57 | else:
58 | self.param = default_param
59 |
60 | for k in [
61 | "mid_side",
62 | "mid_side_b",
63 | "mid_side_b2",
64 | "stereo_w",
65 | "stereo_n",
66 | "reverse",
67 | ]:
68 | if not k in self.param:
69 | self.param[k] = False
70 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 16000,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 16000,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 1024
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 32000,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "kaiser_fast"
14 | }
15 | },
16 | "sr": 32000,
17 | "pre_filter_start": 1000,
18 | "pre_filter_stop": 1021
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 33075,
8 | "hl": 384,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 33075,
17 | "pre_filter_start": 1000,
18 | "pre_filter_stop": 1021
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 1024,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 1024
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 256,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 256,
9 | "n_fft": 512,
10 | "crop_start": 0,
11 | "crop_stop": 256,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 256,
18 | "pre_filter_stop": 256
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 1024
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 700,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 700
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "reduction_bins": 705,
5 | "band": {
6 | "1": {
7 | "sr": 6000,
8 | "hl": 66,
9 | "n_fft": 512,
10 | "crop_start": 0,
11 | "crop_stop": 240,
12 | "lpf_start": 60,
13 | "lpf_stop": 118,
14 | "res_type": "sinc_fastest"
15 | },
16 | "2": {
17 | "sr": 32000,
18 | "hl": 352,
19 | "n_fft": 1024,
20 | "crop_start": 22,
21 | "crop_stop": 505,
22 | "hpf_start": 44,
23 | "hpf_stop": 23,
24 | "res_type": "sinc_medium"
25 | }
26 | },
27 | "sr": 32000,
28 | "pre_filter_start": 710,
29 | "pre_filter_stop": 731
30 | }
31 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 512,
3 | "unstable_bins": 7,
4 | "reduction_bins": 510,
5 | "band": {
6 | "1": {
7 | "sr": 11025,
8 | "hl": 160,
9 | "n_fft": 768,
10 | "crop_start": 0,
11 | "crop_stop": 192,
12 | "lpf_start": 41,
13 | "lpf_stop": 139,
14 | "res_type": "sinc_fastest"
15 | },
16 | "2": {
17 | "sr": 44100,
18 | "hl": 640,
19 | "n_fft": 1024,
20 | "crop_start": 10,
21 | "crop_stop": 320,
22 | "hpf_start": 47,
23 | "hpf_stop": 15,
24 | "res_type": "sinc_medium"
25 | }
26 | },
27 | "sr": 44100,
28 | "pre_filter_start": 510,
29 | "pre_filter_stop": 512
30 | }
31 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "reduction_bins": 705,
5 | "band": {
6 | "1": {
7 | "sr": 6000,
8 | "hl": 66,
9 | "n_fft": 512,
10 | "crop_start": 0,
11 | "crop_stop": 240,
12 | "lpf_start": 60,
13 | "lpf_stop": 240,
14 | "res_type": "sinc_fastest"
15 | },
16 | "2": {
17 | "sr": 48000,
18 | "hl": 528,
19 | "n_fft": 1536,
20 | "crop_start": 22,
21 | "crop_stop": 505,
22 | "hpf_start": 82,
23 | "hpf_stop": 22,
24 | "res_type": "sinc_medium"
25 | }
26 | },
27 | "sr": 48000,
28 | "pre_filter_start": 710,
29 | "pre_filter_stop": 731
30 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 5,
4 | "reduction_bins": 733,
5 | "band": {
6 | "1": {
7 | "sr": 11025,
8 | "hl": 128,
9 | "n_fft": 768,
10 | "crop_start": 0,
11 | "crop_stop": 278,
12 | "lpf_start": 28,
13 | "lpf_stop": 140,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 22050,
18 | "hl": 256,
19 | "n_fft": 768,
20 | "crop_start": 14,
21 | "crop_stop": 322,
22 | "hpf_start": 70,
23 | "hpf_stop": 14,
24 | "lpf_start": 283,
25 | "lpf_stop": 314,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 44100,
30 | "hl": 512,
31 | "n_fft": 768,
32 | "crop_start": 131,
33 | "crop_stop": 313,
34 | "hpf_start": 154,
35 | "hpf_stop": 141,
36 | "res_type": "sinc_medium"
37 | }
38 | },
39 | "sr": 44100,
40 | "pre_filter_start": 757,
41 | "pre_filter_stop": 768
42 | }
43 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side": true,
3 | "bins": 768,
4 | "unstable_bins": 5,
5 | "reduction_bins": 733,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 768,
11 | "crop_start": 0,
12 | "crop_stop": 278,
13 | "lpf_start": 28,
14 | "lpf_stop": 140,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 22050,
19 | "hl": 256,
20 | "n_fft": 768,
21 | "crop_start": 14,
22 | "crop_stop": 322,
23 | "hpf_start": 70,
24 | "hpf_stop": 14,
25 | "lpf_start": 283,
26 | "lpf_stop": 314,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 44100,
31 | "hl": 512,
32 | "n_fft": 768,
33 | "crop_start": 131,
34 | "crop_stop": 313,
35 | "hpf_start": 154,
36 | "hpf_stop": 141,
37 | "res_type": "sinc_medium"
38 | }
39 | },
40 | "sr": 44100,
41 | "pre_filter_start": 757,
42 | "pre_filter_stop": 768
43 | }
44 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b2": true,
3 | "bins": 640,
4 | "unstable_bins": 7,
5 | "reduction_bins": 565,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 108,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 187,
13 | "lpf_start": 92,
14 | "lpf_stop": 186,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 22050,
19 | "hl": 216,
20 | "n_fft": 768,
21 | "crop_start": 0,
22 | "crop_stop": 212,
23 | "hpf_start": 68,
24 | "hpf_stop": 34,
25 | "lpf_start": 174,
26 | "lpf_stop": 209,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 44100,
31 | "hl": 432,
32 | "n_fft": 640,
33 | "crop_start": 66,
34 | "crop_stop": 307,
35 | "hpf_start": 86,
36 | "hpf_stop": 72,
37 | "res_type": "kaiser_fast"
38 | }
39 | },
40 | "sr": 44100,
41 | "pre_filter_start": 639,
42 | "pre_filter_stop": 640
43 | }
44 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "reduction_bins": 668,
5 | "band": {
6 | "1": {
7 | "sr": 11025,
8 | "hl": 128,
9 | "n_fft": 1024,
10 | "crop_start": 0,
11 | "crop_stop": 186,
12 | "lpf_start": 37,
13 | "lpf_stop": 73,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 11025,
18 | "hl": 128,
19 | "n_fft": 512,
20 | "crop_start": 4,
21 | "crop_stop": 185,
22 | "hpf_start": 36,
23 | "hpf_stop": 18,
24 | "lpf_start": 93,
25 | "lpf_stop": 185,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 22050,
30 | "hl": 256,
31 | "n_fft": 512,
32 | "crop_start": 46,
33 | "crop_stop": 186,
34 | "hpf_start": 93,
35 | "hpf_stop": 46,
36 | "lpf_start": 164,
37 | "lpf_stop": 186,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 512,
43 | "n_fft": 768,
44 | "crop_start": 121,
45 | "crop_stop": 382,
46 | "hpf_start": 138,
47 | "hpf_stop": 123,
48 | "res_type": "sinc_medium"
49 | }
50 | },
51 | "sr": 44100,
52 | "pre_filter_start": 740,
53 | "pre_filter_stop": 768
54 | }
55 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "mid_side": true,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
56 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
1 | {
2 | "reverse": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
1 | {
2 | "stereo_w": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 672,
3 | "unstable_bins": 8,
4 | "reduction_bins": 637,
5 | "band": {
6 | "1": {
7 | "sr": 7350,
8 | "hl": 80,
9 | "n_fft": 640,
10 | "crop_start": 0,
11 | "crop_stop": 85,
12 | "lpf_start": 25,
13 | "lpf_stop": 53,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 7350,
18 | "hl": 80,
19 | "n_fft": 320,
20 | "crop_start": 4,
21 | "crop_stop": 87,
22 | "hpf_start": 25,
23 | "hpf_stop": 12,
24 | "lpf_start": 31,
25 | "lpf_stop": 62,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 14700,
30 | "hl": 160,
31 | "n_fft": 512,
32 | "crop_start": 17,
33 | "crop_stop": 216,
34 | "hpf_start": 48,
35 | "hpf_stop": 24,
36 | "lpf_start": 139,
37 | "lpf_stop": 210,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 480,
43 | "n_fft": 960,
44 | "crop_start": 78,
45 | "crop_stop": 383,
46 | "hpf_start": 130,
47 | "hpf_stop": 86,
48 | "res_type": "kaiser_fast"
49 | }
50 | },
51 | "sr": 44100,
52 | "pre_filter_start": 668,
53 | "pre_filter_stop": 672
54 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 672,
3 | "unstable_bins": 8,
4 | "reduction_bins": 637,
5 | "band": {
6 | "1": {
7 | "sr": 7350,
8 | "hl": 80,
9 | "n_fft": 640,
10 | "crop_start": 0,
11 | "crop_stop": 85,
12 | "lpf_start": 25,
13 | "lpf_stop": 53,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 7350,
18 | "hl": 80,
19 | "n_fft": 320,
20 | "crop_start": 4,
21 | "crop_stop": 87,
22 | "hpf_start": 25,
23 | "hpf_stop": 12,
24 | "lpf_start": 31,
25 | "lpf_stop": 62,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 14700,
30 | "hl": 160,
31 | "n_fft": 512,
32 | "crop_start": 17,
33 | "crop_stop": 216,
34 | "hpf_start": 48,
35 | "hpf_stop": 24,
36 | "lpf_start": 139,
37 | "lpf_stop": 210,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 480,
43 | "n_fft": 960,
44 | "crop_start": 78,
45 | "crop_stop": 383,
46 | "hpf_start": 130,
47 | "hpf_stop": 86,
48 | "convert_channels": "stereo_n",
49 | "res_type": "kaiser_fast"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 668,
54 | "pre_filter_stop": 672
55 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 672,
3 | "unstable_bins": 8,
4 | "reduction_bins": 530,
5 | "band": {
6 | "1": {
7 | "sr": 7350,
8 | "hl": 80,
9 | "n_fft": 640,
10 | "crop_start": 0,
11 | "crop_stop": 85,
12 | "lpf_start": 25,
13 | "lpf_stop": 53,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 7350,
18 | "hl": 80,
19 | "n_fft": 320,
20 | "crop_start": 4,
21 | "crop_stop": 87,
22 | "hpf_start": 25,
23 | "hpf_stop": 12,
24 | "lpf_start": 31,
25 | "lpf_stop": 62,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 14700,
30 | "hl": 160,
31 | "n_fft": 512,
32 | "crop_start": 17,
33 | "crop_stop": 216,
34 | "hpf_start": 48,
35 | "hpf_stop": 24,
36 | "lpf_start": 139,
37 | "lpf_stop": 210,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 480,
43 | "n_fft": 960,
44 | "crop_start": 78,
45 | "crop_stop": 383,
46 | "hpf_start": 130,
47 | "hpf_stop": 86,
48 | "res_type": "kaiser_fast"
49 | }
50 | },
51 | "sr": 44100,
52 | "pre_filter_start": 668,
53 | "pre_filter_stop": 672
54 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/ensemble.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b2": true,
3 | "bins": 1280,
4 | "unstable_bins": 7,
5 | "reduction_bins": 565,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 108,
10 | "n_fft": 2048,
11 | "crop_start": 0,
12 | "crop_stop": 374,
13 | "lpf_start": 92,
14 | "lpf_stop": 186,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 22050,
19 | "hl": 216,
20 | "n_fft": 1536,
21 | "crop_start": 0,
22 | "crop_stop": 424,
23 | "hpf_start": 68,
24 | "hpf_stop": 34,
25 | "lpf_start": 348,
26 | "lpf_stop": 418,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 44100,
31 | "hl": 432,
32 | "n_fft": 1280,
33 | "crop_start": 132,
34 | "crop_stop": 614,
35 | "hpf_start": 172,
36 | "hpf_stop": 144,
37 | "res_type": "polyphase"
38 | }
39 | },
40 | "sr": 44100,
41 | "pre_filter_start": 1280,
42 | "pre_filter_stop": 1280
43 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets.py:
--------------------------------------------------------------------------------
1 | import layers
2 | import torch
3 | import torch.nn.functional as F
4 | from torch import nn
5 |
6 | from . import spec_utils
7 |
8 |
9 | class BaseASPPNet(nn.Module):
10 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
11 | super(BaseASPPNet, self).__init__()
12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16 |
17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18 |
19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23 |
24 | def __call__(self, x):
25 | h, e1 = self.enc1(x)
26 | h, e2 = self.enc2(h)
27 | h, e3 = self.enc3(h)
28 | h, e4 = self.enc4(h)
29 |
30 | h = self.aspp(h)
31 |
32 | h = self.dec4(h, e4)
33 | h = self.dec3(h, e3)
34 | h = self.dec2(h, e2)
35 | h = self.dec1(h, e1)
36 |
37 | return h
38 |
39 |
40 | class CascadedASPPNet(nn.Module):
41 | def __init__(self, n_fft):
42 | super(CascadedASPPNet, self).__init__()
43 | self.stg1_low_band_net = BaseASPPNet(2, 16)
44 | self.stg1_high_band_net = BaseASPPNet(2, 16)
45 |
46 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
47 | self.stg2_full_band_net = BaseASPPNet(8, 16)
48 |
49 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
50 | self.stg3_full_band_net = BaseASPPNet(16, 32)
51 |
52 | self.out = nn.Conv2d(32, 2, 1, bias=False)
53 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
54 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
55 |
56 | self.max_bin = n_fft // 2
57 | self.output_bin = n_fft // 2 + 1
58 |
59 | self.offset = 128
60 |
61 | def forward(self, x, aggressiveness=None):
62 | mix = x.detach()
63 | x = x.clone()
64 |
65 | x = x[:, :, : self.max_bin]
66 |
67 | bandw = x.size()[2] // 2
68 | aux1 = torch.cat(
69 | [
70 | self.stg1_low_band_net(x[:, :, :bandw]),
71 | self.stg1_high_band_net(x[:, :, bandw:]),
72 | ],
73 | dim=2,
74 | )
75 |
76 | h = torch.cat([x, aux1], dim=1)
77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78 |
79 | h = torch.cat([x, aux1, aux2], dim=1)
80 | h = self.stg3_full_band_net(self.stg3_bridge(h))
81 |
82 | mask = torch.sigmoid(self.out(h))
83 | mask = F.pad(
84 | input=mask,
85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86 | mode="replicate",
87 | )
88 |
89 | if self.training:
90 | aux1 = torch.sigmoid(self.aux1_out(aux1))
91 | aux1 = F.pad(
92 | input=aux1,
93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94 | mode="replicate",
95 | )
96 | aux2 = torch.sigmoid(self.aux2_out(aux2))
97 | aux2 = F.pad(
98 | input=aux2,
99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 | mode="replicate",
101 | )
102 | return mask * mix, aux1 * mix, aux2 * mix
103 | else:
104 | if aggressiveness:
105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 | mask[:, :, : aggressiveness["split_bin"]],
107 | 1 + aggressiveness["value"] / 3,
108 | )
109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 | mask[:, :, aggressiveness["split_bin"] :],
111 | 1 + aggressiveness["value"],
112 | )
113 |
114 | return mask * mix
115 |
116 | def predict(self, x_mag, aggressiveness=None):
117 | h = self.forward(x_mag, aggressiveness)
118 |
119 | if self.offset > 0:
120 | h = h[:, :, :, self.offset : -self.offset]
121 | assert h.size()[3] > 0
122 |
123 | return h
124 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_123812KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import layers_123821KB as layers
6 |
7 |
8 | class BaseASPPNet(nn.Module):
9 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
10 | super(BaseASPPNet, self).__init__()
11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15 |
16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17 |
18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22 |
23 | def __call__(self, x):
24 | h, e1 = self.enc1(x)
25 | h, e2 = self.enc2(h)
26 | h, e3 = self.enc3(h)
27 | h, e4 = self.enc4(h)
28 |
29 | h = self.aspp(h)
30 |
31 | h = self.dec4(h, e4)
32 | h = self.dec3(h, e3)
33 | h = self.dec2(h, e2)
34 | h = self.dec1(h, e1)
35 |
36 | return h
37 |
38 |
39 | class CascadedASPPNet(nn.Module):
40 | def __init__(self, n_fft):
41 | super(CascadedASPPNet, self).__init__()
42 | self.stg1_low_band_net = BaseASPPNet(2, 32)
43 | self.stg1_high_band_net = BaseASPPNet(2, 32)
44 |
45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46 | self.stg2_full_band_net = BaseASPPNet(16, 32)
47 |
48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49 | self.stg3_full_band_net = BaseASPPNet(32, 64)
50 |
51 | self.out = nn.Conv2d(64, 2, 1, bias=False)
52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54 |
55 | self.max_bin = n_fft // 2
56 | self.output_bin = n_fft // 2 + 1
57 |
58 | self.offset = 128
59 |
60 | def forward(self, x, aggressiveness=None):
61 | mix = x.detach()
62 | x = x.clone()
63 |
64 | x = x[:, :, : self.max_bin]
65 |
66 | bandw = x.size()[2] // 2
67 | aux1 = torch.cat(
68 | [
69 | self.stg1_low_band_net(x[:, :, :bandw]),
70 | self.stg1_high_band_net(x[:, :, bandw:]),
71 | ],
72 | dim=2,
73 | )
74 |
75 | h = torch.cat([x, aux1], dim=1)
76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77 |
78 | h = torch.cat([x, aux1, aux2], dim=1)
79 | h = self.stg3_full_band_net(self.stg3_bridge(h))
80 |
81 | mask = torch.sigmoid(self.out(h))
82 | mask = F.pad(
83 | input=mask,
84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85 | mode="replicate",
86 | )
87 |
88 | if self.training:
89 | aux1 = torch.sigmoid(self.aux1_out(aux1))
90 | aux1 = F.pad(
91 | input=aux1,
92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93 | mode="replicate",
94 | )
95 | aux2 = torch.sigmoid(self.aux2_out(aux2))
96 | aux2 = F.pad(
97 | input=aux2,
98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99 | mode="replicate",
100 | )
101 | return mask * mix, aux1 * mix, aux2 * mix
102 | else:
103 | if aggressiveness:
104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 | mask[:, :, : aggressiveness["split_bin"]],
106 | 1 + aggressiveness["value"] / 3,
107 | )
108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 | mask[:, :, aggressiveness["split_bin"] :],
110 | 1 + aggressiveness["value"],
111 | )
112 |
113 | return mask * mix
114 |
115 | def predict(self, x_mag, aggressiveness=None):
116 | h = self.forward(x_mag, aggressiveness)
117 |
118 | if self.offset > 0:
119 | h = h[:, :, :, self.offset : -self.offset]
120 | assert h.size()[3] > 0
121 |
122 | return h
123 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_123821KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import layers_123821KB as layers
6 |
7 |
8 | class BaseASPPNet(nn.Module):
9 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
10 | super(BaseASPPNet, self).__init__()
11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15 |
16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17 |
18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22 |
23 | def __call__(self, x):
24 | h, e1 = self.enc1(x)
25 | h, e2 = self.enc2(h)
26 | h, e3 = self.enc3(h)
27 | h, e4 = self.enc4(h)
28 |
29 | h = self.aspp(h)
30 |
31 | h = self.dec4(h, e4)
32 | h = self.dec3(h, e3)
33 | h = self.dec2(h, e2)
34 | h = self.dec1(h, e1)
35 |
36 | return h
37 |
38 |
39 | class CascadedASPPNet(nn.Module):
40 | def __init__(self, n_fft):
41 | super(CascadedASPPNet, self).__init__()
42 | self.stg1_low_band_net = BaseASPPNet(2, 32)
43 | self.stg1_high_band_net = BaseASPPNet(2, 32)
44 |
45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46 | self.stg2_full_band_net = BaseASPPNet(16, 32)
47 |
48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49 | self.stg3_full_band_net = BaseASPPNet(32, 64)
50 |
51 | self.out = nn.Conv2d(64, 2, 1, bias=False)
52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54 |
55 | self.max_bin = n_fft // 2
56 | self.output_bin = n_fft // 2 + 1
57 |
58 | self.offset = 128
59 |
60 | def forward(self, x, aggressiveness=None):
61 | mix = x.detach()
62 | x = x.clone()
63 |
64 | x = x[:, :, : self.max_bin]
65 |
66 | bandw = x.size()[2] // 2
67 | aux1 = torch.cat(
68 | [
69 | self.stg1_low_band_net(x[:, :, :bandw]),
70 | self.stg1_high_band_net(x[:, :, bandw:]),
71 | ],
72 | dim=2,
73 | )
74 |
75 | h = torch.cat([x, aux1], dim=1)
76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77 |
78 | h = torch.cat([x, aux1, aux2], dim=1)
79 | h = self.stg3_full_band_net(self.stg3_bridge(h))
80 |
81 | mask = torch.sigmoid(self.out(h))
82 | mask = F.pad(
83 | input=mask,
84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85 | mode="replicate",
86 | )
87 |
88 | if self.training:
89 | aux1 = torch.sigmoid(self.aux1_out(aux1))
90 | aux1 = F.pad(
91 | input=aux1,
92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93 | mode="replicate",
94 | )
95 | aux2 = torch.sigmoid(self.aux2_out(aux2))
96 | aux2 = F.pad(
97 | input=aux2,
98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99 | mode="replicate",
100 | )
101 | return mask * mix, aux1 * mix, aux2 * mix
102 | else:
103 | if aggressiveness:
104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 | mask[:, :, : aggressiveness["split_bin"]],
106 | 1 + aggressiveness["value"] / 3,
107 | )
108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 | mask[:, :, aggressiveness["split_bin"] :],
110 | 1 + aggressiveness["value"],
111 | )
112 |
113 | return mask * mix
114 |
115 | def predict(self, x_mag, aggressiveness=None):
116 | h = self.forward(x_mag, aggressiveness)
117 |
118 | if self.offset > 0:
119 | h = h[:, :, :, self.offset : -self.offset]
120 | assert h.size()[3] > 0
121 |
122 | return h
123 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_33966KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import layers_33966KB as layers
6 |
7 |
8 | class BaseASPPNet(nn.Module):
9 | def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
10 | super(BaseASPPNet, self).__init__()
11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15 |
16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17 |
18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22 |
23 | def __call__(self, x):
24 | h, e1 = self.enc1(x)
25 | h, e2 = self.enc2(h)
26 | h, e3 = self.enc3(h)
27 | h, e4 = self.enc4(h)
28 |
29 | h = self.aspp(h)
30 |
31 | h = self.dec4(h, e4)
32 | h = self.dec3(h, e3)
33 | h = self.dec2(h, e2)
34 | h = self.dec1(h, e1)
35 |
36 | return h
37 |
38 |
39 | class CascadedASPPNet(nn.Module):
40 | def __init__(self, n_fft):
41 | super(CascadedASPPNet, self).__init__()
42 | self.stg1_low_band_net = BaseASPPNet(2, 16)
43 | self.stg1_high_band_net = BaseASPPNet(2, 16)
44 |
45 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
46 | self.stg2_full_band_net = BaseASPPNet(8, 16)
47 |
48 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
49 | self.stg3_full_band_net = BaseASPPNet(16, 32)
50 |
51 | self.out = nn.Conv2d(32, 2, 1, bias=False)
52 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
53 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
54 |
55 | self.max_bin = n_fft // 2
56 | self.output_bin = n_fft // 2 + 1
57 |
58 | self.offset = 128
59 |
60 | def forward(self, x, aggressiveness=None):
61 | mix = x.detach()
62 | x = x.clone()
63 |
64 | x = x[:, :, : self.max_bin]
65 |
66 | bandw = x.size()[2] // 2
67 | aux1 = torch.cat(
68 | [
69 | self.stg1_low_band_net(x[:, :, :bandw]),
70 | self.stg1_high_band_net(x[:, :, bandw:]),
71 | ],
72 | dim=2,
73 | )
74 |
75 | h = torch.cat([x, aux1], dim=1)
76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77 |
78 | h = torch.cat([x, aux1, aux2], dim=1)
79 | h = self.stg3_full_band_net(self.stg3_bridge(h))
80 |
81 | mask = torch.sigmoid(self.out(h))
82 | mask = F.pad(
83 | input=mask,
84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85 | mode="replicate",
86 | )
87 |
88 | if self.training:
89 | aux1 = torch.sigmoid(self.aux1_out(aux1))
90 | aux1 = F.pad(
91 | input=aux1,
92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93 | mode="replicate",
94 | )
95 | aux2 = torch.sigmoid(self.aux2_out(aux2))
96 | aux2 = F.pad(
97 | input=aux2,
98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99 | mode="replicate",
100 | )
101 | return mask * mix, aux1 * mix, aux2 * mix
102 | else:
103 | if aggressiveness:
104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 | mask[:, :, : aggressiveness["split_bin"]],
106 | 1 + aggressiveness["value"] / 3,
107 | )
108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 | mask[:, :, aggressiveness["split_bin"] :],
110 | 1 + aggressiveness["value"],
111 | )
112 |
113 | return mask * mix
114 |
115 | def predict(self, x_mag, aggressiveness=None):
116 | h = self.forward(x_mag, aggressiveness)
117 |
118 | if self.offset > 0:
119 | h = h[:, :, :, self.offset : -self.offset]
120 | assert h.size()[3] > 0
121 |
122 | return h
123 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_537227KB.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from torch import nn
5 |
6 | from . import layers_537238KB as layers
7 |
8 |
9 | class BaseASPPNet(nn.Module):
10 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
11 | super(BaseASPPNet, self).__init__()
12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16 |
17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18 |
19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23 |
24 | def __call__(self, x):
25 | h, e1 = self.enc1(x)
26 | h, e2 = self.enc2(h)
27 | h, e3 = self.enc3(h)
28 | h, e4 = self.enc4(h)
29 |
30 | h = self.aspp(h)
31 |
32 | h = self.dec4(h, e4)
33 | h = self.dec3(h, e3)
34 | h = self.dec2(h, e2)
35 | h = self.dec1(h, e1)
36 |
37 | return h
38 |
39 |
40 | class CascadedASPPNet(nn.Module):
41 | def __init__(self, n_fft):
42 | super(CascadedASPPNet, self).__init__()
43 | self.stg1_low_band_net = BaseASPPNet(2, 64)
44 | self.stg1_high_band_net = BaseASPPNet(2, 64)
45 |
46 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
47 | self.stg2_full_band_net = BaseASPPNet(32, 64)
48 |
49 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
50 | self.stg3_full_band_net = BaseASPPNet(64, 128)
51 |
52 | self.out = nn.Conv2d(128, 2, 1, bias=False)
53 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
54 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
55 |
56 | self.max_bin = n_fft // 2
57 | self.output_bin = n_fft // 2 + 1
58 |
59 | self.offset = 128
60 |
61 | def forward(self, x, aggressiveness=None):
62 | mix = x.detach()
63 | x = x.clone()
64 |
65 | x = x[:, :, : self.max_bin]
66 |
67 | bandw = x.size()[2] // 2
68 | aux1 = torch.cat(
69 | [
70 | self.stg1_low_band_net(x[:, :, :bandw]),
71 | self.stg1_high_band_net(x[:, :, bandw:]),
72 | ],
73 | dim=2,
74 | )
75 |
76 | h = torch.cat([x, aux1], dim=1)
77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78 |
79 | h = torch.cat([x, aux1, aux2], dim=1)
80 | h = self.stg3_full_band_net(self.stg3_bridge(h))
81 |
82 | mask = torch.sigmoid(self.out(h))
83 | mask = F.pad(
84 | input=mask,
85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86 | mode="replicate",
87 | )
88 |
89 | if self.training:
90 | aux1 = torch.sigmoid(self.aux1_out(aux1))
91 | aux1 = F.pad(
92 | input=aux1,
93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94 | mode="replicate",
95 | )
96 | aux2 = torch.sigmoid(self.aux2_out(aux2))
97 | aux2 = F.pad(
98 | input=aux2,
99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 | mode="replicate",
101 | )
102 | return mask * mix, aux1 * mix, aux2 * mix
103 | else:
104 | if aggressiveness:
105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 | mask[:, :, : aggressiveness["split_bin"]],
107 | 1 + aggressiveness["value"] / 3,
108 | )
109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 | mask[:, :, aggressiveness["split_bin"] :],
111 | 1 + aggressiveness["value"],
112 | )
113 |
114 | return mask * mix
115 |
116 | def predict(self, x_mag, aggressiveness=None):
117 | h = self.forward(x_mag, aggressiveness)
118 |
119 | if self.offset > 0:
120 | h = h[:, :, :, self.offset : -self.offset]
121 | assert h.size()[3] > 0
122 |
123 | return h
124 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_537238KB.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from torch import nn
5 |
6 | from . import layers_537238KB as layers
7 |
8 |
9 | class BaseASPPNet(nn.Module):
10 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
11 | super(BaseASPPNet, self).__init__()
12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16 |
17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18 |
19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23 |
24 | def __call__(self, x):
25 | h, e1 = self.enc1(x)
26 | h, e2 = self.enc2(h)
27 | h, e3 = self.enc3(h)
28 | h, e4 = self.enc4(h)
29 |
30 | h = self.aspp(h)
31 |
32 | h = self.dec4(h, e4)
33 | h = self.dec3(h, e3)
34 | h = self.dec2(h, e2)
35 | h = self.dec1(h, e1)
36 |
37 | return h
38 |
39 |
40 | class CascadedASPPNet(nn.Module):
41 | def __init__(self, n_fft):
42 | super(CascadedASPPNet, self).__init__()
43 | self.stg1_low_band_net = BaseASPPNet(2, 64)
44 | self.stg1_high_band_net = BaseASPPNet(2, 64)
45 |
46 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
47 | self.stg2_full_band_net = BaseASPPNet(32, 64)
48 |
49 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
50 | self.stg3_full_band_net = BaseASPPNet(64, 128)
51 |
52 | self.out = nn.Conv2d(128, 2, 1, bias=False)
53 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
54 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
55 |
56 | self.max_bin = n_fft // 2
57 | self.output_bin = n_fft // 2 + 1
58 |
59 | self.offset = 128
60 |
61 | def forward(self, x, aggressiveness=None):
62 | mix = x.detach()
63 | x = x.clone()
64 |
65 | x = x[:, :, : self.max_bin]
66 |
67 | bandw = x.size()[2] // 2
68 | aux1 = torch.cat(
69 | [
70 | self.stg1_low_band_net(x[:, :, :bandw]),
71 | self.stg1_high_band_net(x[:, :, bandw:]),
72 | ],
73 | dim=2,
74 | )
75 |
76 | h = torch.cat([x, aux1], dim=1)
77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78 |
79 | h = torch.cat([x, aux1, aux2], dim=1)
80 | h = self.stg3_full_band_net(self.stg3_bridge(h))
81 |
82 | mask = torch.sigmoid(self.out(h))
83 | mask = F.pad(
84 | input=mask,
85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86 | mode="replicate",
87 | )
88 |
89 | if self.training:
90 | aux1 = torch.sigmoid(self.aux1_out(aux1))
91 | aux1 = F.pad(
92 | input=aux1,
93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94 | mode="replicate",
95 | )
96 | aux2 = torch.sigmoid(self.aux2_out(aux2))
97 | aux2 = F.pad(
98 | input=aux2,
99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 | mode="replicate",
101 | )
102 | return mask * mix, aux1 * mix, aux2 * mix
103 | else:
104 | if aggressiveness:
105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 | mask[:, :, : aggressiveness["split_bin"]],
107 | 1 + aggressiveness["value"] / 3,
108 | )
109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 | mask[:, :, aggressiveness["split_bin"] :],
111 | 1 + aggressiveness["value"],
112 | )
113 |
114 | return mask * mix
115 |
116 | def predict(self, x_mag, aggressiveness=None):
117 | h = self.forward(x_mag, aggressiveness)
118 |
119 | if self.offset > 0:
120 | h = h[:, :, :, self.offset : -self.offset]
121 | assert h.size()[3] > 0
122 |
123 | return h
124 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_61968KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import layers_123821KB as layers
6 |
7 |
8 | class BaseASPPNet(nn.Module):
9 | def __init__(self, nin, ch, dilations=(4, 8, 16)):
10 | super(BaseASPPNet, self).__init__()
11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15 |
16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17 |
18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22 |
23 | def __call__(self, x):
24 | h, e1 = self.enc1(x)
25 | h, e2 = self.enc2(h)
26 | h, e3 = self.enc3(h)
27 | h, e4 = self.enc4(h)
28 |
29 | h = self.aspp(h)
30 |
31 | h = self.dec4(h, e4)
32 | h = self.dec3(h, e3)
33 | h = self.dec2(h, e2)
34 | h = self.dec1(h, e1)
35 |
36 | return h
37 |
38 |
39 | class CascadedASPPNet(nn.Module):
40 | def __init__(self, n_fft):
41 | super(CascadedASPPNet, self).__init__()
42 | self.stg1_low_band_net = BaseASPPNet(2, 32)
43 | self.stg1_high_band_net = BaseASPPNet(2, 32)
44 |
45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46 | self.stg2_full_band_net = BaseASPPNet(16, 32)
47 |
48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49 | self.stg3_full_band_net = BaseASPPNet(32, 64)
50 |
51 | self.out = nn.Conv2d(64, 2, 1, bias=False)
52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54 |
55 | self.max_bin = n_fft // 2
56 | self.output_bin = n_fft // 2 + 1
57 |
58 | self.offset = 128
59 |
60 | def forward(self, x, aggressiveness=None):
61 | mix = x.detach()
62 | x = x.clone()
63 |
64 | x = x[:, :, : self.max_bin]
65 |
66 | bandw = x.size()[2] // 2
67 | aux1 = torch.cat(
68 | [
69 | self.stg1_low_band_net(x[:, :, :bandw]),
70 | self.stg1_high_band_net(x[:, :, bandw:]),
71 | ],
72 | dim=2,
73 | )
74 |
75 | h = torch.cat([x, aux1], dim=1)
76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77 |
78 | h = torch.cat([x, aux1, aux2], dim=1)
79 | h = self.stg3_full_band_net(self.stg3_bridge(h))
80 |
81 | mask = torch.sigmoid(self.out(h))
82 | mask = F.pad(
83 | input=mask,
84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85 | mode="replicate",
86 | )
87 |
88 | if self.training:
89 | aux1 = torch.sigmoid(self.aux1_out(aux1))
90 | aux1 = F.pad(
91 | input=aux1,
92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93 | mode="replicate",
94 | )
95 | aux2 = torch.sigmoid(self.aux2_out(aux2))
96 | aux2 = F.pad(
97 | input=aux2,
98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99 | mode="replicate",
100 | )
101 | return mask * mix, aux1 * mix, aux2 * mix
102 | else:
103 | if aggressiveness:
104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 | mask[:, :, : aggressiveness["split_bin"]],
106 | 1 + aggressiveness["value"] / 3,
107 | )
108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 | mask[:, :, aggressiveness["split_bin"] :],
110 | 1 + aggressiveness["value"],
111 | )
112 |
113 | return mask * mix
114 |
115 | def predict(self, x_mag, aggressiveness=None):
116 | h = self.forward(x_mag, aggressiveness)
117 |
118 | if self.offset > 0:
119 | h = h[:, :, :, self.offset : -self.offset]
120 | assert h.size()[3] > 0
121 |
122 | return h
123 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_new.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import layers_new
6 |
7 |
8 | class BaseNet(nn.Module):
9 | def __init__(
10 | self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
11 | ):
12 | super(BaseNet, self).__init__()
13 | self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
14 | self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
15 | self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
16 | self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
17 | self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
18 |
19 | self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
20 |
21 | self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
22 | self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
23 | self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
24 | self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
25 | self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
26 |
27 | def __call__(self, x):
28 | e1 = self.enc1(x)
29 | e2 = self.enc2(e1)
30 | e3 = self.enc3(e2)
31 | e4 = self.enc4(e3)
32 | e5 = self.enc5(e4)
33 |
34 | h = self.aspp(e5)
35 |
36 | h = self.dec4(h, e4)
37 | h = self.dec3(h, e3)
38 | h = self.dec2(h, e2)
39 | h = torch.cat([h, self.lstm_dec2(h)], dim=1)
40 | h = self.dec1(h, e1)
41 |
42 | return h
43 |
44 |
45 | class CascadedNet(nn.Module):
46 | def __init__(self, n_fft, nout=32, nout_lstm=128):
47 | super(CascadedNet, self).__init__()
48 |
49 | self.max_bin = n_fft // 2
50 | self.output_bin = n_fft // 2 + 1
51 | self.nin_lstm = self.max_bin // 2
52 | self.offset = 64
53 |
54 | self.stg1_low_band_net = nn.Sequential(
55 | BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
56 | layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
57 | )
58 |
59 | self.stg1_high_band_net = BaseNet(
60 | 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
61 | )
62 |
63 | self.stg2_low_band_net = nn.Sequential(
64 | BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
65 | layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
66 | )
67 | self.stg2_high_band_net = BaseNet(
68 | nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
69 | )
70 |
71 | self.stg3_full_band_net = BaseNet(
72 | 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
73 | )
74 |
75 | self.out = nn.Conv2d(nout, 2, 1, bias=False)
76 | self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
77 |
78 | def forward(self, x):
79 | x = x[:, :, : self.max_bin]
80 |
81 | bandw = x.size()[2] // 2
82 | l1_in = x[:, :, :bandw]
83 | h1_in = x[:, :, bandw:]
84 | l1 = self.stg1_low_band_net(l1_in)
85 | h1 = self.stg1_high_band_net(h1_in)
86 | aux1 = torch.cat([l1, h1], dim=2)
87 |
88 | l2_in = torch.cat([l1_in, l1], dim=1)
89 | h2_in = torch.cat([h1_in, h1], dim=1)
90 | l2 = self.stg2_low_band_net(l2_in)
91 | h2 = self.stg2_high_band_net(h2_in)
92 | aux2 = torch.cat([l2, h2], dim=2)
93 |
94 | f3_in = torch.cat([x, aux1, aux2], dim=1)
95 | f3 = self.stg3_full_band_net(f3_in)
96 |
97 | mask = torch.sigmoid(self.out(f3))
98 | mask = F.pad(
99 | input=mask,
100 | pad=(0, 0, 0, self.output_bin - mask.size()[2]),
101 | mode="replicate",
102 | )
103 |
104 | if self.training:
105 | aux = torch.cat([aux1, aux2], dim=1)
106 | aux = torch.sigmoid(self.aux_out(aux))
107 | aux = F.pad(
108 | input=aux,
109 | pad=(0, 0, 0, self.output_bin - aux.size()[2]),
110 | mode="replicate",
111 | )
112 | return mask, aux
113 | else:
114 | return mask
115 |
116 | def predict_mask(self, x):
117 | mask = self.forward(x)
118 |
119 | if self.offset > 0:
120 | mask = mask[:, :, :, self.offset : -self.offset]
121 | assert mask.size()[3] > 0
122 |
123 | return mask
124 |
125 | def predict(self, x, aggressiveness=None):
126 | mask = self.forward(x)
127 | pred_mag = x * mask
128 |
129 | if self.offset > 0:
130 | pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
131 | assert pred_mag.size()[3] > 0
132 |
133 | return pred_mag
134 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import numpy as np
4 | import torch
5 | from tqdm import tqdm
6 |
7 |
8 | def load_data(file_name: str = "./lib/name_params.json") -> dict:
9 | with open(file_name, "r") as f:
10 | data = json.load(f)
11 |
12 | return data
13 |
14 |
15 | def make_padding(width, cropsize, offset):
16 | left = offset
17 | roi_size = cropsize - left * 2
18 | if roi_size == 0:
19 | roi_size = cropsize
20 | right = roi_size - (width % roi_size) + left
21 |
22 | return left, right, roi_size
23 |
24 |
25 | def inference(X_spec, device, model, aggressiveness, data):
26 | """
27 | data : dic configs
28 | """
29 |
30 | def _execute(
31 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
32 | ):
33 | model.eval()
34 | with torch.no_grad():
35 | preds = []
36 |
37 | iterations = [n_window]
38 |
39 | total_iterations = sum(iterations)
40 | for i in tqdm(range(n_window)):
41 | start = i * roi_size
42 | X_mag_window = X_mag_pad[
43 | None, :, :, start : start + data["window_size"]
44 | ]
45 | X_mag_window = torch.from_numpy(X_mag_window)
46 | if is_half:
47 | X_mag_window = X_mag_window.half()
48 | X_mag_window = X_mag_window.to(device)
49 |
50 | pred = model.predict(X_mag_window, aggressiveness)
51 |
52 | pred = pred.detach().cpu().numpy()
53 | preds.append(pred[0])
54 |
55 | pred = np.concatenate(preds, axis=2)
56 | return pred
57 |
58 | def preprocess(X_spec):
59 | X_mag = np.abs(X_spec)
60 | X_phase = np.angle(X_spec)
61 |
62 | return X_mag, X_phase
63 |
64 | X_mag, X_phase = preprocess(X_spec)
65 |
66 | coef = X_mag.max()
67 | X_mag_pre = X_mag / coef
68 |
69 | n_frame = X_mag_pre.shape[2]
70 | pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
71 | n_window = int(np.ceil(n_frame / roi_size))
72 |
73 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
74 |
75 | if list(model.state_dict().values())[0].dtype == torch.float16:
76 | is_half = True
77 | else:
78 | is_half = False
79 | pred = _execute(
80 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
81 | )
82 | pred = pred[:, :, :n_frame]
83 |
84 | if data["tta"]:
85 | pad_l += roi_size // 2
86 | pad_r += roi_size // 2
87 | n_window += 1
88 |
89 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
90 |
91 | pred_tta = _execute(
92 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
93 | )
94 | pred_tta = pred_tta[:, :, roi_size // 2 :]
95 | pred_tta = pred_tta[:, :, :n_frame]
96 |
97 | return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
98 | else:
99 | return pred * coef, X_mag, np.exp(1.0j * X_phase)
100 |
101 |
102 | def _get_name_params(model_path, model_hash):
103 | data = load_data()
104 | flag = False
105 | ModelName = model_path
106 | for type in list(data):
107 | for model in list(data[type][0]):
108 | for i in range(len(data[type][0][model])):
109 | if str(data[type][0][model][i]["hash_name"]) == model_hash:
110 | flag = True
111 | elif str(data[type][0][model][i]["hash_name"]) in ModelName:
112 | flag = True
113 |
114 | if flag:
115 | model_params_auto = data[type][0][model][i]["model_params"]
116 | param_name_auto = data[type][0][model][i]["param_name"]
117 | if type == "equivalent":
118 | return param_name_auto, model_params_auto
119 | else:
120 | flag = False
121 | return param_name_auto, model_params_auto
122 |
--------------------------------------------------------------------------------