├── .dockerignore ├── .gitignore ├── Changelog_CN.md ├── Docker ├── damo.sha256 ├── download.py ├── download.sh ├── links.sha256 └── links.txt ├── Dockerfile ├── GPT_SoVITS ├── AR │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── bucket_sampler.py │ │ ├── data_module.py │ │ └── dataset.py │ ├── models │ │ ├── __init__.py │ │ ├── t2s_lightning_module.py │ │ ├── t2s_lightning_module_onnx.py │ │ ├── t2s_model.py │ │ ├── t2s_model_onnx.py │ │ └── utils.py │ ├── modules │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── activation_onnx.py │ │ ├── embedding.py │ │ ├── embedding_onnx.py │ │ ├── lr_schedulers.py │ │ ├── optim.py │ │ ├── patched_mha_with_cache.py │ │ ├── patched_mha_with_cache_onnx.py │ │ ├── scaling.py │ │ ├── transformer.py │ │ └── transformer_onnx.py │ ├── text_processing │ │ ├── __init__.py │ │ ├── phonemizer.py │ │ └── symbols.py │ └── utils │ │ ├── __init__.py │ │ ├── initialize.py │ │ └── io.py ├── configs │ ├── s1.yaml │ ├── s1big.yaml │ ├── s1big2.yaml │ ├── s1longer.yaml │ ├── s1mq.yaml │ ├── s2.json │ └── train.yaml ├── feature_extractor │ ├── __init__.py │ ├── cnhubert.py │ └── whisper_enc.py ├── inference_gui.py ├── inference_webui.py ├── module │ ├── __init__.py │ ├── attentions.py │ ├── attentions_onnx.py │ ├── commons.py │ ├── core_vq.py │ ├── data_utils.py │ ├── losses.py │ ├── mel_processing.py │ ├── models.py │ ├── models_onnx.py │ ├── modules.py │ ├── mrte_model.py │ ├── quantize.py │ └── transforms.py ├── my_utils.py ├── onnx_export.py ├── prepare_datasets │ ├── 1-get-text.py │ ├── 2-get-hubert-wav32k.py │ └── 3-get-semantic.py ├── pretrained_models │ └── .gitignore ├── process_ckpt.py ├── s1_train.py ├── s2_train.py ├── text │ ├── __init__.py │ ├── chinese.py │ ├── cleaner.py │ ├── cmudict-fast.rep │ ├── cmudict.rep │ ├── engdict-hot.rep │ ├── engdict_cache.pickle │ ├── english.py │ ├── japanese.py │ ├── opencpop-strict.txt │ ├── symbols.py │ ├── tone_sandhi.py │ └── zh_normalization │ │ ├── README.md │ │ ├── __init__.py │ │ ├── char_convert.py │ │ ├── chronology.py │ │ ├── constants.py │ │ ├── num.py │ │ ├── phonecode.py │ │ ├── quantifier.py │ │ └── text_normlization.py └── utils.py ├── GPT_SoVITS_Inference.ipynb ├── LICENSE ├── README.md ├── api.py ├── colab_webui.ipynb ├── config.py ├── docker-compose.yaml ├── dockerbuild.sh ├── docs ├── en │ └── README.md ├── ja │ ├── Changelog_JA.md │ └── README.md └── ko │ ├── Changelog_KO.md │ └── README.md ├── go-webui.bat ├── go-webui.ps1 ├── gpt-sovits_kaggle.ipynb ├── i18n └── locale │ ├── en_US.json │ ├── es_ES.json │ ├── fr_FR.json │ ├── it_IT.json │ ├── ja_JP.json │ ├── ko_KR.json │ ├── pt_BR.json │ ├── ru_RU.json │ ├── tr_TR.json │ ├── zh_CN.json │ ├── zh_HK.json │ ├── zh_SG.json │ └── zh_TW.json ├── install.sh ├── requirements.txt ├── tools ├── asr │ ├── config.py │ ├── fasterwhisper_asr.py │ ├── funasr_asr.py │ └── models │ │ └── .gitignore ├── cmd-denoise.py ├── denoise-model │ └── .gitignore ├── i18n │ ├── i18n.py │ ├── locale │ │ ├── en_US.json │ │ ├── es_ES.json │ │ ├── fr_FR.json │ │ ├── it_IT.json │ │ ├── ja_JP.json │ │ ├── ko_KR.json │ │ ├── ru_RU.json │ │ ├── tr_TR.json │ │ ├── zh_CN.json │ │ ├── zh_HK.json │ │ ├── zh_SG.json │ │ └── zh_TW.json │ ├── locale_diff.py │ └── scan_i18n.py ├── my_utils.py ├── slice_audio.py ├── slicer2.py ├── subfix_webui.py └── uvr5 │ ├── lib │ ├── lib_v5 │ │ ├── dataset.py │ │ ├── layers.py │ │ ├── layers_123812KB.py │ │ ├── layers_123821KB.py │ │ ├── layers_33966KB.py │ │ ├── layers_537227KB.py │ │ ├── layers_537238KB.py │ │ ├── layers_new.py │ │ ├── model_param_init.py │ │ ├── modelparams │ │ │ ├── 1band_sr16000_hl512.json │ │ │ ├── 1band_sr32000_hl512.json │ │ │ ├── 1band_sr33075_hl384.json │ │ │ ├── 1band_sr44100_hl1024.json │ │ │ ├── 1band_sr44100_hl256.json │ │ │ ├── 1band_sr44100_hl512.json │ │ │ ├── 1band_sr44100_hl512_cut.json │ │ │ ├── 2band_32000.json │ │ │ ├── 2band_44100_lofi.json │ │ │ ├── 2band_48000.json │ │ │ ├── 3band_44100.json │ │ │ ├── 3band_44100_mid.json │ │ │ ├── 3band_44100_msb2.json │ │ │ ├── 4band_44100.json │ │ │ ├── 4band_44100_mid.json │ │ │ ├── 4band_44100_msb.json │ │ │ ├── 4band_44100_msb2.json │ │ │ ├── 4band_44100_reverse.json │ │ │ ├── 4band_44100_sw.json │ │ │ ├── 4band_v2.json │ │ │ ├── 4band_v2_sn.json │ │ │ ├── 4band_v3.json │ │ │ └── ensemble.json │ │ ├── nets.py │ │ ├── nets_123812KB.py │ │ ├── nets_123821KB.py │ │ ├── nets_33966KB.py │ │ ├── nets_537227KB.py │ │ ├── nets_537238KB.py │ │ ├── nets_61968KB.py │ │ ├── nets_new.py │ │ └── spec_utils.py │ ├── name_params.json │ └── utils.py │ ├── mdxnet.py │ ├── vr.py │ └── webui.py ├── vc_webui.py └── webui.py /.dockerignore: -------------------------------------------------------------------------------- 1 | docs 2 | logs 3 | output 4 | reference 5 | SoVITS_weights 6 | GPT_weights 7 | TEMP 8 | .git 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | *.pyc 4 | env 5 | runtime 6 | .idea 7 | output 8 | logs 9 | reference 10 | GPT_weights 11 | SoVITS_weights 12 | TEMP 13 | 14 | 15 | -------------------------------------------------------------------------------- /Changelog_CN.md: -------------------------------------------------------------------------------- 1 | ### 20240121更新 2 | 3 | 1-config添加is_share,诸如colab等场景可以将此改为True,来使得webui映射到公网 4 | 5 | 2-WebUI添加英文系统英文翻译适配 6 | 7 | 3-cmd-asr自动判断是否已自带damo模型,如不在默认目录上将从modelscope自带下载 8 | 9 | 4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等) 10 | 11 | 5-清理TEMP文件夹缓存音频等文件 12 | 13 | 6-大幅削弱合成音频包含参考音频结尾的问题 14 | 15 | ### 20240122更新 16 | 17 | 1-修复过短输出文件返回重复参考音频的问题。 18 | 19 | 2-经测试,英文日文训练原生支持(日文训练需要根目录不含非英文等特殊字符)。 20 | 21 | 3-音频路径检查。如果尝试读取输入错的路径报错路径不存在,而非ffmpeg错误。 22 | 23 | ### 20240123更新 24 | 25 | 1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题 26 | 27 | 2-支持推理界面快速切换模型 28 | 29 | 3-优化模型文件排序逻辑 30 | 31 | 4-中文分词使用jieba_fast代替jieba 32 | 33 | ### 20240126更新 34 | 35 | 1-支持输出文本中英混合、日英混合 36 | 37 | 2-输出可选切分模式 38 | 39 | 3-修复uvr5读取到目录自动跳出的问题 40 | 41 | 4-修复多个换行导致推理报错 42 | 43 | 5-去除推理界面大量冗余log 44 | 45 | 6-支持mac训练推理 46 | 47 | 7-自动识别不支持半精度的卡强制单精度。cpu推理下强制单精度。 48 | 49 | ### 20240128更新 50 | 51 | 1-修复数字转汉字念法问题 52 | 53 | 2-修复句首少量字容易吞字的问题 54 | 55 | 3-通过限制排除不合理的参考音频长度 56 | 57 | 4-修复GPT训练不保存ckpt的问题 58 | 59 | 5-完善Dockerfile的下载模型流程 60 | 61 | ### 20240129更新 62 | 63 | 1-16系等半精度训练有问题的显卡把训练配置改为单精度训练 64 | 65 | 2-测试更新可用的colab版本 66 | 67 | 3-修复git clone modelscope funasr仓库+老版本funasr导致接口不对齐报错的问题 68 | 69 | 70 | ### 20240130更新 71 | 72 | 1-所有涉及路径的地方双引号自动去除,小白复制路径带双引号不会报错 73 | 74 | 2-修复中英文标点切割问题和句首句尾补标点的问题 75 | 76 | 3-增加按标点符号切分 77 | 78 | ### 20240201更新 79 | 80 | 1-修复uvr5读取格式错误导致分离失败的问题 81 | 82 | 2-支持中日英混合多种文本自动切分识别语种 83 | 84 | ### 20240202更新 85 | 86 | 1-修复asr路径尾缀带/保存文件名报错 87 | 88 | 2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题,例如:xx.xx%(带百分号类),元/吨 会读成 元吨 而不是元每吨,下划线不再会报错 89 | 90 | ### 20240207更新 91 | 92 | 1-修正语种传参混乱导致中文推理效果下降 https://github.com/RVC-Boss/GPT-SoVITS/issues/391 93 | 94 | 2-uvr5适配高版本librosa https://github.com/RVC-Boss/GPT-SoVITS/pull/403 95 | 96 | 3-修复uvr5 inf everywhere报错的问题(is_half传参未转换bool导致恒定半精度推理,16系显卡会inf) https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8 97 | 98 | 4-优化英文文本前端 99 | 100 | 5-修复gradio依赖 101 | 102 | 6-支持三连根目录留空自动读取.list全路径 103 | 104 | 7-集成faster whisper ASR日文英文 105 | 106 | ### 20240208更新 107 | 108 | 1-GPT训练卡死(win10 1909)和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 (系统语言繁体)GPT训练报错,[尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)。 109 | 110 | ### 20240212更新 111 | 112 | 1-faster whisper和funasr逻辑优化。faster whisper转镜像站下载,规避huggingface连不上的问题。 113 | 114 | 2-DPO Loss实验性训练选项开启,通过构造负样本训练缓解GPT重复漏字问题。推理界面公开几个推理参数。 https://github.com/RVC-Boss/GPT-SoVITS/pull/457 115 | 116 | ### 20240214更新 117 | 118 | 1-训练支持中文实验名(原来会报错) 119 | 120 | 2-DPO训练改为可勾选选项而非必须。如勾选batch size自动减半。修复推理界面新参数不传参的问题。 121 | 122 | ### 20240216更新 123 | 124 | 1-支持无参考文本输入 125 | 126 | 2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475 127 | 128 | ### 20240221更新 129 | 130 | 1-数据处理添加语音降噪选项 131 | 132 | 2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509 133 | 134 | 3-mac CPU推理更快因此把推理设备从mps改到CPU 135 | 136 | 4-colab修复不开启公网url 137 | 138 | todolist: 139 | 140 | 1-中文多音字推理优化 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /Docker/damo.sha256: -------------------------------------------------------------------------------- 1 | 5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb 2 | b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb 3 | a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb -------------------------------------------------------------------------------- /Docker/download.py: -------------------------------------------------------------------------------- 1 | # Download moda ASR related models 2 | from modelscope import snapshot_download 3 | model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',revision="v2.0.4") 4 | model_dir = snapshot_download('damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',revision="v2.0.4") 5 | model_dir = snapshot_download('damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',revision="v2.0.4") 6 | -------------------------------------------------------------------------------- /Docker/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -Eeuo pipefail 4 | 5 | echo "Downloading models..." 6 | 7 | aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue 8 | 9 | echo "Checking SHA256..." 10 | 11 | parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c" 12 | -------------------------------------------------------------------------------- /Docker/links.sha256: -------------------------------------------------------------------------------- 1 | b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt 2 | fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth 3 | 020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth 4 | 24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin 5 | e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin 6 | 39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth 7 | 45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth 8 | 5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth 9 | 8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth 10 | 01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth 11 | 56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth 12 | 233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx -------------------------------------------------------------------------------- /Docker/links.txt: -------------------------------------------------------------------------------- 1 | # GPT-SoVITS models 2 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt 3 | out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt 4 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth 5 | out=GPT_SoVITS/pretrained_models/s2D488k.pth 6 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth 7 | out=GPT_SoVITS/pretrained_models/s2G488k.pth 8 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json 9 | out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json 10 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json 11 | out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json 12 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin 13 | out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin 14 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json 15 | out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json 16 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin 17 | out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin 18 | https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json 19 | out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json 20 | # UVR5 21 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth 22 | out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth 23 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth 24 | out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth 25 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth 26 | out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth 27 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth 28 | out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth 29 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth 30 | out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth 31 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth 32 | out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth 33 | https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx 34 | out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Base CUDA image 2 | FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04 3 | 4 | LABEL maintainer="breakstring@hotmail.com" 5 | LABEL version="dev-20240209" 6 | LABEL description="Docker image for GPT-SoVITS" 7 | 8 | 9 | # Install 3rd party apps 10 | ENV DEBIAN_FRONTEND=noninteractive 11 | ENV TZ=Etc/UTC 12 | RUN apt-get update && \ 13 | apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \ 14 | git lfs install && \ 15 | rm -rf /var/lib/apt/lists/* 16 | 17 | # Copy only requirements.txt initially to leverage Docker cache 18 | WORKDIR /workspace 19 | COPY requirements.txt /workspace/ 20 | RUN pip install --no-cache-dir -r requirements.txt 21 | 22 | # Define a build-time argument for image type 23 | ARG IMAGE_TYPE=full 24 | 25 | # Conditional logic based on the IMAGE_TYPE argument 26 | # Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite" 27 | COPY ./Docker /workspace/Docker 28 | # elite 类型的镜像里面不包含额外的模型 29 | RUN if [ "$IMAGE_TYPE" != "elite" ]; then \ 30 | chmod +x /workspace/Docker/download.sh && \ 31 | /workspace/Docker/download.sh && \ 32 | python /workspace/Docker/download.py && \ 33 | python -m nltk.downloader averaged_perceptron_tagger cmudict; \ 34 | fi 35 | 36 | 37 | # Copy the rest of the application 38 | COPY . /workspace 39 | 40 | # Copy the rest of the application 41 | COPY . /workspace 42 | 43 | EXPOSE 9871 9872 9873 9874 9880 44 | 45 | CMD ["python", "webui.py"] 46 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/AR/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/AR/data/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/data_module.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | from pytorch_lightning import LightningDataModule 4 | from AR.data.bucket_sampler import DistributedBucketSampler 5 | from AR.data.dataset import Text2SemanticDataset 6 | from torch.utils.data import DataLoader 7 | 8 | 9 | class Text2SemanticDataModule(LightningDataModule): 10 | def __init__( 11 | self, 12 | config, 13 | train_semantic_path, 14 | train_phoneme_path, 15 | dev_semantic_path=None, 16 | dev_phoneme_path=None, 17 | ): 18 | super().__init__() 19 | self.config = config 20 | self.train_semantic_path = train_semantic_path 21 | self.train_phoneme_path = train_phoneme_path 22 | self.dev_semantic_path = dev_semantic_path 23 | self.dev_phoneme_path = dev_phoneme_path 24 | self.num_workers = self.config["data"]["num_workers"] 25 | 26 | def prepare_data(self): 27 | pass 28 | 29 | def setup(self, stage=None, output_logs=False): 30 | self._train_dataset = Text2SemanticDataset( 31 | phoneme_path=self.train_phoneme_path, 32 | semantic_path=self.train_semantic_path, 33 | max_sec=self.config["data"]["max_sec"], 34 | pad_val=self.config["data"]["pad_val"], 35 | ) 36 | self._dev_dataset = self._train_dataset 37 | # self._dev_dataset = Text2SemanticDataset( 38 | # phoneme_path=self.dev_phoneme_path, 39 | # semantic_path=self.dev_semantic_path, 40 | # max_sample=self.config['data']['max_eval_sample'], 41 | # max_sec=self.config['data']['max_sec'], 42 | # pad_val=self.config['data']['pad_val']) 43 | 44 | def train_dataloader(self): 45 | batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"] 46 | batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存 47 | sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) 48 | return DataLoader( 49 | self._train_dataset, 50 | batch_size=batch_size, 51 | sampler=sampler, 52 | collate_fn=self._train_dataset.collate, 53 | num_workers=self.num_workers, 54 | persistent_workers=True, 55 | prefetch_factor=16, 56 | ) 57 | 58 | def val_dataloader(self): 59 | return DataLoader( 60 | self._dev_dataset, 61 | batch_size=1, 62 | shuffle=False, 63 | collate_fn=self._train_dataset.collate, 64 | num_workers=max(self.num_workers, 12), 65 | persistent_workers=True, 66 | prefetch_factor=16, 67 | ) 68 | 69 | # 这个会使用到嘛? 70 | def test_dataloader(self): 71 | return DataLoader( 72 | self._dev_dataset, 73 | batch_size=1, 74 | shuffle=False, 75 | collate_fn=self._train_dataset.collate, 76 | ) 77 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/AR/models/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import os, sys 4 | 5 | now_dir = os.getcwd() 6 | sys.path.append(now_dir) 7 | from typing import Dict 8 | 9 | import torch 10 | from pytorch_lightning import LightningModule 11 | from AR.models.t2s_model_onnx import Text2SemanticDecoder 12 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule 13 | from AR.modules.optim import ScaledAdam 14 | 15 | 16 | class Text2SemanticLightningModule(LightningModule): 17 | def __init__(self, config, output_dir, is_train=True): 18 | super().__init__() 19 | self.config = config 20 | self.top_k = 3 21 | self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) 22 | pretrained_s1 = config.get("pretrained_s1") 23 | if pretrained_s1 and is_train: 24 | # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) 25 | print( 26 | self.load_state_dict( 27 | torch.load(pretrained_s1, map_location="cpu")["weight"] 28 | ) 29 | ) 30 | if is_train: 31 | self.automatic_optimization = False 32 | self.save_hyperparameters() 33 | self.eval_dir = output_dir / "eval" 34 | self.eval_dir.mkdir(parents=True, exist_ok=True) 35 | 36 | def training_step(self, batch: Dict, batch_idx: int): 37 | opt = self.optimizers() 38 | scheduler = self.lr_schedulers() 39 | loss, acc = self.model.forward( 40 | batch["phoneme_ids"], 41 | batch["phoneme_ids_len"], 42 | batch["semantic_ids"], 43 | batch["semantic_ids_len"], 44 | batch["bert_feature"], 45 | ) 46 | self.manual_backward(loss) 47 | if batch_idx > 0 and batch_idx % 4 == 0: 48 | opt.step() 49 | opt.zero_grad() 50 | scheduler.step() 51 | 52 | self.log( 53 | "total_loss", 54 | loss, 55 | on_step=True, 56 | on_epoch=True, 57 | prog_bar=True, 58 | sync_dist=True, 59 | ) 60 | self.log( 61 | "lr", 62 | scheduler.get_last_lr()[0], 63 | on_epoch=True, 64 | prog_bar=True, 65 | sync_dist=True, 66 | ) 67 | self.log( 68 | f"top_{self.top_k}_acc", 69 | acc, 70 | on_step=True, 71 | on_epoch=True, 72 | prog_bar=True, 73 | sync_dist=True, 74 | ) 75 | 76 | def validation_step(self, batch: Dict, batch_idx: int): 77 | return 78 | 79 | def configure_optimizers(self): 80 | model_parameters = self.model.parameters() 81 | parameters_names = [] 82 | parameters_names.append( 83 | [name_param_pair[0] for name_param_pair in self.model.named_parameters()] 84 | ) 85 | lm_opt = ScaledAdam( 86 | model_parameters, 87 | lr=0.01, 88 | betas=(0.9, 0.95), 89 | clipping_scale=2.0, 90 | parameters_names=parameters_names, 91 | show_dominant_parameters=False, 92 | clipping_update_period=1000, 93 | ) 94 | 95 | return { 96 | "optimizer": lm_opt, 97 | "lr_scheduler": { 98 | "scheduler": WarmupCosineLRSchedule( 99 | lm_opt, 100 | init_lr=self.config["optimizer"]["lr_init"], 101 | peak_lr=self.config["optimizer"]["lr"], 102 | end_lr=self.config["optimizer"]["lr_end"], 103 | warmup_steps=self.config["optimizer"]["warmup_steps"], 104 | total_steps=self.config["optimizer"]["decay_steps"], 105 | ) 106 | }, 107 | } 108 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/AR/modules/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/embedding.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | 50 | self.reverse = False 51 | self.pe = None 52 | self.extend_pe(torch.tensor(0.0).expand(1, 4000)) 53 | 54 | def extend_pe(self, x): 55 | """Reset the positional encodings.""" 56 | if self.pe is not None: 57 | if self.pe.size(1) >= x.size(1): 58 | if self.pe.dtype != x.dtype or self.pe.device != x.device: 59 | self.pe = self.pe.to(dtype=x.dtype, device=x.device) 60 | return 61 | pe = torch.zeros(x.size(1), self.embedding_dim) 62 | if self.reverse: 63 | position = torch.arange( 64 | x.size(1) - 1, -1, -1.0, dtype=torch.float32 65 | ).unsqueeze(1) 66 | else: 67 | position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) 68 | div_term = torch.exp( 69 | torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) 70 | * -(math.log(10000.0) / self.embedding_dim) 71 | ) 72 | pe[:, 0::2] = torch.sin(position * div_term) 73 | pe[:, 1::2] = torch.cos(position * div_term) 74 | pe = pe.unsqueeze(0) 75 | self.pe = pe.to(device=x.device, dtype=x.dtype).detach() 76 | 77 | def forward(self, x: torch.Tensor) -> torch.Tensor: 78 | self.extend_pe(x) 79 | output = x.unsqueeze(-1) if x.ndim == 2 else x 80 | output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)] 81 | return self.dropout(output) 82 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/embedding_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | self.reverse = False 50 | self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) 51 | 52 | def extend_pe(self, x): 53 | position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1) 54 | scpe = (position * self.div_term).unsqueeze(0) 55 | pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) 56 | pe = pe.contiguous().view(1, -1, self.embedding_dim) 57 | return pe 58 | 59 | def forward(self, x: torch.Tensor) -> torch.Tensor: 60 | pe = self.extend_pe(x) 61 | output = x.unsqueeze(-1) if x.ndim == 2 else x 62 | output = output * self.x_scale + self.alpha * pe 63 | return self.dropout(output) 64 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/lr_schedulers.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import math 4 | 5 | import torch 6 | from matplotlib import pyplot as plt 7 | from torch import nn 8 | from torch.optim import Adam 9 | 10 | 11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): 12 | """ 13 | Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | optimizer, 19 | init_lr, 20 | peak_lr, 21 | end_lr, 22 | warmup_steps=10000, 23 | total_steps=400000, 24 | current_step=0, 25 | ): 26 | self.init_lr = init_lr 27 | self.peak_lr = peak_lr 28 | self.end_lr = end_lr 29 | self.optimizer = optimizer 30 | self._warmup_rate = (peak_lr - init_lr) / warmup_steps 31 | self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps) 32 | self._current_step = current_step 33 | self.lr = init_lr 34 | self.warmup_steps = warmup_steps 35 | self.total_steps = total_steps 36 | self._last_lr = [self.lr] 37 | 38 | def set_lr(self, lr): 39 | self._last_lr = [g["lr"] for g in self.optimizer.param_groups] 40 | for g in self.optimizer.param_groups: 41 | # g['lr'] = lr 42 | g["lr"] = self.end_lr ###锁定用线性 43 | 44 | def step(self): 45 | if self._current_step < self.warmup_steps: 46 | lr = self.init_lr + self._warmup_rate * self._current_step 47 | 48 | elif self._current_step > self.total_steps: 49 | lr = self.end_lr 50 | 51 | else: 52 | decay_ratio = (self._current_step - self.warmup_steps) / ( 53 | self.total_steps - self.warmup_steps 54 | ) 55 | if decay_ratio < 0.0 or decay_ratio > 1.0: 56 | raise RuntimeError( 57 | "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings." 58 | ) 59 | coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) 60 | lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) 61 | 62 | self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定! 63 | self.set_lr(lr) 64 | self.lr = lr 65 | self._current_step += 1 66 | return self.lr 67 | 68 | 69 | if __name__ == "__main__": 70 | m = nn.Linear(10, 10) 71 | opt = Adam(m.parameters(), lr=1e-4) 72 | s = WarmupCosineLRSchedule( 73 | opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0 74 | ) 75 | lrs = [] 76 | for i in range(25000): 77 | s.step() 78 | lrs.append(s.lr) 79 | print(s.lr) 80 | 81 | plt.plot(lrs) 82 | plt.plot(range(0, 25000), lrs) 83 | plt.show() 84 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py: -------------------------------------------------------------------------------- 1 | from torch.nn.functional import * 2 | from torch.nn.functional import ( 3 | _mha_shape_check, 4 | _canonical_mask, 5 | _none_or_dtype, 6 | _in_projection_packed, 7 | ) 8 | 9 | def multi_head_attention_forward_patched( 10 | query, 11 | key, 12 | value, 13 | embed_dim_to_check: int, 14 | num_heads: int, 15 | in_proj_weight, 16 | in_proj_bias: Optional[Tensor], 17 | bias_k: Optional[Tensor], 18 | bias_v: Optional[Tensor], 19 | add_zero_attn: bool, 20 | dropout_p: float, 21 | out_proj_weight: Tensor, 22 | out_proj_bias: Optional[Tensor], 23 | training: bool = True, 24 | key_padding_mask: Optional[Tensor] = None, 25 | need_weights: bool = True, 26 | attn_mask: Optional[Tensor] = None, 27 | use_separate_proj_weight: bool = False, 28 | q_proj_weight: Optional[Tensor] = None, 29 | k_proj_weight: Optional[Tensor] = None, 30 | v_proj_weight: Optional[Tensor] = None, 31 | static_k: Optional[Tensor] = None, 32 | static_v: Optional[Tensor] = None, 33 | average_attn_weights: bool = True, 34 | is_causal: bool = False, 35 | cache=None, 36 | ) -> Tuple[Tensor, Optional[Tensor]]: 37 | 38 | # set up shape vars 39 | _, _, embed_dim = query.shape 40 | attn_mask = _canonical_mask( 41 | mask=attn_mask, 42 | mask_name="attn_mask", 43 | other_type=None, 44 | other_name="", 45 | target_type=query.dtype, 46 | check_other=False, 47 | ) 48 | head_dim = embed_dim // num_heads 49 | 50 | proj_qkv = linear(query, in_proj_weight, in_proj_bias) 51 | proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() 52 | q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2] 53 | 54 | if cache["first_infer"] == 1: 55 | cache["k"][cache["stage"]] = k 56 | cache["v"][cache["stage"]] = v 57 | else: 58 | cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0) 59 | cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0) 60 | k = cache["k"][cache["stage"]] 61 | v = cache["v"][cache["stage"]] 62 | cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] 63 | 64 | attn_mask = _canonical_mask( 65 | mask=attn_mask, 66 | mask_name="attn_mask", 67 | other_type=None, 68 | other_name="", 69 | target_type=q.dtype, 70 | check_other=False, 71 | ) 72 | attn_mask = attn_mask.unsqueeze(0) 73 | 74 | q = q.view(-1, num_heads, head_dim).transpose(0, 1) 75 | k = k.view(-1, num_heads, head_dim).transpose(0, 1) 76 | v = v.view(-1, num_heads, head_dim).transpose(0, 1) 77 | 78 | dropout_p = 0.0 79 | attn_mask = attn_mask.unsqueeze(0) 80 | q = q.view(num_heads, -1, head_dim).unsqueeze(0) 81 | k = k.view(num_heads, -1, head_dim).unsqueeze(0) 82 | v = v.view(num_heads, -1, head_dim).unsqueeze(0) 83 | attn_output = scaled_dot_product_attention( 84 | q, k, v, attn_mask, dropout_p, is_causal 85 | ) 86 | attn_output = ( 87 | attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) 88 | ) 89 | attn_output = linear(attn_output, out_proj_weight, out_proj_bias) 90 | attn_output = attn_output.view(-1, 1, attn_output.size(1)) 91 | 92 | return attn_output 93 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/AR/text_processing/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/phonemizer.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import itertools 4 | import re 5 | from typing import Dict 6 | from typing import List 7 | 8 | import regex 9 | from gruut import sentences 10 | from gruut.const import Sentence 11 | from gruut.const import Word 12 | from AR.text_processing.symbols import SYMBOL_TO_ID 13 | 14 | 15 | class GruutPhonemizer: 16 | def __init__(self, language: str): 17 | self._phonemizer = sentences 18 | self.lang = language 19 | self.symbol_to_id = SYMBOL_TO_ID 20 | self._special_cases_dict: Dict[str] = { 21 | r"\.\.\.": "... ", 22 | ";": "; ", 23 | ":": ": ", 24 | ",": ", ", 25 | r"\.": ". ", 26 | "!": "! ", 27 | r"\?": "? ", 28 | "—": "—", 29 | "…": "… ", 30 | "«": "«", 31 | "»": "»", 32 | } 33 | self._punctuation_regexp: str = ( 34 | rf"([{''.join(self._special_cases_dict.keys())}])" 35 | ) 36 | 37 | def _normalize_punctuation(self, text: str) -> str: 38 | text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) 39 | text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) 40 | text = regex.sub(r"\pZ+", r" ", text) 41 | return text.strip() 42 | 43 | def _convert_punctuation(self, word: Word) -> str: 44 | if not word.phonemes: 45 | return "" 46 | if word.phonemes[0] in ["‖", "|"]: 47 | return word.text.strip() 48 | 49 | phonemes = "".join(word.phonemes) 50 | # remove modifier characters ˈˌː with regex 51 | phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) 52 | return phonemes.strip() 53 | 54 | def phonemize(self, text: str, espeak: bool = False) -> str: 55 | text_to_phonemize: str = self._normalize_punctuation(text) 56 | sents: List[Sentence] = [ 57 | sent 58 | for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak) 59 | ] 60 | words: List[str] = [ 61 | self._convert_punctuation(word) for word in itertools.chain(*sents) 62 | ] 63 | return " ".join(words) 64 | 65 | def transform(self, phonemes): 66 | # convert phonemes to ids 67 | # dictionary is in symbols.py 68 | return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] 69 | 70 | 71 | if __name__ == "__main__": 72 | phonemizer = GruutPhonemizer("en-us") 73 | # text -> IPA 74 | phonemes = phonemizer.phonemize("Hello, wor-ld ?") 75 | print("phonemes:", phonemes) 76 | print("len(phonemes):", len(phonemes)) 77 | phoneme_ids = phonemizer.transform(phonemes) 78 | print("phoneme_ids:", phoneme_ids) 79 | print("len(phoneme_ids):", len(phoneme_ids)) 80 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/symbols.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | PAD = "_" 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” ' 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 6 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 7 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) 8 | SPACE_ID = SYMBOLS.index(" ") 9 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} 10 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)} 11 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def str2bool(str): 5 | return True if str.lower() == 'true' else False 6 | 7 | 8 | def get_newest_ckpt(string_list): 9 | # 定义一个正则表达式模式,用于匹配字符串中的数字 10 | pattern = r'epoch=(\d+)-step=(\d+)\.ckpt' 11 | 12 | # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 13 | extracted_info = [] 14 | for string in string_list: 15 | match = re.match(pattern, string) 16 | if match: 17 | epoch = int(match.group(1)) 18 | step = int(match.group(2)) 19 | extracted_info.append((epoch, step, string)) 20 | # 按照 epoch 后面的数字和 step 后面的数字进行排序 21 | sorted_info = sorted( 22 | extracted_info, key=lambda x: (x[0], x[1]), reverse=True) 23 | # 获取最新的 ckpt 文件名 24 | newest_ckpt = sorted_info[0][2] 25 | return newest_ckpt 26 | 27 | 28 | # 文本存在且不为空时 return True 29 | def check_txt_file(file_path): 30 | try: 31 | with open(file_path, 'r') as file: 32 | text = file.readline().strip() 33 | assert text.strip() != '' 34 | return text 35 | except Exception: 36 | return False 37 | return False 38 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/initialize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Initialize modules for espnet2 neural networks.""" 3 | import torch 4 | from typeguard import check_argument_types 5 | 6 | 7 | def initialize(model: torch.nn.Module, init: str): 8 | """Initialize weights of a neural network module. 9 | 10 | Parameters are initialized using the given method or distribution. 11 | 12 | Custom initialization routines can be implemented into submodules 13 | as function `espnet_initialization_fn` within the custom module. 14 | 15 | Args: 16 | model: Target. 17 | init: Method of initialization. 18 | """ 19 | assert check_argument_types() 20 | print("init with", init) 21 | 22 | # weight init 23 | for p in model.parameters(): 24 | if p.dim() > 1: 25 | if init == "xavier_uniform": 26 | torch.nn.init.xavier_uniform_(p.data) 27 | elif init == "xavier_normal": 28 | torch.nn.init.xavier_normal_(p.data) 29 | elif init == "kaiming_uniform": 30 | torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu") 31 | elif init == "kaiming_normal": 32 | torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu") 33 | else: 34 | raise ValueError("Unknown initialization: " + init) 35 | # bias init 36 | for name, p in model.named_parameters(): 37 | if ".bias" in name and p.dim() == 1: 38 | p.data.zero_() 39 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/io.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | import yaml 5 | 6 | 7 | def load_yaml_config(path): 8 | with open(path) as f: 9 | config = yaml.full_load(f) 10 | return config 11 | 12 | 13 | def save_config_to_yaml(config, path): 14 | assert path.endswith(".yaml") 15 | with open(path, "w") as f: 16 | f.write(yaml.dump(config)) 17 | f.close() 18 | 19 | 20 | def write_args(args, path): 21 | args_dict = dict( 22 | (name, getattr(args, name)) for name in dir(args) if not name.startswith("_") 23 | ) 24 | with open(path, "a") as args_file: 25 | args_file.write("==> torch version: {}\n".format(torch.__version__)) 26 | args_file.write( 27 | "==> cudnn version: {}\n".format(torch.backends.cudnn.version()) 28 | ) 29 | args_file.write("==> Cmd:\n") 30 | args_file.write(str(sys.argv)) 31 | args_file.write("\n==> args:\n") 32 | for k, v in sorted(args_dict.items()): 33 | args_file.write(" %s: %s\n" % (str(k), str(v))) 34 | args_file.close() 35 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 8 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 512 24 | hidden_dim: 512 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 12 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1big.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 8 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 1024 24 | hidden_dim: 1024 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 16 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1big2.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 12 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 1024 24 | hidden_dim: 1024 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 6 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1longer.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 20 4 | batch_size: 8 5 | save_every_n_epoch: 1 6 | precision: 16-mixed 7 | gradient_clip: 1.0 8 | optimizer: 9 | lr: 0.01 10 | lr_init: 0.00001 11 | lr_end: 0.0001 12 | warmup_steps: 2000 13 | decay_steps: 40000 14 | data: 15 | max_eval_sample: 8 16 | max_sec: 54 17 | num_workers: 4 18 | pad_val: 1024 # same with EOS in model 19 | model: 20 | vocab_size: 1025 21 | phoneme_vocab_size: 512 22 | embedding_dim: 512 23 | hidden_dim: 512 24 | head: 16 25 | linear_units: 2048 26 | n_layer: 24 27 | dropout: 0 28 | EOS: 1024 29 | random_bert: 0 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1mq.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 100 4 | batch_size: 6 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 32 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 40 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | saving_path: "ckpt/" 22 | resume_checkpoint: null 23 | vocoder_config_path: "quantizer/new_ckpt/config.json" 24 | vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000" 25 | datadir: "/home/liweiche/GigaSpeech/wavs" 26 | metapath: "/home/liweiche/GigaSpeech/train2.json" 27 | val_metapath: "/home/liweiche/GigaSpeech/dev2.json" 28 | sampledir: "logs/" 29 | pretrained_path: null 30 | lr: 0.0001 31 | batch_size: 200.0 32 | train_bucket_size: 8192 33 | training_step: 800000 34 | optim_flat_percent: 0.0 35 | warmup_step: 50 36 | adam_beta1: 0.9 37 | adam_beta2: 0.98 38 | ffd_size: 3072 39 | hidden_size: 768 40 | enc_nlayers: 6 41 | dec_nlayers: 6 42 | nheads: 12 43 | ar_layer: 4 44 | ar_ffd_size: 1024 45 | ar_hidden_size: 256 46 | ar_nheads: 4 47 | aligner_softmax_temp: 1.0 48 | layer_norm_eps: 0.00001 49 | speaker_embed_dropout: 0.05 50 | label_smoothing: 0.0 51 | val_check_interval: 5000 52 | check_val_every_n_epoch: 1 53 | precision: "fp16" 54 | nworkers: 16 55 | distributed: true 56 | accelerator: "ddp" 57 | version: null 58 | accumulate_grad_batches: 1 59 | use_repetition_token: true 60 | use_repetition_gating: false 61 | repetition_penalty: 1.0 62 | sampling_temperature: 1.0 63 | top_k: -1 64 | min_top_k: 3 65 | top_p: 0.8 66 | sample_num: 4 67 | length_penalty_max_length: 15000 68 | length_penalty_max_prob: 0.95 69 | max_input_length: 2048 70 | max_output_length: 2000 71 | sample_rate: 16000 72 | n_codes: 1024 73 | n_cluster_groups: 1 74 | phone_context_window: 4 75 | phoneset_size: 1000 76 | inference: 77 | top_k: 5 78 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 100, 4 | "eval_interval": 500, 5 | "seed": 1234, 6 | "epochs": 100, 7 | "learning_rate": 0.0001, 8 | "betas": [ 9 | 0.8, 10 | 0.99 11 | ], 12 | "eps": 1e-09, 13 | "batch_size": 32, 14 | "fp16_run": true, 15 | "lr_decay": 0.999875, 16 | "segment_size": 20480, 17 | "init_lr_ratio": 1, 18 | "warmup_epochs": 0, 19 | "c_mel": 45, 20 | "c_kl": 1.0, 21 | "text_low_lr_rate": 0.4 22 | }, 23 | "data": { 24 | "max_wav_value": 32768.0, 25 | "sampling_rate": 32000, 26 | "filter_length": 2048, 27 | "hop_length": 640, 28 | "win_length": 2048, 29 | "n_mel_channels": 128, 30 | "mel_fmin": 0.0, 31 | "mel_fmax": null, 32 | "add_blank": true, 33 | "n_speakers": 300, 34 | "cleaned_text": true 35 | }, 36 | "model": { 37 | "inter_channels": 192, 38 | "hidden_channels": 192, 39 | "filter_channels": 768, 40 | "n_heads": 2, 41 | "n_layers": 6, 42 | "kernel_size": 3, 43 | "p_dropout": 0.1, 44 | "resblock": "1", 45 | "resblock_kernel_sizes": [ 46 | 3, 47 | 7, 48 | 11 49 | ], 50 | "resblock_dilation_sizes": [ 51 | [ 52 | 1, 53 | 3, 54 | 5 55 | ], 56 | [ 57 | 1, 58 | 3, 59 | 5 60 | ], 61 | [ 62 | 1, 63 | 3, 64 | 5 65 | ] 66 | ], 67 | "upsample_rates": [ 68 | 10, 69 | 8, 70 | 2, 71 | 2, 72 | 2 73 | ], 74 | "upsample_initial_channel": 512, 75 | "upsample_kernel_sizes": [ 76 | 16, 77 | 16, 78 | 8, 79 | 2, 80 | 2 81 | ], 82 | "n_layers_q": 3, 83 | "use_spectral_norm": false, 84 | "gin_channels": 512, 85 | "semantic_frame_rate": "25hz", 86 | "freeze_quantizer": true 87 | }, 88 | "s2_ckpt_dir": "logs/s2/big2k1", 89 | "content_module": "cnhubert" 90 | } -------------------------------------------------------------------------------- /GPT_SoVITS/configs/train.yaml: -------------------------------------------------------------------------------- 1 | gpu: 2 | n_card: 1 3 | n_process_per_card: 2 4 | io: 5 | text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 512 24 | hidden_dim: 512 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 24 28 | dropout: 0 29 | EOS: 1024 30 | random_bert: 0 31 | inference: 32 | top_k: 5 33 | -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cnhubert, whisper_enc 2 | 3 | content_module_map = { 4 | 'cnhubert': cnhubert, 5 | 'whisper': whisper_enc 6 | } -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/cnhubert.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import librosa 4 | import torch 5 | import torch.nn.functional as F 6 | import soundfile as sf 7 | import logging 8 | 9 | logging.getLogger("numba").setLevel(logging.WARNING) 10 | 11 | from transformers import ( 12 | Wav2Vec2FeatureExtractor, 13 | HubertModel, 14 | ) 15 | 16 | import utils 17 | import torch.nn as nn 18 | 19 | cnhubert_base_path = None 20 | 21 | 22 | class CNHubert(nn.Module): 23 | def __init__(self): 24 | super().__init__() 25 | self.model = HubertModel.from_pretrained(cnhubert_base_path) 26 | self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( 27 | cnhubert_base_path 28 | ) 29 | 30 | def forward(self, x): 31 | input_values = self.feature_extractor( 32 | x, return_tensors="pt", sampling_rate=16000 33 | ).input_values.to(x.device) 34 | feats = self.model(input_values)["last_hidden_state"] 35 | return feats 36 | 37 | 38 | # class CNHubertLarge(nn.Module): 39 | # def __init__(self): 40 | # super().__init__() 41 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") 42 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") 43 | # def forward(self, x): 44 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 45 | # feats = self.model(input_values)["last_hidden_state"] 46 | # return feats 47 | # 48 | # class CVec(nn.Module): 49 | # def __init__(self): 50 | # super().__init__() 51 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") 52 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") 53 | # def forward(self, x): 54 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 55 | # feats = self.model(input_values)["last_hidden_state"] 56 | # return feats 57 | # 58 | # class cnw2v2base(nn.Module): 59 | # def __init__(self): 60 | # super().__init__() 61 | # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") 62 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") 63 | # def forward(self, x): 64 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 65 | # feats = self.model(input_values)["last_hidden_state"] 66 | # return feats 67 | 68 | 69 | def get_model(): 70 | model = CNHubert() 71 | model.eval() 72 | return model 73 | 74 | 75 | # def get_large_model(): 76 | # model = CNHubertLarge() 77 | # model.eval() 78 | # return model 79 | # 80 | # def get_model_cvec(): 81 | # model = CVec() 82 | # model.eval() 83 | # return model 84 | # 85 | # def get_model_cnw2v2base(): 86 | # model = cnw2v2base() 87 | # model.eval() 88 | # return model 89 | 90 | 91 | def get_content(hmodel, wav_16k_tensor): 92 | with torch.no_grad(): 93 | feats = hmodel(wav_16k_tensor) 94 | return feats.transpose(1, 2) 95 | 96 | 97 | if __name__ == "__main__": 98 | model = get_model() 99 | src_path = "/Users/Shared/原音频2.wav" 100 | wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000) 101 | model = model 102 | wav_16k_tensor = wav_16k_tensor 103 | feats = get_content(model, wav_16k_tensor) 104 | print(feats.shape) 105 | -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/whisper_enc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_model(): 5 | import whisper 6 | 7 | model = whisper.load_model("small", device="cpu") 8 | 9 | return model.encoder 10 | 11 | 12 | def get_content(model=None, wav_16k_tensor=None): 13 | from whisper import log_mel_spectrogram, pad_or_trim 14 | 15 | dev = next(model.parameters()).device 16 | mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000] 17 | # if torch.cuda.is_available(): 18 | # mel = mel.to(torch.float16) 19 | feature_len = mel.shape[-1] // 2 20 | assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" 21 | with torch.no_grad(): 22 | feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[ 23 | :1, :feature_len, : 24 | ].transpose(1, 2) 25 | return feature 26 | -------------------------------------------------------------------------------- /GPT_SoVITS/module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/module/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/module/losses.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch.nn import functional as F 5 | 6 | 7 | def feature_loss(fmap_r, fmap_g): 8 | loss = 0 9 | for dr, dg in zip(fmap_r, fmap_g): 10 | for rl, gl in zip(dr, dg): 11 | rl = rl.float().detach() 12 | gl = gl.float() 13 | loss += torch.mean(torch.abs(rl - gl)) 14 | 15 | return loss * 2 16 | 17 | 18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 19 | loss = 0 20 | r_losses = [] 21 | g_losses = [] 22 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 23 | dr = dr.float() 24 | dg = dg.float() 25 | r_loss = torch.mean((1 - dr) ** 2) 26 | g_loss = torch.mean(dg**2) 27 | loss += r_loss + g_loss 28 | r_losses.append(r_loss.item()) 29 | g_losses.append(g_loss.item()) 30 | 31 | return loss, r_losses, g_losses 32 | 33 | 34 | def generator_loss(disc_outputs): 35 | loss = 0 36 | gen_losses = [] 37 | for dg in disc_outputs: 38 | dg = dg.float() 39 | l = torch.mean((1 - dg) ** 2) 40 | gen_losses.append(l) 41 | loss += l 42 | 43 | return loss, gen_losses 44 | 45 | 46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 47 | """ 48 | z_p, logs_q: [b, h, t_t] 49 | m_p, logs_p: [b, h, t_t] 50 | """ 51 | z_p = z_p.float() 52 | logs_q = logs_q.float() 53 | m_p = m_p.float() 54 | logs_p = logs_p.float() 55 | z_mask = z_mask.float() 56 | 57 | kl = logs_p - logs_q - 0.5 58 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 59 | kl = torch.sum(kl * z_mask) 60 | l = kl / torch.sum(z_mask) 61 | return l 62 | 63 | 64 | def mle_loss(z, m, logs, logdet, mask): 65 | l = torch.sum(logs) + 0.5 * torch.sum( 66 | torch.exp(-2 * logs) * ((z - m) ** 2) 67 | ) # neg normal likelihood w/o the constant term 68 | l = l - torch.sum(logdet) # log jacobian determinant 69 | l = l / torch.sum( 70 | torch.ones_like(z) * mask 71 | ) # averaging across batch, channel and time axes 72 | l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term 73 | return l 74 | -------------------------------------------------------------------------------- /GPT_SoVITS/module/mel_processing.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import torch 5 | from torch import nn 6 | import torch.nn.functional as F 7 | import torch.utils.data 8 | import numpy as np 9 | import librosa 10 | import librosa.util as librosa_util 11 | from librosa.util import normalize, pad_center, tiny 12 | from scipy.signal import get_window 13 | from scipy.io.wavfile import read 14 | from librosa.filters import mel as librosa_mel_fn 15 | 16 | MAX_WAV_VALUE = 32768.0 17 | 18 | 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 20 | """ 21 | PARAMS 22 | ------ 23 | C: compression factor 24 | """ 25 | return torch.log(torch.clamp(x, min=clip_val) * C) 26 | 27 | 28 | def dynamic_range_decompression_torch(x, C=1): 29 | """ 30 | PARAMS 31 | ------ 32 | C: compression factor used to compress 33 | """ 34 | return torch.exp(x) / C 35 | 36 | 37 | def spectral_normalize_torch(magnitudes): 38 | output = dynamic_range_compression_torch(magnitudes) 39 | return output 40 | 41 | 42 | def spectral_de_normalize_torch(magnitudes): 43 | output = dynamic_range_decompression_torch(magnitudes) 44 | return output 45 | 46 | 47 | mel_basis = {} 48 | hann_window = {} 49 | 50 | 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 52 | if torch.min(y) < -1.0: 53 | print("min value is ", torch.min(y)) 54 | if torch.max(y) > 1.0: 55 | print("max value is ", torch.max(y)) 56 | 57 | global hann_window 58 | dtype_device = str(y.dtype) + "_" + str(y.device) 59 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 60 | if wnsize_dtype_device not in hann_window: 61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 62 | dtype=y.dtype, device=y.device 63 | ) 64 | 65 | y = torch.nn.functional.pad( 66 | y.unsqueeze(1), 67 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 68 | mode="reflect", 69 | ) 70 | y = y.squeeze(1) 71 | spec = torch.stft( 72 | y, 73 | n_fft, 74 | hop_length=hop_size, 75 | win_length=win_size, 76 | window=hann_window[wnsize_dtype_device], 77 | center=center, 78 | pad_mode="reflect", 79 | normalized=False, 80 | onesided=True, 81 | return_complex=False, 82 | ) 83 | 84 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 85 | return spec 86 | 87 | 88 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 89 | global mel_basis 90 | dtype_device = str(spec.dtype) + "_" + str(spec.device) 91 | fmax_dtype_device = str(fmax) + "_" + dtype_device 92 | if fmax_dtype_device not in mel_basis: 93 | mel = librosa_mel_fn( 94 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 95 | ) 96 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 97 | dtype=spec.dtype, device=spec.device 98 | ) 99 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 100 | spec = spectral_normalize_torch(spec) 101 | return spec 102 | 103 | 104 | def mel_spectrogram_torch( 105 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False 106 | ): 107 | if torch.min(y) < -1.0: 108 | print("min value is ", torch.min(y)) 109 | if torch.max(y) > 1.0: 110 | print("max value is ", torch.max(y)) 111 | 112 | global mel_basis, hann_window 113 | dtype_device = str(y.dtype) + "_" + str(y.device) 114 | fmax_dtype_device = str(fmax) + "_" + dtype_device 115 | wnsize_dtype_device = str(win_size) + "_" + dtype_device 116 | if fmax_dtype_device not in mel_basis: 117 | mel = librosa_mel_fn( 118 | sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax 119 | ) 120 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( 121 | dtype=y.dtype, device=y.device 122 | ) 123 | if wnsize_dtype_device not in hann_window: 124 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( 125 | dtype=y.dtype, device=y.device 126 | ) 127 | 128 | y = torch.nn.functional.pad( 129 | y.unsqueeze(1), 130 | (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), 131 | mode="reflect", 132 | ) 133 | y = y.squeeze(1) 134 | 135 | spec = torch.stft( 136 | y, 137 | n_fft, 138 | hop_length=hop_size, 139 | win_length=win_size, 140 | window=hann_window[wnsize_dtype_device], 141 | center=center, 142 | pad_mode="reflect", 143 | normalized=False, 144 | onesided=True, 145 | return_complex=False, 146 | ) 147 | 148 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 149 | 150 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 151 | spec = spectral_normalize_torch(spec) 152 | 153 | return spec 154 | -------------------------------------------------------------------------------- /GPT_SoVITS/module/quantize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """Residual vector quantizer implementation.""" 8 | 9 | from dataclasses import dataclass, field 10 | import math 11 | import typing as tp 12 | 13 | import torch 14 | from torch import nn 15 | 16 | from module.core_vq import ResidualVectorQuantization 17 | 18 | 19 | @dataclass 20 | class QuantizedResult: 21 | quantized: torch.Tensor 22 | codes: torch.Tensor 23 | bandwidth: torch.Tensor # bandwidth in kb/s used, per batch item. 24 | penalty: tp.Optional[torch.Tensor] = None 25 | metrics: dict = field(default_factory=dict) 26 | 27 | 28 | class ResidualVectorQuantizer(nn.Module): 29 | """Residual Vector Quantizer. 30 | Args: 31 | dimension (int): Dimension of the codebooks. 32 | n_q (int): Number of residual vector quantizers used. 33 | bins (int): Codebook size. 34 | decay (float): Decay for exponential moving average over the codebooks. 35 | kmeans_init (bool): Whether to use kmeans to initialize the codebooks. 36 | kmeans_iters (int): Number of iterations used for kmeans initialization. 37 | threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes 38 | that have an exponential moving average cluster size less than the specified threshold with 39 | randomly selected vector from the current batch. 40 | """ 41 | 42 | def __init__( 43 | self, 44 | dimension: int = 256, 45 | n_q: int = 8, 46 | bins: int = 1024, 47 | decay: float = 0.99, 48 | kmeans_init: bool = True, 49 | kmeans_iters: int = 50, 50 | threshold_ema_dead_code: int = 2, 51 | ): 52 | super().__init__() 53 | self.n_q = n_q 54 | self.dimension = dimension 55 | self.bins = bins 56 | self.decay = decay 57 | self.kmeans_init = kmeans_init 58 | self.kmeans_iters = kmeans_iters 59 | self.threshold_ema_dead_code = threshold_ema_dead_code 60 | self.vq = ResidualVectorQuantization( 61 | dim=self.dimension, 62 | codebook_size=self.bins, 63 | num_quantizers=self.n_q, 64 | decay=self.decay, 65 | kmeans_init=self.kmeans_init, 66 | kmeans_iters=self.kmeans_iters, 67 | threshold_ema_dead_code=self.threshold_ema_dead_code, 68 | ) 69 | 70 | def forward( 71 | self, 72 | x: torch.Tensor, 73 | n_q: tp.Optional[int] = None, 74 | layers: tp.Optional[list] = None, 75 | ) -> QuantizedResult: 76 | """Residual vector quantization on the given input tensor. 77 | Args: 78 | x (torch.Tensor): Input tensor. 79 | n_q (int): Number of quantizer used to quantize. Default: All quantizers. 80 | layers (list): Layer that need to return quantized. Defalt: None. 81 | Returns: 82 | QuantizedResult: 83 | The quantized (or approximately quantized) representation with 84 | the associated numbert quantizers and layer quantized required to return. 85 | """ 86 | n_q = n_q if n_q else self.n_q 87 | if layers and max(layers) >= n_q: 88 | raise ValueError( 89 | f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B." 90 | ) 91 | quantized, codes, commit_loss, quantized_list = self.vq( 92 | x, n_q=n_q, layers=layers 93 | ) 94 | return quantized, codes, torch.mean(commit_loss), quantized_list 95 | 96 | def encode( 97 | self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None 98 | ) -> torch.Tensor: 99 | """Encode a given input tensor with the specified sample rate at the given bandwidth. 100 | The RVQ encode method sets the appropriate number of quantizer to use 101 | and returns indices for each quantizer. 102 | Args: 103 | x (torch.Tensor): Input tensor. 104 | n_q (int): Number of quantizer used to quantize. Default: All quantizers. 105 | st (int): Start to encode input from which layers. Default: 0. 106 | """ 107 | n_q = n_q if n_q else self.n_q 108 | st = st or 0 109 | codes = self.vq.encode(x, n_q=n_q, st=st) 110 | return codes 111 | 112 | def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor: 113 | """Decode the given codes to the quantized representation. 114 | Args: 115 | codes (torch.Tensor): Input indices for each quantizer. 116 | st (int): Start to decode input codes from which layers. Default: 0. 117 | """ 118 | quantized = self.vq.decode(codes, st=st) 119 | return quantized 120 | -------------------------------------------------------------------------------- /GPT_SoVITS/my_utils.py: -------------------------------------------------------------------------------- 1 | import ffmpeg 2 | import numpy as np 3 | 4 | 5 | def load_audio(file, sr): 6 | try: 7 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 8 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 9 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 10 | file = ( 11 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 12 | ) # 防止小白拷路径头尾带了空格和"和回车 13 | out, _ = ( 14 | ffmpeg.input(file, threads=0) 15 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 16 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 17 | ) 18 | except Exception as e: 19 | raise RuntimeError(f"Failed to load audio: {e}") 20 | 21 | return np.frombuffer(out, np.float32).flatten() 22 | -------------------------------------------------------------------------------- /GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys,os 4 | inp_text= os.environ.get("inp_text") 5 | inp_wav_dir= os.environ.get("inp_wav_dir") 6 | exp_name= os.environ.get("exp_name") 7 | i_part= os.environ.get("i_part") 8 | all_parts= os.environ.get("all_parts") 9 | os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") 10 | from feature_extractor import cnhubert 11 | opt_dir= os.environ.get("opt_dir") 12 | cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") 13 | is_half=eval(os.environ.get("is_half","True")) 14 | 15 | import pdb,traceback,numpy as np,logging 16 | from scipy.io import wavfile 17 | import librosa,torch 18 | now_dir = os.getcwd() 19 | sys.path.append(now_dir) 20 | from my_utils import load_audio 21 | 22 | # from config import cnhubert_base_path 23 | # cnhubert.cnhubert_base_path=cnhubert_base_path 24 | # inp_text=sys.argv[1] 25 | # inp_wav_dir=sys.argv[2] 26 | # exp_name=sys.argv[3] 27 | # i_part=sys.argv[4] 28 | # all_parts=sys.argv[5] 29 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6] 30 | # cnhubert.cnhubert_base_path=sys.argv[7] 31 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name 32 | 33 | from time import time as ttime 34 | import shutil 35 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path 36 | dir=os.path.dirname(path) 37 | name=os.path.basename(path) 38 | # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) 39 | tmp_path="%s%s.pth"%(ttime(),i_part) 40 | torch.save(fea,tmp_path) 41 | shutil.move(tmp_path,"%s/%s"%(dir,name)) 42 | 43 | hubert_dir="%s/4-cnhubert"%(opt_dir) 44 | wav32dir="%s/5-wav32k"%(opt_dir) 45 | os.makedirs(opt_dir,exist_ok=True) 46 | os.makedirs(hubert_dir,exist_ok=True) 47 | os.makedirs(wav32dir,exist_ok=True) 48 | 49 | maxx=0.95 50 | alpha=0.5 51 | if torch.cuda.is_available(): 52 | device = "cuda:0" 53 | elif torch.backends.mps.is_available(): 54 | device = "mps" 55 | else: 56 | device = "cpu" 57 | model=cnhubert.get_model() 58 | # is_half=False 59 | if(is_half==True): 60 | model=model.half().to(device) 61 | else: 62 | model = model.to(device) 63 | 64 | nan_fails=[] 65 | def name2go(wav_name,wav_path): 66 | hubert_path="%s/%s.pt"%(hubert_dir,wav_name) 67 | if(os.path.exists(hubert_path)):return 68 | tmp_audio = load_audio(wav_path, 32000) 69 | tmp_max = np.abs(tmp_audio).max() 70 | if tmp_max > 2.2: 71 | print("%s-filtered,%s" % (wav_name, tmp_max)) 72 | return 73 | tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio 74 | tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio 75 | tmp_audio = librosa.resample( 76 | tmp_audio32b, orig_sr=32000, target_sr=16000 77 | )#不是重采样问题 78 | tensor_wav16 = torch.from_numpy(tmp_audio) 79 | if (is_half == True): 80 | tensor_wav16=tensor_wav16.half().to(device) 81 | else: 82 | tensor_wav16 = tensor_wav16.to(device) 83 | ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215]) 84 | if np.isnan(ssl.detach().numpy()).sum()!= 0: 85 | nan_fails.append(wav_name) 86 | print("nan filtered:%s"%wav_name) 87 | return 88 | wavfile.write( 89 | "%s/%s"%(wav32dir,wav_name), 90 | 32000, 91 | tmp_audio32.astype("int16"), 92 | ) 93 | my_save(ssl,hubert_path ) 94 | 95 | with open(inp_text,"r",encoding="utf8")as f: 96 | lines=f.read().strip("\n").split("\n") 97 | 98 | for line in lines[int(i_part)::int(all_parts)]: 99 | try: 100 | # wav_name,text=line.split("\t") 101 | wav_name, spk_name, language, text = line.split("|") 102 | if (inp_wav_dir != "" and inp_wav_dir != None): 103 | wav_name = os.path.basename(wav_name) 104 | wav_path = "%s/%s"%(inp_wav_dir, wav_name) 105 | 106 | else: 107 | wav_path=wav_name 108 | wav_name = os.path.basename(wav_name) 109 | name2go(wav_name,wav_path) 110 | except: 111 | print(line,traceback.format_exc()) 112 | 113 | if(len(nan_fails)>0 and is_half==True): 114 | is_half=False 115 | model=model.float() 116 | for wav_name in nan_fails: 117 | try: 118 | name2go(wav_name) 119 | except: 120 | print(wav_name,traceback.format_exc()) 121 | -------------------------------------------------------------------------------- /GPT_SoVITS/prepare_datasets/3-get-semantic.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | inp_text = os.environ.get("inp_text") 4 | exp_name = os.environ.get("exp_name") 5 | i_part = os.environ.get("i_part") 6 | all_parts = os.environ.get("all_parts") 7 | os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") 8 | opt_dir = os.environ.get("opt_dir") 9 | pretrained_s2G = os.environ.get("pretrained_s2G") 10 | s2config_path = os.environ.get("s2config_path") 11 | is_half = eval(os.environ.get("is_half", "True")) 12 | import math, traceback 13 | import multiprocessing 14 | import sys, pdb 15 | 16 | now_dir = os.getcwd() 17 | sys.path.append(now_dir) 18 | from random import shuffle 19 | import torch.multiprocessing as mp 20 | from glob import glob 21 | from tqdm import tqdm 22 | import logging, librosa, utils, torch 23 | from module.models import SynthesizerTrn 24 | 25 | logging.getLogger("numba").setLevel(logging.WARNING) 26 | # from config import pretrained_s2G 27 | 28 | # inp_text=sys.argv[1] 29 | # exp_name=sys.argv[2] 30 | # i_part=sys.argv[3] 31 | # all_parts=sys.argv[4] 32 | # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[5] 33 | # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name 34 | 35 | 36 | hubert_dir = "%s/4-cnhubert" % (opt_dir) 37 | semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) 38 | if os.path.exists(semantic_path) == False: 39 | os.makedirs(opt_dir, exist_ok=True) 40 | 41 | if torch.cuda.is_available(): 42 | device = "cuda" 43 | elif torch.backends.mps.is_available(): 44 | device = "mps" 45 | else: 46 | device = "cpu" 47 | hps = utils.get_hparams_from_file(s2config_path) 48 | vq_model = SynthesizerTrn( 49 | hps.data.filter_length // 2 + 1, 50 | hps.train.segment_size // hps.data.hop_length, 51 | n_speakers=hps.data.n_speakers, 52 | **hps.model 53 | ) 54 | if is_half == True: 55 | vq_model = vq_model.half().to(device) 56 | else: 57 | vq_model = vq_model.to(device) 58 | vq_model.eval() 59 | # utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True) 60 | # utils.load_checkpoint(pretrained_s2G, vq_model, None, True) 61 | print( 62 | vq_model.load_state_dict( 63 | torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False 64 | ) 65 | ) 66 | 67 | def name2go(wav_name, lines): 68 | hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) 69 | if os.path.exists(hubert_path) == False: 70 | return 71 | ssl_content = torch.load(hubert_path, map_location="cpu") 72 | if is_half == True: 73 | ssl_content = ssl_content.half().to(device) 74 | else: 75 | ssl_content = ssl_content.to(device) 76 | codes = vq_model.extract_latent(ssl_content) 77 | semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()]) 78 | lines.append("%s\t%s" % (wav_name, semantic)) 79 | 80 | with open(inp_text, "r", encoding="utf8") as f: 81 | lines = f.read().strip("\n").split("\n") 82 | 83 | lines1 = [] 84 | for line in lines[int(i_part) :: int(all_parts)]: 85 | # print(line) 86 | try: 87 | # wav_name,text=line.split("\t") 88 | wav_name, spk_name, language, text = line.split("|") 89 | wav_name = os.path.basename(wav_name) 90 | # name2go(name,lines1) 91 | name2go(wav_name, lines1) 92 | except: 93 | print(line, traceback.format_exc()) 94 | with open(semantic_path, "w", encoding="utf8") as f: 95 | f.write("\n".join(lines1)) 96 | -------------------------------------------------------------------------------- /GPT_SoVITS/pretrained_models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /GPT_SoVITS/process_ckpt.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from collections import OrderedDict 3 | from time import time as ttime 4 | import shutil,os 5 | import torch 6 | from tools.i18n.i18n import I18nAuto 7 | 8 | i18n = I18nAuto() 9 | 10 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path 11 | dir=os.path.dirname(path) 12 | name=os.path.basename(path) 13 | tmp_path="%s.pth"%(ttime()) 14 | torch.save(fea,tmp_path) 15 | shutil.move(tmp_path,"%s/%s"%(dir,name)) 16 | 17 | def savee(ckpt, name, epoch, steps, hps): 18 | try: 19 | opt = OrderedDict() 20 | opt["weight"] = {} 21 | for key in ckpt.keys(): 22 | if "enc_q" in key: 23 | continue 24 | opt["weight"][key] = ckpt[key].half() 25 | opt["config"] = hps 26 | opt["info"] = "%sepoch_%siteration" % (epoch, steps) 27 | # torch.save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) 28 | my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) 29 | return "Success." 30 | except: 31 | return traceback.format_exc() 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/__init__.py: -------------------------------------------------------------------------------- 1 | from text.symbols import * 2 | 3 | 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 5 | 6 | def cleaned_text_to_sequence(cleaned_text): 7 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 8 | Args: 9 | text: string to convert to a sequence 10 | Returns: 11 | List of integers corresponding to the symbols in the text 12 | ''' 13 | phones = [_symbol_to_id[symbol] for symbol in cleaned_text] 14 | return phones 15 | 16 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from text import chinese, japanese, cleaned_text_to_sequence, symbols, english 2 | 3 | language_module_map = {"zh": chinese, "ja": japanese, "en": english} 4 | special = [ 5 | # ("%", "zh", "SP"), 6 | ("¥", "zh", "SP2"), 7 | ("^", "zh", "SP3"), 8 | # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 9 | ] 10 | 11 | 12 | def clean_text(text, language): 13 | if(language not in language_module_map): 14 | language="en" 15 | text=" " 16 | for special_s, special_l, target_symbol in special: 17 | if special_s in text and language == special_l: 18 | return clean_special(text, language, special_s, target_symbol) 19 | language_module = language_module_map[language] 20 | norm_text = language_module.text_normalize(text) 21 | if language == "zh": 22 | phones, word2ph = language_module.g2p(norm_text) 23 | assert len(phones) == sum(word2ph) 24 | assert len(norm_text) == len(word2ph) 25 | else: 26 | phones = language_module.g2p(norm_text) 27 | word2ph = None 28 | 29 | for ph in phones: 30 | assert ph in symbols 31 | return phones, word2ph, norm_text 32 | 33 | 34 | def clean_special(text, language, special_s, target_symbol): 35 | """ 36 | 特殊静音段sp符号处理 37 | """ 38 | text = text.replace(special_s, ",") 39 | language_module = language_module_map[language] 40 | norm_text = language_module.text_normalize(text) 41 | phones = language_module.g2p(norm_text) 42 | new_ph = [] 43 | for ph in phones[0]: 44 | assert ph in symbols 45 | if ph == ",": 46 | new_ph.append(target_symbol) 47 | else: 48 | new_ph.append(ph) 49 | return new_ph, phones[1], norm_text 50 | 51 | 52 | def text_to_sequence(text, language): 53 | phones = clean_text(text) 54 | return cleaned_text_to_sequence(phones) 55 | 56 | 57 | if __name__ == "__main__": 58 | print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh")) 59 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/engdict-hot.rep: -------------------------------------------------------------------------------- 1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1 -------------------------------------------------------------------------------- /GPT_SoVITS/text/engdict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangxu1991/GPT-SoVITS-VC/414130d059c869bdfff3f0581a510b38912012f4/GPT_SoVITS/text/engdict_cache.pickle -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/README.md: -------------------------------------------------------------------------------- 1 | ## Supported NSW (Non-Standard-Word) Normalization 2 | 3 | |NSW type|raw|normalized| 4 | |:--|:-|:-| 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| 6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分| 7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二| 8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日| 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我 10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度 11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| 12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| 13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万| 14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一| 15 | ## References 16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files) 17 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from text.zh_normalization.text_normlization import * 15 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/chronology.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import DIGITS 17 | from .num import num2str 18 | from .num import verbalize_cardinal 19 | from .num import verbalize_digit 20 | 21 | 22 | def _time_num2str(num_string: str) -> str: 23 | """A special case for verbalizing number in time.""" 24 | result = num2str(num_string.lstrip('0')) 25 | if num_string.startswith('0'): 26 | result = DIGITS['0'] + result 27 | return result 28 | 29 | 30 | # 时刻表达式 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' 32 | r':([0-5][0-9])' 33 | r'(:([0-5][0-9]))?') 34 | 35 | # 时间范围,如8:30-12:30 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' 37 | r':([0-5][0-9])' 38 | r'(:([0-5][0-9]))?' 39 | r'(~|-)' 40 | r'([0-1]?[0-9]|2[0-3])' 41 | r':([0-5][0-9])' 42 | r'(:([0-5][0-9]))?') 43 | 44 | 45 | def replace_time(match) -> str: 46 | """ 47 | Args: 48 | match (re.Match) 49 | Returns: 50 | str 51 | """ 52 | 53 | is_range = len(match.groups()) > 5 54 | 55 | hour = match.group(1) 56 | minute = match.group(2) 57 | second = match.group(4) 58 | 59 | if is_range: 60 | hour_2 = match.group(6) 61 | minute_2 = match.group(7) 62 | second_2 = match.group(9) 63 | 64 | result = f"{num2str(hour)}点" 65 | if minute.lstrip('0'): 66 | if int(minute) == 30: 67 | result += "半" 68 | else: 69 | result += f"{_time_num2str(minute)}分" 70 | if second and second.lstrip('0'): 71 | result += f"{_time_num2str(second)}秒" 72 | 73 | if is_range: 74 | result += "至" 75 | result += f"{num2str(hour_2)}点" 76 | if minute_2.lstrip('0'): 77 | if int(minute) == 30: 78 | result += "半" 79 | else: 80 | result += f"{_time_num2str(minute_2)}分" 81 | if second_2 and second_2.lstrip('0'): 82 | result += f"{_time_num2str(second_2)}秒" 83 | 84 | return result 85 | 86 | 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年' 88 | r'((0?[1-9]|1[0-2])月)?' 89 | r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?') 90 | 91 | 92 | def replace_date(match) -> str: 93 | """ 94 | Args: 95 | match (re.Match) 96 | Returns: 97 | str 98 | """ 99 | year = match.group(1) 100 | month = match.group(3) 101 | day = match.group(5) 102 | result = "" 103 | if year: 104 | result += f"{verbalize_digit(year)}年" 105 | if month: 106 | result += f"{verbalize_cardinal(month)}月" 107 | if day: 108 | result += f"{verbalize_cardinal(day)}{match.group(9)}" 109 | return result 110 | 111 | 112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 113 | RE_DATE2 = re.compile( 114 | r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])') 115 | 116 | 117 | def replace_date2(match) -> str: 118 | """ 119 | Args: 120 | match (re.Match) 121 | Returns: 122 | str 123 | """ 124 | year = match.group(1) 125 | month = match.group(3) 126 | day = match.group(4) 127 | result = "" 128 | if year: 129 | result += f"{verbalize_digit(year)}年" 130 | if month: 131 | result += f"{verbalize_cardinal(month)}月" 132 | if day: 133 | result += f"{verbalize_cardinal(day)}日" 134 | return result 135 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import string 16 | 17 | from pypinyin.constants import SUPPORT_UCS4 18 | 19 | # 全角半角转换 20 | # 英文字符全角 -> 半角映射表 (num: 52) 21 | F2H_ASCII_LETTERS = { 22 | ord(char) + 65248: ord(char) 23 | for char in string.ascii_letters 24 | } 25 | 26 | # 英文字符半角 -> 全角映射表 27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} 28 | 29 | # 数字字符全角 -> 半角映射表 (num: 10) 30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits} 31 | # 数字字符半角 -> 全角映射表 32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} 33 | 34 | # 标点符号全角 -> 半角映射表 (num: 32) 35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} 36 | # 标点符号半角 -> 全角映射表 37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} 38 | 39 | # 空格 (num: 1) 40 | F2H_SPACE = {'\u3000': ' '} 41 | H2F_SPACE = {' ': '\u3000'} 42 | 43 | # 非"有拼音的汉字"的字符串,可用于NSW提取 44 | if SUPPORT_UCS4: 45 | RE_NSW = re.compile(r'(?:[^' 46 | r'\u3007' # 〇 47 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 48 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 49 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 50 | r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] 51 | r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] 52 | r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] 53 | r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] 54 | r'])+') 55 | else: 56 | RE_NSW = re.compile( # pragma: no cover 57 | r'(?:[^' 58 | r'\u3007' # 〇 59 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 60 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 61 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 62 | r'])+') 63 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/phonecode.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import verbalize_digit 17 | 18 | # 规范化固话/手机号码 19 | # 手机 20 | # http://www.jihaoba.com/news/show/13680 21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 22 | # 联通:130、131、132、156、155、186、185、176 23 | # 电信:133、153、189、180、181、177 24 | RE_MOBILE_PHONE = re.compile( 25 | r"(? str: 34 | if mobile: 35 | sp_parts = phone_string.strip('+').split() 36 | result = ','.join( 37 | [verbalize_digit(part, alt_one=True) for part in sp_parts]) 38 | return result 39 | else: 40 | sil_parts = phone_string.split('-') 41 | result = ','.join( 42 | [verbalize_digit(part, alt_one=True) for part in sil_parts]) 43 | return result 44 | 45 | 46 | def replace_phone(match) -> str: 47 | """ 48 | Args: 49 | match (re.Match) 50 | Returns: 51 | str 52 | """ 53 | return phone2str(match.group(0), mobile=False) 54 | 55 | 56 | def replace_mobile(match) -> str: 57 | """ 58 | Args: 59 | match (re.Match) 60 | Returns: 61 | str 62 | """ 63 | return phone2str(match.group(0)) 64 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/quantifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import num2str 17 | 18 | # 温度表达式,温度会影响负号的读法 19 | # -3°C 零下三度 20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') 21 | measure_dict = { 22 | "cm2": "平方厘米", 23 | "cm²": "平方厘米", 24 | "cm3": "立方厘米", 25 | "cm³": "立方厘米", 26 | "cm": "厘米", 27 | "db": "分贝", 28 | "ds": "毫秒", 29 | "kg": "千克", 30 | "km": "千米", 31 | "m2": "平方米", 32 | "m²": "平方米", 33 | "m³": "立方米", 34 | "m3": "立方米", 35 | "ml": "毫升", 36 | "m": "米", 37 | "mm": "毫米", 38 | "s": "秒" 39 | } 40 | 41 | 42 | def replace_temperature(match) -> str: 43 | """ 44 | Args: 45 | match (re.Match) 46 | Returns: 47 | str 48 | """ 49 | sign = match.group(1) 50 | temperature = match.group(2) 51 | unit = match.group(3) 52 | sign: str = "零下" if sign else "" 53 | temperature: str = num2str(temperature) 54 | unit: str = "摄氏度" if unit == "摄氏度" else "度" 55 | result = f"{sign}{temperature}{unit}" 56 | return result 57 | 58 | 59 | def replace_measure(sentence) -> str: 60 | for q_notation in measure_dict: 61 | if q_notation in sentence: 62 | sentence = sentence.replace(q_notation, measure_dict[q_notation]) 63 | return sentence 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 RVC-Boss 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

GPT-SoVITS-VC-WebUI

4 | 强大的少样本语音转换与语音合成Web用户界面。

5 | 6 | [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) 7 | 8 |
9 | 10 | [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) 11 | [![Licence](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) 12 | [![Huggingface](https://img.shields.io/badge/🤗%20-Models%20Repo-yellow.svg?style=for-the-badge)](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) 13 | 14 | [**English**](./docs/en/README.md) | [**中文简体**](./README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) 15 | 16 |
17 | 18 | --- 19 | ## 本项目新增 Voice Conversion (VC) 变声 20 | ### 特点: 21 | 1. 无需任何训练 22 | 2. 不改任何配置 23 | 3. 支持非固定、任意目标音色变声,无需训练/微调,infer 时目标音色直接作为 prompt 输入 24 | ### 用法: 25 | 1. 按照源项目要求逐一配置,主要是下载好所有预训练模型 26 | 2. 打开终端,执行 python vc_webui.py 27 | --- 28 | ## 变声 demo 试听 29 | 目标音色:ChatGPT 助手音色 30 | 31 | 待转换音色:窃格瓦拉 32 | ### demo 1 33 | 原声: 34 | 35 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/b5a5c3aa-6620-48fc-9c08-9b0711fbd76a 36 | 37 | 变声: 38 | 39 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/6c29db50-471d-4a98-a557-55e356732662 40 | 41 | ### demo 2 42 | 原声: 43 | 44 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/9cb746c8-9d23-4fca-98be-94496af85d14 45 | 46 | 变声: 47 | 48 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/e71279b8-90a9-4dca-9214-31ed1803170a 49 | 50 | 51 | ### demo 3 52 | 原声: 53 | 54 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/ca745165-fc75-44f8-9d92-50097e8d4924 55 | 56 | 变声: 57 | 58 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/7378a676-1f89-4e0b-b931-b921228f7e2c 59 | 60 | ### demo 4 61 | 原声: 62 | 63 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/f2fbc07a-816a-469b-ac27-996b3c2a4cdf 64 | 65 | 变声: 66 | 67 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/dbcb0533-903d-49be-b03b-a8da4189b645 68 | 69 | --- 70 | 71 | ## Star History 72 | [![Star History Chart](https://api.star-history.com/svg?repos=huangxu1991/GPT-SoVITS-VC&type=Date)](https://star-history.com/#huangxu1991/GPT-SoVITS-VC&Date) 73 | -------------------------------------------------------------------------------- /colab_webui.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "include_colab_link": true 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "accelerator": "GPU" 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "view-in-github", 20 | "colab_type": "text" 21 | }, 22 | "source": [ 23 | "\"Open" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "source": [ 29 | "环境配置 environment" 30 | ], 31 | "metadata": { 32 | "id": "_o6a8GS2lWQM" 33 | } 34 | }, 35 | { 36 | "cell_type": "code", 37 | "metadata": { 38 | "id": "e9b7iFV3dm1f" 39 | }, 40 | "source": [ 41 | "!pip install -q condacolab\n", 42 | "# Setting up condacolab and installing packages\n", 43 | "import condacolab\n", 44 | "condacolab.install_from_url(\"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh\")\n", 45 | "%cd -q /content\n", 46 | "!git clone https://github.com/RVC-Boss/GPT-SoVITS\n", 47 | "!conda install -y -q -c pytorch -c nvidia cudatoolkit\n", 48 | "%cd -q /content/GPT-SoVITS\n", 49 | "!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n", 50 | "!/usr/local/bin/pip install -r requirements.txt" 51 | ], 52 | "execution_count": null, 53 | "outputs": [] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "source": [ 58 | "# @title Download pretrained models 下载预训练模型\n", 59 | "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", 60 | "!mkdir -p /content/GPT-SoVITS/tools/damo_asr/models\n", 61 | "!mkdir -p /content/GPT-SoVITS/tools/uvr5\n", 62 | "%cd /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", 63 | "!git clone https://huggingface.co/lj1995/GPT-SoVITS\n", 64 | "%cd /content/GPT-SoVITS/tools/damo_asr/models\n", 65 | "!git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git\n", 66 | "!git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git\n", 67 | "!git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git\n", 68 | "# @title UVR5 pretrains 安装uvr5模型\n", 69 | "%cd /content/GPT-SoVITS/tools/uvr5\n", 70 | "!git clone https://huggingface.co/Delik/uvr5_weights\n", 71 | "!git config core.sparseCheckout true\n", 72 | "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/" 73 | ], 74 | "metadata": { 75 | "id": "0NgxXg5sjv7z" 76 | }, 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "source": [ 83 | "# @title launch WebUI 启动WebUI\n", 84 | "!/usr/local/bin/pip install ipykernel\n", 85 | "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n", 86 | "%cd /content/GPT-SoVITS/\n", 87 | "!/usr/local/bin/python webui.py" 88 | ], 89 | "metadata": { 90 | "id": "4oRGUzkrk8C7" 91 | }, 92 | "execution_count": null, 93 | "outputs": [] 94 | } 95 | ] 96 | } 97 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | 3 | import torch 4 | 5 | # 推理用的指定模型 6 | sovits_path = "" 7 | gpt_path = "" 8 | is_half_str = os.environ.get("is_half", "True") 9 | is_half = True if is_half_str.lower() == 'true' else False 10 | is_share_str = os.environ.get("is_share","False") 11 | is_share= True if is_share_str.lower() == 'true' else False 12 | 13 | cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" 14 | bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" 15 | pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth" 16 | pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" 17 | 18 | exp_root = "logs" 19 | python_exec = sys.executable or "python" 20 | if torch.cuda.is_available(): 21 | infer_device = "cuda" 22 | else: 23 | infer_device = "cpu" 24 | 25 | webui_port_main = 9874 26 | webui_port_uvr5 = 9873 27 | webui_port_infer_tts = 9872 28 | webui_port_subfix = 9871 29 | 30 | api_port = 9880 31 | 32 | if infer_device == "cuda": 33 | gpu_name = torch.cuda.get_device_name(0) 34 | if ( 35 | ("16" in gpu_name and "V100" not in gpu_name.upper()) 36 | or "P40" in gpu_name.upper() 37 | or "P10" in gpu_name.upper() 38 | or "1060" in gpu_name 39 | or "1070" in gpu_name 40 | or "1080" in gpu_name 41 | ): 42 | is_half=False 43 | 44 | if(infer_device=="cpu"):is_half=False 45 | 46 | class Config: 47 | def __init__(self): 48 | self.sovits_path = sovits_path 49 | self.gpt_path = gpt_path 50 | self.is_half = is_half 51 | 52 | self.cnhubert_path = cnhubert_path 53 | self.bert_path = bert_path 54 | self.pretrained_sovits_path = pretrained_sovits_path 55 | self.pretrained_gpt_path = pretrained_gpt_path 56 | 57 | self.exp_root = exp_root 58 | self.python_exec = python_exec 59 | self.infer_device = infer_device 60 | 61 | self.webui_port_main = webui_port_main 62 | self.webui_port_uvr5 = webui_port_uvr5 63 | self.webui_port_infer_tts = webui_port_infer_tts 64 | self.webui_port_subfix = webui_port_subfix 65 | 66 | self.api_port = api_port 67 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | gpt-sovits: 5 | image: breakstring/gpt-sovits:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as GPT-SoVITS, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container. 6 | container_name: gpt-sovits-container 7 | environment: 8 | - is_half=False 9 | - is_share=False 10 | volumes: 11 | - ./output:/workspace/output 12 | - ./logs:/workspace/logs 13 | - ./SoVITS_weights:/workspace/SoVITS_weights 14 | - ./reference:/workspace/reference 15 | working_dir: /workspace 16 | ports: 17 | - "9880:9880" 18 | - "9871:9871" 19 | - "9872:9872" 20 | - "9873:9873" 21 | - "9874:9874" 22 | shm_size: 16G 23 | deploy: 24 | resources: 25 | reservations: 26 | devices: 27 | - driver: nvidia 28 | count: "all" 29 | capabilities: [gpu] 30 | stdin_open: true 31 | tty: true 32 | restart: unless-stopped 33 | -------------------------------------------------------------------------------- /dockerbuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 获取当前日期,格式为 YYYYMMDD 4 | DATE=$(date +%Y%m%d) 5 | # 获取最新的 Git commit 哈希值的前 7 位 6 | COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7) 7 | 8 | # 构建 full 版本的镜像 9 | docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest . 10 | # 为同一个镜像添加带日期的标签 11 | docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE 12 | # 为同一个镜像添加带当前代码库Commit哈希值的标签 13 | docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH 14 | 15 | 16 | # 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器) 17 | docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite . 18 | # 为同一个镜像添加带日期的标签 19 | docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite 20 | # 为同一个镜像添加带当前代码库Commit哈希值的标签 21 | docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite 22 | -------------------------------------------------------------------------------- /docs/en/README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

GPT-SoVITS-VC-WebUI

4 | A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.

5 | 6 | [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS) 7 | 8 |
9 | 10 | [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb) 11 | [![Licence](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE) 12 | [![Huggingface](https://img.shields.io/badge/🤗%20-Models%20Repo-yellow.svg?style=for-the-badge)](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) 13 | 14 | [**English**](./README.md) | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) 15 | 16 |
17 | 18 | --- 19 | ## Voice Conversion (VC) - new added in this fork 20 | ### Features 21 | 1. Don't need any retrain 22 | 2. Support non-fixed/any tartget speaker vocie, and don't need any training or finetuning. You can infer directly through webui! 23 | ### Usages 24 | 1. please config following source repo steps; 25 | 2. open terminal, and execute `python vc_webui.py` 26 | --- 27 | 28 | ## Demos 29 | 30 | Note: Target speaker from ChatGPT Assistant Voice 31 | 32 | ### 1. demo 1 33 | 34 | source: 35 | 36 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/39248790-87f5-484f-8f48-532078412a80 37 | 38 | target: 39 | 40 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/d055d970-6b28-44e0-af9c-e0db0dc01e8c 41 | 42 | ### 2. demo 2 43 | 44 | source: 45 | 46 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/08ab4451-bbeb-4940-a04a-a43df94f0e61 47 | 48 | target: 49 | 50 | https://github.com/huangxu1991/GPT-SoVITS-VC/assets/40886464/e1917fa3-e5e7-4d8e-89f1-e3f0a40fdd18 51 | 52 | --- 53 | -------------------------------------------------------------------------------- /docs/ja/Changelog_JA.md: -------------------------------------------------------------------------------- 1 | ### 20240121 更新 2 | 3 | 1. `config`に`is_share`を追加し、Colab などの環境でこれを`True`に設定すると、webui を公共ネットワークにマッピングできます。 4 | 5 | 2. WebUI に英語システムの英語翻訳を追加しました。 6 | 7 | 3. `cmd-asr`は damo モデルが既に含まれているかどうかを自動的に確認し、デフォルトのパスにない場合は modelscope から自動的にダウンロードします。 8 | 9 | 4. [SoVITS 训练报错 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 修復を試みます(長さ 0 のサンプルをフィルタリングなど) 10 | 11 | 5. TEMP ファイルフォルダからオーディオやその他のファイルをクリーンアップして最適化します。 12 | 13 | 6. 合成オーディオがリファレンスオーディオの終わりを含む問題を大幅に改善しました。 14 | 15 | ### 20240122 更新 16 | 17 | 1. 短すぎる出力ファイルが重複したリファレンスオーディオを返す問題を修正しました。 18 | 19 | 2. 英語-日本語学習がスムーズに進む QA を完了しました。(ただし、日本語学習はルートディレクトリに英語以外の文字が含まれていない必要があります) 20 | 21 | 3. オーディオパスをチェックします。間違ったパスを読み取ろうとすると、「パスが存在しません」というエラーメッセージが返されます。これは ffmpeg モジュールのエラーではありません。 22 | 23 | ### 20240123 更新 24 | 25 | 1. hubert から nan 抽出による SoVITS/GPT 学習中の ZeroDivisionError 関連エラーを修正しました。 26 | 27 | 2. 推論インターフェースでモデルを素早く切り替えることができるようにサポートしました。 28 | 29 | 3. モデルファイルのソートロジックを最適化しました。 30 | 31 | 4. 中国語の分析に`jieba_fast`を`jieba`に置き換えました。 32 | 33 | ### 20240126 更新 34 | 35 | 1. 中国語と英語、日本語と英語が混在した出力テキストをサポートします。 36 | 37 | 2. 出力で選択的な分割モードをサポートします。 38 | 39 | 3. uvr5 がディレクトリを読み取り、自動的に終了する問題を修正しました。 40 | 41 | 4. 複数の改行による推論エラーを修正しました。 42 | 43 | 5. 推論インターフェースから不要なログを削除しました。 44 | 45 | 6. MacOS での学習と推論をサポートします。 46 | 47 | 7. 半精度をサポートしていないカードを自動的に識別して単精度を強制し、CPU 推論では単精度を強制します。 48 | 49 | ### 20240128 更新 50 | 51 | 1. 数字を漢字で読む問題を修正しました。 52 | 53 | 2. 文章の先頭の一部の単語が欠落する問題を修正しました。 54 | 55 | 3. 不適切な長さのリファレンスオーディオを制限しました。 56 | 57 | 4. GPT 学習時の ckpt が保存されない問題を修正しました。 58 | 59 | 5. Dockerfile のモデルダウンロードプロセスを改善しました。 60 | 61 | ### 20240129 更新 62 | 63 | 1. 16 系などの半精度学習に問題があるカードは、学習構成を単精度学習に変更しました。 64 | 65 | 2. Colab でも使用可能なバージョンをテストして更新しました。 66 | 67 | 3. `git clone modelscope funasr`リポジトリと古いバージョンの funasr を使用してインターフェースが一致しないエラーを修正しました。 68 | 69 | ### 20240130 更新 70 | 71 | 1. パスと関連する文字列を解析して、二重引用符を自動的に削除します。また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません。 72 | 73 | 2. 中国語と英語、日本語と英語の混合出力をサポートします。 74 | 75 | 3. 出力で選択的な分割モードをサポートします。 76 | 77 | todolist: 78 | 79 | 1. 同音異義語(中国語)の推論の最適化 80 | 81 | 2. 英語大文字認識と英語ハイフン [問題](https://github.com/RVC-Boss/GPT-SoVITS/issues/271) 82 | 83 | 3. テキストに%記号が含まれているとエラーが発生し、推論が不可能です。また、「元/吨」が「元吨」ではなく「元每吨」と読まれるなどの問題があります。このような問題を解決するには、どのライブラリを使用する必要があり、それに対する改善を検討しています。 84 | 85 | 4. 中-日-英、中-英、日-英を含む 5 つの言語をサポートすることを目標にしています。 86 | -------------------------------------------------------------------------------- /docs/ko/Changelog_KO.md: -------------------------------------------------------------------------------- 1 | ### 20240121 업데이트 2 | 3 | 1. `config`에 `is_share` 추가, Colab 등의 환경에서 이를 `True`로 설정하여 webui를 공용 네트워크에 매핑되도록 할 수 있습니다. 4 | 2. WebUI에 영어 번역이 추가되었습니다. 5 | 3. `cmd-asr`은 damo 모델이 이미 포함되어 있는지 자동으로 확인하고, 기본 경로에 없는 경우 modelscope에서 자동 다운로드 되도록 수정하였습니다. 6 | 4. [SoVITS 학습 중 ZeroDivisionError가 발생](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)하는 경우 복구를 시도합니다. (길이가 0인 샘플 필터링 등) 7 | 5. TEMP 파일 폴더에서 오디오 및 기타 파일을 정리하여 최적화합니다. 8 | 6. 합성 오디오가 레퍼런스 오디오의 끝부분을 포함하는 문제를 개선하였습니다. 9 | 10 | ### 20240122 업데이트 11 | 12 | 1. 너무 짧은 출력 파일이 중복된 레퍼런스 오디오를 반환하는 문제 수정하였습니다. 13 | 2. 영어-일본어 학습이 원활하게 진행되는 QA를 완료하였습니다. (다만, 일본어 학습은 루트 디렉토리에 영어 이외의 문자가 없어야 합니다) 14 | 3. 오디오 경로를 검사합니다. 잘못된 경로를 읽으려고 할 때 '경로가 존재하지 않습니다'라는 에러 메시지를 반환하도록 수정하였습니다. 이는 ffmpeg 모듈의 에러가 아닙니다. 15 | 16 | ### 20240123 업데이트 17 | 18 | 1. hubert에서 nan 추출로 인한 SoVITS/GPT 학습 중 ZeroDivisionError 관련 에러를 해결하였습니다. 19 | 2. 추론 인터페이스에서 모델을 빠르게 전환할 수 있도록 지원하도록 수정되었습니다. 20 | 3. 모델 파일 정렬 로직 최적화하였습니다. 21 | 4. 중문 분석에 `jieba_fast`를 `jieba`로 대체하였습니다. 22 | 23 | ### 20240126 업데이트 24 | 25 | 1. 중국어와 영어, 일본어와 영어가 혼합된 출력 텍스트를 지원합니다. 26 | 2. 출력에서 선택적 분할 모드를 지원합니다. 27 | 3. uvr5가 디렉토리를 읽고 자동으로 종료되는 문제를 수정하였습니다. 28 | 4. 여러 줄바꿈으로 인한 추론 오류를 수정하였습니다. 29 | 5. 추론 인터페이스에서 불필요한 로그 제거하였습니다. 30 | 6. MacOS에서의 학습 및 추론을 지원합니다. 31 | 7. 반정밀을 지원하지 않는 카드를 자동으로 식별하여 단일 정밀도를 강제 적용하고, CPU 추론에서 단일 정밀도를 강제 적용합니다. 32 | 33 | ### 20240128 업데이트 34 | 35 | 1. 숫자를 한자로 읽는 문제를 수정했습니다. 36 | 2. 문장 시작 부분의 일부 단어가 누락되는 문제 수정하였습니다. 37 | 3. 부적절한 길이의 레퍼런스 오디오를 제한하였습니다. 38 | 4. GPT 학습 시 ckpt가 저장되지 않는 문제 수정하였습니다. 39 | 5. Dockerfile에서 모델 다운로드 프로세스 개선하였습니다. 40 | 41 | ### 20240129 업데이트 42 | 43 | 1. 반정밀도 훈련에 문제가 있는 16 시리즈 및 기타 그래픽 카드의 훈련 구성을 단정밀도 훈련으로 변경했습니다. 44 | 2. Colab에서도 사용이 가능한 버전을 테스트 및 업데이트 하였습니다. 45 | 3. `git clone modelscope funasr` 저장소와 오래된 버전의 funasr 사용으로 인해 인터페이스가 일치하지 않는 오류를 수정하였습니다. 46 | 47 | ### 20240130 업데이트 48 | 49 | 1. 경로와 관련된 문자열을 파싱하여 큰따옴표를 자동으로 제거합니다. 또한, 경로를 복사하는 경우 큰따옴표가 포함되어도 오류가 발생하지 않습니다. 50 | 2. 중국어 및 영어 문자열의 문장 부호가 잘리는 문제 및 문장의 시작과 끝에 문장 부호가 추가되는 문제를 수정했습니다. 51 | 3. 문장 부호의 수를 확장하였습니다. 52 | 53 | ### 20240201 업데이트 54 | 55 | 1. uvr5가 잘못된 형식으로 읽어들이는 문제를 수정하였습니다. 56 | 2. 중국어, 일본어, 영어가 혼합된 여러 텍스트를 자동으로 분리하여 언어를 인식합니다. 57 | 58 | ### 20240202 업데이트 59 | 60 | 1. asr 경로의 끝에 `/`가 포함되어 있는 경우 오류가 발생하는 문제를 수정하였습니다. 61 | 2. paddlespeech의 Normalizer를 도입하여 [문제를 해결](https://github.com/RVC-Boss/GPT-SoVITS/pull/377)하여, 예를 들어 xx.xx%(백분율), 元/吨이 元吨으로 읽히는 문제를 해결하였습니다. 또한, 밑줄이 더 이상 오류를 발생시키지 않습니다. 62 | 63 | ### 20240207 업데이트 64 | 65 | 1. 언어 전달 매개변수가 혼란스러워져 [중국어 추론 효과가 저하되는 문제](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)를 수정하였습니다. 66 | 2. uvr5가 `inf everywhere` [오류를 반환하는 문제](https://github.com/RVC-Boss/GPT-SoVITS/pull/403)를 수정하였습니다. 67 | 3. uvr5의 `is_half` 매개변수가 bool로 변환되지 않아 항상 반정밀도 추론으로 설정되어 16 시리즈 그래픽 카드에서 `inf`가 반환되는 [문제](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)를 수정하였습니다. 68 | 4. 영어 텍스트 입력을 최적화하였습니다. 69 | 5. gradio 종속성을 지원합니다. 70 | 6. 루트 디렉토리가 비어 있으면 `.list` 전체 경로를 자동으로 읽습니다. 71 | 7. faster whisper ASR 일본어 및 영어를 지원합니다. 72 | 73 | ### 20240208 업데이트 74 | 75 | 1. GPT 학습이 카드에 따라 멈추는 문제와 [GPT 학습 중 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) 문제를 수정하였습니다. 76 | 77 | ### 20240212 업데이트 78 | 79 | 1. faster whisper 및 funasr 로직을 최적화하였습니다. faster whisper는 이미지 스토어에서 다운로드하여 huggingface에 연결하지 못하는 문제를 회피합니다. 80 | 2. DPO Loss 실험적 학습 옵션을 활성화하여 부정적 샘플을 생성하여 [GPT 반복 및 누락 문자 문제](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)를 완화합니다. 추론 인터페이스에 몇 가지 추론 매개변수를 공개합니다. 81 | 82 | ### 20240214 업데이트 83 | 84 | 1. 학습에서 중국어 실험 이름을 지원합니다. (이전에 오류가 발생했습니다) 85 | 2. DPO 학습을 선택적으로 설정할 수 있도록 변경하였습니다. 배치 크기를 선택하면 자동으로 절반으로 줄어듭니다. 추론 인터페이스에서 새로운 매개변수를 전달하지 않는 문제를 수정하였습니다. 86 | 87 | ### 20240216 업데이트 88 | 89 | 1. 참조 텍스트 입력을 지원합니다. 90 | 2. 프론트엔드에 있던 중국어 텍스트 입력 버그를 수정하였습니다. 91 | 92 | todolist : 93 | 94 | 1. 중국어 다음음자 추론 최적화 95 | -------------------------------------------------------------------------------- /go-webui.bat: -------------------------------------------------------------------------------- 1 | runtime\python.exe vc_webui.py 2 | pause 3 | -------------------------------------------------------------------------------- /go-webui.ps1: -------------------------------------------------------------------------------- 1 | $ErrorActionPreference = "SilentlyContinue" 2 | chcp 65001 3 | & "$PSScriptRoot\runtime\python.exe" "$PSScriptRoot\vc_webui.py" 4 | pause 5 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | conda install -c conda-forge gcc 3 | conda install -c conda-forge gxx 4 | conda install ffmpeg cmake 5 | conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia 6 | pip install -r requirements.txt 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | tensorboard 4 | librosa==0.9.2 5 | numba==0.56.4 6 | pytorch-lightning 7 | gradio==3.38.0 8 | gradio_client==0.8.1 9 | ffmpeg-python 10 | onnxruntime 11 | tqdm 12 | funasr==1.0.0 13 | cn2an 14 | pypinyin 15 | pyopenjtalk 16 | g2p_en 17 | torchaudio 18 | modelscope==1.10.0 19 | sentencepiece 20 | transformers 21 | chardet 22 | PyYAML 23 | psutil 24 | jieba_fast 25 | jieba 26 | LangSegment>=0.2.0 27 | Faster_Whisper -------------------------------------------------------------------------------- /tools/asr/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def check_fw_local_models(): 4 | ''' 5 | 启动时检查本地是否有 Faster Whisper 模型. 6 | ''' 7 | model_size_list = [ 8 | "tiny", "tiny.en", 9 | "base", "base.en", 10 | "small", "small.en", 11 | "medium", "medium.en", 12 | "large", "large-v1", 13 | "large-v2", "large-v3"] 14 | for i, size in enumerate(model_size_list): 15 | if os.path.exists(f'tools/asr/models/faster-whisper-{size}'): 16 | model_size_list[i] = size + '-local' 17 | return model_size_list 18 | 19 | asr_dict = { 20 | "达摩 ASR (中文)": { 21 | 'lang': ['zh'], 22 | 'size': ['large'], 23 | 'path': 'funasr_asr.py', 24 | }, 25 | "Faster Whisper (多语种)": { 26 | 'lang': ['auto', 'zh', 'en', 'ja'], 27 | 'size': check_fw_local_models(), 28 | 'path': 'fasterwhisper_asr.py' 29 | } 30 | } 31 | 32 | -------------------------------------------------------------------------------- /tools/asr/fasterwhisper_asr.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | os.environ["HF_ENDPOINT"]="https://hf-mirror.com" 4 | import traceback 5 | import requests 6 | from glob import glob 7 | 8 | from faster_whisper import WhisperModel 9 | from tqdm import tqdm 10 | 11 | from tools.asr.config import check_fw_local_models 12 | from tools.asr.funasr_asr import only_asr 13 | 14 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" 15 | 16 | language_code_list = [ 17 | "af", "am", "ar", "as", "az", 18 | "ba", "be", "bg", "bn", "bo", 19 | "br", "bs", "ca", "cs", "cy", 20 | "da", "de", "el", "en", "es", 21 | "et", "eu", "fa", "fi", "fo", 22 | "fr", "gl", "gu", "ha", "haw", 23 | "he", "hi", "hr", "ht", "hu", 24 | "hy", "id", "is", "it", "ja", 25 | "jw", "ka", "kk", "km", "kn", 26 | "ko", "la", "lb", "ln", "lo", 27 | "lt", "lv", "mg", "mi", "mk", 28 | "ml", "mn", "mr", "ms", "mt", 29 | "my", "ne", "nl", "nn", "no", 30 | "oc", "pa", "pl", "ps", "pt", 31 | "ro", "ru", "sa", "sd", "si", 32 | "sk", "sl", "sn", "so", "sq", 33 | "sr", "su", "sv", "sw", "ta", 34 | "te", "tg", "th", "tk", "tl", 35 | "tr", "tt", "uk", "ur", "uz", 36 | "vi", "yi", "yo", "zh", "yue", 37 | "auto"] 38 | 39 | def execute_asr(input_folder, output_folder, model_size, language,precision): 40 | if '-local' in model_size: 41 | model_size = model_size[:-6] 42 | model_path = f'tools/asr/models/faster-whisper-{model_size}' 43 | else: 44 | model_path = model_size 45 | if language == 'auto': 46 | language = None #不设置语种由模型自动输出概率最高的语种 47 | print("loading faster whisper model:",model_size,model_path) 48 | try: 49 | model = WhisperModel(model_path, device="cuda", compute_type=precision) 50 | except: 51 | return print(traceback.format_exc()) 52 | output = [] 53 | output_file_name = os.path.basename(input_folder) 54 | output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') 55 | 56 | if not os.path.exists(output_folder): 57 | os.makedirs(output_folder) 58 | 59 | for file in tqdm(glob(os.path.join(input_folder, '**/*.wav'), recursive=True)): 60 | try: 61 | segments, info = model.transcribe( 62 | audio = file, 63 | beam_size = 5, 64 | vad_filter = True, 65 | vad_parameters = dict(min_silence_duration_ms=700), 66 | language = language) 67 | text = '' 68 | 69 | if info.language == "zh": 70 | print("检测为中文文本,转funasr处理") 71 | text = only_asr(file) 72 | 73 | if text == '': 74 | for segment in segments: 75 | text += segment.text 76 | output.append(f"{file}|{output_file_name}|{info.language.upper()}|{text}") 77 | except: 78 | return print(traceback.format_exc()) 79 | 80 | with open(output_file_path, "w", encoding="utf-8") as f: 81 | f.write("\n".join(output)) 82 | print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") 83 | return output_file_path 84 | 85 | if __name__ == '__main__': 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument("-i", "--input_folder", type=str, required=True, 88 | help="Path to the folder containing WAV files.") 89 | parser.add_argument("-o", "--output_folder", type=str, required=True, 90 | help="Output folder to store transcriptions.") 91 | parser.add_argument("-s", "--model_size", type=str, default='large-v3', 92 | choices=check_fw_local_models(), 93 | help="Model Size of Faster Whisper") 94 | parser.add_argument("-l", "--language", type=str, default='ja', 95 | choices=language_code_list, 96 | help="Language of the audio files.") 97 | parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], 98 | help="fp16 or fp32") 99 | 100 | cmd = parser.parse_args() 101 | output_file_path = execute_asr( 102 | input_folder = cmd.input_folder, 103 | output_folder = cmd.output_folder, 104 | model_size = cmd.model_size, 105 | language = cmd.language, 106 | precision = cmd.precision, 107 | ) 108 | -------------------------------------------------------------------------------- /tools/asr/funasr_asr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import argparse 4 | import os 5 | import traceback 6 | from tqdm import tqdm 7 | 8 | from funasr import AutoModel 9 | 10 | path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' 11 | path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' 12 | path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' 13 | path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" 14 | path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" 15 | path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" 16 | 17 | model = AutoModel( 18 | model = path_asr, 19 | model_revision = "v2.0.4", 20 | vad_model = path_vad, 21 | vad_model_revision = "v2.0.4", 22 | punc_model = path_punc, 23 | punc_model_revision = "v2.0.4", 24 | ) 25 | 26 | def only_asr(input_file): 27 | try: 28 | text = model.generate(input=input_file)[0]["text"] 29 | except: 30 | text = '' 31 | print(traceback.format_exc()) 32 | return text 33 | 34 | def execute_asr(input_folder, output_folder, model_size, language): 35 | input_file_names = os.listdir(input_folder) 36 | input_file_names.sort() 37 | 38 | output = [] 39 | output_file_name = os.path.basename(input_folder) 40 | 41 | for name in tqdm(input_file_names): 42 | try: 43 | text = model.generate(input="%s/%s"%(input_folder, name))[0]["text"] 44 | output.append(f"{input_folder}/{name}|{output_file_name}|{language.upper()}|{text}") 45 | except: 46 | print(traceback.format_exc()) 47 | 48 | output_folder = output_folder or "output/asr_opt" 49 | os.makedirs(output_folder, exist_ok=True) 50 | output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') 51 | 52 | with open(output_file_path, "w", encoding="utf-8") as f: 53 | f.write("\n".join(output)) 54 | print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") 55 | return output_file_path 56 | 57 | if __name__ == '__main__': 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument("-i", "--input_folder", type=str, required=True, 60 | help="Path to the folder containing WAV files.") 61 | parser.add_argument("-o", "--output_folder", type=str, required=True, 62 | help="Output folder to store transcriptions.") 63 | parser.add_argument("-s", "--model_size", type=str, default='large', 64 | help="Model Size of FunASR is Large") 65 | parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'], 66 | help="Language of the audio files.") 67 | parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], 68 | help="fp16 or fp32")#还没接入 69 | 70 | cmd = parser.parse_args() 71 | execute_asr( 72 | input_folder = cmd.input_folder, 73 | output_folder = cmd.output_folder, 74 | model_size = cmd.model_size, 75 | language = cmd.language, 76 | ) 77 | -------------------------------------------------------------------------------- /tools/asr/models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /tools/cmd-denoise.py: -------------------------------------------------------------------------------- 1 | import os,argparse 2 | 3 | from modelscope.pipelines import pipeline 4 | from modelscope.utils.constant import Tasks 5 | from tqdm import tqdm 6 | 7 | path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k' 8 | path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" 9 | ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise) 10 | def execute_denoise(input_folder,output_folder): 11 | os.makedirs(output_folder,exist_ok=True) 12 | # print(input_folder) 13 | # print(list(os.listdir(input_folder).sort())) 14 | for name in tqdm(os.listdir(input_folder)): 15 | ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("-i", "--input_folder", type=str, required=True, 20 | help="Path to the folder containing WAV files.") 21 | parser.add_argument("-o", "--output_folder", type=str, required=True, 22 | help="Output folder to store transcriptions.") 23 | parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], 24 | help="fp16 or fp32")#还没接入 25 | cmd = parser.parse_args() 26 | execute_denoise( 27 | input_folder = cmd.input_folder, 28 | output_folder = cmd.output_folder, 29 | ) -------------------------------------------------------------------------------- /tools/denoise-model/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /tools/i18n/i18n.py: -------------------------------------------------------------------------------- 1 | import json 2 | import locale 3 | import os 4 | 5 | 6 | def load_language_list(language): 7 | with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f: 8 | language_list = json.load(f) 9 | return language_list 10 | 11 | 12 | class I18nAuto: 13 | def __init__(self, language=None): 14 | if language in ["Auto", None]: 15 | language = locale.getdefaultlocale()[ 16 | 0 17 | ] # getlocale can't identify the system's language ((None, None)) 18 | if not os.path.exists(f"./i18n/locale/{language}.json"): 19 | language = "en_US" 20 | self.language = language 21 | self.language_map = load_language_list(language) 22 | 23 | def __call__(self, key): 24 | return self.language_map.get(key, key) 25 | 26 | def __repr__(self): 27 | return "Use Language: " + self.language 28 | -------------------------------------------------------------------------------- /tools/i18n/locale_diff.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import OrderedDict 4 | 5 | # Define the standard file name 6 | standard_file = "locale/zh_CN.json" 7 | 8 | # Find all JSON files in the directory 9 | dir_path = "locale/" 10 | languages = [ 11 | os.path.join(dir_path, f) 12 | for f in os.listdir(dir_path) 13 | if f.endswith(".json") and f != standard_file 14 | ] 15 | 16 | # Load the standard file 17 | with open(standard_file, "r", encoding="utf-8") as f: 18 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 19 | 20 | # Loop through each language file 21 | for lang_file in languages: 22 | # Load the language file 23 | with open(lang_file, "r", encoding="utf-8") as f: 24 | lang_data = json.load(f, object_pairs_hook=OrderedDict) 25 | 26 | # Find the difference between the language file and the standard file 27 | diff = set(standard_data.keys()) - set(lang_data.keys()) 28 | 29 | miss = set(lang_data.keys()) - set(standard_data.keys()) 30 | 31 | # Add any missing keys to the language file 32 | for key in diff: 33 | lang_data[key] = key 34 | 35 | # Del any extra keys to the language file 36 | for key in miss: 37 | del lang_data[key] 38 | 39 | # Sort the keys of the language file to match the order of the standard file 40 | lang_data = OrderedDict( 41 | sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])) 42 | ) 43 | 44 | # Save the updated language file 45 | with open(lang_file, "w", encoding="utf-8") as f: 46 | json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True) 47 | f.write("\n") 48 | -------------------------------------------------------------------------------- /tools/i18n/scan_i18n.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import glob 3 | import json 4 | from collections import OrderedDict 5 | 6 | 7 | def extract_i18n_strings(node): 8 | i18n_strings = [] 9 | 10 | if ( 11 | isinstance(node, ast.Call) 12 | and isinstance(node.func, ast.Name) 13 | and node.func.id == "i18n" 14 | ): 15 | for arg in node.args: 16 | if isinstance(arg, ast.Str): 17 | i18n_strings.append(arg.s) 18 | 19 | for child_node in ast.iter_child_nodes(node): 20 | i18n_strings.extend(extract_i18n_strings(child_node)) 21 | 22 | return i18n_strings 23 | 24 | 25 | # scan the directory for all .py files (recursively) 26 | # for each file, parse the code into an AST 27 | # for each AST, extract the i18n strings 28 | 29 | strings = [] 30 | for filename in glob.iglob("**/*.py", recursive=True): 31 | with open(filename, "r") as f: 32 | code = f.read() 33 | if "I18nAuto" in code: 34 | tree = ast.parse(code) 35 | i18n_strings = extract_i18n_strings(tree) 36 | print(filename, len(i18n_strings)) 37 | strings.extend(i18n_strings) 38 | code_keys = set(strings) 39 | """ 40 | n_i18n.py 41 | gui_v1.py 26 42 | app.py 16 43 | infer-web.py 147 44 | scan_i18n.py 0 45 | i18n.py 0 46 | lib/train/process_ckpt.py 1 47 | """ 48 | print() 49 | print("Total unique:", len(code_keys)) 50 | 51 | 52 | standard_file = "i18n/locale/zh_CN.json" 53 | with open(standard_file, "r", encoding="utf-8") as f: 54 | standard_data = json.load(f, object_pairs_hook=OrderedDict) 55 | standard_keys = set(standard_data.keys()) 56 | 57 | # Define the standard file name 58 | unused_keys = standard_keys - code_keys 59 | print("Unused keys:", len(unused_keys)) 60 | for unused_key in unused_keys: 61 | print("\t", unused_key) 62 | 63 | missing_keys = code_keys - standard_keys 64 | print("Missing keys:", len(missing_keys)) 65 | for missing_key in missing_keys: 66 | print("\t", missing_key) 67 | 68 | code_keys_dict = OrderedDict() 69 | for s in strings: 70 | code_keys_dict[s] = s 71 | 72 | # write back 73 | with open(standard_file, "w", encoding="utf-8") as f: 74 | json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True) 75 | f.write("\n") 76 | -------------------------------------------------------------------------------- /tools/my_utils.py: -------------------------------------------------------------------------------- 1 | import platform,os,traceback 2 | import ffmpeg 3 | import numpy as np 4 | 5 | 6 | def load_audio(file, sr): 7 | try: 8 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 9 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 10 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 11 | file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 12 | if os.path.exists(file) == False: 13 | raise RuntimeError( 14 | "You input a wrong audio path that does not exists, please fix it!" 15 | ) 16 | out, _ = ( 17 | ffmpeg.input(file, threads=0) 18 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 19 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 20 | ) 21 | except Exception as e: 22 | traceback.print_exc() 23 | raise RuntimeError(f"Failed to load audio: {e}") 24 | 25 | return np.frombuffer(out, np.float32).flatten() 26 | 27 | 28 | def clean_path(path_str): 29 | if platform.system() == 'Windows': 30 | path_str = path_str.replace('/', '\\') 31 | return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 32 | -------------------------------------------------------------------------------- /tools/slice_audio.py: -------------------------------------------------------------------------------- 1 | import os,sys,numpy as np 2 | import traceback 3 | from scipy.io import wavfile 4 | # parent_directory = os.path.dirname(os.path.abspath(__file__)) 5 | # sys.path.append(parent_directory) 6 | from my_utils import load_audio 7 | from slicer2 import Slicer 8 | 9 | def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): 10 | os.makedirs(opt_root,exist_ok=True) 11 | if os.path.isfile(inp): 12 | input=[inp] 13 | elif os.path.isdir(inp): 14 | input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] 15 | else: 16 | return "输入路径存在但既不是文件也不是文件夹" 17 | slicer = Slicer( 18 | sr=32000, # 长音频采样率 19 | threshold= int(threshold), # 音量小于这个值视作静音的备选切割点 20 | min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 21 | min_interval= int(min_interval), # 最短切割间隔 22 | hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) 23 | max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长 24 | ) 25 | _max=float(_max) 26 | alpha=float(alpha) 27 | for inp_path in input[int(i_part)::int(all_part)]: 28 | # print(inp_path) 29 | try: 30 | name = os.path.basename(inp_path) 31 | audio = load_audio(inp_path, 32000) 32 | # print(audio.shape) 33 | for chunk, start, end in slicer.slice(audio): # start和end是帧数 34 | tmp_max = np.abs(chunk).max() 35 | if(tmp_max>1):chunk/=tmp_max 36 | chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk 37 | wavfile.write( 38 | "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), 39 | 32000, 40 | # chunk.astype(np.float32), 41 | (chunk * 32767).astype(np.int16), 42 | ) 43 | except: 44 | print(inp_path,"->fail->",traceback.format_exc()) 45 | return "执行完毕,请检查输出文件" 46 | 47 | print(slice(*sys.argv[1:])) 48 | 49 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_123812KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.bottleneck = nn.Sequential( 104 | Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 105 | ) 106 | 107 | def forward(self, x): 108 | _, _, h, w = x.size() 109 | feat1 = F.interpolate( 110 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 111 | ) 112 | feat2 = self.conv2(x) 113 | feat3 = self.conv3(x) 114 | feat4 = self.conv4(x) 115 | feat5 = self.conv5(x) 116 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 117 | bottle = self.bottleneck(out) 118 | return bottle 119 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_33966KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.conv6 = SeperableConv2DBNActiv( 104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 105 | ) 106 | self.conv7 = SeperableConv2DBNActiv( 107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 108 | ) 109 | self.bottleneck = nn.Sequential( 110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 111 | ) 112 | 113 | def forward(self, x): 114 | _, _, h, w = x.size() 115 | feat1 = F.interpolate( 116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 117 | ) 118 | feat2 = self.conv2(x) 119 | feat3 = self.conv3(x) 120 | feat4 = self.conv4(x) 121 | feat5 = self.conv5(x) 122 | feat6 = self.conv6(x) 123 | feat7 = self.conv7(x) 124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 125 | bottle = self.bottleneck(out) 126 | return bottle 127 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_537227KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.conv6 = SeperableConv2DBNActiv( 104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 105 | ) 106 | self.conv7 = SeperableConv2DBNActiv( 107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 108 | ) 109 | self.bottleneck = nn.Sequential( 110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 111 | ) 112 | 113 | def forward(self, x): 114 | _, _, h, w = x.size() 115 | feat1 = F.interpolate( 116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 117 | ) 118 | feat2 = self.conv2(x) 119 | feat3 = self.conv3(x) 120 | feat4 = self.conv4(x) 121 | feat5 = self.conv5(x) 122 | feat6 = self.conv6(x) 123 | feat7 = self.conv7(x) 124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 125 | bottle = self.bottleneck(out) 126 | return bottle 127 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_537238KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__( 67 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 68 | ): 69 | super(Decoder, self).__init__() 70 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 71 | self.dropout = nn.Dropout2d(0.1) if dropout else None 72 | 73 | def __call__(self, x, skip=None): 74 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 75 | if skip is not None: 76 | skip = spec_utils.crop_center(skip, x) 77 | x = torch.cat([x, skip], dim=1) 78 | h = self.conv(x) 79 | 80 | if self.dropout is not None: 81 | h = self.dropout(h) 82 | 83 | return h 84 | 85 | 86 | class ASPPModule(nn.Module): 87 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 88 | super(ASPPModule, self).__init__() 89 | self.conv1 = nn.Sequential( 90 | nn.AdaptiveAvgPool2d((1, None)), 91 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 92 | ) 93 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 94 | self.conv3 = SeperableConv2DBNActiv( 95 | nin, nin, 3, 1, dilations[0], dilations[0], activ=activ 96 | ) 97 | self.conv4 = SeperableConv2DBNActiv( 98 | nin, nin, 3, 1, dilations[1], dilations[1], activ=activ 99 | ) 100 | self.conv5 = SeperableConv2DBNActiv( 101 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 102 | ) 103 | self.conv6 = SeperableConv2DBNActiv( 104 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 105 | ) 106 | self.conv7 = SeperableConv2DBNActiv( 107 | nin, nin, 3, 1, dilations[2], dilations[2], activ=activ 108 | ) 109 | self.bottleneck = nn.Sequential( 110 | Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) 111 | ) 112 | 113 | def forward(self, x): 114 | _, _, h, w = x.size() 115 | feat1 = F.interpolate( 116 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 117 | ) 118 | feat2 = self.conv2(x) 119 | feat3 = self.conv3(x) 120 | feat4 = self.conv4(x) 121 | feat5 = self.conv5(x) 122 | feat6 = self.conv6(x) 123 | feat7 = self.conv7(x) 124 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 125 | bottle = self.bottleneck(out) 126 | return bottle 127 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_new.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class Encoder(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 31 | super(Encoder, self).__init__() 32 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) 33 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) 34 | 35 | def __call__(self, x): 36 | h = self.conv1(x) 37 | h = self.conv2(h) 38 | 39 | return h 40 | 41 | 42 | class Decoder(nn.Module): 43 | def __init__( 44 | self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False 45 | ): 46 | super(Decoder, self).__init__() 47 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 48 | # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) 49 | self.dropout = nn.Dropout2d(0.1) if dropout else None 50 | 51 | def __call__(self, x, skip=None): 52 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 53 | 54 | if skip is not None: 55 | skip = spec_utils.crop_center(skip, x) 56 | x = torch.cat([x, skip], dim=1) 57 | 58 | h = self.conv1(x) 59 | # h = self.conv2(h) 60 | 61 | if self.dropout is not None: 62 | h = self.dropout(h) 63 | 64 | return h 65 | 66 | 67 | class ASPPModule(nn.Module): 68 | def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): 69 | super(ASPPModule, self).__init__() 70 | self.conv1 = nn.Sequential( 71 | nn.AdaptiveAvgPool2d((1, None)), 72 | Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), 73 | ) 74 | self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) 75 | self.conv3 = Conv2DBNActiv( 76 | nin, nout, 3, 1, dilations[0], dilations[0], activ=activ 77 | ) 78 | self.conv4 = Conv2DBNActiv( 79 | nin, nout, 3, 1, dilations[1], dilations[1], activ=activ 80 | ) 81 | self.conv5 = Conv2DBNActiv( 82 | nin, nout, 3, 1, dilations[2], dilations[2], activ=activ 83 | ) 84 | self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) 85 | self.dropout = nn.Dropout2d(0.1) if dropout else None 86 | 87 | def forward(self, x): 88 | _, _, h, w = x.size() 89 | feat1 = F.interpolate( 90 | self.conv1(x), size=(h, w), mode="bilinear", align_corners=True 91 | ) 92 | feat2 = self.conv2(x) 93 | feat3 = self.conv3(x) 94 | feat4 = self.conv4(x) 95 | feat5 = self.conv5(x) 96 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 97 | out = self.bottleneck(out) 98 | 99 | if self.dropout is not None: 100 | out = self.dropout(out) 101 | 102 | return out 103 | 104 | 105 | class LSTMModule(nn.Module): 106 | def __init__(self, nin_conv, nin_lstm, nout_lstm): 107 | super(LSTMModule, self).__init__() 108 | self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) 109 | self.lstm = nn.LSTM( 110 | input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True 111 | ) 112 | self.dense = nn.Sequential( 113 | nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() 114 | ) 115 | 116 | def forward(self, x): 117 | N, _, nbins, nframes = x.size() 118 | h = self.conv(x)[:, 0] # N, nbins, nframes 119 | h = h.permute(2, 0, 1) # nframes, N, nbins 120 | h, _ = self.lstm(h) 121 | h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins 122 | h = h.reshape(nframes, N, 1, nbins) 123 | h = h.permute(1, 2, 3, 0) 124 | 125 | return h 126 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/model_param_init.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pathlib 4 | 5 | default_param = {} 6 | default_param["bins"] = 768 7 | default_param["unstable_bins"] = 9 # training only 8 | default_param["reduction_bins"] = 762 # training only 9 | default_param["sr"] = 44100 10 | default_param["pre_filter_start"] = 757 11 | default_param["pre_filter_stop"] = 768 12 | default_param["band"] = {} 13 | 14 | 15 | default_param["band"][1] = { 16 | "sr": 11025, 17 | "hl": 128, 18 | "n_fft": 960, 19 | "crop_start": 0, 20 | "crop_stop": 245, 21 | "lpf_start": 61, # inference only 22 | "res_type": "polyphase", 23 | } 24 | 25 | default_param["band"][2] = { 26 | "sr": 44100, 27 | "hl": 512, 28 | "n_fft": 1536, 29 | "crop_start": 24, 30 | "crop_stop": 547, 31 | "hpf_start": 81, # inference only 32 | "res_type": "sinc_best", 33 | } 34 | 35 | 36 | def int_keys(d): 37 | r = {} 38 | for k, v in d: 39 | if k.isdigit(): 40 | k = int(k) 41 | r[k] = v 42 | return r 43 | 44 | 45 | class ModelParameters(object): 46 | def __init__(self, config_path=""): 47 | if ".pth" == pathlib.Path(config_path).suffix: 48 | import zipfile 49 | 50 | with zipfile.ZipFile(config_path, "r") as zip: 51 | self.param = json.loads( 52 | zip.read("param.json"), object_pairs_hook=int_keys 53 | ) 54 | elif ".json" == pathlib.Path(config_path).suffix: 55 | with open(config_path, "r") as f: 56 | self.param = json.loads(f.read(), object_pairs_hook=int_keys) 57 | else: 58 | self.param = default_param 59 | 60 | for k in [ 61 | "mid_side", 62 | "mid_side_b", 63 | "mid_side_b2", 64 | "stereo_w", 65 | "stereo_n", 66 | "reverse", 67 | ]: 68 | if not k in self.param: 69 | self.param[k] = False 70 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 16000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 16000, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 32000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "kaiser_fast" 14 | } 15 | }, 16 | "sr": 32000, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 33075, 8 | "hl": 384, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 33075, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 1024, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 256, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 256, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 256, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 256, 18 | "pre_filter_stop": 256 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 700, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 700 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 118, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 32000, 18 | "hl": 352, 19 | "n_fft": 1024, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 44, 23 | "hpf_stop": 23, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 32000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } 31 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 512, 3 | "unstable_bins": 7, 4 | "reduction_bins": 510, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 160, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 192, 12 | "lpf_start": 41, 13 | "lpf_stop": 139, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 44100, 18 | "hl": 640, 19 | "n_fft": 1024, 20 | "crop_start": 10, 21 | "crop_stop": 320, 22 | "hpf_start": 47, 23 | "hpf_stop": 15, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 44100, 28 | "pre_filter_start": 510, 29 | "pre_filter_stop": 512 30 | } 31 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 240, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 48000, 18 | "hl": 528, 19 | "n_fft": 1536, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 82, 23 | "hpf_stop": 22, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 48000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 5, 4 | "reduction_bins": 733, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 278, 12 | "lpf_start": 28, 13 | "lpf_stop": 140, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 22050, 18 | "hl": 256, 19 | "n_fft": 768, 20 | "crop_start": 14, 21 | "crop_stop": 322, 22 | "hpf_start": 70, 23 | "hpf_stop": 14, 24 | "lpf_start": 283, 25 | "lpf_stop": 314, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 44100, 30 | "hl": 512, 31 | "n_fft": 768, 32 | "crop_start": 131, 33 | "crop_stop": 313, 34 | "hpf_start": 154, 35 | "hpf_stop": 141, 36 | "res_type": "sinc_medium" 37 | } 38 | }, 39 | "sr": 44100, 40 | "pre_filter_start": 757, 41 | "pre_filter_stop": 768 42 | } 43 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side": true, 3 | "bins": 768, 4 | "unstable_bins": 5, 5 | "reduction_bins": 733, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 768, 11 | "crop_start": 0, 12 | "crop_stop": 278, 13 | "lpf_start": 28, 14 | "lpf_stop": 140, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 256, 20 | "n_fft": 768, 21 | "crop_start": 14, 22 | "crop_stop": 322, 23 | "hpf_start": 70, 24 | "hpf_stop": 14, 25 | "lpf_start": 283, 26 | "lpf_stop": 314, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 512, 32 | "n_fft": 768, 33 | "crop_start": 131, 34 | "crop_stop": 313, 35 | "hpf_start": 154, 36 | "hpf_stop": 141, 37 | "res_type": "sinc_medium" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 757, 42 | "pre_filter_stop": 768 43 | } 44 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 640, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 187, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 768, 21 | "crop_start": 0, 22 | "crop_stop": 212, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 174, 26 | "lpf_stop": 209, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 640, 33 | "crop_start": 66, 34 | "crop_stop": 307, 35 | "hpf_start": 86, 36 | "hpf_stop": 72, 37 | "res_type": "kaiser_fast" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 639, 42 | "pre_filter_stop": 640 43 | } 44 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 668, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 1024, 10 | "crop_start": 0, 11 | "crop_stop": 186, 12 | "lpf_start": 37, 13 | "lpf_stop": 73, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 11025, 18 | "hl": 128, 19 | "n_fft": 512, 20 | "crop_start": 4, 21 | "crop_stop": 185, 22 | "hpf_start": 36, 23 | "hpf_stop": 18, 24 | "lpf_start": 93, 25 | "lpf_stop": 185, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 22050, 30 | "hl": 256, 31 | "n_fft": 512, 32 | "crop_start": 46, 33 | "crop_stop": 186, 34 | "hpf_start": 93, 35 | "hpf_stop": 46, 36 | "lpf_start": 164, 37 | "lpf_stop": 186, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 512, 43 | "n_fft": 768, 44 | "crop_start": 121, 45 | "crop_stop": 382, 46 | "hpf_start": 138, 47 | "hpf_stop": 123, 48 | "res_type": "sinc_medium" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 740, 53 | "pre_filter_stop": 768 54 | } 55 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "mid_side": true, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } 56 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json: -------------------------------------------------------------------------------- 1 | { 2 | "reverse": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json: -------------------------------------------------------------------------------- 1 | { 2 | "stereo_w": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "convert_channels": "stereo_n", 49 | "res_type": "kaiser_fast" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 668, 54 | "pre_filter_stop": 672 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 530, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/ensemble.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 1280, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 2048, 11 | "crop_start": 0, 12 | "crop_stop": 374, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 1536, 21 | "crop_start": 0, 22 | "crop_stop": 424, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 348, 26 | "lpf_stop": 418, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 1280, 33 | "crop_start": 132, 34 | "crop_stop": 614, 35 | "hpf_start": 172, 36 | "hpf_stop": 144, 37 | "res_type": "polyphase" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 1280, 42 | "pre_filter_stop": 1280 43 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets.py: -------------------------------------------------------------------------------- 1 | import layers 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from . import spec_utils 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | def __init__(self, n_fft): 42 | super(CascadedASPPNet, self).__init__() 43 | self.stg1_low_band_net = BaseASPPNet(2, 16) 44 | self.stg1_high_band_net = BaseASPPNet(2, 16) 45 | 46 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) 47 | self.stg2_full_band_net = BaseASPPNet(8, 16) 48 | 49 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 50 | self.stg3_full_band_net = BaseASPPNet(16, 32) 51 | 52 | self.out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) 54 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) 55 | 56 | self.max_bin = n_fft // 2 57 | self.output_bin = n_fft // 2 + 1 58 | 59 | self.offset = 128 60 | 61 | def forward(self, x, aggressiveness=None): 62 | mix = x.detach() 63 | x = x.clone() 64 | 65 | x = x[:, :, : self.max_bin] 66 | 67 | bandw = x.size()[2] // 2 68 | aux1 = torch.cat( 69 | [ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]), 72 | ], 73 | dim=2, 74 | ) 75 | 76 | h = torch.cat([x, aux1], dim=1) 77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 78 | 79 | h = torch.cat([x, aux1, aux2], dim=1) 80 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 81 | 82 | mask = torch.sigmoid(self.out(h)) 83 | mask = F.pad( 84 | input=mask, 85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 86 | mode="replicate", 87 | ) 88 | 89 | if self.training: 90 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 91 | aux1 = F.pad( 92 | input=aux1, 93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 94 | mode="replicate", 95 | ) 96 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 97 | aux2 = F.pad( 98 | input=aux2, 99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 100 | mode="replicate", 101 | ) 102 | return mask * mix, aux1 * mix, aux2 * mix 103 | else: 104 | if aggressiveness: 105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 106 | mask[:, :, : aggressiveness["split_bin"]], 107 | 1 + aggressiveness["value"] / 3, 108 | ) 109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 110 | mask[:, :, aggressiveness["split_bin"] :], 111 | 1 + aggressiveness["value"], 112 | ) 113 | 114 | return mask * mix 115 | 116 | def predict(self, x_mag, aggressiveness=None): 117 | h = self.forward(x_mag, aggressiveness) 118 | 119 | if self.offset > 0: 120 | h = h[:, :, :, self.offset : -self.offset] 121 | assert h.size()[3] > 0 122 | 123 | return h 124 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets_123812KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets_33966KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_33966KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16, 32)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 16) 43 | self.stg1_high_band_net = BaseASPPNet(2, 16) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(8, 16) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(16, 32) 50 | 51 | self.out = nn.Conv2d(32, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets_537227KB.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from . import layers_537238KB as layers 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | def __init__(self, n_fft): 42 | super(CascadedASPPNet, self).__init__() 43 | self.stg1_low_band_net = BaseASPPNet(2, 64) 44 | self.stg1_high_band_net = BaseASPPNet(2, 64) 45 | 46 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 47 | self.stg2_full_band_net = BaseASPPNet(32, 64) 48 | 49 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) 50 | self.stg3_full_band_net = BaseASPPNet(64, 128) 51 | 52 | self.out = nn.Conv2d(128, 2, 1, bias=False) 53 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) 54 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) 55 | 56 | self.max_bin = n_fft // 2 57 | self.output_bin = n_fft // 2 + 1 58 | 59 | self.offset = 128 60 | 61 | def forward(self, x, aggressiveness=None): 62 | mix = x.detach() 63 | x = x.clone() 64 | 65 | x = x[:, :, : self.max_bin] 66 | 67 | bandw = x.size()[2] // 2 68 | aux1 = torch.cat( 69 | [ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]), 72 | ], 73 | dim=2, 74 | ) 75 | 76 | h = torch.cat([x, aux1], dim=1) 77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 78 | 79 | h = torch.cat([x, aux1, aux2], dim=1) 80 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 81 | 82 | mask = torch.sigmoid(self.out(h)) 83 | mask = F.pad( 84 | input=mask, 85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 86 | mode="replicate", 87 | ) 88 | 89 | if self.training: 90 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 91 | aux1 = F.pad( 92 | input=aux1, 93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 94 | mode="replicate", 95 | ) 96 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 97 | aux2 = F.pad( 98 | input=aux2, 99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 100 | mode="replicate", 101 | ) 102 | return mask * mix, aux1 * mix, aux2 * mix 103 | else: 104 | if aggressiveness: 105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 106 | mask[:, :, : aggressiveness["split_bin"]], 107 | 1 + aggressiveness["value"] / 3, 108 | ) 109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 110 | mask[:, :, aggressiveness["split_bin"] :], 111 | 1 + aggressiveness["value"], 112 | ) 113 | 114 | return mask * mix 115 | 116 | def predict(self, x_mag, aggressiveness=None): 117 | h = self.forward(x_mag, aggressiveness) 118 | 119 | if self.offset > 0: 120 | h = h[:, :, :, self.offset : -self.offset] 121 | assert h.size()[3] > 0 122 | 123 | return h 124 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets_537238KB.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | from . import layers_537238KB as layers 7 | 8 | 9 | class BaseASPPNet(nn.Module): 10 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 11 | super(BaseASPPNet, self).__init__() 12 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 13 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 14 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 15 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 16 | 17 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 18 | 19 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 20 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 21 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 22 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 23 | 24 | def __call__(self, x): 25 | h, e1 = self.enc1(x) 26 | h, e2 = self.enc2(h) 27 | h, e3 = self.enc3(h) 28 | h, e4 = self.enc4(h) 29 | 30 | h = self.aspp(h) 31 | 32 | h = self.dec4(h, e4) 33 | h = self.dec3(h, e3) 34 | h = self.dec2(h, e2) 35 | h = self.dec1(h, e1) 36 | 37 | return h 38 | 39 | 40 | class CascadedASPPNet(nn.Module): 41 | def __init__(self, n_fft): 42 | super(CascadedASPPNet, self).__init__() 43 | self.stg1_low_band_net = BaseASPPNet(2, 64) 44 | self.stg1_high_band_net = BaseASPPNet(2, 64) 45 | 46 | self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 47 | self.stg2_full_band_net = BaseASPPNet(32, 64) 48 | 49 | self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) 50 | self.stg3_full_band_net = BaseASPPNet(64, 128) 51 | 52 | self.out = nn.Conv2d(128, 2, 1, bias=False) 53 | self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) 54 | self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) 55 | 56 | self.max_bin = n_fft // 2 57 | self.output_bin = n_fft // 2 + 1 58 | 59 | self.offset = 128 60 | 61 | def forward(self, x, aggressiveness=None): 62 | mix = x.detach() 63 | x = x.clone() 64 | 65 | x = x[:, :, : self.max_bin] 66 | 67 | bandw = x.size()[2] // 2 68 | aux1 = torch.cat( 69 | [ 70 | self.stg1_low_band_net(x[:, :, :bandw]), 71 | self.stg1_high_band_net(x[:, :, bandw:]), 72 | ], 73 | dim=2, 74 | ) 75 | 76 | h = torch.cat([x, aux1], dim=1) 77 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 78 | 79 | h = torch.cat([x, aux1, aux2], dim=1) 80 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 81 | 82 | mask = torch.sigmoid(self.out(h)) 83 | mask = F.pad( 84 | input=mask, 85 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 86 | mode="replicate", 87 | ) 88 | 89 | if self.training: 90 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 91 | aux1 = F.pad( 92 | input=aux1, 93 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 94 | mode="replicate", 95 | ) 96 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 97 | aux2 = F.pad( 98 | input=aux2, 99 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 100 | mode="replicate", 101 | ) 102 | return mask * mix, aux1 * mix, aux2 * mix 103 | else: 104 | if aggressiveness: 105 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 106 | mask[:, :, : aggressiveness["split_bin"]], 107 | 1 + aggressiveness["value"] / 3, 108 | ) 109 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 110 | mask[:, :, aggressiveness["split_bin"] :], 111 | 1 + aggressiveness["value"], 112 | ) 113 | 114 | return mask * mix 115 | 116 | def predict(self, x_mag, aggressiveness=None): 117 | h = self.forward(x_mag, aggressiveness) 118 | 119 | if self.offset > 0: 120 | h = h[:, :, :, self.offset : -self.offset] 121 | assert h.size()[3] > 0 122 | 123 | return h 124 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets_61968KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_123821KB as layers 6 | 7 | 8 | class BaseASPPNet(nn.Module): 9 | def __init__(self, nin, ch, dilations=(4, 8, 16)): 10 | super(BaseASPPNet, self).__init__() 11 | self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) 12 | self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) 13 | self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) 14 | self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) 15 | 16 | self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) 17 | 18 | self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) 19 | self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) 20 | self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) 21 | self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) 22 | 23 | def __call__(self, x): 24 | h, e1 = self.enc1(x) 25 | h, e2 = self.enc2(h) 26 | h, e3 = self.enc3(h) 27 | h, e4 = self.enc4(h) 28 | 29 | h = self.aspp(h) 30 | 31 | h = self.dec4(h, e4) 32 | h = self.dec3(h, e3) 33 | h = self.dec2(h, e2) 34 | h = self.dec1(h, e1) 35 | 36 | return h 37 | 38 | 39 | class CascadedASPPNet(nn.Module): 40 | def __init__(self, n_fft): 41 | super(CascadedASPPNet, self).__init__() 42 | self.stg1_low_band_net = BaseASPPNet(2, 32) 43 | self.stg1_high_band_net = BaseASPPNet(2, 32) 44 | 45 | self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) 46 | self.stg2_full_band_net = BaseASPPNet(16, 32) 47 | 48 | self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) 49 | self.stg3_full_band_net = BaseASPPNet(32, 64) 50 | 51 | self.out = nn.Conv2d(64, 2, 1, bias=False) 52 | self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) 53 | self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) 54 | 55 | self.max_bin = n_fft // 2 56 | self.output_bin = n_fft // 2 + 1 57 | 58 | self.offset = 128 59 | 60 | def forward(self, x, aggressiveness=None): 61 | mix = x.detach() 62 | x = x.clone() 63 | 64 | x = x[:, :, : self.max_bin] 65 | 66 | bandw = x.size()[2] // 2 67 | aux1 = torch.cat( 68 | [ 69 | self.stg1_low_band_net(x[:, :, :bandw]), 70 | self.stg1_high_band_net(x[:, :, bandw:]), 71 | ], 72 | dim=2, 73 | ) 74 | 75 | h = torch.cat([x, aux1], dim=1) 76 | aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) 77 | 78 | h = torch.cat([x, aux1, aux2], dim=1) 79 | h = self.stg3_full_band_net(self.stg3_bridge(h)) 80 | 81 | mask = torch.sigmoid(self.out(h)) 82 | mask = F.pad( 83 | input=mask, 84 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 85 | mode="replicate", 86 | ) 87 | 88 | if self.training: 89 | aux1 = torch.sigmoid(self.aux1_out(aux1)) 90 | aux1 = F.pad( 91 | input=aux1, 92 | pad=(0, 0, 0, self.output_bin - aux1.size()[2]), 93 | mode="replicate", 94 | ) 95 | aux2 = torch.sigmoid(self.aux2_out(aux2)) 96 | aux2 = F.pad( 97 | input=aux2, 98 | pad=(0, 0, 0, self.output_bin - aux2.size()[2]), 99 | mode="replicate", 100 | ) 101 | return mask * mix, aux1 * mix, aux2 * mix 102 | else: 103 | if aggressiveness: 104 | mask[:, :, : aggressiveness["split_bin"]] = torch.pow( 105 | mask[:, :, : aggressiveness["split_bin"]], 106 | 1 + aggressiveness["value"] / 3, 107 | ) 108 | mask[:, :, aggressiveness["split_bin"] :] = torch.pow( 109 | mask[:, :, aggressiveness["split_bin"] :], 110 | 1 + aggressiveness["value"], 111 | ) 112 | 113 | return mask * mix 114 | 115 | def predict(self, x_mag, aggressiveness=None): 116 | h = self.forward(x_mag, aggressiveness) 117 | 118 | if self.offset > 0: 119 | h = h[:, :, :, self.offset : -self.offset] 120 | assert h.size()[3] > 0 121 | 122 | return h 123 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/nets_new.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import layers_new 6 | 7 | 8 | class BaseNet(nn.Module): 9 | def __init__( 10 | self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) 11 | ): 12 | super(BaseNet, self).__init__() 13 | self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) 14 | self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) 15 | self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1) 16 | self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1) 17 | self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1) 18 | 19 | self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) 20 | 21 | self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) 22 | self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) 23 | self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) 24 | self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm) 25 | self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) 26 | 27 | def __call__(self, x): 28 | e1 = self.enc1(x) 29 | e2 = self.enc2(e1) 30 | e3 = self.enc3(e2) 31 | e4 = self.enc4(e3) 32 | e5 = self.enc5(e4) 33 | 34 | h = self.aspp(e5) 35 | 36 | h = self.dec4(h, e4) 37 | h = self.dec3(h, e3) 38 | h = self.dec2(h, e2) 39 | h = torch.cat([h, self.lstm_dec2(h)], dim=1) 40 | h = self.dec1(h, e1) 41 | 42 | return h 43 | 44 | 45 | class CascadedNet(nn.Module): 46 | def __init__(self, n_fft, nout=32, nout_lstm=128): 47 | super(CascadedNet, self).__init__() 48 | 49 | self.max_bin = n_fft // 2 50 | self.output_bin = n_fft // 2 + 1 51 | self.nin_lstm = self.max_bin // 2 52 | self.offset = 64 53 | 54 | self.stg1_low_band_net = nn.Sequential( 55 | BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), 56 | layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), 57 | ) 58 | 59 | self.stg1_high_band_net = BaseNet( 60 | 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 61 | ) 62 | 63 | self.stg2_low_band_net = nn.Sequential( 64 | BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), 65 | layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), 66 | ) 67 | self.stg2_high_band_net = BaseNet( 68 | nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 69 | ) 70 | 71 | self.stg3_full_band_net = BaseNet( 72 | 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm 73 | ) 74 | 75 | self.out = nn.Conv2d(nout, 2, 1, bias=False) 76 | self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) 77 | 78 | def forward(self, x): 79 | x = x[:, :, : self.max_bin] 80 | 81 | bandw = x.size()[2] // 2 82 | l1_in = x[:, :, :bandw] 83 | h1_in = x[:, :, bandw:] 84 | l1 = self.stg1_low_band_net(l1_in) 85 | h1 = self.stg1_high_band_net(h1_in) 86 | aux1 = torch.cat([l1, h1], dim=2) 87 | 88 | l2_in = torch.cat([l1_in, l1], dim=1) 89 | h2_in = torch.cat([h1_in, h1], dim=1) 90 | l2 = self.stg2_low_band_net(l2_in) 91 | h2 = self.stg2_high_band_net(h2_in) 92 | aux2 = torch.cat([l2, h2], dim=2) 93 | 94 | f3_in = torch.cat([x, aux1, aux2], dim=1) 95 | f3 = self.stg3_full_band_net(f3_in) 96 | 97 | mask = torch.sigmoid(self.out(f3)) 98 | mask = F.pad( 99 | input=mask, 100 | pad=(0, 0, 0, self.output_bin - mask.size()[2]), 101 | mode="replicate", 102 | ) 103 | 104 | if self.training: 105 | aux = torch.cat([aux1, aux2], dim=1) 106 | aux = torch.sigmoid(self.aux_out(aux)) 107 | aux = F.pad( 108 | input=aux, 109 | pad=(0, 0, 0, self.output_bin - aux.size()[2]), 110 | mode="replicate", 111 | ) 112 | return mask, aux 113 | else: 114 | return mask 115 | 116 | def predict_mask(self, x): 117 | mask = self.forward(x) 118 | 119 | if self.offset > 0: 120 | mask = mask[:, :, :, self.offset : -self.offset] 121 | assert mask.size()[3] > 0 122 | 123 | return mask 124 | 125 | def predict(self, x, aggressiveness=None): 126 | mask = self.forward(x) 127 | pred_mag = x * mask 128 | 129 | if self.offset > 0: 130 | pred_mag = pred_mag[:, :, :, self.offset : -self.offset] 131 | assert pred_mag.size()[3] > 0 132 | 133 | return pred_mag 134 | -------------------------------------------------------------------------------- /tools/uvr5/lib/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import torch 5 | from tqdm import tqdm 6 | 7 | 8 | def load_data(file_name: str = "./lib/name_params.json") -> dict: 9 | with open(file_name, "r") as f: 10 | data = json.load(f) 11 | 12 | return data 13 | 14 | 15 | def make_padding(width, cropsize, offset): 16 | left = offset 17 | roi_size = cropsize - left * 2 18 | if roi_size == 0: 19 | roi_size = cropsize 20 | right = roi_size - (width % roi_size) + left 21 | 22 | return left, right, roi_size 23 | 24 | 25 | def inference(X_spec, device, model, aggressiveness, data): 26 | """ 27 | data : dic configs 28 | """ 29 | 30 | def _execute( 31 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True 32 | ): 33 | model.eval() 34 | with torch.no_grad(): 35 | preds = [] 36 | 37 | iterations = [n_window] 38 | 39 | total_iterations = sum(iterations) 40 | for i in tqdm(range(n_window)): 41 | start = i * roi_size 42 | X_mag_window = X_mag_pad[ 43 | None, :, :, start : start + data["window_size"] 44 | ] 45 | X_mag_window = torch.from_numpy(X_mag_window) 46 | if is_half: 47 | X_mag_window = X_mag_window.half() 48 | X_mag_window = X_mag_window.to(device) 49 | 50 | pred = model.predict(X_mag_window, aggressiveness) 51 | 52 | pred = pred.detach().cpu().numpy() 53 | preds.append(pred[0]) 54 | 55 | pred = np.concatenate(preds, axis=2) 56 | return pred 57 | 58 | def preprocess(X_spec): 59 | X_mag = np.abs(X_spec) 60 | X_phase = np.angle(X_spec) 61 | 62 | return X_mag, X_phase 63 | 64 | X_mag, X_phase = preprocess(X_spec) 65 | 66 | coef = X_mag.max() 67 | X_mag_pre = X_mag / coef 68 | 69 | n_frame = X_mag_pre.shape[2] 70 | pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) 71 | n_window = int(np.ceil(n_frame / roi_size)) 72 | 73 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 74 | 75 | if list(model.state_dict().values())[0].dtype == torch.float16: 76 | is_half = True 77 | else: 78 | is_half = False 79 | pred = _execute( 80 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half 81 | ) 82 | pred = pred[:, :, :n_frame] 83 | 84 | if data["tta"]: 85 | pad_l += roi_size // 2 86 | pad_r += roi_size // 2 87 | n_window += 1 88 | 89 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 90 | 91 | pred_tta = _execute( 92 | X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half 93 | ) 94 | pred_tta = pred_tta[:, :, roi_size // 2 :] 95 | pred_tta = pred_tta[:, :, :n_frame] 96 | 97 | return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) 98 | else: 99 | return pred * coef, X_mag, np.exp(1.0j * X_phase) 100 | 101 | 102 | def _get_name_params(model_path, model_hash): 103 | data = load_data() 104 | flag = False 105 | ModelName = model_path 106 | for type in list(data): 107 | for model in list(data[type][0]): 108 | for i in range(len(data[type][0][model])): 109 | if str(data[type][0][model][i]["hash_name"]) == model_hash: 110 | flag = True 111 | elif str(data[type][0][model][i]["hash_name"]) in ModelName: 112 | flag = True 113 | 114 | if flag: 115 | model_params_auto = data[type][0][model][i]["model_params"] 116 | param_name_auto = data[type][0][model][i]["param_name"] 117 | if type == "equivalent": 118 | return param_name_auto, model_params_auto 119 | else: 120 | flag = False 121 | return param_name_auto, model_params_auto 122 | --------------------------------------------------------------------------------