├── .dockerignore ├── .github ├── build_windows_packages.ps1 └── workflows │ ├── build_windows_packages.yaml │ └── docker-publish.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── Colab-Inference.ipynb ├── Colab-WebUI.ipynb ├── Docker ├── install_wrapper.sh └── miniconda_install.sh ├── Dockerfile ├── GPT_SoVITS ├── AR │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── bucket_sampler.py │ │ ├── data_module.py │ │ └── dataset.py │ ├── models │ │ ├── __init__.py │ │ ├── t2s_lightning_module.py │ │ ├── t2s_lightning_module_onnx.py │ │ ├── t2s_model.py │ │ ├── t2s_model_onnx.py │ │ └── utils.py │ ├── modules │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── activation_onnx.py │ │ ├── embedding.py │ │ ├── embedding_onnx.py │ │ ├── lr_schedulers.py │ │ ├── optim.py │ │ ├── patched_mha_with_cache.py │ │ ├── patched_mha_with_cache_onnx.py │ │ ├── scaling.py │ │ ├── transformer.py │ │ └── transformer_onnx.py │ ├── text_processing │ │ ├── __init__.py │ │ ├── phonemizer.py │ │ └── symbols.py │ └── utils │ │ ├── __init__.py │ │ ├── initialize.py │ │ └── io.py ├── BigVGAN │ ├── LICENSE │ ├── README.md │ ├── activations.py │ ├── alias_free_activation │ │ ├── cuda │ │ │ ├── __init__.py │ │ │ ├── activation1d.py │ │ │ ├── anti_alias_activation.cpp │ │ │ ├── anti_alias_activation_cuda.cu │ │ │ ├── build │ │ │ │ └── _ │ │ │ ├── compat.h │ │ │ ├── load.py │ │ │ └── type_shim.h │ │ └── torch │ │ │ ├── __init__.py │ │ │ ├── act.py │ │ │ ├── filter.py │ │ │ └── resample.py │ ├── bigvgan.py │ ├── configs │ │ ├── bigvgan_22khz_80band.json │ │ ├── bigvgan_24khz_100band.json │ │ ├── bigvgan_base_22khz_80band.json │ │ ├── bigvgan_base_24khz_100band.json │ │ ├── bigvgan_v2_22khz_80band_256x.json │ │ ├── bigvgan_v2_22khz_80band_fmax8k_256x.json │ │ ├── bigvgan_v2_24khz_100band_256x.json │ │ ├── bigvgan_v2_44khz_128band_256x.json │ │ └── bigvgan_v2_44khz_128band_512x.json │ ├── discriminators.py │ ├── env.py │ ├── incl_licenses │ │ ├── LICENSE_1 │ │ ├── LICENSE_2 │ │ ├── LICENSE_3 │ │ ├── LICENSE_4 │ │ ├── LICENSE_5 │ │ ├── LICENSE_6 │ │ ├── LICENSE_7 │ │ └── LICENSE_8 │ ├── inference.py │ ├── inference_e2e.py │ ├── loss.py │ ├── meldataset.py │ ├── nv-modelcard++ │ │ ├── .gitkeep │ │ ├── bias.md │ │ ├── explainability.md │ │ ├── overview.md │ │ ├── privacy.md │ │ └── safety.md │ ├── requirements.txt │ ├── tests │ │ ├── test_activation.py │ │ ├── test_activation_snake_beta.py │ │ └── test_cuda_vs_torch_model.py │ ├── train.py │ └── utils0.py ├── TTS_infer_pack │ ├── TTS.py │ ├── TextPreprocessor.py │ ├── __init__.py │ └── text_segmentation_method.py ├── configs │ ├── .gitignore │ ├── s1.yaml │ ├── s1big.yaml │ ├── s1big2.yaml │ ├── s1longer-v2.yaml │ ├── s1longer.yaml │ ├── s1mq.yaml │ ├── s2.json │ ├── train.yaml │ └── tts_infer.yaml ├── download.py ├── export_torch_script.py ├── export_torch_script_v3.py ├── f5_tts │ └── model │ │ ├── __init__.py │ │ ├── backbones │ │ ├── README.md │ │ ├── dit.py │ │ ├── mmdit.py │ │ └── unett.py │ │ └── modules.py ├── feature_extractor │ ├── __init__.py │ ├── cnhubert.py │ └── whisper_enc.py ├── inference_cli.py ├── inference_gui.py ├── inference_webui.py ├── inference_webui_fast.py ├── module │ ├── __init__.py │ ├── attentions.py │ ├── attentions_onnx.py │ ├── commons.py │ ├── core_vq.py │ ├── data_utils.py │ ├── losses.py │ ├── mel_processing.py │ ├── models.py │ ├── models_onnx.py │ ├── modules.py │ ├── mrte_model.py │ ├── quantize.py │ └── transforms.py ├── onnx_export.py ├── prepare_datasets │ ├── 1-get-text.py │ ├── 2-get-hubert-wav32k.py │ └── 3-get-semantic.py ├── pretrained_models │ └── .gitignore ├── process_ckpt.py ├── s1_train.py ├── s2_train.py ├── s2_train_v3.py ├── s2_train_v3_lora.py ├── text │ ├── .gitignore │ ├── LangSegmenter │ │ ├── __init__.py │ │ └── langsegmenter.py │ ├── __init__.py │ ├── cantonese.py │ ├── chinese.py │ ├── chinese2.py │ ├── cleaner.py │ ├── cmudict-fast.rep │ ├── cmudict.rep │ ├── en_normalization │ │ └── expend.py │ ├── engdict-hot.rep │ ├── engdict_cache.pickle │ ├── english.py │ ├── g2pw │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── g2pw.py │ │ ├── onnx_api.py │ │ ├── polyphonic-fix.rep │ │ ├── polyphonic.pickle │ │ ├── polyphonic.rep │ │ └── utils.py │ ├── ja_userdic │ │ └── userdict.csv │ ├── japanese.py │ ├── korean.py │ ├── namedict_cache.pickle │ ├── opencpop-strict.txt │ ├── symbols.py │ ├── symbols2.py │ ├── tone_sandhi.py │ └── zh_normalization │ │ ├── README.md │ │ ├── __init__.py │ │ ├── char_convert.py │ │ ├── chronology.py │ │ ├── constants.py │ │ ├── num.py │ │ ├── phonecode.py │ │ ├── quantifier.py │ │ └── text_normlization.py └── utils.py ├── LICENSE ├── README.md ├── api.py ├── api_v2.py ├── config.py ├── docker-compose.yaml ├── docker_build.sh ├── docs ├── cn │ ├── Changelog_CN.md │ └── README.md ├── en │ └── Changelog_EN.md ├── ja │ ├── Changelog_JA.md │ └── README.md ├── ko │ ├── Changelog_KO.md │ └── README.md └── tr │ ├── Changelog_TR.md │ └── README.md ├── extra-req.txt ├── go-webui.bat ├── go-webui.ps1 ├── gpt-sovits_kaggle.ipynb ├── install.sh ├── requirements.txt ├── tools ├── AP_BWE_main │ ├── 24kto48k │ │ └── readme.txt │ ├── LICENSE │ ├── README.md │ ├── datasets1 │ │ ├── __init__.py │ │ └── dataset.py │ └── models │ │ ├── __init__.py │ │ └── model.py ├── __init__.py ├── asr │ ├── config.py │ ├── fasterwhisper_asr.py │ ├── funasr_asr.py │ └── models │ │ └── .gitignore ├── audio_sr.py ├── cmd-denoise.py ├── denoise-model │ └── .gitignore ├── i18n │ ├── i18n.py │ ├── locale │ │ ├── en_US.json │ │ ├── es_ES.json │ │ ├── fr_FR.json │ │ ├── it_IT.json │ │ ├── ja_JP.json │ │ ├── ko_KR.json │ │ ├── pt_BR.json │ │ ├── ru_RU.json │ │ ├── tr_TR.json │ │ ├── zh_CN.json │ │ ├── zh_HK.json │ │ ├── zh_SG.json │ │ └── zh_TW.json │ └── scan_i18n.py ├── my_utils.py ├── slice_audio.py ├── slicer2.py ├── subfix_webui.py └── uvr5 │ ├── bs_roformer │ ├── __init__.py │ ├── attend.py │ ├── bs_roformer.py │ └── mel_band_roformer.py │ ├── bsroformer.py │ ├── lib │ ├── lib_v5 │ │ ├── dataset.py │ │ ├── layers.py │ │ ├── layers_123812KB.py │ │ ├── layers_123821KB.py │ │ ├── layers_33966KB.py │ │ ├── layers_537227KB.py │ │ ├── layers_537238KB.py │ │ ├── layers_new.py │ │ ├── model_param_init.py │ │ ├── modelparams │ │ │ ├── 1band_sr16000_hl512.json │ │ │ ├── 1band_sr32000_hl512.json │ │ │ ├── 1band_sr33075_hl384.json │ │ │ ├── 1band_sr44100_hl1024.json │ │ │ ├── 1band_sr44100_hl256.json │ │ │ ├── 1band_sr44100_hl512.json │ │ │ ├── 1band_sr44100_hl512_cut.json │ │ │ ├── 2band_32000.json │ │ │ ├── 2band_44100_lofi.json │ │ │ ├── 2band_48000.json │ │ │ ├── 3band_44100.json │ │ │ ├── 3band_44100_mid.json │ │ │ ├── 3band_44100_msb2.json │ │ │ ├── 4band_44100.json │ │ │ ├── 4band_44100_mid.json │ │ │ ├── 4band_44100_msb.json │ │ │ ├── 4band_44100_msb2.json │ │ │ ├── 4band_44100_reverse.json │ │ │ ├── 4band_44100_sw.json │ │ │ ├── 4band_v2.json │ │ │ ├── 4band_v2_sn.json │ │ │ ├── 4band_v3.json │ │ │ └── ensemble.json │ │ ├── nets.py │ │ ├── nets_123812KB.py │ │ ├── nets_123821KB.py │ │ ├── nets_33966KB.py │ │ ├── nets_537227KB.py │ │ ├── nets_537238KB.py │ │ ├── nets_61968KB.py │ │ ├── nets_new.py │ │ └── spec_utils.py │ ├── name_params.json │ └── utils.py │ ├── mdxnet.py │ ├── uvr5_weights │ └── .gitignore │ ├── vr.py │ └── webui.py └── webui.py /.github/workflows/build_windows_packages.yaml: -------------------------------------------------------------------------------- 1 | name: Build and Upload Windows Package 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | date: 7 | description: "Date suffix (optional)" 8 | required: false 9 | default: "" 10 | suffix: 11 | description: "Package name suffix (optional)" 12 | required: false 13 | default: "" 14 | 15 | jobs: 16 | build: 17 | runs-on: windows-latest 18 | strategy: 19 | matrix: 20 | torch_cuda: [cu124, cu128] 21 | env: 22 | TORCH_CUDA: ${{ matrix.torch_cuda }} 23 | MODELSCOPE_USERNAME: ${{ secrets.MODELSCOPE_USERNAME }} 24 | MODELSCOPE_TOKEN: ${{ secrets.MODELSCOPE_TOKEN }} 25 | HUGGINGFACE_USERNAME: ${{ secrets.HUGGINGFACE_USERNAME }} 26 | HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} 27 | DATE_SUFFIX: ${{ github.event.inputs.date }} 28 | PKG_SUFFIX: ${{ github.event.inputs.suffix }} 29 | 30 | steps: 31 | - name: Checkout 32 | uses: actions/checkout@v4 33 | 34 | - name: Run Build and Upload Script 35 | shell: pwsh 36 | run: | 37 | Move-Item .github/build_windows_packages.ps1 ../build_windows_packages.ps1 38 | ../build_windows_packages.ps1 -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autoupdate_schedule: monthly 3 | 4 | repos: 5 | - repo: https://github.com/astral-sh/ruff-pre-commit 6 | rev: v0.11.7 7 | hooks: 8 | # Run the linter. 9 | - id: ruff 10 | types_or: [ python, pyi ] 11 | args: [ --fix ] 12 | # Run the formatter. 13 | - id: ruff-format 14 | types_or: [ python, pyi ] 15 | args: [ --line-length, "120", --target-version, "py310" ] 16 | -------------------------------------------------------------------------------- /Colab-WebUI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "view-in-github" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# GPT-SoVITS WebUI" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "_o6a8GS2lWQM" 24 | }, 25 | "source": [ 26 | "## Env Setup (Run Once Only)\n", 27 | "## 环境配置, 只需运行一次" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "### 1." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "%%writefile /content/setup.sh\n", 44 | "set -e\n", 45 | "\n", 46 | "cd /content\n", 47 | "\n", 48 | "git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", 49 | "\n", 50 | "cd GPT-SoVITS\n", 51 | "\n", 52 | "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n", 53 | " :\n", 54 | "else\n", 55 | " conda create -n GPTSoVITS python=3.10 -y\n", 56 | "fi\n", 57 | "\n", 58 | "source activate GPTSoVITS\n", 59 | "\n", 60 | "pip install ipykernel\n", 61 | "\n", 62 | "bash install.sh --device CU126 --source HF --download-uvr5" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "### 2." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "%pip install -q condacolab\n", 79 | "import condacolab\n", 80 | "condacolab.install_from_url(\"https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh\")\n", 81 | "!cd /content && bash setup.sh" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Launch WebUI\n", 89 | "## 启动 WebUI" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "id": "4oRGUzkrk8C7" 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py" 101 | ] 102 | } 103 | ], 104 | "metadata": { 105 | "accelerator": "GPU", 106 | "colab": { 107 | "include_colab_link": true, 108 | "provenance": [] 109 | }, 110 | "kernelspec": { 111 | "display_name": "Python 3", 112 | "name": "python3" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 0 117 | } 118 | -------------------------------------------------------------------------------- /Docker/install_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" 4 | 5 | cd "$SCRIPT_DIR" || exit 1 6 | 7 | cd .. || exit 1 8 | 9 | set -e 10 | 11 | source "$HOME/miniconda3/etc/profile.d/conda.sh" 12 | 13 | mkdir -p GPT_SoVITS 14 | 15 | mkdir -p GPT_SoVITS/text 16 | 17 | ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models 18 | 19 | ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel 20 | 21 | bash install.sh --device "CU${CUDA_VERSION//./}" --source HF 22 | 23 | pip cache purge 24 | 25 | pip show torch 26 | 27 | rm -rf /tmp/* /var/tmp/* 28 | 29 | rm -rf "$HOME/miniconda3/pkgs" 30 | 31 | mkdir -p "$HOME/miniconda3/pkgs" 32 | 33 | rm -rf /root/.conda /root/.cache 34 | -------------------------------------------------------------------------------- /Docker/miniconda_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" 6 | 7 | cd "$SCRIPT_DIR" || exit 1 8 | 9 | cd .. || exit 1 10 | 11 | if [ -d "$HOME/miniconda3" ]; then 12 | exit 0 13 | fi 14 | 15 | WORKFLOW=${WORKFLOW:-"false"} 16 | TARGETPLATFORM=${TARGETPLATFORM:-"linux/amd64"} 17 | 18 | if [ "$WORKFLOW" = "true" ]; then 19 | WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404) 20 | else 21 | WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404) 22 | fi 23 | 24 | if [ "$TARGETPLATFORM" = "linux/amd64" ]; then 25 | "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-x86_64.sh 26 | elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then 27 | "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-aarch64.sh 28 | else 29 | exit 1 30 | fi 31 | 32 | LOG_PATH="/tmp/miniconda-install.log" 33 | 34 | bash miniconda.sh -b -p "$HOME/miniconda3" >"$LOG_PATH" 2>&1 35 | 36 | if [ $? -eq 0 ]; then 37 | echo "== Miniconda Installed ==" 38 | else 39 | echo "Failed to Install miniconda" 40 | tail -n 50 "$LOG_PATH" 41 | exit 1 42 | fi 43 | 44 | rm miniconda.sh 45 | 46 | source "$HOME/miniconda3/etc/profile.d/conda.sh" 47 | 48 | "$HOME/miniconda3/bin/conda" config --add channels conda-forge 49 | 50 | "$HOME/miniconda3/bin/conda" update -q --all -y 1>/dev/null 51 | 52 | "$HOME/miniconda3/bin/conda" install python=3.11 -q -y 53 | 54 | "$HOME/miniconda3/bin/conda" install gcc=14 gxx ffmpeg cmake make unzip -q -y 55 | 56 | if [ "$CUDA_VERSION" = "12.8" ]; then 57 | "$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu128 58 | elif [ "$CUDA_VERSION" = "12.6" ]; then 59 | "$HOME/miniconda3/bin/pip" install torch==2.6 torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126 60 | fi 61 | 62 | "$HOME/miniconda3/bin/pip" cache purge 63 | 64 | rm $LOG_PATH 65 | 66 | rm -rf "$HOME/miniconda3/pkgs" 67 | 68 | mkdir -p "$HOME/miniconda3/pkgs" 69 | 70 | rm -rf "$HOME/.conda" "$HOME/.cache" 71 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG CUDA_VERSION=12.6 2 | ARG TORCH_BASE=full 3 | 4 | FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE} 5 | 6 | LABEL maintainer="XXXXRT" 7 | LABEL version="V4" 8 | LABEL description="Docker image for GPT-SoVITS" 9 | 10 | ARG CUDA_VERSION=12.6 11 | 12 | ENV CUDA_VERSION=${CUDA_VERSION} 13 | 14 | SHELL ["/bin/bash", "-c"] 15 | 16 | WORKDIR /workspace/GPT-SoVITS 17 | 18 | COPY Docker /workspace/GPT-SoVITS/Docker/ 19 | 20 | ARG LITE=false 21 | ENV LITE=${LITE} 22 | 23 | ARG WORKFLOW=false 24 | ENV WORKFLOW=${WORKFLOW} 25 | 26 | ARG TARGETPLATFORM 27 | ENV TARGETPLATFORM=${TARGETPLATFORM} 28 | 29 | RUN bash Docker/miniconda_install.sh 30 | 31 | COPY extra-req.txt /workspace/GPT-SoVITS/ 32 | 33 | COPY requirements.txt /workspace/GPT-SoVITS/ 34 | 35 | COPY install.sh /workspace/GPT-SoVITS/ 36 | 37 | RUN bash Docker/install_wrapper.sh 38 | 39 | EXPOSE 9871 9872 9873 9874 9880 40 | 41 | ENV PYTHONPATH="/workspace/GPT-SoVITS" 42 | 43 | RUN conda init bash && echo "conda activate base" >> ~/.bashrc 44 | 45 | WORKDIR /workspace 46 | 47 | RUN rm -rf /workspace/GPT-SoVITS 48 | 49 | WORKDIR /workspace/GPT-SoVITS 50 | 51 | COPY . /workspace/GPT-SoVITS 52 | 53 | CMD ["/bin/bash", "-c", "\ 54 | rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \ 55 | rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \ 56 | rm -rf /workspace/GPT-SoVITS/tools/asr/models && \ 57 | rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \ 58 | ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \ 59 | ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \ 60 | ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \ 61 | ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \ 62 | exec bash"] -------------------------------------------------------------------------------- /GPT_SoVITS/AR/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/data/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/data/data_module.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | from pytorch_lightning import LightningDataModule 4 | from torch.utils.data import DataLoader 5 | 6 | from AR.data.bucket_sampler import DistributedBucketSampler 7 | from AR.data.dataset import Text2SemanticDataset 8 | 9 | 10 | class Text2SemanticDataModule(LightningDataModule): 11 | def __init__( 12 | self, 13 | config, 14 | train_semantic_path, 15 | train_phoneme_path, 16 | dev_semantic_path=None, 17 | dev_phoneme_path=None, 18 | ): 19 | super().__init__() 20 | self.config = config 21 | self.train_semantic_path = train_semantic_path 22 | self.train_phoneme_path = train_phoneme_path 23 | self.dev_semantic_path = dev_semantic_path 24 | self.dev_phoneme_path = dev_phoneme_path 25 | self.num_workers = self.config["data"]["num_workers"] 26 | 27 | def prepare_data(self): 28 | pass 29 | 30 | def setup(self, stage=None, output_logs=False): 31 | self._train_dataset = Text2SemanticDataset( 32 | phoneme_path=self.train_phoneme_path, 33 | semantic_path=self.train_semantic_path, 34 | max_sec=self.config["data"]["max_sec"], 35 | pad_val=self.config["data"]["pad_val"], 36 | ) 37 | self._dev_dataset = self._train_dataset 38 | # self._dev_dataset = Text2SemanticDataset( 39 | # phoneme_path=self.dev_phoneme_path, 40 | # semantic_path=self.dev_semantic_path, 41 | # max_sample=self.config['data']['max_eval_sample'], 42 | # max_sec=self.config['data']['max_sec'], 43 | # pad_val=self.config['data']['pad_val']) 44 | 45 | def train_dataloader(self): 46 | batch_size = ( 47 | self.config["train"]["batch_size"] // 2 48 | if self.config["train"].get("if_dpo", False) is True 49 | else self.config["train"]["batch_size"] 50 | ) 51 | batch_size = max(min(batch_size, len(self._train_dataset) // 4), 1) # 防止不保存 52 | sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) 53 | return DataLoader( 54 | self._train_dataset, 55 | batch_size=batch_size, 56 | sampler=sampler, 57 | collate_fn=self._train_dataset.collate, 58 | num_workers=self.num_workers, 59 | persistent_workers=True, 60 | prefetch_factor=16, 61 | ) 62 | 63 | def val_dataloader(self): 64 | return DataLoader( 65 | self._dev_dataset, 66 | batch_size=1, 67 | shuffle=False, 68 | collate_fn=self._train_dataset.collate, 69 | num_workers=max(self.num_workers, 12), 70 | persistent_workers=True, 71 | prefetch_factor=16, 72 | ) 73 | 74 | # 这个会使用到嘛? 75 | def test_dataloader(self): 76 | return DataLoader( 77 | self._dev_dataset, 78 | batch_size=1, 79 | shuffle=False, 80 | collate_fn=self._train_dataset.collate, 81 | ) 82 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/models/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import os 4 | import sys 5 | 6 | now_dir = os.getcwd() 7 | sys.path.append(now_dir) 8 | from typing import Dict 9 | 10 | import torch 11 | from pytorch_lightning import LightningModule 12 | 13 | from AR.models.t2s_model_onnx import Text2SemanticDecoder 14 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule 15 | from AR.modules.optim import ScaledAdam 16 | 17 | 18 | class Text2SemanticLightningModule(LightningModule): 19 | def __init__(self, config, output_dir, is_train=True): 20 | super().__init__() 21 | self.config = config 22 | self.top_k = 3 23 | self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) 24 | pretrained_s1 = config.get("pretrained_s1") 25 | if pretrained_s1 and is_train: 26 | # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) 27 | print( 28 | self.load_state_dict( 29 | torch.load( 30 | pretrained_s1, 31 | map_location="cpu", 32 | )["weight"], 33 | ), 34 | ) 35 | if is_train: 36 | self.automatic_optimization = False 37 | self.save_hyperparameters() 38 | self.eval_dir = output_dir / "eval" 39 | self.eval_dir.mkdir(parents=True, exist_ok=True) 40 | 41 | def training_step(self, batch: Dict, batch_idx: int): 42 | opt = self.optimizers() 43 | scheduler = self.lr_schedulers() 44 | loss, acc = self.model.forward( 45 | batch["phoneme_ids"], 46 | batch["phoneme_ids_len"], 47 | batch["semantic_ids"], 48 | batch["semantic_ids_len"], 49 | batch["bert_feature"], 50 | ) 51 | self.manual_backward(loss) 52 | if batch_idx > 0 and batch_idx % 4 == 0: 53 | opt.step() 54 | opt.zero_grad() 55 | scheduler.step() 56 | 57 | self.log( 58 | "total_loss", 59 | loss, 60 | on_step=True, 61 | on_epoch=True, 62 | prog_bar=True, 63 | sync_dist=True, 64 | ) 65 | self.log( 66 | "lr", 67 | scheduler.get_last_lr()[0], 68 | on_epoch=True, 69 | prog_bar=True, 70 | sync_dist=True, 71 | ) 72 | self.log( 73 | f"top_{self.top_k}_acc", 74 | acc, 75 | on_step=True, 76 | on_epoch=True, 77 | prog_bar=True, 78 | sync_dist=True, 79 | ) 80 | 81 | def validation_step(self, batch: Dict, batch_idx: int): 82 | return 83 | 84 | def configure_optimizers(self): 85 | model_parameters = self.model.parameters() 86 | parameters_names = [] 87 | parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()]) 88 | lm_opt = ScaledAdam( 89 | model_parameters, 90 | lr=0.01, 91 | betas=(0.9, 0.95), 92 | clipping_scale=2.0, 93 | parameters_names=parameters_names, 94 | show_dominant_parameters=False, 95 | clipping_update_period=1000, 96 | ) 97 | 98 | return { 99 | "optimizer": lm_opt, 100 | "lr_scheduler": { 101 | "scheduler": WarmupCosineLRSchedule( 102 | lm_opt, 103 | init_lr=self.config["optimizer"]["lr_init"], 104 | peak_lr=self.config["optimizer"]["lr"], 105 | end_lr=self.config["optimizer"]["lr_end"], 106 | warmup_steps=self.config["optimizer"]["warmup_steps"], 107 | total_steps=self.config["optimizer"]["decay_steps"], 108 | ) 109 | }, 110 | } 111 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/modules/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/embedding.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | 50 | self.reverse = False 51 | self.pe = None 52 | self.extend_pe(torch.tensor(0.0).expand(1, 4000)) 53 | 54 | def extend_pe(self, x): 55 | """Reset the positional encodings.""" 56 | if self.pe is not None: 57 | if self.pe.size(1) >= x.size(1): 58 | if self.pe.dtype != x.dtype or self.pe.device != x.device: 59 | self.pe = self.pe.to(dtype=x.dtype, device=x.device) 60 | return 61 | pe = torch.zeros(x.size(1), self.embedding_dim) 62 | if self.reverse: 63 | position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1) 64 | else: 65 | position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) 66 | div_term = torch.exp( 67 | torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim) 68 | ) 69 | pe[:, 0::2] = torch.sin(position * div_term) 70 | pe[:, 1::2] = torch.cos(position * div_term) 71 | pe = pe.unsqueeze(0) 72 | self.pe = pe.to(device=x.device, dtype=x.dtype).detach() 73 | 74 | def forward(self, x: torch.Tensor) -> torch.Tensor: 75 | self.extend_pe(x) 76 | output = x.unsqueeze(-1) if x.ndim == 2 else x 77 | output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)] 78 | return self.dropout(output) 79 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/embedding_onnx.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py 2 | import math 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class TokenEmbedding(nn.Module): 9 | def __init__( 10 | self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | dropout: float = 0.0, 14 | ): 15 | super().__init__() 16 | 17 | self.vocab_size = vocab_size 18 | self.embedding_dim = embedding_dim 19 | 20 | self.dropout = torch.nn.Dropout(p=dropout) 21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) 22 | 23 | @property 24 | def weight(self) -> torch.Tensor: 25 | return self.word_embeddings.weight 26 | 27 | def embedding(self, index: int) -> torch.Tensor: 28 | return self.word_embeddings.weight[index : index + 1] 29 | 30 | def forward(self, x: torch.Tensor): 31 | x = self.word_embeddings(x) 32 | x = self.dropout(x) 33 | return x 34 | 35 | 36 | class SinePositionalEmbedding(nn.Module): 37 | def __init__( 38 | self, 39 | embedding_dim: int, 40 | dropout: float = 0.0, 41 | scale: bool = False, 42 | alpha: bool = False, 43 | ): 44 | super().__init__() 45 | self.embedding_dim = embedding_dim 46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) 48 | self.dropout = torch.nn.Dropout(p=dropout) 49 | self.reverse = False 50 | self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) 51 | 52 | def extend_pe(self, x): 53 | position = torch.cumsum(torch.ones_like(x[:, :, 0]), dim=1).transpose(0, 1) 54 | scpe = (position * self.div_term).unsqueeze(0) 55 | pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) 56 | pe = pe.contiguous().view(1, -1, self.embedding_dim) 57 | return pe 58 | 59 | def forward(self, x: torch.Tensor) -> torch.Tensor: 60 | pe = self.extend_pe(x) 61 | output = x.unsqueeze(-1) if x.ndim == 2 else x 62 | output = output * self.x_scale + self.alpha * pe 63 | return self.dropout(output) 64 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/lr_schedulers.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import math 4 | 5 | import torch 6 | from matplotlib import pyplot as plt 7 | from torch import nn 8 | from torch.optim import Adam 9 | 10 | 11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): 12 | """ 13 | Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | optimizer, 19 | init_lr, 20 | peak_lr, 21 | end_lr, 22 | warmup_steps=10000, 23 | total_steps=400000, 24 | current_step=0, 25 | ): 26 | self.init_lr = init_lr 27 | self.peak_lr = peak_lr 28 | self.end_lr = end_lr 29 | self.optimizer = optimizer 30 | self._warmup_rate = (peak_lr - init_lr) / warmup_steps 31 | self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps) 32 | self._current_step = current_step 33 | self.lr = init_lr 34 | self.warmup_steps = warmup_steps 35 | self.total_steps = total_steps 36 | self._last_lr = [self.lr] 37 | 38 | def set_lr(self, lr): 39 | self._last_lr = [g["lr"] for g in self.optimizer.param_groups] 40 | for g in self.optimizer.param_groups: 41 | # g['lr'] = lr 42 | g["lr"] = self.end_lr ###锁定用线性 43 | 44 | def step(self): 45 | if self._current_step < self.warmup_steps: 46 | lr = self.init_lr + self._warmup_rate * self._current_step 47 | 48 | elif self._current_step > self.total_steps: 49 | lr = self.end_lr 50 | 51 | else: 52 | decay_ratio = (self._current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps) 53 | if decay_ratio < 0.0 or decay_ratio > 1.0: 54 | raise RuntimeError("Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings.") 55 | coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) 56 | lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) 57 | 58 | self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定! 59 | self.set_lr(lr) 60 | self.lr = lr 61 | self._current_step += 1 62 | return self.lr 63 | 64 | 65 | if __name__ == "__main__": 66 | m = nn.Linear(10, 10) 67 | opt = Adam(m.parameters(), lr=1e-4) 68 | s = WarmupCosineLRSchedule( 69 | opt, 70 | 1e-6, 71 | 2e-4, 72 | 1e-6, 73 | warmup_steps=2000, 74 | total_steps=20000, 75 | current_step=0, 76 | ) 77 | lrs = [] 78 | for i in range(25000): 79 | s.step() 80 | lrs.append(s.lr) 81 | print(s.lr) 82 | 83 | plt.plot(lrs) 84 | plt.plot(range(0, 25000), lrs) 85 | plt.show() 86 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py: -------------------------------------------------------------------------------- 1 | from torch.nn.functional import * 2 | from torch.nn.functional import ( 3 | _canonical_mask, 4 | ) 5 | 6 | 7 | def multi_head_attention_forward_patched( 8 | query, 9 | key, 10 | value, 11 | embed_dim_to_check: int, 12 | num_heads: int, 13 | in_proj_weight, 14 | in_proj_bias: Optional[Tensor], 15 | bias_k: Optional[Tensor], 16 | bias_v: Optional[Tensor], 17 | add_zero_attn: bool, 18 | dropout_p: float, 19 | out_proj_weight: Tensor, 20 | out_proj_bias: Optional[Tensor], 21 | training: bool = True, 22 | key_padding_mask: Optional[Tensor] = None, 23 | need_weights: bool = True, 24 | attn_mask: Optional[Tensor] = None, 25 | use_separate_proj_weight: bool = False, 26 | q_proj_weight: Optional[Tensor] = None, 27 | k_proj_weight: Optional[Tensor] = None, 28 | v_proj_weight: Optional[Tensor] = None, 29 | static_k: Optional[Tensor] = None, 30 | static_v: Optional[Tensor] = None, 31 | average_attn_weights: bool = True, 32 | is_causal: bool = False, 33 | cache=None, 34 | ) -> Tuple[Tensor, Optional[Tensor]]: 35 | # set up shape vars 36 | _, _, embed_dim = query.shape 37 | attn_mask = _canonical_mask( 38 | mask=attn_mask, 39 | mask_name="attn_mask", 40 | other_type=None, 41 | other_name="", 42 | target_type=query.dtype, 43 | check_other=False, 44 | ) 45 | head_dim = embed_dim // num_heads 46 | 47 | proj_qkv = linear(query, in_proj_weight, in_proj_bias) 48 | proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() 49 | q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2] 50 | 51 | if cache["first_infer"] == 1: 52 | cache["k"][cache["stage"]] = k 53 | cache["v"][cache["stage"]] = v 54 | else: 55 | cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0) 56 | cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0) 57 | k = cache["k"][cache["stage"]] 58 | v = cache["v"][cache["stage"]] 59 | cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] 60 | 61 | attn_mask = _canonical_mask( 62 | mask=attn_mask, 63 | mask_name="attn_mask", 64 | other_type=None, 65 | other_name="", 66 | target_type=q.dtype, 67 | check_other=False, 68 | ) 69 | attn_mask = attn_mask.unsqueeze(0) 70 | 71 | q = q.view(-1, num_heads, head_dim).transpose(0, 1) 72 | k = k.view(-1, num_heads, head_dim).transpose(0, 1) 73 | v = v.view(-1, num_heads, head_dim).transpose(0, 1) 74 | 75 | dropout_p = 0.0 76 | attn_mask = attn_mask.unsqueeze(0) 77 | q = q.view(num_heads, -1, head_dim).unsqueeze(0) 78 | k = k.view(num_heads, -1, head_dim).unsqueeze(0) 79 | v = v.view(num_heads, -1, head_dim).unsqueeze(0) 80 | attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal) 81 | attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) 82 | attn_output = linear(attn_output, out_proj_weight, out_proj_bias) 83 | attn_output = attn_output.view(-1, 1, attn_output.size(1)) 84 | 85 | return attn_output 86 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/text_processing/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/phonemizer.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | import itertools 4 | import re 5 | from typing import Dict 6 | from typing import List 7 | 8 | import regex 9 | from gruut import sentences 10 | from gruut.const import Sentence 11 | from gruut.const import Word 12 | from AR.text_processing.symbols import SYMBOL_TO_ID 13 | 14 | 15 | class GruutPhonemizer: 16 | def __init__(self, language: str): 17 | self._phonemizer = sentences 18 | self.lang = language 19 | self.symbol_to_id = SYMBOL_TO_ID 20 | self._special_cases_dict: Dict[str] = { 21 | r"\.\.\.": "... ", 22 | ";": "; ", 23 | ":": ": ", 24 | ",": ", ", 25 | r"\.": ". ", 26 | "!": "! ", 27 | r"\?": "? ", 28 | "—": "—", 29 | "…": "… ", 30 | "«": "«", 31 | "»": "»", 32 | } 33 | self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])" 34 | 35 | def _normalize_punctuation(self, text: str) -> str: 36 | text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) 37 | text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) 38 | text = regex.sub(r"\pZ+", r" ", text) 39 | return text.strip() 40 | 41 | def _convert_punctuation(self, word: Word) -> str: 42 | if not word.phonemes: 43 | return "" 44 | if word.phonemes[0] in ["‖", "|"]: 45 | return word.text.strip() 46 | 47 | phonemes = "".join(word.phonemes) 48 | # remove modifier characters ˈˌː with regex 49 | phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) 50 | return phonemes.strip() 51 | 52 | def phonemize(self, text: str, espeak: bool = False) -> str: 53 | text_to_phonemize: str = self._normalize_punctuation(text) 54 | sents: List[Sentence] = [sent for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)] 55 | words: List[str] = [self._convert_punctuation(word) for word in itertools.chain(*sents)] 56 | return " ".join(words) 57 | 58 | def transform(self, phonemes): 59 | # convert phonemes to ids 60 | # dictionary is in symbols.py 61 | return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] 62 | 63 | 64 | if __name__ == "__main__": 65 | phonemizer = GruutPhonemizer("en-us") 66 | # text -> IPA 67 | phonemes = phonemizer.phonemize("Hello, wor-ld ?") 68 | print("phonemes:", phonemes) 69 | print("len(phonemes):", len(phonemes)) 70 | phoneme_ids = phonemizer.transform(phonemes) 71 | print("phoneme_ids:", phoneme_ids) 72 | print("len(phoneme_ids):", len(phoneme_ids)) 73 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/text_processing/symbols.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py 2 | # reference: https://github.com/lifeiteng/vall-e 3 | PAD = "_" 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” ' 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 6 | IPA_LETTERS = ( 7 | "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 8 | ) 9 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) 10 | SPACE_ID = SYMBOLS.index(" ") 11 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} 12 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)} 13 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def str2bool(str): 5 | return True if str.lower() == "true" else False 6 | 7 | 8 | def get_newest_ckpt(string_list): 9 | # 定义一个正则表达式模式,用于匹配字符串中的数字 10 | pattern = r"epoch=(\d+)-step=(\d+)\.ckpt" 11 | 12 | # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 13 | extracted_info = [] 14 | for string in string_list: 15 | match = re.match(pattern, string) 16 | if match: 17 | epoch = int(match.group(1)) 18 | step = int(match.group(2)) 19 | extracted_info.append((epoch, step, string)) 20 | # 按照 epoch 后面的数字和 step 后面的数字进行排序 21 | sorted_info = sorted(extracted_info, key=lambda x: (x[0], x[1]), reverse=True) 22 | # 获取最新的 ckpt 文件名 23 | newest_ckpt = sorted_info[0][2] 24 | return newest_ckpt 25 | 26 | 27 | # 文本存在且不为空时 return True 28 | def check_txt_file(file_path): 29 | try: 30 | with open(file_path, "r") as file: 31 | text = file.readline().strip() 32 | assert text.strip() != "" 33 | return text 34 | except Exception: 35 | return False 36 | return False 37 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/initialize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Initialize modules for espnet2 neural networks.""" 3 | 4 | import torch 5 | from typeguard import check_argument_types 6 | 7 | 8 | def initialize(model: torch.nn.Module, init: str): 9 | """Initialize weights of a neural network module. 10 | 11 | Parameters are initialized using the given method or distribution. 12 | 13 | Custom initialization routines can be implemented into submodules 14 | as function `espnet_initialization_fn` within the custom module. 15 | 16 | Args: 17 | model: Target. 18 | init: Method of initialization. 19 | """ 20 | assert check_argument_types() 21 | print("init with", init) 22 | 23 | # weight init 24 | for p in model.parameters(): 25 | if p.dim() > 1: 26 | if init == "xavier_uniform": 27 | torch.nn.init.xavier_uniform_(p.data) 28 | elif init == "xavier_normal": 29 | torch.nn.init.xavier_normal_(p.data) 30 | elif init == "kaiming_uniform": 31 | torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu") 32 | elif init == "kaiming_normal": 33 | torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu") 34 | else: 35 | raise ValueError("Unknown initialization: " + init) 36 | # bias init 37 | for name, p in model.named_parameters(): 38 | if ".bias" in name and p.dim() == 1: 39 | p.data.zero_() 40 | -------------------------------------------------------------------------------- /GPT_SoVITS/AR/utils/io.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | import yaml 5 | 6 | 7 | def load_yaml_config(path): 8 | with open(path) as f: 9 | config = yaml.full_load(f) 10 | return config 11 | 12 | 13 | def save_config_to_yaml(config, path): 14 | assert path.endswith(".yaml") 15 | with open(path, "w") as f: 16 | f.write(yaml.dump(config)) 17 | f.close() 18 | 19 | 20 | def write_args(args, path): 21 | args_dict = dict((name, getattr(args, name)) for name in dir(args) if not name.startswith("_")) 22 | with open(path, "a") as args_file: 23 | args_file.write("==> torch version: {}\n".format(torch.__version__)) 24 | args_file.write("==> cudnn version: {}\n".format(torch.backends.cudnn.version())) 25 | args_file.write("==> Cmd:\n") 26 | args_file.write(str(sys.argv)) 27 | args_file.write("\n==> args:\n") 28 | for k, v in sorted(args_dict.items()): 29 | args_file.write(" %s: %s\n" % (str(k), str(v))) 30 | args_file.close() 31 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 NVIDIA CORPORATION. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | import torch 5 | import torch.nn as nn 6 | from alias_free_activation.torch.resample import UpSample1d, DownSample1d 7 | 8 | # load fused CUDA kernel: this enables importing anti_alias_activation_cuda 9 | from alias_free_activation.cuda import load 10 | 11 | anti_alias_activation_cuda = load.load() 12 | 13 | 14 | class FusedAntiAliasActivation(torch.autograd.Function): 15 | """ 16 | Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs. 17 | The hyperparameters are hard-coded in the kernel to maximize speed. 18 | NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters. 19 | """ 20 | 21 | @staticmethod 22 | def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta): 23 | activation_results = anti_alias_activation_cuda.forward(inputs, up_ftr, down_ftr, alpha, beta) 24 | 25 | return activation_results 26 | 27 | @staticmethod 28 | def backward(ctx, output_grads): 29 | raise NotImplementedError 30 | return output_grads, None, None 31 | 32 | 33 | class Activation1d(nn.Module): 34 | def __init__( 35 | self, 36 | activation, 37 | up_ratio: int = 2, 38 | down_ratio: int = 2, 39 | up_kernel_size: int = 12, 40 | down_kernel_size: int = 12, 41 | fused: bool = True, 42 | ): 43 | super().__init__() 44 | self.up_ratio = up_ratio 45 | self.down_ratio = down_ratio 46 | self.act = activation 47 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 48 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 49 | 50 | self.fused = fused # Whether to use fused CUDA kernel or not 51 | 52 | def forward(self, x): 53 | if not self.fused: 54 | x = self.upsample(x) 55 | x = self.act(x) 56 | x = self.downsample(x) 57 | return x 58 | else: 59 | if self.act.__class__.__name__ == "Snake": 60 | beta = self.act.alpha.data # Snake uses same params for alpha and beta 61 | else: 62 | beta = self.act.beta.data # Snakebeta uses different params for alpha and beta 63 | alpha = self.act.alpha.data 64 | if not self.act.alpha_logscale: # Exp baked into cuda kernel, cancel it out with a log 65 | alpha = torch.log(alpha) 66 | beta = torch.log(beta) 67 | 68 | x = FusedAntiAliasActivation.apply(x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta) 69 | return x 70 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | 19 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta); 20 | 21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 22 | m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)"); 23 | } -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/build/_: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | #ifndef TORCH_CHECK 22 | #define TORCH_CHECK AT_CHECK 23 | #endif 24 | 25 | #ifdef VERSION_GE_1_3 26 | #define DATA_PTR data_ptr 27 | #else 28 | #define DATA_PTR data 29 | #endif 30 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | import pathlib 6 | import subprocess 7 | 8 | from torch.utils import cpp_extension 9 | 10 | """ 11 | Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels. 12 | Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below 13 | """ 14 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 15 | 16 | 17 | def load(): 18 | # Check if cuda 11 is installed for compute capability 8.0 19 | cc_flag = [] 20 | _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) 21 | if int(bare_metal_major) >= 11: 22 | cc_flag.append("-gencode") 23 | cc_flag.append("arch=compute_80,code=sm_80") 24 | 25 | # Build path 26 | srcpath = pathlib.Path(__file__).parent.absolute() 27 | buildpath = srcpath / "build" 28 | _create_build_dir(buildpath) 29 | 30 | # Helper function to build the kernels. 31 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags): 32 | return cpp_extension.load( 33 | name=name, 34 | sources=sources, 35 | build_directory=buildpath, 36 | extra_cflags=[ 37 | "-O3", 38 | ], 39 | extra_cuda_cflags=[ 40 | "-O3", 41 | "-gencode", 42 | "arch=compute_70,code=sm_70", 43 | "--use_fast_math", 44 | ] 45 | + extra_cuda_flags 46 | + cc_flag, 47 | verbose=True, 48 | ) 49 | 50 | extra_cuda_flags = [ 51 | "-U__CUDA_NO_HALF_OPERATORS__", 52 | "-U__CUDA_NO_HALF_CONVERSIONS__", 53 | "--expt-relaxed-constexpr", 54 | "--expt-extended-lambda", 55 | ] 56 | 57 | sources = [ 58 | srcpath / "anti_alias_activation.cpp", 59 | srcpath / "anti_alias_activation_cuda.cu", 60 | ] 61 | anti_alias_activation_cuda = _cpp_extention_load_helper("anti_alias_activation_cuda", sources, extra_cuda_flags) 62 | 63 | return anti_alias_activation_cuda 64 | 65 | 66 | def _get_cuda_bare_metal_version(cuda_dir): 67 | raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) 68 | output = raw_output.split() 69 | release_idx = output.index("release") + 1 70 | release = output[release_idx].split(".") 71 | bare_metal_major = release[0] 72 | bare_metal_minor = release[1][0] 73 | 74 | return raw_output, bare_metal_major, bare_metal_minor 75 | 76 | 77 | def _create_build_dir(buildpath): 78 | try: 79 | os.mkdir(buildpath) 80 | except OSError: 81 | if not os.path.isdir(buildpath): 82 | print(f"Creation of the build directory {buildpath} failed") 83 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from .filter import * 5 | from .resample import * 6 | from .act import * 7 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from .resample import UpSample1d, DownSample1d 6 | 7 | 8 | class Activation1d(nn.Module): 9 | def __init__( 10 | self, 11 | activation, 12 | up_ratio: int = 2, 13 | down_ratio: int = 2, 14 | up_kernel_size: int = 12, 15 | down_kernel_size: int = 12, 16 | ): 17 | super().__init__() 18 | self.up_ratio = up_ratio 19 | self.down_ratio = down_ratio 20 | self.act = activation 21 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 22 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 23 | 24 | # x: [B,C,T] 25 | def forward(self, x): 26 | x = self.upsample(x) 27 | x = self.act(x) 28 | x = self.downsample(x) 29 | 30 | return x 31 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import math 8 | 9 | if "sinc" in dir(torch): 10 | sinc = torch.sinc 11 | else: 12 | # This code is adopted from adefossez's julius.core.sinc under the MIT License 13 | # https://adefossez.github.io/julius/julius/core.html 14 | # LICENSE is in incl_licenses directory. 15 | def sinc(x: torch.Tensor): 16 | """ 17 | Implementation of sinc, i.e. sin(pi * x) / (pi * x) 18 | __Warning__: Different to julius.sinc, the input is multiplied by `pi`! 19 | """ 20 | return torch.where( 21 | x == 0, 22 | torch.tensor(1.0, device=x.device, dtype=x.dtype), 23 | torch.sin(math.pi * x) / math.pi / x, 24 | ) 25 | 26 | 27 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License 28 | # https://adefossez.github.io/julius/julius/lowpass.html 29 | # LICENSE is in incl_licenses directory. 30 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] 31 | even = kernel_size % 2 == 0 32 | half_size = kernel_size // 2 33 | 34 | # For kaiser window 35 | delta_f = 4 * half_width 36 | A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 37 | if A > 50.0: 38 | beta = 0.1102 * (A - 8.7) 39 | elif A >= 21.0: 40 | beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0) 41 | else: 42 | beta = 0.0 43 | window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) 44 | 45 | # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio 46 | if even: 47 | time = torch.arange(-half_size, half_size) + 0.5 48 | else: 49 | time = torch.arange(kernel_size) - half_size 50 | if cutoff == 0: 51 | filter_ = torch.zeros_like(time) 52 | else: 53 | filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) 54 | """ 55 | Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal. 56 | """ 57 | filter_ /= filter_.sum() 58 | filter = filter_.view(1, 1, kernel_size) 59 | 60 | return filter 61 | 62 | 63 | class LowPassFilter1d(nn.Module): 64 | def __init__( 65 | self, 66 | cutoff=0.5, 67 | half_width=0.6, 68 | stride: int = 1, 69 | padding: bool = True, 70 | padding_mode: str = "replicate", 71 | kernel_size: int = 12, 72 | ): 73 | """ 74 | kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible. 75 | """ 76 | super().__init__() 77 | if cutoff < -0.0: 78 | raise ValueError("Minimum cutoff must be larger than zero.") 79 | if cutoff > 0.5: 80 | raise ValueError("A cutoff above 0.5 does not make sense.") 81 | self.kernel_size = kernel_size 82 | self.even = kernel_size % 2 == 0 83 | self.pad_left = kernel_size // 2 - int(self.even) 84 | self.pad_right = kernel_size // 2 85 | self.stride = stride 86 | self.padding = padding 87 | self.padding_mode = padding_mode 88 | filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) 89 | self.register_buffer("filter", filter) 90 | 91 | # Input [B, C, T] 92 | def forward(self, x): 93 | _, C, _ = x.shape 94 | 95 | if self.padding: 96 | x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode) 97 | out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) 98 | 99 | return out 100 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | from .filter import LowPassFilter1d 7 | from .filter import kaiser_sinc_filter1d 8 | 9 | 10 | class UpSample1d(nn.Module): 11 | def __init__(self, ratio=2, kernel_size=None): 12 | super().__init__() 13 | self.ratio = ratio 14 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 15 | self.stride = ratio 16 | self.pad = self.kernel_size // ratio - 1 17 | self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 18 | self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 19 | filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size) 20 | self.register_buffer("filter", filter) 21 | 22 | # x: [B, C, T] 23 | def forward(self, x): 24 | _, C, _ = x.shape 25 | 26 | x = F.pad(x, (self.pad, self.pad), mode="replicate") 27 | x = self.ratio * F.conv_transpose1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) 28 | x = x[..., self.pad_left : -self.pad_right] 29 | 30 | return x 31 | 32 | 33 | class DownSample1d(nn.Module): 34 | def __init__(self, ratio=2, kernel_size=None): 35 | super().__init__() 36 | self.ratio = ratio 37 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size 38 | self.lowpass = LowPassFilter1d( 39 | cutoff=0.5 / ratio, 40 | half_width=0.6 / ratio, 41 | stride=ratio, 42 | kernel_size=self.kernel_size, 43 | ) 44 | 45 | def forward(self, x): 46 | xx = self.lowpass(x) 47 | 48 | return xx 49 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_22khz_80band.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 32, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "activation": "snakebeta", 18 | "snake_logscale": true, 19 | 20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], 21 | "mpd_reshapes": [2, 3, 5, 7, 11], 22 | "use_spectral_norm": false, 23 | "discriminator_channel_mult": 1, 24 | 25 | "segment_size": 8192, 26 | "num_mels": 80, 27 | "num_freq": 1025, 28 | "n_fft": 1024, 29 | "hop_size": 256, 30 | "win_size": 1024, 31 | 32 | "sampling_rate": 22050, 33 | 34 | "fmin": 0, 35 | "fmax": 8000, 36 | "fmax_for_loss": null, 37 | 38 | "num_workers": 4, 39 | 40 | "dist_config": { 41 | "dist_backend": "nccl", 42 | "dist_url": "tcp://localhost:54321", 43 | "world_size": 1 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_24khz_100band.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 32, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "activation": "snakebeta", 18 | "snake_logscale": true, 19 | 20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], 21 | "mpd_reshapes": [2, 3, 5, 7, 11], 22 | "use_spectral_norm": false, 23 | "discriminator_channel_mult": 1, 24 | 25 | "segment_size": 8192, 26 | "num_mels": 100, 27 | "num_freq": 1025, 28 | "n_fft": 1024, 29 | "hop_size": 256, 30 | "win_size": 1024, 31 | 32 | "sampling_rate": 24000, 33 | 34 | "fmin": 0, 35 | "fmax": 12000, 36 | "fmax_for_loss": null, 37 | 38 | "num_workers": 4, 39 | 40 | "dist_config": { 41 | "dist_backend": "nccl", 42 | "dist_url": "tcp://localhost:54321", 43 | "world_size": 1 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_base_22khz_80band.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 32, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,8,2,2], 12 | "upsample_kernel_sizes": [16,16,4,4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "activation": "snakebeta", 18 | "snake_logscale": true, 19 | 20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], 21 | "mpd_reshapes": [2, 3, 5, 7, 11], 22 | "use_spectral_norm": false, 23 | "discriminator_channel_mult": 1, 24 | 25 | "segment_size": 8192, 26 | "num_mels": 80, 27 | "num_freq": 1025, 28 | "n_fft": 1024, 29 | "hop_size": 256, 30 | "win_size": 1024, 31 | 32 | "sampling_rate": 22050, 33 | 34 | "fmin": 0, 35 | "fmax": 8000, 36 | "fmax_for_loss": null, 37 | 38 | "num_workers": 4, 39 | 40 | "dist_config": { 41 | "dist_backend": "nccl", 42 | "dist_url": "tcp://localhost:54321", 43 | "world_size": 1 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_base_24khz_100band.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 32, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,8,2,2], 12 | "upsample_kernel_sizes": [16,16,4,4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "activation": "snakebeta", 18 | "snake_logscale": true, 19 | 20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], 21 | "mpd_reshapes": [2, 3, 5, 7, 11], 22 | "use_spectral_norm": false, 23 | "discriminator_channel_mult": 1, 24 | 25 | "segment_size": 8192, 26 | "num_mels": 100, 27 | "num_freq": 1025, 28 | "n_fft": 1024, 29 | "hop_size": 256, 30 | "win_size": 1024, 31 | 32 | "sampling_rate": 24000, 33 | 34 | "fmin": 0, 35 | "fmax": 12000, 36 | "fmax_for_loss": null, 37 | 38 | "num_workers": 4, 39 | 40 | "dist_config": { 41 | "dist_backend": "nccl", 42 | "dist_url": "tcp://localhost:54321", 43 | "world_size": 1 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_256x.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 4, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 80, 43 | "num_freq": 1025, 44 | "n_fft": 1024, 45 | "hop_size": 256, 46 | "win_size": 1024, 47 | 48 | "sampling_rate": 22050, 49 | 50 | "fmin": 0, 51 | "fmax": null, 52 | "fmax_for_loss": null, 53 | 54 | "num_workers": 4, 55 | 56 | "dist_config": { 57 | "dist_backend": "nccl", 58 | "dist_url": "tcp://localhost:54321", 59 | "world_size": 1 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_fmax8k_256x.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 4, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 80, 43 | "num_freq": 1025, 44 | "n_fft": 1024, 45 | "hop_size": 256, 46 | "win_size": 1024, 47 | 48 | "sampling_rate": 22050, 49 | 50 | "fmin": 0, 51 | "fmax": 8000, 52 | "fmax_for_loss": null, 53 | 54 | "num_workers": 4, 55 | 56 | "dist_config": { 57 | "dist_backend": "nccl", 58 | "dist_url": "tcp://localhost:54321", 59 | "world_size": 1 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_v2_24khz_100band_256x.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 4, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 100, 43 | "num_freq": 1025, 44 | "n_fft": 1024, 45 | "hop_size": 256, 46 | "win_size": 1024, 47 | 48 | "sampling_rate": 24000, 49 | 50 | "fmin": 0, 51 | "fmax": null, 52 | "fmax_for_loss": null, 53 | 54 | "num_workers": 4, 55 | 56 | "dist_config": { 57 | "dist_backend": "nccl", 58 | "dist_url": "tcp://localhost:54321", 59 | "world_size": 1 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_256x.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 4, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 128, 43 | "num_freq": 1025, 44 | "n_fft": 1024, 45 | "hop_size": 256, 46 | "win_size": 1024, 47 | 48 | "sampling_rate": 44100, 49 | 50 | "fmin": 0, 51 | "fmax": null, 52 | "fmax_for_loss": null, 53 | 54 | "num_workers": 4, 55 | 56 | "dist_config": { 57 | "dist_backend": "nccl", 58 | "dist_url": "tcp://localhost:54321", 59 | "world_size": 1 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_512x.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 4, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,4,2,2,2,2], 12 | "upsample_kernel_sizes": [16,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 128, 43 | "num_freq": 2049, 44 | "n_fft": 2048, 45 | "hop_size": 512, 46 | "win_size": 2048, 47 | 48 | "sampling_rate": 44100, 49 | 50 | "fmin": 0, 51 | "fmax": null, 52 | "fmax_for_loss": null, 53 | 54 | "num_workers": 4, 55 | 56 | "dist_config": { 57 | "dist_backend": "nccl", 58 | "dist_url": "tcp://localhost:54321", 59 | "world_size": 1 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/env.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import os 5 | import shutil 6 | 7 | 8 | class AttrDict(dict): 9 | def __init__(self, *args, **kwargs): 10 | super(AttrDict, self).__init__(*args, **kwargs) 11 | self.__dict__ = self 12 | 13 | 14 | def build_env(config, config_name, path): 15 | t_path = os.path.join(path, config_name) 16 | if config != t_path: 17 | os.makedirs(path, exist_ok=True) 18 | shutil.copyfile(config, os.path.join(path, config_name)) 19 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_1: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_2: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Edward Dixon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_4: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Seungwon Park 박승원 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_5: -------------------------------------------------------------------------------- 1 | Copyright 2020 Alexandre Défossez 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 4 | associated documentation files (the "Software"), to deal in the Software without restriction, 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute, 6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 7 | furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or 10 | substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 15 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_6: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023-present, Descript 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_7: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Charactr Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_8: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Amphion 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/inference.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from __future__ import absolute_import, division, print_function, unicode_literals 5 | 6 | import os 7 | import argparse 8 | import json 9 | import torch 10 | import librosa 11 | from utils import load_checkpoint 12 | from meldataset import get_mel_spectrogram 13 | from scipy.io.wavfile import write 14 | from env import AttrDict 15 | from meldataset import MAX_WAV_VALUE 16 | from bigvgan import BigVGAN as Generator 17 | 18 | h = None 19 | device = None 20 | torch.backends.cudnn.benchmark = False 21 | 22 | 23 | def inference(a, h): 24 | generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device) 25 | 26 | state_dict_g = load_checkpoint(a.checkpoint_file, device) 27 | generator.load_state_dict(state_dict_g["generator"]) 28 | 29 | filelist = os.listdir(a.input_wavs_dir) 30 | 31 | os.makedirs(a.output_dir, exist_ok=True) 32 | 33 | generator.eval() 34 | generator.remove_weight_norm() 35 | with torch.no_grad(): 36 | for i, filname in enumerate(filelist): 37 | # Load the ground truth audio and resample if necessary 38 | wav, sr = librosa.load(os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True) 39 | wav = torch.FloatTensor(wav).to(device) 40 | # Compute mel spectrogram from the ground truth audio 41 | x = get_mel_spectrogram(wav.unsqueeze(0), generator.h) 42 | 43 | y_g_hat = generator(x) 44 | 45 | audio = y_g_hat.squeeze() 46 | audio = audio * MAX_WAV_VALUE 47 | audio = audio.cpu().numpy().astype("int16") 48 | 49 | output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated.wav") 50 | write(output_file, h.sampling_rate, audio) 51 | print(output_file) 52 | 53 | 54 | def main(): 55 | print("Initializing Inference Process..") 56 | 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("--input_wavs_dir", default="test_files") 59 | parser.add_argument("--output_dir", default="generated_files") 60 | parser.add_argument("--checkpoint_file", required=True) 61 | parser.add_argument("--use_cuda_kernel", action="store_true", default=False) 62 | 63 | a = parser.parse_args() 64 | 65 | config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json") 66 | with open(config_file) as f: 67 | data = f.read() 68 | 69 | global h 70 | json_config = json.loads(data) 71 | h = AttrDict(json_config) 72 | 73 | torch.manual_seed(h.seed) 74 | global device 75 | if torch.cuda.is_available(): 76 | torch.cuda.manual_seed(h.seed) 77 | device = torch.device("cuda") 78 | else: 79 | device = torch.device("cpu") 80 | 81 | inference(a, h) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/inference_e2e.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from __future__ import absolute_import, division, print_function, unicode_literals 5 | 6 | import glob 7 | import os 8 | import numpy as np 9 | import argparse 10 | import json 11 | import torch 12 | from scipy.io.wavfile import write 13 | from env import AttrDict 14 | from meldataset import MAX_WAV_VALUE 15 | from bigvgan import BigVGAN as Generator 16 | 17 | h = None 18 | device = None 19 | torch.backends.cudnn.benchmark = False 20 | 21 | 22 | def load_checkpoint(filepath, device): 23 | assert os.path.isfile(filepath) 24 | print(f"Loading '{filepath}'") 25 | checkpoint_dict = torch.load(filepath, map_location=device) 26 | print("Complete.") 27 | return checkpoint_dict 28 | 29 | 30 | def scan_checkpoint(cp_dir, prefix): 31 | pattern = os.path.join(cp_dir, prefix + "*") 32 | cp_list = glob.glob(pattern) 33 | if len(cp_list) == 0: 34 | return "" 35 | return sorted(cp_list)[-1] 36 | 37 | 38 | def inference(a, h): 39 | generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device) 40 | 41 | state_dict_g = load_checkpoint(a.checkpoint_file, device) 42 | generator.load_state_dict(state_dict_g["generator"]) 43 | 44 | filelist = os.listdir(a.input_mels_dir) 45 | 46 | os.makedirs(a.output_dir, exist_ok=True) 47 | 48 | generator.eval() 49 | generator.remove_weight_norm() 50 | with torch.no_grad(): 51 | for i, filname in enumerate(filelist): 52 | # Load the mel spectrogram in .npy format 53 | x = np.load(os.path.join(a.input_mels_dir, filname)) 54 | x = torch.FloatTensor(x).to(device) 55 | if len(x.shape) == 2: 56 | x = x.unsqueeze(0) 57 | 58 | y_g_hat = generator(x) 59 | 60 | audio = y_g_hat.squeeze() 61 | audio = audio * MAX_WAV_VALUE 62 | audio = audio.cpu().numpy().astype("int16") 63 | 64 | output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav") 65 | write(output_file, h.sampling_rate, audio) 66 | print(output_file) 67 | 68 | 69 | def main(): 70 | print("Initializing Inference Process..") 71 | 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument("--input_mels_dir", default="test_mel_files") 74 | parser.add_argument("--output_dir", default="generated_files_from_mel") 75 | parser.add_argument("--checkpoint_file", required=True) 76 | parser.add_argument("--use_cuda_kernel", action="store_true", default=False) 77 | 78 | a = parser.parse_args() 79 | 80 | config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json") 81 | with open(config_file) as f: 82 | data = f.read() 83 | 84 | global h 85 | json_config = json.loads(data) 86 | h = AttrDict(json_config) 87 | 88 | torch.manual_seed(h.seed) 89 | global device 90 | if torch.cuda.is_available(): 91 | torch.cuda.manual_seed(h.seed) 92 | device = torch.device("cuda") 93 | else: 94 | device = torch.device("cpu") 95 | 96 | inference(a, h) 97 | 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/nv-modelcard++/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/nv-modelcard++/bias.md: -------------------------------------------------------------------------------- 1 | | Field | Response | 2 | | :--------------------------------------------------------------------------------------------------------- | :--------------------------------------------------- | 3 | | Participation considerations from adversely impacted groups protected classes in model design and testing: | None | 4 | | Measures taken to mitigate against unwanted bias: | No measures taken to mitigate against unwanted bias. | 5 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/nv-modelcard++/explainability.md: -------------------------------------------------------------------------------- 1 | | Field | Response | 2 | | :---------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 3 | | Intended Application & Domain: | Generating waveform from mel spectrogram. | 4 | | Model Type: | Convolutional Neural Network (CNN) | 5 | | Intended Users: | This model is intended for developers to synthesize and generate waveforms from the AI-generated mel spectrograms. | 6 | | Output: | Audio Waveform | 7 | | Describe how the model works: | Model generates audio waveform corresponding to the input mel spectrogram. | 8 | | Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable | 9 | | Technical Limitations: | This may not perform well on synthetically-generated mel spectrograms that deviate significantly from the profile of mel spectrograms on which this was trained. | 10 | | Verified to have met prescribed NVIDIA quality standards: | Yes | 11 | | Performance Metrics: | Perceptual Evaluation of Speech Quality (PESQ), Virtual Speech Quality Objective Listener (VISQOL), Multi-resolution STFT (MRSTFT), Mel cepstral distortion (MCD), Periodicity RMSE, Voice/Unvoiced F1 Score (V/UV F1) | 12 | | Potential Known Risks: | This model may generate low-quality or distorted soundwaves. | 13 | | Licensing: | https://github.com/NVIDIA/BigVGAN/blob/main/LICENSE | 14 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/nv-modelcard++/privacy.md: -------------------------------------------------------------------------------- 1 | | Field | Response | 2 | | :------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------- | 3 | | Generatable or reverse engineerable personal information? | None | 4 | | Protected class data used to create this model? | None | 5 | | Was consent obtained for any personal data used? | Not Applicable (No Personal Data) | 6 | | How often is dataset reviewed? | Before Release | 7 | | Is a mechanism in place to honor data subject right of access or deletion of personal data? | Not Applicable | 8 | | If personal collected for the development of the model, was it collected directly by NVIDIA? | Not Applicable | 9 | | If personal collected for the development of the model by NVIDIA, do you maintain or have access to disclosures made to data subjects? | Not Applicable | 10 | | If personal collected for the development of this AI model, was it minimized to only what was required? | Not Applicable | 11 | | Is data in dataset traceable? | Yes | 12 | | Is there provenance for all datasets used in training? | Yes | 13 | | Does data labeling (annotation, metadata) comply with privacy laws? | Yes | 14 | | Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. | 15 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/nv-modelcard++/safety.md: -------------------------------------------------------------------------------- 1 | | Field | Response | 2 | | :---------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 3 | | Model Application(s): | Synethic Audio Generation | 4 | | Describe the life critical impact (if present). | Not Applicable | 5 | | Use Case Restrictions: | None | 6 | | Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. | 7 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | numpy 3 | librosa>=0.8.1 4 | scipy 5 | tensorboard 6 | soundfile 7 | matplotlib 8 | pesq 9 | auraloss 10 | tqdm 11 | nnAudio 12 | ninja 13 | huggingface_hub>=0.23.4 -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/tests/test_activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | import sys 6 | 7 | # to import modules from parent_dir 8 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 9 | sys.path.append(parent_dir) 10 | 11 | import torch 12 | from alias_free_activation.cuda import activation1d 13 | from activations import Snake 14 | 15 | 16 | def test_load_fused_kernels(): 17 | try: 18 | print("[Success] load_fused_kernels") 19 | except ImportError as e: 20 | print("[Fail] load_fused_kernels") 21 | raise e 22 | 23 | 24 | def test_anti_alias_activation(): 25 | data = torch.rand((10, 10, 200), device="cuda") 26 | 27 | # Check activations.Snake cuda vs. torch 28 | fused_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=True).cuda() 29 | fused_activation_output = fused_anti_alias_activation(data) 30 | 31 | torch_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=False).cuda() 32 | torch_activation_output = torch_anti_alias_activation(data) 33 | 34 | test_result = (fused_activation_output - torch_activation_output).abs() 35 | 36 | while test_result.dim() != 1: 37 | test_result = test_result.mean(dim=-1) 38 | 39 | diff = test_result.mean(dim=-1) 40 | 41 | if diff <= 1e-3: 42 | print( 43 | f"\n[Success] test_fused_anti_alias_activation" 44 | f"\n > mean_difference={diff}" 45 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}" 46 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}" 47 | ) 48 | else: 49 | print( 50 | f"\n[Fail] test_fused_anti_alias_activation" 51 | f"\n > mean_difference={diff}, " 52 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, " 53 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}" 54 | ) 55 | 56 | 57 | if __name__ == "__main__": 58 | from alias_free_activation.cuda import load 59 | 60 | load.load() 61 | test_load_fused_kernels() 62 | test_anti_alias_activation() 63 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | import os 5 | import sys 6 | 7 | # to import modules from parent_dir 8 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 9 | sys.path.append(parent_dir) 10 | 11 | import torch 12 | from alias_free_activation.cuda import activation1d 13 | from activations import SnakeBeta 14 | 15 | 16 | def test_load_fused_kernels(): 17 | try: 18 | print("[Success] load_fused_kernels") 19 | except ImportError as e: 20 | print("[Fail] load_fused_kernels") 21 | raise e 22 | 23 | 24 | def test_anti_alias_activation(): 25 | data = torch.rand((10, 10, 200), device="cuda") 26 | 27 | # Check activations, Snake CUDA vs. Torch 28 | fused_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=True).cuda() 29 | fused_activation_output = fused_anti_alias_activation(data) 30 | 31 | torch_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=False).cuda() 32 | torch_activation_output = torch_anti_alias_activation(data) 33 | 34 | test_result = (fused_activation_output - torch_activation_output).abs() 35 | 36 | while test_result.dim() != 1: 37 | test_result = test_result.mean(dim=-1) 38 | 39 | diff = test_result.mean(dim=-1) 40 | 41 | if diff <= 1e-3: 42 | print( 43 | f"\n[Success] test_fused_anti_alias_activation" 44 | f"\n > mean_difference={diff}" 45 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}" 46 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}" 47 | ) 48 | else: 49 | print( 50 | f"\n[Fail] test_fused_anti_alias_activation" 51 | f"\n > mean_difference={diff}, " 52 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, " 53 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}" 54 | ) 55 | 56 | 57 | if __name__ == "__main__": 58 | from alias_free_activation.cuda import load 59 | 60 | load.load() 61 | test_load_fused_kernels() 62 | test_anti_alias_activation() 63 | -------------------------------------------------------------------------------- /GPT_SoVITS/BigVGAN/utils0.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import glob 5 | import os 6 | import matplotlib 7 | import torch 8 | from torch.nn.utils import weight_norm 9 | 10 | matplotlib.use("Agg") 11 | import matplotlib.pylab as plt 12 | from .meldataset import MAX_WAV_VALUE 13 | from scipy.io.wavfile import write 14 | 15 | 16 | def plot_spectrogram(spectrogram): 17 | fig, ax = plt.subplots(figsize=(10, 2)) 18 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 19 | plt.colorbar(im, ax=ax) 20 | 21 | fig.canvas.draw() 22 | plt.close() 23 | 24 | return fig 25 | 26 | 27 | def plot_spectrogram_clipped(spectrogram, clip_max=2.0): 28 | fig, ax = plt.subplots(figsize=(10, 2)) 29 | im = ax.imshow( 30 | spectrogram, 31 | aspect="auto", 32 | origin="lower", 33 | interpolation="none", 34 | vmin=1e-6, 35 | vmax=clip_max, 36 | ) 37 | plt.colorbar(im, ax=ax) 38 | 39 | fig.canvas.draw() 40 | plt.close() 41 | 42 | return fig 43 | 44 | 45 | def init_weights(m, mean=0.0, std=0.01): 46 | classname = m.__class__.__name__ 47 | if classname.find("Conv") != -1: 48 | m.weight.data.normal_(mean, std) 49 | 50 | 51 | def apply_weight_norm(m): 52 | classname = m.__class__.__name__ 53 | if classname.find("Conv") != -1: 54 | weight_norm(m) 55 | 56 | 57 | def get_padding(kernel_size, dilation=1): 58 | return int((kernel_size * dilation - dilation) / 2) 59 | 60 | 61 | def load_checkpoint(filepath, device): 62 | assert os.path.isfile(filepath) 63 | print(f"Loading '{filepath}'") 64 | checkpoint_dict = torch.load(filepath, map_location=device) 65 | print("Complete.") 66 | return checkpoint_dict 67 | 68 | 69 | def save_checkpoint(filepath, obj): 70 | print(f"Saving checkpoint to {filepath}") 71 | torch.save(obj, filepath) 72 | print("Complete.") 73 | 74 | 75 | def scan_checkpoint(cp_dir, prefix, renamed_file=None): 76 | # Fallback to original scanning logic first 77 | pattern = os.path.join(cp_dir, prefix + "????????") 78 | cp_list = glob.glob(pattern) 79 | 80 | if len(cp_list) > 0: 81 | last_checkpoint_path = sorted(cp_list)[-1] 82 | print(f"[INFO] Resuming from checkpoint: '{last_checkpoint_path}'") 83 | return last_checkpoint_path 84 | 85 | # If no pattern-based checkpoints are found, check for renamed file 86 | if renamed_file: 87 | renamed_path = os.path.join(cp_dir, renamed_file) 88 | if os.path.isfile(renamed_path): 89 | print(f"[INFO] Resuming from renamed checkpoint: '{renamed_file}'") 90 | return renamed_path 91 | 92 | return None 93 | 94 | 95 | def save_audio(audio, path, sr): 96 | # wav: torch with 1d shape 97 | audio = audio * MAX_WAV_VALUE 98 | audio = audio.cpu().numpy().astype("int16") 99 | write(path, sr, audio) 100 | -------------------------------------------------------------------------------- /GPT_SoVITS/TTS_infer_pack/__init__.py: -------------------------------------------------------------------------------- 1 | from . import TTS, text_segmentation_method 2 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/.gitignore: -------------------------------------------------------------------------------- 1 | *.yaml -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 8 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 512 24 | hidden_dim: 512 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 12 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1big.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 8 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 1024 24 | hidden_dim: 1024 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 16 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1big2.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 300 4 | batch_size: 12 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 1024 24 | hidden_dim: 1024 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 6 28 | dropout: 0 29 | EOS: 1024 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1longer-v2.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 20 4 | batch_size: 8 5 | save_every_n_epoch: 1 6 | precision: 16-mixed 7 | gradient_clip: 1.0 8 | optimizer: 9 | lr: 0.01 10 | lr_init: 0.00001 11 | lr_end: 0.0001 12 | warmup_steps: 2000 13 | decay_steps: 40000 14 | data: 15 | max_eval_sample: 8 16 | max_sec: 54 17 | num_workers: 4 18 | pad_val: 1024 # same with EOS in model 19 | model: 20 | vocab_size: 1025 21 | phoneme_vocab_size: 732 22 | embedding_dim: 512 23 | hidden_dim: 512 24 | head: 16 25 | linear_units: 2048 26 | n_layer: 24 27 | dropout: 0 28 | EOS: 1024 29 | random_bert: 0 30 | inference: 31 | top_k: 15 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1longer.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 20 4 | batch_size: 8 5 | save_every_n_epoch: 1 6 | precision: 16-mixed 7 | gradient_clip: 1.0 8 | optimizer: 9 | lr: 0.01 10 | lr_init: 0.00001 11 | lr_end: 0.0001 12 | warmup_steps: 2000 13 | decay_steps: 40000 14 | data: 15 | max_eval_sample: 8 16 | max_sec: 54 17 | num_workers: 4 18 | pad_val: 1024 # same with EOS in model 19 | model: 20 | vocab_size: 1025 21 | phoneme_vocab_size: 512 22 | embedding_dim: 512 23 | hidden_dim: 512 24 | head: 16 25 | linear_units: 2048 26 | n_layer: 24 27 | dropout: 0 28 | EOS: 1024 29 | random_bert: 0 30 | inference: 31 | top_k: 5 32 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s1mq.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | seed: 1234 3 | epochs: 100 4 | batch_size: 6 5 | gradient_accumulation: 4 6 | save_every_n_epoch: 1 7 | precision: 32 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 40 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | saving_path: "ckpt/" 22 | resume_checkpoint: null 23 | vocoder_config_path: "quantizer/new_ckpt/config.json" 24 | vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000" 25 | datadir: "/home/liweiche/GigaSpeech/wavs" 26 | metapath: "/home/liweiche/GigaSpeech/train2.json" 27 | val_metapath: "/home/liweiche/GigaSpeech/dev2.json" 28 | sampledir: "logs/" 29 | pretrained_path: null 30 | lr: 0.0001 31 | batch_size: 200.0 32 | train_bucket_size: 8192 33 | training_step: 800000 34 | optim_flat_percent: 0.0 35 | warmup_step: 50 36 | adam_beta1: 0.9 37 | adam_beta2: 0.98 38 | ffd_size: 3072 39 | hidden_size: 768 40 | enc_nlayers: 6 41 | dec_nlayers: 6 42 | nheads: 12 43 | ar_layer: 4 44 | ar_ffd_size: 1024 45 | ar_hidden_size: 256 46 | ar_nheads: 4 47 | aligner_softmax_temp: 1.0 48 | layer_norm_eps: 0.00001 49 | speaker_embed_dropout: 0.05 50 | label_smoothing: 0.0 51 | val_check_interval: 5000 52 | check_val_every_n_epoch: 1 53 | precision: "fp16" 54 | nworkers: 16 55 | distributed: true 56 | accelerator: "ddp" 57 | version: null 58 | accumulate_grad_batches: 1 59 | use_repetition_token: true 60 | use_repetition_gating: false 61 | repetition_penalty: 1.0 62 | sampling_temperature: 1.0 63 | top_k: -1 64 | min_top_k: 3 65 | top_p: 0.8 66 | sample_num: 4 67 | length_penalty_max_length: 15000 68 | length_penalty_max_prob: 0.95 69 | max_input_length: 2048 70 | max_output_length: 2000 71 | sample_rate: 16000 72 | n_codes: 1024 73 | n_cluster_groups: 1 74 | phone_context_window: 4 75 | phoneset_size: 1000 76 | inference: 77 | top_k: 5 78 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/s2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 100, 4 | "eval_interval": 500, 5 | "seed": 1234, 6 | "epochs": 100, 7 | "learning_rate": 0.0001, 8 | "betas": [ 9 | 0.8, 10 | 0.99 11 | ], 12 | "eps": 1e-09, 13 | "batch_size": 32, 14 | "fp16_run": true, 15 | "lr_decay": 0.999875, 16 | "segment_size": 20480, 17 | "init_lr_ratio": 1, 18 | "warmup_epochs": 0, 19 | "c_mel": 45, 20 | "c_kl": 1.0, 21 | "text_low_lr_rate": 0.4, 22 | "grad_ckpt": false 23 | }, 24 | "data": { 25 | "max_wav_value": 32768.0, 26 | "sampling_rate": 32000, 27 | "filter_length": 2048, 28 | "hop_length": 640, 29 | "win_length": 2048, 30 | "n_mel_channels": 128, 31 | "mel_fmin": 0.0, 32 | "mel_fmax": null, 33 | "add_blank": true, 34 | "n_speakers": 300, 35 | "cleaned_text": true 36 | }, 37 | "model": { 38 | "inter_channels": 192, 39 | "hidden_channels": 192, 40 | "filter_channels": 768, 41 | "n_heads": 2, 42 | "n_layers": 6, 43 | "kernel_size": 3, 44 | "p_dropout": 0.1, 45 | "resblock": "1", 46 | "resblock_kernel_sizes": [ 47 | 3, 48 | 7, 49 | 11 50 | ], 51 | "resblock_dilation_sizes": [ 52 | [ 53 | 1, 54 | 3, 55 | 5 56 | ], 57 | [ 58 | 1, 59 | 3, 60 | 5 61 | ], 62 | [ 63 | 1, 64 | 3, 65 | 5 66 | ] 67 | ], 68 | "upsample_rates": [ 69 | 10, 70 | 8, 71 | 2, 72 | 2, 73 | 2 74 | ], 75 | "upsample_initial_channel": 512, 76 | "upsample_kernel_sizes": [ 77 | 16, 78 | 16, 79 | 8, 80 | 2, 81 | 2 82 | ], 83 | "n_layers_q": 3, 84 | "use_spectral_norm": false, 85 | "gin_channels": 512, 86 | "semantic_frame_rate": "25hz", 87 | "freeze_quantizer": true 88 | }, 89 | "s2_ckpt_dir": "logs/s2/big2k1", 90 | "content_module": "cnhubert" 91 | } -------------------------------------------------------------------------------- /GPT_SoVITS/configs/train.yaml: -------------------------------------------------------------------------------- 1 | gpu: 2 | n_card: 1 3 | n_process_per_card: 2 4 | io: 5 | text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS 6 | save_every_n_epoch: 1 7 | precision: 16-mixed 8 | gradient_clip: 1.0 9 | optimizer: 10 | lr: 0.01 11 | lr_init: 0.00001 12 | lr_end: 0.0001 13 | warmup_steps: 2000 14 | decay_steps: 40000 15 | data: 16 | max_eval_sample: 8 17 | max_sec: 54 18 | num_workers: 1 19 | pad_val: 1024 # same with EOS in model 20 | model: 21 | vocab_size: 1025 22 | phoneme_vocab_size: 512 23 | embedding_dim: 512 24 | hidden_dim: 512 25 | head: 16 26 | linear_units: 2048 27 | n_layer: 24 28 | dropout: 0 29 | EOS: 1024 30 | random_bert: 0 31 | inference: 32 | top_k: 5 33 | -------------------------------------------------------------------------------- /GPT_SoVITS/configs/tts_infer.yaml: -------------------------------------------------------------------------------- 1 | custom: 2 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large 3 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base 4 | device: cuda 5 | is_half: true 6 | t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt 7 | version: v2 8 | vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth 9 | v1: 10 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large 11 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base 12 | device: cpu 13 | is_half: false 14 | t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt 15 | version: v1 16 | vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth 17 | v2: 18 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large 19 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base 20 | device: cpu 21 | is_half: false 22 | t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt 23 | version: v2 24 | vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth 25 | v3: 26 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large 27 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base 28 | device: cpu 29 | is_half: false 30 | t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt 31 | version: v3 32 | vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth 33 | v4: 34 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large 35 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base 36 | device: cpu 37 | is_half: false 38 | t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt 39 | version: v4 40 | vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth 41 | -------------------------------------------------------------------------------- /GPT_SoVITS/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | now_dir = os.getcwd() 5 | sys.path.insert(0, now_dir) 6 | from text.g2pw import G2PWPinyin 7 | 8 | g2pw = G2PWPinyin( 9 | model_dir="GPT_SoVITS/text/G2PWModel", 10 | model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", 11 | v_to_u=False, 12 | neutral_tone_with_five=True, 13 | ) 14 | -------------------------------------------------------------------------------- /GPT_SoVITS/f5_tts/model/__init__.py: -------------------------------------------------------------------------------- 1 | # from f5_tts.model.cfm import CFM 2 | # 3 | # from f5_tts.model.backbones.unett import UNetT 4 | from GPT_SoVITS.f5_tts.model.backbones.dit import DiT 5 | # from f5_tts.model.backbones.dit import DiTNoCond 6 | # from f5_tts.model.backbones.dit import DiTNoCondNoT 7 | # from f5_tts.model.backbones.mmdit import MMDiT 8 | 9 | # from f5_tts.model.trainer import Trainer 10 | 11 | 12 | # __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"] 13 | # __all__ = ["CFM", "UNetT", "DiTNoCond","DiT", "MMDiT"] 14 | -------------------------------------------------------------------------------- /GPT_SoVITS/f5_tts/model/backbones/README.md: -------------------------------------------------------------------------------- 1 | ## Backbones quick introduction 2 | 3 | 4 | ### unett.py 5 | - flat unet transformer 6 | - structure same as in e2-tts & voicebox paper except using rotary pos emb 7 | - update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat 8 | 9 | ### dit.py 10 | - adaln-zero dit 11 | - embedded timestep as condition 12 | - concatted noised_input + masked_cond + embedded_text, linear proj in 13 | - possible abs pos emb & convnextv2 blocks for embedded text before concat 14 | - possible long skip connection (first layer to last layer) 15 | 16 | ### mmdit.py 17 | - sd3 structure 18 | - timestep as condition 19 | - left stream: text embedded and applied a abs pos emb 20 | - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett 21 | -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cnhubert, whisper_enc 2 | 3 | content_module_map = {"cnhubert": cnhubert, "whisper": whisper_enc} 4 | -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/cnhubert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from transformers import logging as tf_logging 4 | 5 | tf_logging.set_verbosity_error() 6 | 7 | import logging 8 | 9 | logging.getLogger("numba").setLevel(logging.WARNING) 10 | 11 | from transformers import ( 12 | Wav2Vec2FeatureExtractor, 13 | HubertModel, 14 | ) 15 | 16 | import utils 17 | import torch.nn as nn 18 | 19 | cnhubert_base_path = None 20 | 21 | 22 | class CNHubert(nn.Module): 23 | def __init__(self, base_path: str = None): 24 | super().__init__() 25 | if base_path is None: 26 | base_path = cnhubert_base_path 27 | if os.path.exists(base_path): 28 | ... 29 | else: 30 | raise FileNotFoundError(base_path) 31 | self.model = HubertModel.from_pretrained(base_path, local_files_only=True) 32 | self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(base_path, local_files_only=True) 33 | 34 | def forward(self, x): 35 | input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 36 | feats = self.model(input_values)["last_hidden_state"] 37 | return feats 38 | 39 | 40 | # class CNHubertLarge(nn.Module): 41 | # def __init__(self): 42 | # super().__init__() 43 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") 44 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") 45 | # def forward(self, x): 46 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 47 | # feats = self.model(input_values)["last_hidden_state"] 48 | # return feats 49 | # 50 | # class CVec(nn.Module): 51 | # def __init__(self): 52 | # super().__init__() 53 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") 54 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") 55 | # def forward(self, x): 56 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 57 | # feats = self.model(input_values)["last_hidden_state"] 58 | # return feats 59 | # 60 | # class cnw2v2base(nn.Module): 61 | # def __init__(self): 62 | # super().__init__() 63 | # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") 64 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") 65 | # def forward(self, x): 66 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) 67 | # feats = self.model(input_values)["last_hidden_state"] 68 | # return feats 69 | 70 | 71 | def get_model(): 72 | model = CNHubert() 73 | model.eval() 74 | return model 75 | 76 | 77 | # def get_large_model(): 78 | # model = CNHubertLarge() 79 | # model.eval() 80 | # return model 81 | # 82 | # def get_model_cvec(): 83 | # model = CVec() 84 | # model.eval() 85 | # return model 86 | # 87 | # def get_model_cnw2v2base(): 88 | # model = cnw2v2base() 89 | # model.eval() 90 | # return model 91 | 92 | 93 | def get_content(hmodel, wav_16k_tensor): 94 | with torch.no_grad(): 95 | feats = hmodel(wav_16k_tensor) 96 | return feats.transpose(1, 2) 97 | 98 | 99 | if __name__ == "__main__": 100 | model = get_model() 101 | src_path = "/Users/Shared/原音频2.wav" 102 | wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000) 103 | model = model 104 | wav_16k_tensor = wav_16k_tensor 105 | feats = get_content(model, wav_16k_tensor) 106 | print(feats.shape) 107 | -------------------------------------------------------------------------------- /GPT_SoVITS/feature_extractor/whisper_enc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_model(): 5 | import whisper 6 | 7 | model = whisper.load_model("small", device="cpu") 8 | 9 | return model.encoder 10 | 11 | 12 | def get_content(model=None, wav_16k_tensor=None): 13 | from whisper import log_mel_spectrogram, pad_or_trim 14 | 15 | dev = next(model.parameters()).device 16 | mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000] 17 | # if torch.cuda.is_available(): 18 | # mel = mel.to(torch.float16) 19 | feature_len = mel.shape[-1] // 2 20 | assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" 21 | with torch.no_grad(): 22 | feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[:1, :feature_len, :].transpose(1, 2) 23 | return feature 24 | -------------------------------------------------------------------------------- /GPT_SoVITS/inference_cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import soundfile as sf 4 | 5 | from tools.i18n.i18n import I18nAuto 6 | from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav 7 | 8 | i18n = I18nAuto() 9 | 10 | 11 | def synthesize( 12 | GPT_model_path, 13 | SoVITS_model_path, 14 | ref_audio_path, 15 | ref_text_path, 16 | ref_language, 17 | target_text_path, 18 | target_language, 19 | output_path, 20 | ): 21 | # Read reference text 22 | with open(ref_text_path, "r", encoding="utf-8") as file: 23 | ref_text = file.read() 24 | 25 | # Read target text 26 | with open(target_text_path, "r", encoding="utf-8") as file: 27 | target_text = file.read() 28 | 29 | # Change model weights 30 | change_gpt_weights(gpt_path=GPT_model_path) 31 | change_sovits_weights(sovits_path=SoVITS_model_path) 32 | 33 | # Synthesize audio 34 | synthesis_result = get_tts_wav( 35 | ref_wav_path=ref_audio_path, 36 | prompt_text=ref_text, 37 | prompt_language=i18n(ref_language), 38 | text=target_text, 39 | text_language=i18n(target_language), 40 | top_p=1, 41 | temperature=1, 42 | ) 43 | 44 | result_list = list(synthesis_result) 45 | 46 | if result_list: 47 | last_sampling_rate, last_audio_data = result_list[-1] 48 | output_wav_path = os.path.join(output_path, "output.wav") 49 | sf.write(output_wav_path, last_audio_data, last_sampling_rate) 50 | print(f"Audio saved to {output_wav_path}") 51 | 52 | 53 | def main(): 54 | parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") 55 | parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file") 56 | parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file") 57 | parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file") 58 | parser.add_argument("--ref_text", required=True, help="Path to the reference text file") 59 | parser.add_argument( 60 | "--ref_language", required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio" 61 | ) 62 | parser.add_argument("--target_text", required=True, help="Path to the target text file") 63 | parser.add_argument( 64 | "--target_language", 65 | required=True, 66 | choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], 67 | help="Language of the target text", 68 | ) 69 | parser.add_argument("--output_path", required=True, help="Path to the output directory") 70 | 71 | args = parser.parse_args() 72 | 73 | synthesize( 74 | args.gpt_model, 75 | args.sovits_model, 76 | args.ref_audio, 77 | args.ref_text, 78 | args.ref_language, 79 | args.target_text, 80 | args.target_language, 81 | args.output_path, 82 | ) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /GPT_SoVITS/module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/module/__init__.py -------------------------------------------------------------------------------- /GPT_SoVITS/module/losses.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | 5 | 6 | def feature_loss(fmap_r, fmap_g): 7 | loss = 0 8 | for dr, dg in zip(fmap_r, fmap_g): 9 | for rl, gl in zip(dr, dg): 10 | rl = rl.float().detach() 11 | gl = gl.float() 12 | loss += torch.mean(torch.abs(rl - gl)) 13 | 14 | return loss * 2 15 | 16 | 17 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 18 | loss = 0 19 | r_losses = [] 20 | g_losses = [] 21 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 22 | dr = dr.float() 23 | dg = dg.float() 24 | r_loss = torch.mean((1 - dr) ** 2) 25 | g_loss = torch.mean(dg**2) 26 | loss += r_loss + g_loss 27 | r_losses.append(r_loss.item()) 28 | g_losses.append(g_loss.item()) 29 | 30 | return loss, r_losses, g_losses 31 | 32 | 33 | def generator_loss(disc_outputs): 34 | loss = 0 35 | gen_losses = [] 36 | for dg in disc_outputs: 37 | dg = dg.float() 38 | l = torch.mean((1 - dg) ** 2) 39 | gen_losses.append(l) 40 | loss += l 41 | 42 | return loss, gen_losses 43 | 44 | 45 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 46 | """ 47 | z_p, logs_q: [b, h, t_t] 48 | m_p, logs_p: [b, h, t_t] 49 | """ 50 | z_p = z_p.float() 51 | logs_q = logs_q.float() 52 | m_p = m_p.float() 53 | logs_p = logs_p.float() 54 | z_mask = z_mask.float() 55 | 56 | kl = logs_p - logs_q - 0.5 57 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 58 | kl = torch.sum(kl * z_mask) 59 | l = kl / torch.sum(z_mask) 60 | return l 61 | 62 | 63 | def mle_loss(z, m, logs, logdet, mask): 64 | l = torch.sum(logs) + 0.5 * torch.sum( 65 | torch.exp(-2 * logs) * ((z - m) ** 2) 66 | ) # neg normal likelihood w/o the constant term 67 | l = l - torch.sum(logdet) # log jacobian determinant 68 | l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes 69 | l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term 70 | return l 71 | -------------------------------------------------------------------------------- /GPT_SoVITS/pretrained_models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /GPT_SoVITS/process_ckpt.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from collections import OrderedDict 3 | from time import time as ttime 4 | import shutil 5 | import os 6 | import torch 7 | from tools.i18n.i18n import I18nAuto 8 | 9 | i18n = I18nAuto() 10 | 11 | 12 | def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path 13 | dir = os.path.dirname(path) 14 | name = os.path.basename(path) 15 | tmp_path = "%s.pth" % (ttime()) 16 | torch.save(fea, tmp_path) 17 | shutil.move(tmp_path, "%s/%s" % (dir, name)) 18 | 19 | 20 | """ 21 | 00:v1 22 | 01:v2 23 | 02:v3 24 | 03:v3lora 25 | 04:v4lora 26 | 27 | """ 28 | from io import BytesIO 29 | 30 | 31 | def my_save2(fea, path, cfm_version): 32 | bio = BytesIO() 33 | torch.save(fea, bio) 34 | bio.seek(0) 35 | data = bio.getvalue() 36 | byte = b"03" if cfm_version == "v3" else b"04" 37 | data = byte + data[2:] 38 | with open(path, "wb") as f: 39 | f.write(data) 40 | 41 | 42 | def savee(ckpt, name, epoch, steps, hps, cfm_version=None, lora_rank=None): 43 | try: 44 | opt = OrderedDict() 45 | opt["weight"] = {} 46 | for key in ckpt.keys(): 47 | if "enc_q" in key: 48 | continue 49 | opt["weight"][key] = ckpt[key].half() 50 | opt["config"] = hps 51 | opt["info"] = "%sepoch_%siteration" % (epoch, steps) 52 | if lora_rank: 53 | opt["lora_rank"] = lora_rank 54 | my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), cfm_version) 55 | else: 56 | my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) 57 | return "Success." 58 | except: 59 | return traceback.format_exc() 60 | 61 | 62 | head2version = { 63 | b"00": ["v1", "v1", False], 64 | b"01": ["v2", "v2", False], 65 | b"02": ["v2", "v3", False], 66 | b"03": ["v2", "v3", True], 67 | b"04": ["v2", "v4", True], 68 | } 69 | hash_pretrained_dict = { 70 | "dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False], # s2G488k.pth#sovits_v1_pretrained 71 | "43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False], # s2Gv3.pth#sovits_v3_pretrained 72 | "6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False], # s2G2333K.pth#sovits_v2_pretrained 73 | "4f26b9476d0c5033e04162c486074374": ["v2", "v4", False], # s2Gv4.pth#sovits_v4_pretrained 74 | } 75 | import hashlib 76 | 77 | 78 | def get_hash_from_file(sovits_path): 79 | with open(sovits_path, "rb") as f: 80 | data = f.read(8192) 81 | hash_md5 = hashlib.md5() 82 | hash_md5.update(data) 83 | return hash_md5.hexdigest() 84 | 85 | 86 | def get_sovits_version_from_path_fast(sovits_path): 87 | ###1-if it is pretrained sovits models, by hash 88 | hash = get_hash_from_file(sovits_path) 89 | if hash in hash_pretrained_dict: 90 | return hash_pretrained_dict[hash] 91 | ###2-new weights, by head 92 | with open(sovits_path, "rb") as f: 93 | version = f.read(2) 94 | if version != b"PK": 95 | return head2version[version] 96 | ###3-old weights, by file size 97 | if_lora_v3 = False 98 | size = os.path.getsize(sovits_path) 99 | """ 100 | v1weights:about 82942KB 101 | half thr:82978KB 102 | v2weights:about 83014KB 103 | v3weights:about 750MB 104 | """ 105 | if size < 82978 * 1024: 106 | model_version = version = "v1" 107 | elif size < 700 * 1024 * 1024: 108 | model_version = version = "v2" 109 | else: 110 | version = "v2" 111 | model_version = "v3" 112 | return version, model_version, if_lora_v3 113 | 114 | 115 | def load_sovits_new(sovits_path): 116 | f = open(sovits_path, "rb") 117 | meta = f.read(2) 118 | if meta != "PK": 119 | data = b"PK" + f.read() 120 | bio = BytesIO() 121 | bio.write(data) 122 | bio.seek(0) 123 | return torch.load(bio, map_location="cpu", weights_only=False) 124 | return torch.load(sovits_path, map_location="cpu", weights_only=False) 125 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/.gitignore: -------------------------------------------------------------------------------- 1 | G2PWModel 2 | __pycache__ 3 | *.zip -------------------------------------------------------------------------------- /GPT_SoVITS/text/LangSegmenter/__init__.py: -------------------------------------------------------------------------------- 1 | from .langsegmenter import LangSegmenter 2 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | # if os.environ.get("version","v1")=="v1": 3 | # from text.symbols import symbols 4 | # else: 5 | # from text.symbols2 import symbols 6 | 7 | from text import symbols as symbols_v1 8 | from text import symbols2 as symbols_v2 9 | 10 | _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)} 11 | _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)} 12 | 13 | 14 | def cleaned_text_to_sequence(cleaned_text, version=None): 15 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | Args: 17 | text: string to convert to a sequence 18 | Returns: 19 | List of integers corresponding to the symbols in the text 20 | """ 21 | if version is None: 22 | version = os.environ.get("version", "v2") 23 | if version == "v1": 24 | phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text] 25 | else: 26 | phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text] 27 | 28 | return phones 29 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/cleaner.py: -------------------------------------------------------------------------------- 1 | from text import cleaned_text_to_sequence 2 | import os 3 | # if os.environ.get("version","v1")=="v1": 4 | # from text import chinese 5 | # from text.symbols import symbols 6 | # else: 7 | # from text import chinese2 as chinese 8 | # from text.symbols2 import symbols 9 | 10 | from text import symbols as symbols_v1 11 | from text import symbols2 as symbols_v2 12 | 13 | special = [ 14 | # ("%", "zh", "SP"), 15 | ("¥", "zh", "SP2"), 16 | ("^", "zh", "SP3"), 17 | # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 18 | ] 19 | 20 | 21 | def clean_text(text, language, version=None): 22 | if version is None: 23 | version = os.environ.get("version", "v2") 24 | if version == "v1": 25 | symbols = symbols_v1.symbols 26 | language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} 27 | else: 28 | symbols = symbols_v2.symbols 29 | language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"} 30 | 31 | if language not in language_module_map: 32 | language = "en" 33 | text = " " 34 | for special_s, special_l, target_symbol in special: 35 | if special_s in text and language == special_l: 36 | return clean_special(text, language, special_s, target_symbol, version) 37 | language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) 38 | if hasattr(language_module, "text_normalize"): 39 | norm_text = language_module.text_normalize(text) 40 | else: 41 | norm_text = text 42 | if language == "zh" or language == "yue": ########## 43 | phones, word2ph = language_module.g2p(norm_text) 44 | assert len(phones) == sum(word2ph) 45 | assert len(norm_text) == len(word2ph) 46 | elif language == "en": 47 | phones = language_module.g2p(norm_text) 48 | if len(phones) < 4: 49 | phones = [","] + phones 50 | word2ph = None 51 | else: 52 | phones = language_module.g2p(norm_text) 53 | word2ph = None 54 | phones = ["UNK" if ph not in symbols else ph for ph in phones] 55 | return phones, word2ph, norm_text 56 | 57 | 58 | def clean_special(text, language, special_s, target_symbol, version=None): 59 | if version is None: 60 | version = os.environ.get("version", "v2") 61 | if version == "v1": 62 | symbols = symbols_v1.symbols 63 | language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} 64 | else: 65 | symbols = symbols_v2.symbols 66 | language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"} 67 | 68 | """ 69 | 特殊静音段sp符号处理 70 | """ 71 | text = text.replace(special_s, ",") 72 | language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]]) 73 | norm_text = language_module.text_normalize(text) 74 | phones = language_module.g2p(norm_text) 75 | new_ph = [] 76 | for ph in phones[0]: 77 | assert ph in symbols 78 | if ph == ",": 79 | new_ph.append(target_symbol) 80 | else: 81 | new_ph.append(ph) 82 | return new_ph, phones[1], norm_text 83 | 84 | 85 | def text_to_sequence(text, language, version=None): 86 | version = os.environ.get("version", version) 87 | if version is None: 88 | version = "v2" 89 | phones = clean_text(text) 90 | return cleaned_text_to_sequence(phones, version) 91 | 92 | 93 | if __name__ == "__main__": 94 | print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh")) 95 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/engdict-hot.rep: -------------------------------------------------------------------------------- 1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1 2 | JSON JH EY1 S AH0 N 3 | CONDA K AA1 N D AH0 -------------------------------------------------------------------------------- /GPT_SoVITS/text/engdict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/text/engdict_cache.pickle -------------------------------------------------------------------------------- /GPT_SoVITS/text/g2pw/__init__.py: -------------------------------------------------------------------------------- 1 | from text.g2pw.g2pw import * 2 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/g2pw/polyphonic.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/text/g2pw/polyphonic.pickle -------------------------------------------------------------------------------- /GPT_SoVITS/text/g2pw/polyphonic.rep: -------------------------------------------------------------------------------- 1 | 湖泊: ['hu2','po1'] 2 | 地壳: ['di4','qiao4'] 3 | 柏树: ['bai3','shu4'] 4 | 曝光: ['bao4','guang1'] 5 | 弹力: ['tan2','li4'] 6 | 字帖: ['zi4','tie4'] 7 | 口吃: ['kou3','chi1'] 8 | 包扎: ['bao1','za1'] 9 | 哪吒: ['ne2','zha1'] 10 | 说服: ['shuo1','fu2'] 11 | 识字: ['shi2','zi4'] 12 | 骨头: ['gu3','tou5'] 13 | 对称: ['dui4','chen4'] 14 | 口供: ['kou3','gong4'] 15 | 抹布: ['ma1','bu4'] 16 | 露背: ['lu4','bei4'] 17 | 圈养: ['juan4', 'yang3'] 18 | 眼眶: ['yan3', 'kuang4'] 19 | 品行: ['pin3','xing2'] 20 | 颤抖: ['chan4','dou3'] 21 | 差不多: ['cha4','bu5','duo1'] 22 | 鸭绿江: ['ya1','lu4','jiang1'] 23 | 撒切尔: ['sa4','qie4','er3'] 24 | 比比皆是: ['bi3','bi3','jie1','shi4'] 25 | 身无长物: ['shen1','wu2','chang2','wu4'] 26 | 手里: ['shou2','li3'] 27 | 关卡: ['guan1','qia3'] 28 | 怀揣: ['huai2','chuai1'] 29 | 挑剔: ['tiao1','ti4'] 30 | 供称: ['gong4','cheng1'] 31 | 作坊: ['zuo1', 'fang5'] 32 | 中医: ['zhong1','yi1'] 33 | 嚷嚷: ['rang1','rang5'] 34 | 商厦: ['shang1','sha4'] 35 | 大厦: ['da4','sha4'] 36 | 刹车: ['sha1','che1'] 37 | 嘚瑟: ['de4','se5'] 38 | 朝鲜: ['chao2','xian3'] 39 | 阿房宫: ['e1','pang2','gong1'] 40 | 阿胶: ['e1','jiao1'] 41 | 咖喱: ['ga1','li5'] 42 | 时分: ['shi2','fen1'] 43 | 蚌埠: ['beng4','bu4'] 44 | 驯服: ['xun4','fu2'] 45 | 幸免于难: ['xing4','mian3','yu2','nan4'] 46 | 恶行: ['e4','xing2'] 47 | 唉: ['ai4'] 48 | 扎实: ['zha1','shi2'] 49 | 干将: ['gan4','jiang4'] 50 | 陈威行: ['chen2', 'wei1', 'hang2'] 51 | 郭晟: ['guo1', 'sheng4'] 52 | 中标: ['zhong4', 'biao1'] 53 | 抗住: ['kang2', 'zhu4'] -------------------------------------------------------------------------------- /GPT_SoVITS/text/namedict_cache.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/text/namedict_cache.pickle -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/README.md: -------------------------------------------------------------------------------- 1 | ## Supported NSW (Non-Standard-Word) Normalization 2 | 3 | |NSW type|raw|normalized| 4 | |:--|:-|:-| 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| 6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分| 7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二| 8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日| 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我 10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度 11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| 12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| 13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万| 14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一| 15 | ## References 16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files) 17 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from text.zh_normalization.text_normlization import * 15 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/chronology.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import DIGITS 17 | from .num import num2str 18 | from .num import verbalize_cardinal 19 | from .num import verbalize_digit 20 | 21 | 22 | def _time_num2str(num_string: str) -> str: 23 | """A special case for verbalizing number in time.""" 24 | result = num2str(num_string.lstrip("0")) 25 | if num_string.startswith("0"): 26 | result = DIGITS["0"] + result 27 | return result 28 | 29 | 30 | # 时刻表达式 31 | RE_TIME = re.compile( 32 | r"([0-1]?[0-9]|2[0-3])" 33 | r":([0-5][0-9])" 34 | r"(:([0-5][0-9]))?" 35 | ) 36 | 37 | # 时间范围,如8:30-12:30 38 | RE_TIME_RANGE = re.compile( 39 | r"([0-1]?[0-9]|2[0-3])" 40 | r":([0-5][0-9])" 41 | r"(:([0-5][0-9]))?" 42 | r"(~|-)" 43 | r"([0-1]?[0-9]|2[0-3])" 44 | r":([0-5][0-9])" 45 | r"(:([0-5][0-9]))?" 46 | ) 47 | 48 | 49 | def replace_time(match) -> str: 50 | """ 51 | Args: 52 | match (re.Match) 53 | Returns: 54 | str 55 | """ 56 | 57 | is_range = len(match.groups()) > 5 58 | 59 | hour = match.group(1) 60 | minute = match.group(2) 61 | second = match.group(4) 62 | 63 | if is_range: 64 | hour_2 = match.group(6) 65 | minute_2 = match.group(7) 66 | second_2 = match.group(9) 67 | 68 | result = f"{num2str(hour)}点" 69 | if minute.lstrip("0"): 70 | if int(minute) == 30: 71 | result += "半" 72 | else: 73 | result += f"{_time_num2str(minute)}分" 74 | if second and second.lstrip("0"): 75 | result += f"{_time_num2str(second)}秒" 76 | 77 | if is_range: 78 | result += "至" 79 | result += f"{num2str(hour_2)}点" 80 | if minute_2.lstrip("0"): 81 | if int(minute) == 30: 82 | result += "半" 83 | else: 84 | result += f"{_time_num2str(minute_2)}分" 85 | if second_2 and second_2.lstrip("0"): 86 | result += f"{_time_num2str(second_2)}秒" 87 | 88 | return result 89 | 90 | 91 | RE_DATE = re.compile( 92 | r"(\d{4}|\d{2})年" 93 | r"((0?[1-9]|1[0-2])月)?" 94 | r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?" 95 | ) 96 | 97 | 98 | def replace_date(match) -> str: 99 | """ 100 | Args: 101 | match (re.Match) 102 | Returns: 103 | str 104 | """ 105 | year = match.group(1) 106 | month = match.group(3) 107 | day = match.group(5) 108 | result = "" 109 | if year: 110 | result += f"{verbalize_digit(year)}年" 111 | if month: 112 | result += f"{verbalize_cardinal(month)}月" 113 | if day: 114 | result += f"{verbalize_cardinal(day)}{match.group(9)}" 115 | return result 116 | 117 | 118 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 119 | RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])") 120 | 121 | 122 | def replace_date2(match) -> str: 123 | """ 124 | Args: 125 | match (re.Match) 126 | Returns: 127 | str 128 | """ 129 | year = match.group(1) 130 | month = match.group(3) 131 | day = match.group(4) 132 | result = "" 133 | if year: 134 | result += f"{verbalize_digit(year)}年" 135 | if month: 136 | result += f"{verbalize_cardinal(month)}月" 137 | if day: 138 | result += f"{verbalize_cardinal(day)}日" 139 | return result 140 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import string 16 | 17 | from pypinyin.constants import SUPPORT_UCS4 18 | 19 | # 全角半角转换 20 | # 英文字符全角 -> 半角映射表 (num: 52) 21 | F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters} 22 | 23 | # 英文字符半角 -> 全角映射表 24 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} 25 | 26 | # 数字字符全角 -> 半角映射表 (num: 10) 27 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits} 28 | # 数字字符半角 -> 全角映射表 29 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} 30 | 31 | # 标点符号全角 -> 半角映射表 (num: 32) 32 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} 33 | # 标点符号半角 -> 全角映射表 34 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} 35 | 36 | # 空格 (num: 1) 37 | F2H_SPACE = {"\u3000": " "} 38 | H2F_SPACE = {" ": "\u3000"} 39 | 40 | # 非"有拼音的汉字"的字符串,可用于NSW提取 41 | if SUPPORT_UCS4: 42 | RE_NSW = re.compile( 43 | r"(?:[^" 44 | r"\u3007" # 〇 45 | r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF] 46 | r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF] 47 | r"\uf900-\ufaff" # CJK兼容:[F900-FAFF] 48 | r"\U00020000-\U0002A6DF" # CJK扩展B:[20000-2A6DF] 49 | r"\U0002A703-\U0002B73F" # CJK扩展C:[2A700-2B73F] 50 | r"\U0002B740-\U0002B81D" # CJK扩展D:[2B740-2B81D] 51 | r"\U0002F80A-\U0002FA1F" # CJK兼容扩展:[2F800-2FA1F] 52 | r"])+" 53 | ) 54 | else: 55 | RE_NSW = re.compile( # pragma: no cover 56 | r"(?:[^" 57 | r"\u3007" # 〇 58 | r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF] 59 | r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF] 60 | r"\uf900-\ufaff" # CJK兼容:[F900-FAFF] 61 | r"])+" 62 | ) 63 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/phonecode.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import verbalize_digit 17 | 18 | # 规范化固话/手机号码 19 | # 手机 20 | # http://www.jihaoba.com/news/show/13680 21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 22 | # 联通:130、131、132、156、155、186、185、176 23 | # 电信:133、153、189、180、181、177 24 | RE_MOBILE_PHONE = re.compile(r"(? str: 32 | if mobile: 33 | sp_parts = phone_string.strip("+").split() 34 | result = ",".join([verbalize_digit(part, alt_one=True) for part in sp_parts]) 35 | return result 36 | else: 37 | sil_parts = phone_string.split("-") 38 | result = ",".join([verbalize_digit(part, alt_one=True) for part in sil_parts]) 39 | return result 40 | 41 | 42 | def replace_phone(match) -> str: 43 | """ 44 | Args: 45 | match (re.Match) 46 | Returns: 47 | str 48 | """ 49 | return phone2str(match.group(0), mobile=False) 50 | 51 | 52 | def replace_mobile(match) -> str: 53 | """ 54 | Args: 55 | match (re.Match) 56 | Returns: 57 | str 58 | """ 59 | return phone2str(match.group(0)) 60 | -------------------------------------------------------------------------------- /GPT_SoVITS/text/zh_normalization/quantifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import num2str 17 | 18 | # 温度表达式,温度会影响负号的读法 19 | # -3°C 零下三度 20 | RE_TEMPERATURE = re.compile(r"(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)") 21 | measure_dict = { 22 | "cm2": "平方厘米", 23 | "cm²": "平方厘米", 24 | "cm3": "立方厘米", 25 | "cm³": "立方厘米", 26 | "cm": "厘米", 27 | "db": "分贝", 28 | "ds": "毫秒", 29 | "kg": "千克", 30 | "km": "千米", 31 | "m2": "平方米", 32 | "m²": "平方米", 33 | "m³": "立方米", 34 | "m3": "立方米", 35 | "ml": "毫升", 36 | "m": "米", 37 | "mm": "毫米", 38 | "s": "秒", 39 | } 40 | 41 | 42 | def replace_temperature(match) -> str: 43 | """ 44 | Args: 45 | match (re.Match) 46 | Returns: 47 | str 48 | """ 49 | sign = match.group(1) 50 | temperature = match.group(2) 51 | unit = match.group(3) 52 | sign: str = "零下" if sign else "" 53 | temperature: str = num2str(temperature) 54 | unit: str = "摄氏度" if unit == "摄氏度" else "度" 55 | result = f"{sign}{temperature}{unit}" 56 | return result 57 | 58 | 59 | def replace_measure(sentence) -> str: 60 | for q_notation in measure_dict: 61 | if q_notation in sentence: 62 | sentence = sentence.replace(q_notation, measure_dict[q_notation]) 63 | return sentence 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 RVC-Boss 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | import torch 5 | 6 | # 推理用的指定模型 7 | sovits_path = "" 8 | gpt_path = "" 9 | is_half_str = os.environ.get("is_half", "True") 10 | is_half = True if is_half_str.lower() == "true" else False 11 | is_share_str = os.environ.get("is_share", "False") 12 | is_share = True if is_share_str.lower() == "true" else False 13 | 14 | cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base" 15 | bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" 16 | pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth" 17 | pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" 18 | 19 | exp_root = "logs" 20 | python_exec = sys.executable or "python" 21 | if torch.cuda.is_available(): 22 | infer_device = "cuda" 23 | else: 24 | infer_device = "cpu" 25 | 26 | webui_port_main = 9874 27 | webui_port_uvr5 = 9873 28 | webui_port_infer_tts = 9872 29 | webui_port_subfix = 9871 30 | 31 | api_port = 9880 32 | 33 | if infer_device == "cuda": 34 | gpu_name = torch.cuda.get_device_name(0) 35 | if ( 36 | ("16" in gpu_name and "V100" not in gpu_name.upper()) 37 | or "P40" in gpu_name.upper() 38 | or "P10" in gpu_name.upper() 39 | or "1060" in gpu_name 40 | or "1070" in gpu_name 41 | or "1080" in gpu_name 42 | ): 43 | is_half = False 44 | 45 | if infer_device == "cpu": 46 | is_half = False 47 | 48 | 49 | class Config: 50 | def __init__(self): 51 | self.sovits_path = sovits_path 52 | self.gpt_path = gpt_path 53 | self.is_half = is_half 54 | 55 | self.cnhubert_path = cnhubert_path 56 | self.bert_path = bert_path 57 | self.pretrained_sovits_path = pretrained_sovits_path 58 | self.pretrained_gpt_path = pretrained_gpt_path 59 | 60 | self.exp_root = exp_root 61 | self.python_exec = python_exec 62 | self.infer_device = infer_device 63 | 64 | self.webui_port_main = webui_port_main 65 | self.webui_port_uvr5 = webui_port_uvr5 66 | self.webui_port_infer_tts = webui_port_infer_tts 67 | self.webui_port_subfix = webui_port_subfix 68 | 69 | self.api_port = api_port 70 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | services: 4 | GPT-SoVITS-CU126: 5 | image: xxxxrt666/gpt-sovits:latest-cu126 6 | container_name: GPT-SoVITS-CU126 7 | ports: 8 | - "9871:9871" 9 | - "9872:9872" 10 | - "9873:9873" 11 | - "9874:9874" 12 | - "9880:9880" 13 | volumes: 14 | - .:/workspace/GPT-SoVITS 15 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models 16 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel 17 | - /dev/null:/workspace/GPT-SoVITS/tools/asr/models 18 | - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights 19 | environment: 20 | - is_half=true 21 | tty: true 22 | stdin_open: true 23 | shm_size: "16g" 24 | restart: unless-stopped 25 | runtime: nvidia 26 | GPT-SoVITS-CU126-Lite: 27 | image: xxxxrt666/gpt-sovits:latest-cu126-lite 28 | container_name: GPT-SoVITS-CU126-Lite 29 | ports: 30 | - "9871:9871" 31 | - "9872:9872" 32 | - "9873:9873" 33 | - "9874:9874" 34 | - "9880:9880" 35 | volumes: 36 | - .:/workspace/GPT-SoVITS 37 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models 38 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel 39 | - /dev/null:/workspace/GPT-SoVITS/tools/asr/models 40 | - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights 41 | - tools/asr/models:/workspace/models/asr_models 42 | - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights 43 | environment: 44 | - is_half=true 45 | tty: true 46 | stdin_open: true 47 | shm_size: "16g" 48 | restart: unless-stopped 49 | runtime: nvidia 50 | GPT-SoVITS-CU128: 51 | image: xxxxrt666/gpt-sovits:latest-cu128 52 | container_name: GPT-SoVITS-CU128 53 | ports: 54 | - "9871:9871" 55 | - "9872:9872" 56 | - "9873:9873" 57 | - "9874:9874" 58 | - "9880:9880" 59 | volumes: 60 | - .:/workspace/GPT-SoVITS 61 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models 62 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel 63 | - /dev/null:/workspace/GPT-SoVITS/tools/asr/models 64 | - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights 65 | environment: 66 | - is_half=true 67 | tty: true 68 | stdin_open: true 69 | shm_size: "16g" 70 | restart: unless-stopped 71 | runtime: nvidia 72 | GPT-SoVITS-CU128-Lite: 73 | image: xxxxrt666/gpt-sovits:latest-cu128-lite 74 | container_name: GPT-SoVITS-CU128-Lite 75 | ports: 76 | - "9871:9871" 77 | - "9872:9872" 78 | - "9873:9873" 79 | - "9874:9874" 80 | - "9880:9880" 81 | volumes: 82 | - .:/workspace/GPT-SoVITS 83 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models 84 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel 85 | - /dev/null:/workspace/GPT-SoVITS/tools/asr/models 86 | - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights 87 | - tools/asr/models:/workspace/models/asr_models 88 | - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights 89 | environment: 90 | - is_half=true 91 | tty: true 92 | stdin_open: true 93 | shm_size: "16g" 94 | restart: unless-stopped 95 | runtime: nvidia -------------------------------------------------------------------------------- /docker_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" 4 | 5 | cd "$SCRIPT_DIR" || exit 1 6 | 7 | set -e 8 | 9 | if ! command -v docker &>/dev/null; then 10 | echo "Docker Not Found" 11 | exit 1 12 | fi 13 | 14 | trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR 15 | 16 | LITE=false 17 | CUDA_VERSION=12.6 18 | 19 | print_help() { 20 | echo "Usage: bash docker_build.sh [OPTIONS]" 21 | echo "" 22 | echo "Options:" 23 | echo " --cuda 12.6|12.8 Specify the CUDA VERSION (REQUIRED)" 24 | echo " --lite Build a Lite Image" 25 | echo " -h, --help Show this help message and exit" 26 | echo "" 27 | echo "Examples:" 28 | echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper" 29 | } 30 | 31 | # Show help if no arguments provided 32 | if [[ $# -eq 0 ]]; then 33 | print_help 34 | exit 0 35 | fi 36 | 37 | # Parse arguments 38 | while [[ $# -gt 0 ]]; do 39 | case "$1" in 40 | --cuda) 41 | case "$2" in 42 | 12.6) 43 | CUDA_VERSION=12.6 44 | ;; 45 | 12.8) 46 | CUDA_VERSION=12.8 47 | ;; 48 | *) 49 | echo "Error: Invalid CUDA_VERSION: $2" 50 | echo "Choose From: [12.6, 12.8]" 51 | exit 1 52 | ;; 53 | esac 54 | shift 2 55 | ;; 56 | --lite) 57 | LITE=true 58 | shift 59 | ;; 60 | *) 61 | echo "Unknown Argument: $1" 62 | echo "Use -h or --help to see available options." 63 | exit 1 64 | ;; 65 | esac 66 | done 67 | 68 | TARGETPLATFORM=$(uname -m | grep -q 'x86' && echo "linux/amd64" || echo "linux/arm64") 69 | 70 | if [ $LITE = true ]; then 71 | TORCH_BASE="lite" 72 | else 73 | TORCH_BASE="full" 74 | fi 75 | 76 | docker build \ 77 | --build-arg CUDA_VERSION=$CUDA_VERSION \ 78 | --build-arg LITE=$LITE \ 79 | --build-arg TARGETPLATFORM="$TARGETPLATFORM" \ 80 | --build-arg TORCH_BASE=$TORCH_BASE \ 81 | -t "${USER}/gpt-sovits:local" \ 82 | . 83 | -------------------------------------------------------------------------------- /extra-req.txt: -------------------------------------------------------------------------------- 1 | faster-whisper 2 | -------------------------------------------------------------------------------- /go-webui.bat: -------------------------------------------------------------------------------- 1 | set "SCRIPT_DIR=%~dp0" 2 | set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" 3 | cd /d "%SCRIPT_DIR%" 4 | set "PATH=%SCRIPT_DIR%\runtime;%PATH%" 5 | runtime\python.exe -I webui.py zh_CN 6 | pause 7 | -------------------------------------------------------------------------------- /go-webui.ps1: -------------------------------------------------------------------------------- 1 | $ErrorActionPreference = "SilentlyContinue" 2 | chcp 65001 3 | Set-Location $PSScriptRoot 4 | $runtimePath = Join-Path $PSScriptRoot "runtime" 5 | $env:PATH = "$runtimePath;$env:PATH" 6 | & "$runtimePath\python.exe" -I "$PSScriptRoot\webui.py" zh_CN 7 | pause 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | --no-binary=opencc 2 | numpy<2.0 3 | scipy 4 | tensorboard 5 | librosa==0.10.2 6 | numba 7 | pytorch-lightning>=2.4 8 | gradio<5 9 | ffmpeg-python 10 | onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64" 11 | onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64" 12 | tqdm 13 | funasr==1.0.27 14 | cn2an 15 | pypinyin 16 | pyopenjtalk>=0.4.1 17 | g2p_en 18 | torchaudio 19 | modelscope==1.10.0 20 | sentencepiece 21 | transformers>=4.43,<=4.50 22 | peft 23 | chardet 24 | PyYAML 25 | psutil 26 | jieba_fast 27 | jieba 28 | split-lang 29 | fast_langdetect>=0.3.1 30 | wordsegment 31 | rotary_embedding_torch 32 | ToJyutping 33 | g2pk2 34 | ko_pron 35 | opencc 36 | python_mecab_ko; sys_platform != 'win32' 37 | fastapi[standard]>=0.115.2 38 | x_transformers 39 | torchmetrics<=1.5 40 | pydantic<=2.10.6 41 | ctranslate2>=4.0,<5 42 | huggingface_hub>=0.13 43 | tokenizers>=0.13,<1 44 | av>=11 45 | tqdm 46 | -------------------------------------------------------------------------------- /tools/AP_BWE_main/24kto48k/readme.txt: -------------------------------------------------------------------------------- 1 | For the inference of the v3 model, if you find that the generated audio sounds somewhat muffled, you can try using this audio super-resolution model. 2 | 对于v3模型的推理,如果你发现生成的音频比较闷,可以尝试这个音频超分模型。 3 | 4 | put g_24kto48k.zip and config.json in this folder 5 | 把g_24kto48k.zip and config.json下到这个文件夹 6 | 7 | download link 下载链接: 8 | https://drive.google.com/drive/folders/1IIYTf2zbJWzelu4IftKD6ooHloJ8mnZF?usp=share_link 9 | 10 | audio sr project page 音频超分项目主页: 11 | https://github.com/yxlu-0102/AP-BWE 12 | -------------------------------------------------------------------------------- /tools/AP_BWE_main/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Ye-Xin Lu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tools/AP_BWE_main/datasets1/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tools/AP_BWE_main/models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/tools/__init__.py -------------------------------------------------------------------------------- /tools/asr/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def check_fw_local_models(): 5 | """ 6 | 启动时检查本地是否有 Faster Whisper 模型. 7 | """ 8 | model_size_list = [ 9 | "tiny", 10 | "tiny.en", 11 | "base", 12 | "base.en", 13 | "small", 14 | "small.en", 15 | "medium", 16 | "medium.en", 17 | "large", 18 | "large-v1", 19 | "large-v2", 20 | "large-v3", 21 | ] 22 | for i, size in enumerate(model_size_list): 23 | if os.path.exists(f"tools/asr/models/faster-whisper-{size}"): 24 | model_size_list[i] = size + "-local" 25 | return model_size_list 26 | 27 | 28 | asr_dict = { 29 | "达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]}, 30 | "Faster Whisper (多语种)": { 31 | "lang": ["auto", "zh", "en", "ja", "ko", "yue"], 32 | "size": check_fw_local_models(), 33 | "path": "fasterwhisper_asr.py", 34 | "precision": ["float32", "float16", "int8"], 35 | }, 36 | } 37 | -------------------------------------------------------------------------------- /tools/asr/models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /tools/audio_sr.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | import sys 3 | import os 4 | 5 | AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main") 6 | sys.path.append(AP_BWE_main_dir_path) 7 | import json 8 | import torch 9 | import torchaudio.functional as aF 10 | # from attrdict import AttrDict####will be bug in py3.10 11 | 12 | from datasets1.dataset import amp_pha_stft, amp_pha_istft 13 | from models.model import APNet_BWE_Model 14 | 15 | 16 | class AP_BWE: 17 | def __init__(self, device, DictToAttrRecursive, checkpoint_file=None): 18 | if checkpoint_file == None: 19 | checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path) 20 | if os.path.exists(checkpoint_file) == False: 21 | raise FileNotFoundError 22 | config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json") 23 | with open(config_file) as f: 24 | data = f.read() 25 | json_config = json.loads(data) 26 | # h = AttrDict(json_config) 27 | h = DictToAttrRecursive(json_config) 28 | model = APNet_BWE_Model(h).to(device) 29 | state_dict = torch.load(checkpoint_file, map_location="cpu", weights_only=False) 30 | model.load_state_dict(state_dict["generator"]) 31 | model.eval() 32 | self.device = device 33 | self.model = model 34 | self.h = h 35 | 36 | def to(self, *arg, **kwargs): 37 | self.model.to(*arg, **kwargs) 38 | self.device = self.model.conv_pre_mag.weight.device 39 | return self 40 | 41 | def __call__(self, audio, orig_sampling_rate): 42 | with torch.no_grad(): 43 | # audio, orig_sampling_rate = torchaudio.load(inp_path) 44 | # audio = audio.to(self.device) 45 | audio = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.h.hr_sampling_rate) 46 | amp_nb, pha_nb, com_nb = amp_pha_stft(audio, self.h.n_fft, self.h.hop_size, self.h.win_size) 47 | amp_wb_g, pha_wb_g, com_wb_g = self.model(amp_nb, pha_nb) 48 | audio_hr_g = amp_pha_istft(amp_wb_g, pha_wb_g, self.h.n_fft, self.h.hop_size, self.h.win_size) 49 | # sf.write(opt_path, audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate, 'PCM_16') 50 | return audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate 51 | -------------------------------------------------------------------------------- /tools/cmd-denoise.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import traceback 4 | 5 | from modelscope.pipelines import pipeline 6 | from modelscope.utils.constant import Tasks 7 | from tqdm import tqdm 8 | 9 | path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k" 10 | path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" 11 | ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise) 12 | 13 | 14 | def execute_denoise(input_folder, output_folder): 15 | os.makedirs(output_folder, exist_ok=True) 16 | # print(input_folder) 17 | # print(list(os.listdir(input_folder).sort())) 18 | for name in tqdm(os.listdir(input_folder)): 19 | try: 20 | ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name)) 21 | except: 22 | traceback.print_exc() 23 | 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument( 28 | "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files." 29 | ) 30 | parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.") 31 | parser.add_argument( 32 | "-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32" 33 | ) # 还没接入 34 | cmd = parser.parse_args() 35 | execute_denoise( 36 | input_folder=cmd.input_folder, 37 | output_folder=cmd.output_folder, 38 | ) 39 | -------------------------------------------------------------------------------- /tools/denoise-model/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /tools/i18n/i18n.py: -------------------------------------------------------------------------------- 1 | import json 2 | import locale 3 | import os 4 | 5 | I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale") 6 | 7 | 8 | def load_language_list(language): 9 | with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f: 10 | language_list = json.load(f) 11 | return language_list 12 | 13 | 14 | def scan_language_list(): 15 | language_list = [] 16 | for name in os.listdir(I18N_JSON_DIR): 17 | if name.endswith(".json"): 18 | language_list.append(name.split(".")[0]) 19 | return language_list 20 | 21 | 22 | class I18nAuto: 23 | def __init__(self, language=None): 24 | if language in ["Auto", None]: 25 | language = locale.getdefaultlocale()[0] 26 | # getlocale can't identify the system's language ((None, None)) 27 | if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")): 28 | language = "en_US" 29 | self.language = language 30 | self.language_map = load_language_list(language) 31 | 32 | def __call__(self, key): 33 | return self.language_map.get(key, key) 34 | 35 | def __repr__(self): 36 | return "Use Language: " + self.language 37 | 38 | 39 | if __name__ == "__main__": 40 | i18n = I18nAuto(language="en_US") 41 | print(i18n) 42 | -------------------------------------------------------------------------------- /tools/slice_audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import traceback 5 | from scipy.io import wavfile 6 | 7 | # parent_directory = os.path.dirname(os.path.abspath(__file__)) 8 | # sys.path.append(parent_directory) 9 | from tools.my_utils import load_audio 10 | from slicer2 import Slicer 11 | 12 | 13 | def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part): 14 | os.makedirs(opt_root, exist_ok=True) 15 | if os.path.isfile(inp): 16 | input = [inp] 17 | elif os.path.isdir(inp): 18 | input = [os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] 19 | else: 20 | return "输入路径存在但既不是文件也不是文件夹" 21 | slicer = Slicer( 22 | sr=32000, # 长音频采样率 23 | threshold=int(threshold), # 音量小于这个值视作静音的备选切割点 24 | min_length=int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 25 | min_interval=int(min_interval), # 最短切割间隔 26 | hop_size=int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) 27 | max_sil_kept=int(max_sil_kept), # 切完后静音最多留多长 28 | ) 29 | _max = float(_max) 30 | alpha = float(alpha) 31 | for inp_path in input[int(i_part) :: int(all_part)]: 32 | # print(inp_path) 33 | try: 34 | name = os.path.basename(inp_path) 35 | audio = load_audio(inp_path, 32000) 36 | # print(audio.shape) 37 | for chunk, start, end in slicer.slice(audio): # start和end是帧数 38 | tmp_max = np.abs(chunk).max() 39 | if tmp_max > 1: 40 | chunk /= tmp_max 41 | chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk 42 | wavfile.write( 43 | "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), 44 | 32000, 45 | # chunk.astype(np.float32), 46 | (chunk * 32767).astype(np.int16), 47 | ) 48 | except: 49 | print(inp_path, "->fail->", traceback.format_exc()) 50 | return "执行完毕,请检查输出文件" 51 | 52 | 53 | print(slice(*sys.argv[1:])) 54 | -------------------------------------------------------------------------------- /tools/uvr5/bs_roformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/tools/uvr5/bs_roformer/__init__.py -------------------------------------------------------------------------------- /tools/uvr5/bs_roformer/attend.py: -------------------------------------------------------------------------------- 1 | from packaging import version 2 | import torch 3 | from torch import nn, einsum 4 | import torch.nn.functional as F 5 | 6 | 7 | def exists(val): 8 | return val is not None 9 | 10 | 11 | def default(v, d): 12 | return v if exists(v) else d 13 | 14 | 15 | class Attend(nn.Module): 16 | def __init__(self, dropout=0.0, flash=False, scale=None): 17 | super().__init__() 18 | self.scale = scale 19 | self.dropout = dropout 20 | self.attn_dropout = nn.Dropout(dropout) 21 | 22 | self.flash = flash 23 | assert not (flash and version.parse(torch.__version__) < version.parse("2.0.0")), ( 24 | "in order to use flash attention, you must be using pytorch 2.0 or above" 25 | ) 26 | 27 | def flash_attn(self, q, k, v): 28 | # _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device 29 | 30 | if exists(self.scale): 31 | default_scale = q.shape[-1] ** -0.5 32 | q = q * (self.scale / default_scale) 33 | 34 | # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale 35 | # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): 36 | return F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout if self.training else 0.0) 37 | 38 | def forward(self, q, k, v): 39 | """ 40 | einstein notation 41 | b - batch 42 | h - heads 43 | n, i, j - sequence length (base sequence length, source, target) 44 | d - feature dimension 45 | """ 46 | 47 | # q_len, k_len, device = q.shape[-2], k.shape[-2], q.device 48 | 49 | scale = default(self.scale, q.shape[-1] ** -0.5) 50 | 51 | if self.flash: 52 | return self.flash_attn(q, k, v) 53 | 54 | # similarity 55 | 56 | sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale 57 | 58 | # attention 59 | 60 | attn = sim.softmax(dim=-1) 61 | attn = self.attn_dropout(attn) 62 | 63 | # aggregate values 64 | 65 | out = einsum("b h i j, b h j d -> b h i d", attn, v) 66 | 67 | return out 68 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 67 | super(Decoder, self).__init__() 68 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 69 | self.dropout = nn.Dropout2d(0.1) if dropout else None 70 | 71 | def __call__(self, x, skip=None): 72 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 73 | if skip is not None: 74 | skip = spec_utils.crop_center(skip, x) 75 | x = torch.cat([x, skip], dim=1) 76 | h = self.conv(x) 77 | 78 | if self.dropout is not None: 79 | h = self.dropout(h) 80 | 81 | return h 82 | 83 | 84 | class ASPPModule(nn.Module): 85 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 86 | super(ASPPModule, self).__init__() 87 | self.conv1 = nn.Sequential( 88 | nn.AdaptiveAvgPool2d((1, None)), 89 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 90 | ) 91 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 92 | self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) 93 | self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) 94 | self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 95 | self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) 96 | 97 | def forward(self, x): 98 | _, _, h, w = x.size() 99 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) 100 | feat2 = self.conv2(x) 101 | feat3 = self.conv3(x) 102 | feat4 = self.conv4(x) 103 | feat5 = self.conv5(x) 104 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 105 | bottle = self.bottleneck(out) 106 | return bottle 107 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_123812KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 67 | super(Decoder, self).__init__() 68 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 69 | self.dropout = nn.Dropout2d(0.1) if dropout else None 70 | 71 | def __call__(self, x, skip=None): 72 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 73 | if skip is not None: 74 | skip = spec_utils.crop_center(skip, x) 75 | x = torch.cat([x, skip], dim=1) 76 | h = self.conv(x) 77 | 78 | if self.dropout is not None: 79 | h = self.dropout(h) 80 | 81 | return h 82 | 83 | 84 | class ASPPModule(nn.Module): 85 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 86 | super(ASPPModule, self).__init__() 87 | self.conv1 = nn.Sequential( 88 | nn.AdaptiveAvgPool2d((1, None)), 89 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 90 | ) 91 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 92 | self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) 93 | self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) 94 | self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 95 | self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) 96 | 97 | def forward(self, x): 98 | _, _, h, w = x.size() 99 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) 100 | feat2 = self.conv2(x) 101 | feat3 = self.conv3(x) 102 | feat4 = self.conv4(x) 103 | feat5 = self.conv5(x) 104 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 105 | bottle = self.bottleneck(out) 106 | return bottle 107 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_123821KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 67 | super(Decoder, self).__init__() 68 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 69 | self.dropout = nn.Dropout2d(0.1) if dropout else None 70 | 71 | def __call__(self, x, skip=None): 72 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 73 | if skip is not None: 74 | skip = spec_utils.crop_center(skip, x) 75 | x = torch.cat([x, skip], dim=1) 76 | h = self.conv(x) 77 | 78 | if self.dropout is not None: 79 | h = self.dropout(h) 80 | 81 | return h 82 | 83 | 84 | class ASPPModule(nn.Module): 85 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): 86 | super(ASPPModule, self).__init__() 87 | self.conv1 = nn.Sequential( 88 | nn.AdaptiveAvgPool2d((1, None)), 89 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 90 | ) 91 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 92 | self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) 93 | self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) 94 | self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 95 | self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) 96 | 97 | def forward(self, x): 98 | _, _, h, w = x.size() 99 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) 100 | feat2 = self.conv2(x) 101 | feat3 = self.conv3(x) 102 | feat4 = self.conv4(x) 103 | feat5 = self.conv5(x) 104 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 105 | bottle = self.bottleneck(out) 106 | return bottle 107 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_33966KB.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class SeperableConv2DBNActiv(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 31 | super(SeperableConv2DBNActiv, self).__init__() 32 | self.conv = nn.Sequential( 33 | nn.Conv2d( 34 | nin, 35 | nin, 36 | kernel_size=ksize, 37 | stride=stride, 38 | padding=pad, 39 | dilation=dilation, 40 | groups=nin, 41 | bias=False, 42 | ), 43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False), 44 | nn.BatchNorm2d(nout), 45 | activ(), 46 | ) 47 | 48 | def __call__(self, x): 49 | return self.conv(x) 50 | 51 | 52 | class Encoder(nn.Module): 53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 54 | super(Encoder, self).__init__() 55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) 57 | 58 | def __call__(self, x): 59 | skip = self.conv1(x) 60 | h = self.conv2(skip) 61 | 62 | return h, skip 63 | 64 | 65 | class Decoder(nn.Module): 66 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 67 | super(Decoder, self).__init__() 68 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 69 | self.dropout = nn.Dropout2d(0.1) if dropout else None 70 | 71 | def __call__(self, x, skip=None): 72 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 73 | if skip is not None: 74 | skip = spec_utils.crop_center(skip, x) 75 | x = torch.cat([x, skip], dim=1) 76 | h = self.conv(x) 77 | 78 | if self.dropout is not None: 79 | h = self.dropout(h) 80 | 81 | return h 82 | 83 | 84 | class ASPPModule(nn.Module): 85 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): 86 | super(ASPPModule, self).__init__() 87 | self.conv1 = nn.Sequential( 88 | nn.AdaptiveAvgPool2d((1, None)), 89 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), 90 | ) 91 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) 92 | self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) 93 | self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) 94 | self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 95 | self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 96 | self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) 97 | self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) 98 | 99 | def forward(self, x): 100 | _, _, h, w = x.size() 101 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) 102 | feat2 = self.conv2(x) 103 | feat3 = self.conv3(x) 104 | feat4 = self.conv4(x) 105 | feat5 = self.conv5(x) 106 | feat6 = self.conv6(x) 107 | feat7 = self.conv7(x) 108 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) 109 | bottle = self.bottleneck(out) 110 | return bottle 111 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/layers_new.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from . import spec_utils 6 | 7 | 8 | class Conv2DBNActiv(nn.Module): 9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): 10 | super(Conv2DBNActiv, self).__init__() 11 | self.conv = nn.Sequential( 12 | nn.Conv2d( 13 | nin, 14 | nout, 15 | kernel_size=ksize, 16 | stride=stride, 17 | padding=pad, 18 | dilation=dilation, 19 | bias=False, 20 | ), 21 | nn.BatchNorm2d(nout), 22 | activ(), 23 | ) 24 | 25 | def __call__(self, x): 26 | return self.conv(x) 27 | 28 | 29 | class Encoder(nn.Module): 30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): 31 | super(Encoder, self).__init__() 32 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) 33 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) 34 | 35 | def __call__(self, x): 36 | h = self.conv1(x) 37 | h = self.conv2(h) 38 | 39 | return h 40 | 41 | 42 | class Decoder(nn.Module): 43 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): 44 | super(Decoder, self).__init__() 45 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) 46 | # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) 47 | self.dropout = nn.Dropout2d(0.1) if dropout else None 48 | 49 | def __call__(self, x, skip=None): 50 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) 51 | 52 | if skip is not None: 53 | skip = spec_utils.crop_center(skip, x) 54 | x = torch.cat([x, skip], dim=1) 55 | 56 | h = self.conv1(x) 57 | # h = self.conv2(h) 58 | 59 | if self.dropout is not None: 60 | h = self.dropout(h) 61 | 62 | return h 63 | 64 | 65 | class ASPPModule(nn.Module): 66 | def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): 67 | super(ASPPModule, self).__init__() 68 | self.conv1 = nn.Sequential( 69 | nn.AdaptiveAvgPool2d((1, None)), 70 | Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), 71 | ) 72 | self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) 73 | self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ) 74 | self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ) 75 | self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ) 76 | self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) 77 | self.dropout = nn.Dropout2d(0.1) if dropout else None 78 | 79 | def forward(self, x): 80 | _, _, h, w = x.size() 81 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) 82 | feat2 = self.conv2(x) 83 | feat3 = self.conv3(x) 84 | feat4 = self.conv4(x) 85 | feat5 = self.conv5(x) 86 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) 87 | out = self.bottleneck(out) 88 | 89 | if self.dropout is not None: 90 | out = self.dropout(out) 91 | 92 | return out 93 | 94 | 95 | class LSTMModule(nn.Module): 96 | def __init__(self, nin_conv, nin_lstm, nout_lstm): 97 | super(LSTMModule, self).__init__() 98 | self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) 99 | self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True) 100 | self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()) 101 | 102 | def forward(self, x): 103 | N, _, nbins, nframes = x.size() 104 | h = self.conv(x)[:, 0] # N, nbins, nframes 105 | h = h.permute(2, 0, 1) # nframes, N, nbins 106 | h, _ = self.lstm(h) 107 | h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins 108 | h = h.reshape(nframes, N, 1, nbins) 109 | h = h.permute(1, 2, 3, 0) 110 | 111 | return h 112 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/model_param_init.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | 4 | default_param = {} 5 | default_param["bins"] = 768 6 | default_param["unstable_bins"] = 9 # training only 7 | default_param["reduction_bins"] = 762 # training only 8 | default_param["sr"] = 44100 9 | default_param["pre_filter_start"] = 757 10 | default_param["pre_filter_stop"] = 768 11 | default_param["band"] = {} 12 | 13 | 14 | default_param["band"][1] = { 15 | "sr": 11025, 16 | "hl": 128, 17 | "n_fft": 960, 18 | "crop_start": 0, 19 | "crop_stop": 245, 20 | "lpf_start": 61, # inference only 21 | "res_type": "polyphase", 22 | } 23 | 24 | default_param["band"][2] = { 25 | "sr": 44100, 26 | "hl": 512, 27 | "n_fft": 1536, 28 | "crop_start": 24, 29 | "crop_stop": 547, 30 | "hpf_start": 81, # inference only 31 | "res_type": "sinc_best", 32 | } 33 | 34 | 35 | def int_keys(d): 36 | r = {} 37 | for k, v in d: 38 | if k.isdigit(): 39 | k = int(k) 40 | r[k] = v 41 | return r 42 | 43 | 44 | class ModelParameters(object): 45 | def __init__(self, config_path=""): 46 | if ".pth" == pathlib.Path(config_path).suffix: 47 | import zipfile 48 | 49 | with zipfile.ZipFile(config_path, "r") as zip: 50 | self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys) 51 | elif ".json" == pathlib.Path(config_path).suffix: 52 | with open(config_path, "r") as f: 53 | self.param = json.loads(f.read(), object_pairs_hook=int_keys) 54 | else: 55 | self.param = default_param 56 | 57 | for k in [ 58 | "mid_side", 59 | "mid_side_b", 60 | "mid_side_b2", 61 | "stereo_w", 62 | "stereo_n", 63 | "reverse", 64 | ]: 65 | if k not in self.param: 66 | self.param[k] = False 67 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 16000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 16000, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 32000, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "kaiser_fast" 14 | } 15 | }, 16 | "sr": 32000, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 33075, 8 | "hl": 384, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 33075, 17 | "pre_filter_start": 1000, 18 | "pre_filter_stop": 1021 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 1024, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 256, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 256, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 256, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 256, 18 | "pre_filter_stop": 256 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 1024, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 1024 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 1024, 3 | "unstable_bins": 0, 4 | "reduction_bins": 0, 5 | "band": { 6 | "1": { 7 | "sr": 44100, 8 | "hl": 512, 9 | "n_fft": 2048, 10 | "crop_start": 0, 11 | "crop_stop": 700, 12 | "hpf_start": -1, 13 | "res_type": "sinc_best" 14 | } 15 | }, 16 | "sr": 44100, 17 | "pre_filter_start": 1023, 18 | "pre_filter_stop": 700 19 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_32000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 118, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 32000, 18 | "hl": 352, 19 | "n_fft": 1024, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 44, 23 | "hpf_stop": 23, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 32000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } 31 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 512, 3 | "unstable_bins": 7, 4 | "reduction_bins": 510, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 160, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 192, 12 | "lpf_start": 41, 13 | "lpf_stop": 139, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 44100, 18 | "hl": 640, 19 | "n_fft": 1024, 20 | "crop_start": 10, 21 | "crop_stop": 320, 22 | "hpf_start": 47, 23 | "hpf_stop": 15, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 44100, 28 | "pre_filter_start": 510, 29 | "pre_filter_stop": 512 30 | } 31 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/2band_48000.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 705, 5 | "band": { 6 | "1": { 7 | "sr": 6000, 8 | "hl": 66, 9 | "n_fft": 512, 10 | "crop_start": 0, 11 | "crop_stop": 240, 12 | "lpf_start": 60, 13 | "lpf_stop": 240, 14 | "res_type": "sinc_fastest" 15 | }, 16 | "2": { 17 | "sr": 48000, 18 | "hl": 528, 19 | "n_fft": 1536, 20 | "crop_start": 22, 21 | "crop_stop": 505, 22 | "hpf_start": 82, 23 | "hpf_stop": 22, 24 | "res_type": "sinc_medium" 25 | } 26 | }, 27 | "sr": 48000, 28 | "pre_filter_start": 710, 29 | "pre_filter_stop": 731 30 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 5, 4 | "reduction_bins": 733, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 768, 10 | "crop_start": 0, 11 | "crop_stop": 278, 12 | "lpf_start": 28, 13 | "lpf_stop": 140, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 22050, 18 | "hl": 256, 19 | "n_fft": 768, 20 | "crop_start": 14, 21 | "crop_stop": 322, 22 | "hpf_start": 70, 23 | "hpf_stop": 14, 24 | "lpf_start": 283, 25 | "lpf_stop": 314, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 44100, 30 | "hl": 512, 31 | "n_fft": 768, 32 | "crop_start": 131, 33 | "crop_stop": 313, 34 | "hpf_start": 154, 35 | "hpf_stop": 141, 36 | "res_type": "sinc_medium" 37 | } 38 | }, 39 | "sr": 44100, 40 | "pre_filter_start": 757, 41 | "pre_filter_stop": 768 42 | } 43 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side": true, 3 | "bins": 768, 4 | "unstable_bins": 5, 5 | "reduction_bins": 733, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 768, 11 | "crop_start": 0, 12 | "crop_stop": 278, 13 | "lpf_start": 28, 14 | "lpf_stop": 140, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 256, 20 | "n_fft": 768, 21 | "crop_start": 14, 22 | "crop_stop": 322, 23 | "hpf_start": 70, 24 | "hpf_stop": 14, 25 | "lpf_start": 283, 26 | "lpf_stop": 314, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 512, 32 | "n_fft": 768, 33 | "crop_start": 131, 34 | "crop_stop": 313, 35 | "hpf_start": 154, 36 | "hpf_stop": 141, 37 | "res_type": "sinc_medium" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 757, 42 | "pre_filter_stop": 768 43 | } 44 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 640, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 187, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 768, 21 | "crop_start": 0, 22 | "crop_stop": 212, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 174, 26 | "lpf_stop": 209, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 640, 33 | "crop_start": 66, 34 | "crop_stop": 307, 35 | "hpf_start": 86, 36 | "hpf_stop": 72, 37 | "res_type": "kaiser_fast" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 639, 42 | "pre_filter_stop": 640 43 | } 44 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "reduction_bins": 668, 5 | "band": { 6 | "1": { 7 | "sr": 11025, 8 | "hl": 128, 9 | "n_fft": 1024, 10 | "crop_start": 0, 11 | "crop_stop": 186, 12 | "lpf_start": 37, 13 | "lpf_stop": 73, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 11025, 18 | "hl": 128, 19 | "n_fft": 512, 20 | "crop_start": 4, 21 | "crop_stop": 185, 22 | "hpf_start": 36, 23 | "hpf_stop": 18, 24 | "lpf_start": 93, 25 | "lpf_stop": 185, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 22050, 30 | "hl": 256, 31 | "n_fft": 512, 32 | "crop_start": 46, 33 | "crop_stop": 186, 34 | "hpf_start": 93, 35 | "hpf_stop": 46, 36 | "lpf_start": 164, 37 | "lpf_stop": 186, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 512, 43 | "n_fft": 768, 44 | "crop_start": 121, 45 | "crop_stop": 382, 46 | "hpf_start": 138, 47 | "hpf_stop": 123, 48 | "res_type": "sinc_medium" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 740, 53 | "pre_filter_stop": 768 54 | } 55 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 768, 3 | "unstable_bins": 7, 4 | "mid_side": true, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } 56 | -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json: -------------------------------------------------------------------------------- 1 | { 2 | "reverse": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json: -------------------------------------------------------------------------------- 1 | { 2 | "stereo_w": true, 3 | "bins": 768, 4 | "unstable_bins": 7, 5 | "reduction_bins": 668, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 128, 10 | "n_fft": 1024, 11 | "crop_start": 0, 12 | "crop_stop": 186, 13 | "lpf_start": 37, 14 | "lpf_stop": 73, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 11025, 19 | "hl": 128, 20 | "n_fft": 512, 21 | "crop_start": 4, 22 | "crop_stop": 185, 23 | "hpf_start": 36, 24 | "hpf_stop": 18, 25 | "lpf_start": 93, 26 | "lpf_stop": 185, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 22050, 31 | "hl": 256, 32 | "n_fft": 512, 33 | "crop_start": 46, 34 | "crop_stop": 186, 35 | "hpf_start": 93, 36 | "hpf_stop": 46, 37 | "lpf_start": 164, 38 | "lpf_stop": 186, 39 | "res_type": "polyphase" 40 | }, 41 | "4": { 42 | "sr": 44100, 43 | "hl": 512, 44 | "n_fft": 768, 45 | "crop_start": 121, 46 | "crop_stop": 382, 47 | "hpf_start": 138, 48 | "hpf_stop": 123, 49 | "res_type": "sinc_medium" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 740, 54 | "pre_filter_stop": 768 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 637, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "convert_channels": "stereo_n", 49 | "res_type": "kaiser_fast" 50 | } 51 | }, 52 | "sr": 44100, 53 | "pre_filter_start": 668, 54 | "pre_filter_stop": 672 55 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/4band_v3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bins": 672, 3 | "unstable_bins": 8, 4 | "reduction_bins": 530, 5 | "band": { 6 | "1": { 7 | "sr": 7350, 8 | "hl": 80, 9 | "n_fft": 640, 10 | "crop_start": 0, 11 | "crop_stop": 85, 12 | "lpf_start": 25, 13 | "lpf_stop": 53, 14 | "res_type": "polyphase" 15 | }, 16 | "2": { 17 | "sr": 7350, 18 | "hl": 80, 19 | "n_fft": 320, 20 | "crop_start": 4, 21 | "crop_stop": 87, 22 | "hpf_start": 25, 23 | "hpf_stop": 12, 24 | "lpf_start": 31, 25 | "lpf_stop": 62, 26 | "res_type": "polyphase" 27 | }, 28 | "3": { 29 | "sr": 14700, 30 | "hl": 160, 31 | "n_fft": 512, 32 | "crop_start": 17, 33 | "crop_stop": 216, 34 | "hpf_start": 48, 35 | "hpf_stop": 24, 36 | "lpf_start": 139, 37 | "lpf_stop": 210, 38 | "res_type": "polyphase" 39 | }, 40 | "4": { 41 | "sr": 44100, 42 | "hl": 480, 43 | "n_fft": 960, 44 | "crop_start": 78, 45 | "crop_stop": 383, 46 | "hpf_start": 130, 47 | "hpf_stop": 86, 48 | "res_type": "kaiser_fast" 49 | } 50 | }, 51 | "sr": 44100, 52 | "pre_filter_start": 668, 53 | "pre_filter_stop": 672 54 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/lib_v5/modelparams/ensemble.json: -------------------------------------------------------------------------------- 1 | { 2 | "mid_side_b2": true, 3 | "bins": 1280, 4 | "unstable_bins": 7, 5 | "reduction_bins": 565, 6 | "band": { 7 | "1": { 8 | "sr": 11025, 9 | "hl": 108, 10 | "n_fft": 2048, 11 | "crop_start": 0, 12 | "crop_stop": 374, 13 | "lpf_start": 92, 14 | "lpf_stop": 186, 15 | "res_type": "polyphase" 16 | }, 17 | "2": { 18 | "sr": 22050, 19 | "hl": 216, 20 | "n_fft": 1536, 21 | "crop_start": 0, 22 | "crop_stop": 424, 23 | "hpf_start": 68, 24 | "hpf_stop": 34, 25 | "lpf_start": 348, 26 | "lpf_stop": 418, 27 | "res_type": "polyphase" 28 | }, 29 | "3": { 30 | "sr": 44100, 31 | "hl": 432, 32 | "n_fft": 1280, 33 | "crop_start": 132, 34 | "crop_stop": 614, 35 | "hpf_start": 172, 36 | "hpf_stop": 144, 37 | "res_type": "polyphase" 38 | } 39 | }, 40 | "sr": 44100, 41 | "pre_filter_start": 1280, 42 | "pre_filter_stop": 1280 43 | } -------------------------------------------------------------------------------- /tools/uvr5/lib/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import torch 5 | from tqdm import tqdm 6 | 7 | 8 | def load_data(file_name: str = "./lib/name_params.json") -> dict: 9 | with open(file_name, "r") as f: 10 | data = json.load(f) 11 | 12 | return data 13 | 14 | 15 | def make_padding(width, cropsize, offset): 16 | left = offset 17 | roi_size = cropsize - left * 2 18 | if roi_size == 0: 19 | roi_size = cropsize 20 | right = roi_size - (width % roi_size) + left 21 | 22 | return left, right, roi_size 23 | 24 | 25 | def inference(X_spec, device, model, aggressiveness, data): 26 | """ 27 | data : dic configs 28 | """ 29 | 30 | def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True): 31 | model.eval() 32 | with torch.no_grad(): 33 | preds = [] 34 | 35 | iterations = [n_window] 36 | 37 | total_iterations = sum(iterations) 38 | for i in tqdm(range(n_window)): 39 | start = i * roi_size 40 | X_mag_window = X_mag_pad[None, :, :, start : start + data["window_size"]] 41 | X_mag_window = torch.from_numpy(X_mag_window) 42 | if is_half: 43 | X_mag_window = X_mag_window.half() 44 | X_mag_window = X_mag_window.to(device) 45 | 46 | pred = model.predict(X_mag_window, aggressiveness) 47 | 48 | pred = pred.detach().cpu().numpy() 49 | preds.append(pred[0]) 50 | 51 | pred = np.concatenate(preds, axis=2) 52 | return pred 53 | 54 | def preprocess(X_spec): 55 | X_mag = np.abs(X_spec) 56 | X_phase = np.angle(X_spec) 57 | 58 | return X_mag, X_phase 59 | 60 | X_mag, X_phase = preprocess(X_spec) 61 | 62 | coef = X_mag.max() 63 | X_mag_pre = X_mag / coef 64 | 65 | n_frame = X_mag_pre.shape[2] 66 | pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) 67 | n_window = int(np.ceil(n_frame / roi_size)) 68 | 69 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 70 | 71 | if list(model.state_dict().values())[0].dtype == torch.float16: 72 | is_half = True 73 | else: 74 | is_half = False 75 | pred = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half) 76 | pred = pred[:, :, :n_frame] 77 | 78 | if data["tta"]: 79 | pad_l += roi_size // 2 80 | pad_r += roi_size // 2 81 | n_window += 1 82 | 83 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") 84 | 85 | pred_tta = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half) 86 | pred_tta = pred_tta[:, :, roi_size // 2 :] 87 | pred_tta = pred_tta[:, :, :n_frame] 88 | 89 | return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) 90 | else: 91 | return pred * coef, X_mag, np.exp(1.0j * X_phase) 92 | 93 | 94 | def _get_name_params(model_path, model_hash): 95 | data = load_data() 96 | flag = False 97 | ModelName = model_path 98 | for type in list(data): 99 | for model in list(data[type][0]): 100 | for i in range(len(data[type][0][model])): 101 | if str(data[type][0][model][i]["hash_name"]) == model_hash: 102 | flag = True 103 | elif str(data[type][0][model][i]["hash_name"]) in ModelName: 104 | flag = True 105 | 106 | if flag: 107 | model_params_auto = data[type][0][model][i]["model_params"] 108 | param_name_auto = data[type][0][model][i]["param_name"] 109 | if type == "equivalent": 110 | return param_name_auto, model_params_auto 111 | else: 112 | flag = False 113 | return param_name_auto, model_params_auto 114 | -------------------------------------------------------------------------------- /tools/uvr5/uvr5_weights/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | --------------------------------------------------------------------------------