├── .dockerignore
├── .github
    ├── build_windows_packages.ps1
    └── workflows
    │   ├── build_windows_packages.yaml
    │   └── docker-publish.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── Colab-Inference.ipynb
├── Colab-WebUI.ipynb
├── Docker
    ├── install_wrapper.sh
    └── miniconda_install.sh
├── Dockerfile
├── GPT_SoVITS
    ├── AR
    │   ├── __init__.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── bucket_sampler.py
    │   │   ├── data_module.py
    │   │   └── dataset.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── t2s_lightning_module.py
    │   │   ├── t2s_lightning_module_onnx.py
    │   │   ├── t2s_model.py
    │   │   ├── t2s_model_onnx.py
    │   │   └── utils.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── activation.py
    │   │   ├── activation_onnx.py
    │   │   ├── embedding.py
    │   │   ├── embedding_onnx.py
    │   │   ├── lr_schedulers.py
    │   │   ├── optim.py
    │   │   ├── patched_mha_with_cache.py
    │   │   ├── patched_mha_with_cache_onnx.py
    │   │   ├── scaling.py
    │   │   ├── transformer.py
    │   │   └── transformer_onnx.py
    │   ├── text_processing
    │   │   ├── __init__.py
    │   │   ├── phonemizer.py
    │   │   └── symbols.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── initialize.py
    │   │   └── io.py
    ├── BigVGAN
    │   ├── LICENSE
    │   ├── README.md
    │   ├── activations.py
    │   ├── alias_free_activation
    │   │   ├── cuda
    │   │   │   ├── __init__.py
    │   │   │   ├── activation1d.py
    │   │   │   ├── anti_alias_activation.cpp
    │   │   │   ├── anti_alias_activation_cuda.cu
    │   │   │   ├── build
    │   │   │   │   └── _
    │   │   │   ├── compat.h
    │   │   │   ├── load.py
    │   │   │   └── type_shim.h
    │   │   └── torch
    │   │   │   ├── __init__.py
    │   │   │   ├── act.py
    │   │   │   ├── filter.py
    │   │   │   └── resample.py
    │   ├── bigvgan.py
    │   ├── configs
    │   │   ├── bigvgan_22khz_80band.json
    │   │   ├── bigvgan_24khz_100band.json
    │   │   ├── bigvgan_base_22khz_80band.json
    │   │   ├── bigvgan_base_24khz_100band.json
    │   │   ├── bigvgan_v2_22khz_80band_256x.json
    │   │   ├── bigvgan_v2_22khz_80band_fmax8k_256x.json
    │   │   ├── bigvgan_v2_24khz_100band_256x.json
    │   │   ├── bigvgan_v2_44khz_128band_256x.json
    │   │   └── bigvgan_v2_44khz_128band_512x.json
    │   ├── discriminators.py
    │   ├── env.py
    │   ├── incl_licenses
    │   │   ├── LICENSE_1
    │   │   ├── LICENSE_2
    │   │   ├── LICENSE_3
    │   │   ├── LICENSE_4
    │   │   ├── LICENSE_5
    │   │   ├── LICENSE_6
    │   │   ├── LICENSE_7
    │   │   └── LICENSE_8
    │   ├── inference.py
    │   ├── inference_e2e.py
    │   ├── loss.py
    │   ├── meldataset.py
    │   ├── nv-modelcard++
    │   │   ├── .gitkeep
    │   │   ├── bias.md
    │   │   ├── explainability.md
    │   │   ├── overview.md
    │   │   ├── privacy.md
    │   │   └── safety.md
    │   ├── requirements.txt
    │   ├── tests
    │   │   ├── test_activation.py
    │   │   ├── test_activation_snake_beta.py
    │   │   └── test_cuda_vs_torch_model.py
    │   ├── train.py
    │   └── utils0.py
    ├── TTS_infer_pack
    │   ├── TTS.py
    │   ├── TextPreprocessor.py
    │   ├── __init__.py
    │   └── text_segmentation_method.py
    ├── configs
    │   ├── .gitignore
    │   ├── s1.yaml
    │   ├── s1big.yaml
    │   ├── s1big2.yaml
    │   ├── s1longer-v2.yaml
    │   ├── s1longer.yaml
    │   ├── s1mq.yaml
    │   ├── s2.json
    │   ├── train.yaml
    │   └── tts_infer.yaml
    ├── download.py
    ├── export_torch_script.py
    ├── export_torch_script_v3.py
    ├── f5_tts
    │   └── model
    │   │   ├── __init__.py
    │   │   ├── backbones
    │   │       ├── README.md
    │   │       ├── dit.py
    │   │       ├── mmdit.py
    │   │       └── unett.py
    │   │   └── modules.py
    ├── feature_extractor
    │   ├── __init__.py
    │   ├── cnhubert.py
    │   └── whisper_enc.py
    ├── inference_cli.py
    ├── inference_gui.py
    ├── inference_webui.py
    ├── inference_webui_fast.py
    ├── module
    │   ├── __init__.py
    │   ├── attentions.py
    │   ├── attentions_onnx.py
    │   ├── commons.py
    │   ├── core_vq.py
    │   ├── data_utils.py
    │   ├── losses.py
    │   ├── mel_processing.py
    │   ├── models.py
    │   ├── models_onnx.py
    │   ├── modules.py
    │   ├── mrte_model.py
    │   ├── quantize.py
    │   └── transforms.py
    ├── onnx_export.py
    ├── prepare_datasets
    │   ├── 1-get-text.py
    │   ├── 2-get-hubert-wav32k.py
    │   └── 3-get-semantic.py
    ├── pretrained_models
    │   └── .gitignore
    ├── process_ckpt.py
    ├── s1_train.py
    ├── s2_train.py
    ├── s2_train_v3.py
    ├── s2_train_v3_lora.py
    ├── text
    │   ├── .gitignore
    │   ├── LangSegmenter
    │   │   ├── __init__.py
    │   │   └── langsegmenter.py
    │   ├── __init__.py
    │   ├── cantonese.py
    │   ├── chinese.py
    │   ├── chinese2.py
    │   ├── cleaner.py
    │   ├── cmudict-fast.rep
    │   ├── cmudict.rep
    │   ├── en_normalization
    │   │   └── expend.py
    │   ├── engdict-hot.rep
    │   ├── engdict_cache.pickle
    │   ├── english.py
    │   ├── g2pw
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── g2pw.py
    │   │   ├── onnx_api.py
    │   │   ├── polyphonic-fix.rep
    │   │   ├── polyphonic.pickle
    │   │   ├── polyphonic.rep
    │   │   └── utils.py
    │   ├── ja_userdic
    │   │   └── userdict.csv
    │   ├── japanese.py
    │   ├── korean.py
    │   ├── namedict_cache.pickle
    │   ├── opencpop-strict.txt
    │   ├── symbols.py
    │   ├── symbols2.py
    │   ├── tone_sandhi.py
    │   └── zh_normalization
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── char_convert.py
    │   │   ├── chronology.py
    │   │   ├── constants.py
    │   │   ├── num.py
    │   │   ├── phonecode.py
    │   │   ├── quantifier.py
    │   │   └── text_normlization.py
    └── utils.py
├── LICENSE
├── README.md
├── api.py
├── api_v2.py
├── config.py
├── docker-compose.yaml
├── docker_build.sh
├── docs
    ├── cn
    │   ├── Changelog_CN.md
    │   └── README.md
    ├── en
    │   └── Changelog_EN.md
    ├── ja
    │   ├── Changelog_JA.md
    │   └── README.md
    ├── ko
    │   ├── Changelog_KO.md
    │   └── README.md
    └── tr
    │   ├── Changelog_TR.md
    │   └── README.md
├── extra-req.txt
├── go-webui.bat
├── go-webui.ps1
├── gpt-sovits_kaggle.ipynb
├── install.sh
├── requirements.txt
├── tools
    ├── AP_BWE_main
    │   ├── 24kto48k
    │   │   └── readme.txt
    │   ├── LICENSE
    │   ├── README.md
    │   ├── datasets1
    │   │   ├── __init__.py
    │   │   └── dataset.py
    │   └── models
    │   │   ├── __init__.py
    │   │   └── model.py
    ├── __init__.py
    ├── asr
    │   ├── config.py
    │   ├── fasterwhisper_asr.py
    │   ├── funasr_asr.py
    │   └── models
    │   │   └── .gitignore
    ├── audio_sr.py
    ├── cmd-denoise.py
    ├── denoise-model
    │   └── .gitignore
    ├── i18n
    │   ├── i18n.py
    │   ├── locale
    │   │   ├── en_US.json
    │   │   ├── es_ES.json
    │   │   ├── fr_FR.json
    │   │   ├── it_IT.json
    │   │   ├── ja_JP.json
    │   │   ├── ko_KR.json
    │   │   ├── pt_BR.json
    │   │   ├── ru_RU.json
    │   │   ├── tr_TR.json
    │   │   ├── zh_CN.json
    │   │   ├── zh_HK.json
    │   │   ├── zh_SG.json
    │   │   └── zh_TW.json
    │   └── scan_i18n.py
    ├── my_utils.py
    ├── slice_audio.py
    ├── slicer2.py
    ├── subfix_webui.py
    └── uvr5
    │   ├── bs_roformer
    │       ├── __init__.py
    │       ├── attend.py
    │       ├── bs_roformer.py
    │       └── mel_band_roformer.py
    │   ├── bsroformer.py
    │   ├── lib
    │       ├── lib_v5
    │       │   ├── dataset.py
    │       │   ├── layers.py
    │       │   ├── layers_123812KB.py
    │       │   ├── layers_123821KB.py
    │       │   ├── layers_33966KB.py
    │       │   ├── layers_537227KB.py
    │       │   ├── layers_537238KB.py
    │       │   ├── layers_new.py
    │       │   ├── model_param_init.py
    │       │   ├── modelparams
    │       │   │   ├── 1band_sr16000_hl512.json
    │       │   │   ├── 1band_sr32000_hl512.json
    │       │   │   ├── 1band_sr33075_hl384.json
    │       │   │   ├── 1band_sr44100_hl1024.json
    │       │   │   ├── 1band_sr44100_hl256.json
    │       │   │   ├── 1band_sr44100_hl512.json
    │       │   │   ├── 1band_sr44100_hl512_cut.json
    │       │   │   ├── 2band_32000.json
    │       │   │   ├── 2band_44100_lofi.json
    │       │   │   ├── 2band_48000.json
    │       │   │   ├── 3band_44100.json
    │       │   │   ├── 3band_44100_mid.json
    │       │   │   ├── 3band_44100_msb2.json
    │       │   │   ├── 4band_44100.json
    │       │   │   ├── 4band_44100_mid.json
    │       │   │   ├── 4band_44100_msb.json
    │       │   │   ├── 4band_44100_msb2.json
    │       │   │   ├── 4band_44100_reverse.json
    │       │   │   ├── 4band_44100_sw.json
    │       │   │   ├── 4band_v2.json
    │       │   │   ├── 4band_v2_sn.json
    │       │   │   ├── 4band_v3.json
    │       │   │   └── ensemble.json
    │       │   ├── nets.py
    │       │   ├── nets_123812KB.py
    │       │   ├── nets_123821KB.py
    │       │   ├── nets_33966KB.py
    │       │   ├── nets_537227KB.py
    │       │   ├── nets_537238KB.py
    │       │   ├── nets_61968KB.py
    │       │   ├── nets_new.py
    │       │   └── spec_utils.py
    │       ├── name_params.json
    │       └── utils.py
    │   ├── mdxnet.py
    │   ├── uvr5_weights
    │       └── .gitignore
    │   ├── vr.py
    │   └── webui.py
└── webui.py


/.github/workflows/build_windows_packages.yaml:
--------------------------------------------------------------------------------
 1 | name: Build and Upload Windows Package
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       date:
 7 |         description: "Date suffix (optional)"
 8 |         required: false
 9 |         default: ""
10 |       suffix:
11 |         description: "Package name suffix (optional)"
12 |         required: false
13 |         default: ""
14 | 
15 | jobs:
16 |   build:
17 |     runs-on: windows-latest
18 |     strategy:
19 |       matrix:
20 |         torch_cuda: [cu124, cu128]
21 |     env:
22 |       TORCH_CUDA: ${{ matrix.torch_cuda }}
23 |       MODELSCOPE_USERNAME: ${{ secrets.MODELSCOPE_USERNAME }}
24 |       MODELSCOPE_TOKEN: ${{ secrets.MODELSCOPE_TOKEN }}
25 |       HUGGINGFACE_USERNAME: ${{ secrets.HUGGINGFACE_USERNAME }}
26 |       HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
27 |       DATE_SUFFIX: ${{ github.event.inputs.date }}
28 |       PKG_SUFFIX: ${{ github.event.inputs.suffix }}
29 | 
30 |     steps:
31 |       - name: Checkout
32 |         uses: actions/checkout@v4
33 | 
34 |       - name: Run Build and Upload Script
35 |         shell: pwsh
36 |         run: |
37 |           Move-Item .github/build_windows_packages.ps1 ../build_windows_packages.ps1
38 |           ../build_windows_packages.ps1


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ci:
 2 |   autoupdate_schedule: monthly
 3 | 
 4 | repos:
 5 | - repo: https://github.com/astral-sh/ruff-pre-commit
 6 |   rev: v0.11.7
 7 |   hooks:
 8 |     # Run the linter.
 9 |     - id: ruff
10 |       types_or: [ python, pyi ]
11 |       args: [ --fix ]
12 |     # Run the formatter.
13 |     - id: ruff-format
14 |       types_or: [ python, pyi ]
15 |       args: [ --line-length, "120", --target-version, "py310" ]
16 | 


--------------------------------------------------------------------------------
/Colab-WebUI.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "colab_type": "text",
  7 |         "id": "view-in-github"
  8 |       },
  9 |       "source": [
 10 |         "<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 11 |       ]
 12 |     },
 13 |     {
 14 |       "cell_type": "markdown",
 15 |       "metadata": {},
 16 |       "source": [
 17 |         "# GPT-SoVITS WebUI"
 18 |       ]
 19 |     },
 20 |     {
 21 |       "cell_type": "markdown",
 22 |       "metadata": {
 23 |         "id": "_o6a8GS2lWQM"
 24 |       },
 25 |       "source": [
 26 |         "## Env Setup (Run Once Only)\n",
 27 |         "## 环境配置, 只需运行一次"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "markdown",
 32 |       "metadata": {},
 33 |       "source": [
 34 |         "### 1."
 35 |       ]
 36 |     },
 37 |     {
 38 |       "cell_type": "code",
 39 |       "execution_count": null,
 40 |       "metadata": {},
 41 |       "outputs": [],
 42 |       "source": [
 43 |         "%%writefile /content/setup.sh\n",
 44 |         "set -e\n",
 45 |         "\n",
 46 |         "cd /content\n",
 47 |         "\n",
 48 |         "git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
 49 |         "\n",
 50 |         "cd GPT-SoVITS\n",
 51 |         "\n",
 52 |         "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
 53 |         "    :\n",
 54 |         "else\n",
 55 |         "    conda create -n GPTSoVITS python=3.10 -y\n",
 56 |         "fi\n",
 57 |         "\n",
 58 |         "source activate GPTSoVITS\n",
 59 |         "\n",
 60 |         "pip install ipykernel\n",
 61 |         "\n",
 62 |         "bash install.sh --device CU126 --source HF --download-uvr5"
 63 |       ]
 64 |     },
 65 |     {
 66 |       "cell_type": "markdown",
 67 |       "metadata": {},
 68 |       "source": [
 69 |         "### 2."
 70 |       ]
 71 |     },
 72 |     {
 73 |       "cell_type": "code",
 74 |       "execution_count": null,
 75 |       "metadata": {},
 76 |       "outputs": [],
 77 |       "source": [
 78 |         "%pip install -q condacolab\n",
 79 |         "import condacolab\n",
 80 |         "condacolab.install_from_url(\"https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh\")\n",
 81 |         "!cd /content && bash setup.sh"
 82 |       ]
 83 |     },
 84 |     {
 85 |       "cell_type": "markdown",
 86 |       "metadata": {},
 87 |       "source": [
 88 |         "## Launch WebUI\n",
 89 |         "## 启动 WebUI"
 90 |       ]
 91 |     },
 92 |     {
 93 |       "cell_type": "code",
 94 |       "execution_count": null,
 95 |       "metadata": {
 96 |         "id": "4oRGUzkrk8C7"
 97 |       },
 98 |       "outputs": [],
 99 |       "source": [
100 |         "!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py"
101 |       ]
102 |     }
103 |   ],
104 |   "metadata": {
105 |     "accelerator": "GPU",
106 |     "colab": {
107 |       "include_colab_link": true,
108 |       "provenance": []
109 |     },
110 |     "kernelspec": {
111 |       "display_name": "Python 3",
112 |       "name": "python3"
113 |     }
114 |   },
115 |   "nbformat": 4,
116 |   "nbformat_minor": 0
117 | }
118 | 


--------------------------------------------------------------------------------
/Docker/install_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 4 | 
 5 | cd "$SCRIPT_DIR" || exit 1
 6 | 
 7 | cd .. || exit 1
 8 | 
 9 | set -e
10 | 
11 | source "$HOME/miniconda3/etc/profile.d/conda.sh"
12 | 
13 | mkdir -p GPT_SoVITS
14 | 
15 | mkdir -p GPT_SoVITS/text
16 | 
17 | ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
18 | 
19 | ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
20 | 
21 | bash install.sh --device "CU${CUDA_VERSION//./}" --source HF
22 | 
23 | pip cache purge
24 | 
25 | pip show torch
26 | 
27 | rm -rf /tmp/* /var/tmp/*
28 | 
29 | rm -rf "$HOME/miniconda3/pkgs"
30 | 
31 | mkdir -p "$HOME/miniconda3/pkgs"
32 | 
33 | rm -rf /root/.conda /root/.cache
34 | 


--------------------------------------------------------------------------------
/Docker/miniconda_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 6 | 
 7 | cd "$SCRIPT_DIR" || exit 1
 8 | 
 9 | cd .. || exit 1
10 | 
11 | if [ -d "$HOME/miniconda3" ]; then
12 |     exit 0
13 | fi
14 | 
15 | WORKFLOW=${WORKFLOW:-"false"}
16 | TARGETPLATFORM=${TARGETPLATFORM:-"linux/amd64"}
17 | 
18 | if [ "$WORKFLOW" = "true" ]; then
19 |     WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
20 | else
21 |     WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
22 | fi
23 | 
24 | if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
25 |     "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-x86_64.sh
26 | elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then
27 |     "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-aarch64.sh
28 | else
29 |     exit 1
30 | fi
31 | 
32 | LOG_PATH="/tmp/miniconda-install.log"
33 | 
34 | bash miniconda.sh -b -p "$HOME/miniconda3" >"$LOG_PATH" 2>&1
35 | 
36 | if [ $? -eq 0 ]; then
37 |     echo "== Miniconda Installed =="
38 | else
39 |     echo "Failed to Install miniconda"
40 |     tail -n 50 "$LOG_PATH"
41 |     exit 1
42 | fi
43 | 
44 | rm miniconda.sh
45 | 
46 | source "$HOME/miniconda3/etc/profile.d/conda.sh"
47 | 
48 | "$HOME/miniconda3/bin/conda" config --add channels conda-forge
49 | 
50 | "$HOME/miniconda3/bin/conda" update -q --all -y 1>/dev/null
51 | 
52 | "$HOME/miniconda3/bin/conda" install python=3.11 -q -y
53 | 
54 | "$HOME/miniconda3/bin/conda" install gcc=14 gxx ffmpeg cmake make unzip -q -y
55 | 
56 | if [ "$CUDA_VERSION" = "12.8" ]; then
57 |     "$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu128
58 | elif [ "$CUDA_VERSION" = "12.6" ]; then
59 |     "$HOME/miniconda3/bin/pip" install torch==2.6 torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
60 | fi
61 | 
62 | "$HOME/miniconda3/bin/pip" cache purge
63 | 
64 | rm $LOG_PATH
65 | 
66 | rm -rf "$HOME/miniconda3/pkgs"
67 | 
68 | mkdir -p "$HOME/miniconda3/pkgs"
69 | 
70 | rm -rf "$HOME/.conda" "$HOME/.cache"
71 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG CUDA_VERSION=12.6
 2 | ARG TORCH_BASE=full
 3 | 
 4 | FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE}
 5 | 
 6 | LABEL maintainer="XXXXRT"
 7 | LABEL version="V4"
 8 | LABEL description="Docker image for GPT-SoVITS"
 9 | 
10 | ARG CUDA_VERSION=12.6
11 | 
12 | ENV CUDA_VERSION=${CUDA_VERSION}
13 | 
14 | SHELL ["/bin/bash", "-c"]
15 | 
16 | WORKDIR /workspace/GPT-SoVITS
17 | 
18 | COPY Docker /workspace/GPT-SoVITS/Docker/
19 | 
20 | ARG LITE=false
21 | ENV LITE=${LITE}
22 | 
23 | ARG WORKFLOW=false
24 | ENV WORKFLOW=${WORKFLOW}
25 | 
26 | ARG TARGETPLATFORM
27 | ENV TARGETPLATFORM=${TARGETPLATFORM}
28 | 
29 | RUN bash Docker/miniconda_install.sh
30 | 
31 | COPY extra-req.txt /workspace/GPT-SoVITS/
32 | 
33 | COPY requirements.txt /workspace/GPT-SoVITS/
34 | 
35 | COPY install.sh /workspace/GPT-SoVITS/
36 | 
37 | RUN bash Docker/install_wrapper.sh
38 | 
39 | EXPOSE 9871 9872 9873 9874 9880
40 | 
41 | ENV PYTHONPATH="/workspace/GPT-SoVITS"
42 | 
43 | RUN conda init bash && echo "conda activate base" >> ~/.bashrc
44 | 
45 | WORKDIR /workspace
46 | 
47 | RUN rm -rf /workspace/GPT-SoVITS
48 | 
49 | WORKDIR /workspace/GPT-SoVITS
50 | 
51 | COPY . /workspace/GPT-SoVITS
52 | 
53 | CMD ["/bin/bash", "-c", "\
54 |   rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
55 |   rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
56 |   rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
57 |   rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
58 |   ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
59 |   ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
60 |   ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
61 |   ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
62 |   exec bash"]


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/data/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/data_module.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | from pytorch_lightning import LightningDataModule
 4 | from torch.utils.data import DataLoader
 5 | 
 6 | from AR.data.bucket_sampler import DistributedBucketSampler
 7 | from AR.data.dataset import Text2SemanticDataset
 8 | 
 9 | 
10 | class Text2SemanticDataModule(LightningDataModule):
11 |     def __init__(
12 |         self,
13 |         config,
14 |         train_semantic_path,
15 |         train_phoneme_path,
16 |         dev_semantic_path=None,
17 |         dev_phoneme_path=None,
18 |     ):
19 |         super().__init__()
20 |         self.config = config
21 |         self.train_semantic_path = train_semantic_path
22 |         self.train_phoneme_path = train_phoneme_path
23 |         self.dev_semantic_path = dev_semantic_path
24 |         self.dev_phoneme_path = dev_phoneme_path
25 |         self.num_workers = self.config["data"]["num_workers"]
26 | 
27 |     def prepare_data(self):
28 |         pass
29 | 
30 |     def setup(self, stage=None, output_logs=False):
31 |         self._train_dataset = Text2SemanticDataset(
32 |             phoneme_path=self.train_phoneme_path,
33 |             semantic_path=self.train_semantic_path,
34 |             max_sec=self.config["data"]["max_sec"],
35 |             pad_val=self.config["data"]["pad_val"],
36 |         )
37 |         self._dev_dataset = self._train_dataset
38 |         # self._dev_dataset = Text2SemanticDataset(
39 |         #     phoneme_path=self.dev_phoneme_path,
40 |         #     semantic_path=self.dev_semantic_path,
41 |         #     max_sample=self.config['data']['max_eval_sample'],
42 |         #     max_sec=self.config['data']['max_sec'],
43 |         #     pad_val=self.config['data']['pad_val'])
44 | 
45 |     def train_dataloader(self):
46 |         batch_size = (
47 |             self.config["train"]["batch_size"] // 2
48 |             if self.config["train"].get("if_dpo", False) is True
49 |             else self.config["train"]["batch_size"]
50 |         )
51 |         batch_size = max(min(batch_size, len(self._train_dataset) // 4), 1)  # 防止不保存
52 |         sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
53 |         return DataLoader(
54 |             self._train_dataset,
55 |             batch_size=batch_size,
56 |             sampler=sampler,
57 |             collate_fn=self._train_dataset.collate,
58 |             num_workers=self.num_workers,
59 |             persistent_workers=True,
60 |             prefetch_factor=16,
61 |         )
62 | 
63 |     def val_dataloader(self):
64 |         return DataLoader(
65 |             self._dev_dataset,
66 |             batch_size=1,
67 |             shuffle=False,
68 |             collate_fn=self._train_dataset.collate,
69 |             num_workers=max(self.num_workers, 12),
70 |             persistent_workers=True,
71 |             prefetch_factor=16,
72 |         )
73 | 
74 |     # 这个会使用到嘛？
75 |     def test_dataloader(self):
76 |         return DataLoader(
77 |             self._dev_dataset,
78 |             batch_size=1,
79 |             shuffle=False,
80 |             collate_fn=self._train_dataset.collate,
81 |         )
82 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/models/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
  2 | # reference: https://github.com/lifeiteng/vall-e
  3 | import os
  4 | import sys
  5 | 
  6 | now_dir = os.getcwd()
  7 | sys.path.append(now_dir)
  8 | from typing import Dict
  9 | 
 10 | import torch
 11 | from pytorch_lightning import LightningModule
 12 | 
 13 | from AR.models.t2s_model_onnx import Text2SemanticDecoder
 14 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule
 15 | from AR.modules.optim import ScaledAdam
 16 | 
 17 | 
 18 | class Text2SemanticLightningModule(LightningModule):
 19 |     def __init__(self, config, output_dir, is_train=True):
 20 |         super().__init__()
 21 |         self.config = config
 22 |         self.top_k = 3
 23 |         self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
 24 |         pretrained_s1 = config.get("pretrained_s1")
 25 |         if pretrained_s1 and is_train:
 26 |             # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
 27 |             print(
 28 |                 self.load_state_dict(
 29 |                     torch.load(
 30 |                         pretrained_s1,
 31 |                         map_location="cpu",
 32 |                     )["weight"],
 33 |                 ),
 34 |             )
 35 |         if is_train:
 36 |             self.automatic_optimization = False
 37 |             self.save_hyperparameters()
 38 |             self.eval_dir = output_dir / "eval"
 39 |             self.eval_dir.mkdir(parents=True, exist_ok=True)
 40 | 
 41 |     def training_step(self, batch: Dict, batch_idx: int):
 42 |         opt = self.optimizers()
 43 |         scheduler = self.lr_schedulers()
 44 |         loss, acc = self.model.forward(
 45 |             batch["phoneme_ids"],
 46 |             batch["phoneme_ids_len"],
 47 |             batch["semantic_ids"],
 48 |             batch["semantic_ids_len"],
 49 |             batch["bert_feature"],
 50 |         )
 51 |         self.manual_backward(loss)
 52 |         if batch_idx > 0 and batch_idx % 4 == 0:
 53 |             opt.step()
 54 |             opt.zero_grad()
 55 |             scheduler.step()
 56 | 
 57 |         self.log(
 58 |             "total_loss",
 59 |             loss,
 60 |             on_step=True,
 61 |             on_epoch=True,
 62 |             prog_bar=True,
 63 |             sync_dist=True,
 64 |         )
 65 |         self.log(
 66 |             "lr",
 67 |             scheduler.get_last_lr()[0],
 68 |             on_epoch=True,
 69 |             prog_bar=True,
 70 |             sync_dist=True,
 71 |         )
 72 |         self.log(
 73 |             f"top_{self.top_k}_acc",
 74 |             acc,
 75 |             on_step=True,
 76 |             on_epoch=True,
 77 |             prog_bar=True,
 78 |             sync_dist=True,
 79 |         )
 80 | 
 81 |     def validation_step(self, batch: Dict, batch_idx: int):
 82 |         return
 83 | 
 84 |     def configure_optimizers(self):
 85 |         model_parameters = self.model.parameters()
 86 |         parameters_names = []
 87 |         parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()])
 88 |         lm_opt = ScaledAdam(
 89 |             model_parameters,
 90 |             lr=0.01,
 91 |             betas=(0.9, 0.95),
 92 |             clipping_scale=2.0,
 93 |             parameters_names=parameters_names,
 94 |             show_dominant_parameters=False,
 95 |             clipping_update_period=1000,
 96 |         )
 97 | 
 98 |         return {
 99 |             "optimizer": lm_opt,
100 |             "lr_scheduler": {
101 |                 "scheduler": WarmupCosineLRSchedule(
102 |                     lm_opt,
103 |                     init_lr=self.config["optimizer"]["lr_init"],
104 |                     peak_lr=self.config["optimizer"]["lr"],
105 |                     end_lr=self.config["optimizer"]["lr_end"],
106 |                     warmup_steps=self.config["optimizer"]["warmup_steps"],
107 |                     total_steps=self.config["optimizer"]["decay_steps"],
108 |                 )
109 |             },
110 |         }
111 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/modules/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 | 
50 |         self.reverse = False
51 |         self.pe = None
52 |         self.extend_pe(torch.tensor(0.0).expand(1, 4000))
53 | 
54 |     def extend_pe(self, x):
55 |         """Reset the positional encodings."""
56 |         if self.pe is not None:
57 |             if self.pe.size(1) >= x.size(1):
58 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
59 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
60 |                 return
61 |         pe = torch.zeros(x.size(1), self.embedding_dim)
62 |         if self.reverse:
63 |             position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
64 |         else:
65 |             position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
66 |         div_term = torch.exp(
67 |             torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim)
68 |         )
69 |         pe[:, 0::2] = torch.sin(position * div_term)
70 |         pe[:, 1::2] = torch.cos(position * div_term)
71 |         pe = pe.unsqueeze(0)
72 |         self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
73 | 
74 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
75 |         self.extend_pe(x)
76 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
77 |         output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
78 |         return self.dropout(output)
79 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding_onnx.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 |         self.reverse = False
50 |         self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
51 | 
52 |     def extend_pe(self, x):
53 |         position = torch.cumsum(torch.ones_like(x[:, :, 0]), dim=1).transpose(0, 1)
54 |         scpe = (position * self.div_term).unsqueeze(0)
55 |         pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
56 |         pe = pe.contiguous().view(1, -1, self.embedding_dim)
57 |         return pe
58 | 
59 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
60 |         pe = self.extend_pe(x)
61 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
62 |         output = output * self.x_scale + self.alpha * pe
63 |         return self.dropout(output)
64 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/lr_schedulers.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import math
 4 | 
 5 | import torch
 6 | from matplotlib import pyplot as plt
 7 | from torch import nn
 8 | from torch.optim import Adam
 9 | 
10 | 
11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
12 |     """
13 |     Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         optimizer,
19 |         init_lr,
20 |         peak_lr,
21 |         end_lr,
22 |         warmup_steps=10000,
23 |         total_steps=400000,
24 |         current_step=0,
25 |     ):
26 |         self.init_lr = init_lr
27 |         self.peak_lr = peak_lr
28 |         self.end_lr = end_lr
29 |         self.optimizer = optimizer
30 |         self._warmup_rate = (peak_lr - init_lr) / warmup_steps
31 |         self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
32 |         self._current_step = current_step
33 |         self.lr = init_lr
34 |         self.warmup_steps = warmup_steps
35 |         self.total_steps = total_steps
36 |         self._last_lr = [self.lr]
37 | 
38 |     def set_lr(self, lr):
39 |         self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
40 |         for g in self.optimizer.param_groups:
41 |             # g['lr'] = lr
42 |             g["lr"] = self.end_lr  ###锁定用线性
43 | 
44 |     def step(self):
45 |         if self._current_step < self.warmup_steps:
46 |             lr = self.init_lr + self._warmup_rate * self._current_step
47 | 
48 |         elif self._current_step > self.total_steps:
49 |             lr = self.end_lr
50 | 
51 |         else:
52 |             decay_ratio = (self._current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
53 |             if decay_ratio < 0.0 or decay_ratio > 1.0:
54 |                 raise RuntimeError("Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings.")
55 |             coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
56 |             lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
57 | 
58 |         self.lr = lr = self.end_lr = 0.002  ###锁定用线性###不听话，直接锁定！
59 |         self.set_lr(lr)
60 |         self.lr = lr
61 |         self._current_step += 1
62 |         return self.lr
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     m = nn.Linear(10, 10)
67 |     opt = Adam(m.parameters(), lr=1e-4)
68 |     s = WarmupCosineLRSchedule(
69 |         opt,
70 |         1e-6,
71 |         2e-4,
72 |         1e-6,
73 |         warmup_steps=2000,
74 |         total_steps=20000,
75 |         current_step=0,
76 |     )
77 |     lrs = []
78 |     for i in range(25000):
79 |         s.step()
80 |         lrs.append(s.lr)
81 |         print(s.lr)
82 | 
83 |     plt.plot(lrs)
84 |     plt.plot(range(0, 25000), lrs)
85 |     plt.show()
86 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.functional import *
 2 | from torch.nn.functional import (
 3 |     _canonical_mask,
 4 | )
 5 | 
 6 | 
 7 | def multi_head_attention_forward_patched(
 8 |     query,
 9 |     key,
10 |     value,
11 |     embed_dim_to_check: int,
12 |     num_heads: int,
13 |     in_proj_weight,
14 |     in_proj_bias: Optional[Tensor],
15 |     bias_k: Optional[Tensor],
16 |     bias_v: Optional[Tensor],
17 |     add_zero_attn: bool,
18 |     dropout_p: float,
19 |     out_proj_weight: Tensor,
20 |     out_proj_bias: Optional[Tensor],
21 |     training: bool = True,
22 |     key_padding_mask: Optional[Tensor] = None,
23 |     need_weights: bool = True,
24 |     attn_mask: Optional[Tensor] = None,
25 |     use_separate_proj_weight: bool = False,
26 |     q_proj_weight: Optional[Tensor] = None,
27 |     k_proj_weight: Optional[Tensor] = None,
28 |     v_proj_weight: Optional[Tensor] = None,
29 |     static_k: Optional[Tensor] = None,
30 |     static_v: Optional[Tensor] = None,
31 |     average_attn_weights: bool = True,
32 |     is_causal: bool = False,
33 |     cache=None,
34 | ) -> Tuple[Tensor, Optional[Tensor]]:
35 |     # set up shape vars
36 |     _, _, embed_dim = query.shape
37 |     attn_mask = _canonical_mask(
38 |         mask=attn_mask,
39 |         mask_name="attn_mask",
40 |         other_type=None,
41 |         other_name="",
42 |         target_type=query.dtype,
43 |         check_other=False,
44 |     )
45 |     head_dim = embed_dim // num_heads
46 | 
47 |     proj_qkv = linear(query, in_proj_weight, in_proj_bias)
48 |     proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
49 |     q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
50 | 
51 |     if cache["first_infer"] == 1:
52 |         cache["k"][cache["stage"]] = k
53 |         cache["v"][cache["stage"]] = v
54 |     else:
55 |         cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
56 |         cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
57 |         k = cache["k"][cache["stage"]]
58 |         v = cache["v"][cache["stage"]]
59 |     cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
60 | 
61 |     attn_mask = _canonical_mask(
62 |         mask=attn_mask,
63 |         mask_name="attn_mask",
64 |         other_type=None,
65 |         other_name="",
66 |         target_type=q.dtype,
67 |         check_other=False,
68 |     )
69 |     attn_mask = attn_mask.unsqueeze(0)
70 | 
71 |     q = q.view(-1, num_heads, head_dim).transpose(0, 1)
72 |     k = k.view(-1, num_heads, head_dim).transpose(0, 1)
73 |     v = v.view(-1, num_heads, head_dim).transpose(0, 1)
74 | 
75 |     dropout_p = 0.0
76 |     attn_mask = attn_mask.unsqueeze(0)
77 |     q = q.view(num_heads, -1, head_dim).unsqueeze(0)
78 |     k = k.view(num_heads, -1, head_dim).unsqueeze(0)
79 |     v = v.view(num_heads, -1, head_dim).unsqueeze(0)
80 |     attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
81 |     attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
82 |     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
83 |     attn_output = attn_output.view(-1, 1, attn_output.size(1))
84 | 
85 |     return attn_output
86 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/text_processing/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | import itertools
 4 | import re
 5 | from typing import Dict
 6 | from typing import List
 7 | 
 8 | import regex
 9 | from gruut import sentences
10 | from gruut.const import Sentence
11 | from gruut.const import Word
12 | from AR.text_processing.symbols import SYMBOL_TO_ID
13 | 
14 | 
15 | class GruutPhonemizer:
16 |     def __init__(self, language: str):
17 |         self._phonemizer = sentences
18 |         self.lang = language
19 |         self.symbol_to_id = SYMBOL_TO_ID
20 |         self._special_cases_dict: Dict[str] = {
21 |             r"\.\.\.": "... ",
22 |             ";": "; ",
23 |             ":": ": ",
24 |             ",": ", ",
25 |             r"\.": ". ",
26 |             "!": "! ",
27 |             r"\?": "? ",
28 |             "—": "—",
29 |             "…": "… ",
30 |             "«": "«",
31 |             "»": "»",
32 |         }
33 |         self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])"
34 | 
35 |     def _normalize_punctuation(self, text: str) -> str:
36 |         text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
37 |         text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
38 |         text = regex.sub(r"\pZ+", r" ", text)
39 |         return text.strip()
40 | 
41 |     def _convert_punctuation(self, word: Word) -> str:
42 |         if not word.phonemes:
43 |             return ""
44 |         if word.phonemes[0] in ["‖", "|"]:
45 |             return word.text.strip()
46 | 
47 |         phonemes = "".join(word.phonemes)
48 |         # remove modifier characters ˈˌː with regex
49 |         phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
50 |         return phonemes.strip()
51 | 
52 |     def phonemize(self, text: str, espeak: bool = False) -> str:
53 |         text_to_phonemize: str = self._normalize_punctuation(text)
54 |         sents: List[Sentence] = [sent for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)]
55 |         words: List[str] = [self._convert_punctuation(word) for word in itertools.chain(*sents)]
56 |         return " ".join(words)
57 | 
58 |     def transform(self, phonemes):
59 |         # convert phonemes to ids
60 |         # dictionary is in symbols.py
61 |         return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     phonemizer = GruutPhonemizer("en-us")
66 |     # text -> IPA
67 |     phonemes = phonemizer.phonemize("Hello, wor-ld ?")
68 |     print("phonemes:", phonemes)
69 |     print("len(phonemes):", len(phonemes))
70 |     phoneme_ids = phonemizer.transform(phonemes)
71 |     print("phoneme_ids:", phoneme_ids)
72 |     print("len(phoneme_ids):", len(phoneme_ids))
73 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/symbols.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
 2 | # reference: https://github.com/lifeiteng/vall-e
 3 | PAD = "_"
 4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” '
 5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 6 | IPA_LETTERS = (
 7 |     "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
 8 | )
 9 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
10 | SPACE_ID = SYMBOLS.index(" ")
11 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
12 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
13 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def str2bool(str):
 5 |     return True if str.lower() == "true" else False
 6 | 
 7 | 
 8 | def get_newest_ckpt(string_list):
 9 |     # 定义一个正则表达式模式，用于匹配字符串中的数字
10 |     pattern = r"epoch=(\d+)-step=(\d+)\.ckpt"
11 | 
12 |     # 使用正则表达式提取每个字符串中的数字信息，并创建一个包含元组的列表
13 |     extracted_info = []
14 |     for string in string_list:
15 |         match = re.match(pattern, string)
16 |         if match:
17 |             epoch = int(match.group(1))
18 |             step = int(match.group(2))
19 |             extracted_info.append((epoch, step, string))
20 |     # 按照 epoch 后面的数字和 step 后面的数字进行排序
21 |     sorted_info = sorted(extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
22 |     # 获取最新的 ckpt 文件名
23 |     newest_ckpt = sorted_info[0][2]
24 |     return newest_ckpt
25 | 
26 | 
27 | # 文本存在且不为空时 return True
28 | def check_txt_file(file_path):
29 |     try:
30 |         with open(file_path, "r") as file:
31 |             text = file.readline().strip()
32 |         assert text.strip() != ""
33 |         return text
34 |     except Exception:
35 |         return False
36 |     return False
37 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/initialize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Initialize modules for espnet2 neural networks."""
 3 | 
 4 | import torch
 5 | from typeguard import check_argument_types
 6 | 
 7 | 
 8 | def initialize(model: torch.nn.Module, init: str):
 9 |     """Initialize weights of a neural network module.
10 | 
11 |     Parameters are initialized using the given method or distribution.
12 | 
13 |     Custom initialization routines can be implemented into submodules
14 |     as function `espnet_initialization_fn` within the custom module.
15 | 
16 |     Args:
17 |         model: Target.
18 |         init: Method of initialization.
19 |     """
20 |     assert check_argument_types()
21 |     print("init with", init)
22 | 
23 |     # weight init
24 |     for p in model.parameters():
25 |         if p.dim() > 1:
26 |             if init == "xavier_uniform":
27 |                 torch.nn.init.xavier_uniform_(p.data)
28 |             elif init == "xavier_normal":
29 |                 torch.nn.init.xavier_normal_(p.data)
30 |             elif init == "kaiming_uniform":
31 |                 torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
32 |             elif init == "kaiming_normal":
33 |                 torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
34 |             else:
35 |                 raise ValueError("Unknown initialization: " + init)
36 |     # bias init
37 |     for name, p in model.named_parameters():
38 |         if ".bias" in name and p.dim() == 1:
39 |             p.data.zero_()
40 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/io.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | import yaml
 5 | 
 6 | 
 7 | def load_yaml_config(path):
 8 |     with open(path) as f:
 9 |         config = yaml.full_load(f)
10 |     return config
11 | 
12 | 
13 | def save_config_to_yaml(config, path):
14 |     assert path.endswith(".yaml")
15 |     with open(path, "w") as f:
16 |         f.write(yaml.dump(config))
17 |         f.close()
18 | 
19 | 
20 | def write_args(args, path):
21 |     args_dict = dict((name, getattr(args, name)) for name in dir(args) if not name.startswith("_"))
22 |     with open(path, "a") as args_file:
23 |         args_file.write("==> torch version: {}\n".format(torch.__version__))
24 |         args_file.write("==> cudnn version: {}\n".format(torch.backends.cudnn.version()))
25 |         args_file.write("==> Cmd:\n")
26 |         args_file.write(str(sys.argv))
27 |         args_file.write("\n==> args:\n")
28 |         for k, v in sorted(args_dict.items()):
29 |             args_file.write("  %s: %s\n" % (str(k), str(v)))
30 |         args_file.close()
31 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 NVIDIA CORPORATION.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | from alias_free_activation.torch.resample import UpSample1d, DownSample1d
 7 | 
 8 | # load fused CUDA kernel: this enables importing anti_alias_activation_cuda
 9 | from alias_free_activation.cuda import load
10 | 
11 | anti_alias_activation_cuda = load.load()
12 | 
13 | 
14 | class FusedAntiAliasActivation(torch.autograd.Function):
15 |     """
16 |     Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
17 |     The hyperparameters are hard-coded in the kernel to maximize speed.
18 |     NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
19 |     """
20 | 
21 |     @staticmethod
22 |     def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
23 |         activation_results = anti_alias_activation_cuda.forward(inputs, up_ftr, down_ftr, alpha, beta)
24 | 
25 |         return activation_results
26 | 
27 |     @staticmethod
28 |     def backward(ctx, output_grads):
29 |         raise NotImplementedError
30 |         return output_grads, None, None
31 | 
32 | 
33 | class Activation1d(nn.Module):
34 |     def __init__(
35 |         self,
36 |         activation,
37 |         up_ratio: int = 2,
38 |         down_ratio: int = 2,
39 |         up_kernel_size: int = 12,
40 |         down_kernel_size: int = 12,
41 |         fused: bool = True,
42 |     ):
43 |         super().__init__()
44 |         self.up_ratio = up_ratio
45 |         self.down_ratio = down_ratio
46 |         self.act = activation
47 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
48 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
49 | 
50 |         self.fused = fused  # Whether to use fused CUDA kernel or not
51 | 
52 |     def forward(self, x):
53 |         if not self.fused:
54 |             x = self.upsample(x)
55 |             x = self.act(x)
56 |             x = self.downsample(x)
57 |             return x
58 |         else:
59 |             if self.act.__class__.__name__ == "Snake":
60 |                 beta = self.act.alpha.data  # Snake uses same params for alpha and beta
61 |             else:
62 |                 beta = self.act.beta.data  # Snakebeta uses different params for alpha and beta
63 |             alpha = self.act.alpha.data
64 |             if not self.act.alpha_logscale:  # Exp baked into cuda kernel, cancel it out with a log
65 |                 alpha = torch.log(alpha)
66 |                 beta = torch.log(beta)
67 | 
68 |             x = FusedAntiAliasActivation.apply(x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta)
69 |             return x
70 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 |  #include <torch/extension.h>
18 | 
19 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
20 | 
21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
22 |     m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
23 | }


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/build/_:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied fron NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 | 
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import os
 5 | import pathlib
 6 | import subprocess
 7 | 
 8 | from torch.utils import cpp_extension
 9 | 
10 | """
11 | Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels. 
12 | Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
13 | """
14 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
15 | 
16 | 
17 | def load():
18 |     # Check if cuda 11 is installed for compute capability 8.0
19 |     cc_flag = []
20 |     _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
21 |     if int(bare_metal_major) >= 11:
22 |         cc_flag.append("-gencode")
23 |         cc_flag.append("arch=compute_80,code=sm_80")
24 | 
25 |     # Build path
26 |     srcpath = pathlib.Path(__file__).parent.absolute()
27 |     buildpath = srcpath / "build"
28 |     _create_build_dir(buildpath)
29 | 
30 |     # Helper function to build the kernels.
31 |     def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
32 |         return cpp_extension.load(
33 |             name=name,
34 |             sources=sources,
35 |             build_directory=buildpath,
36 |             extra_cflags=[
37 |                 "-O3",
38 |             ],
39 |             extra_cuda_cflags=[
40 |                 "-O3",
41 |                 "-gencode",
42 |                 "arch=compute_70,code=sm_70",
43 |                 "--use_fast_math",
44 |             ]
45 |             + extra_cuda_flags
46 |             + cc_flag,
47 |             verbose=True,
48 |         )
49 | 
50 |     extra_cuda_flags = [
51 |         "-U__CUDA_NO_HALF_OPERATORS__",
52 |         "-U__CUDA_NO_HALF_CONVERSIONS__",
53 |         "--expt-relaxed-constexpr",
54 |         "--expt-extended-lambda",
55 |     ]
56 | 
57 |     sources = [
58 |         srcpath / "anti_alias_activation.cpp",
59 |         srcpath / "anti_alias_activation_cuda.cu",
60 |     ]
61 |     anti_alias_activation_cuda = _cpp_extention_load_helper("anti_alias_activation_cuda", sources, extra_cuda_flags)
62 | 
63 |     return anti_alias_activation_cuda
64 | 
65 | 
66 | def _get_cuda_bare_metal_version(cuda_dir):
67 |     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
68 |     output = raw_output.split()
69 |     release_idx = output.index("release") + 1
70 |     release = output[release_idx].split(".")
71 |     bare_metal_major = release[0]
72 |     bare_metal_minor = release[1][0]
73 | 
74 |     return raw_output, bare_metal_major, bare_metal_minor
75 | 
76 | 
77 | def _create_build_dir(buildpath):
78 |     try:
79 |         os.mkdir(buildpath)
80 |     except OSError:
81 |         if not os.path.isdir(buildpath):
82 |             print(f"Creation of the build directory {buildpath} failed")
83 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | #   LICENSE is in incl_licenses directory.
3 | 
4 | from .filter import *
5 | from .resample import *
6 | from .act import *
7 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from .resample import UpSample1d, DownSample1d
 6 | 
 7 | 
 8 | class Activation1d(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         activation,
12 |         up_ratio: int = 2,
13 |         down_ratio: int = 2,
14 |         up_kernel_size: int = 12,
15 |         down_kernel_size: int = 12,
16 |     ):
17 |         super().__init__()
18 |         self.up_ratio = up_ratio
19 |         self.down_ratio = down_ratio
20 |         self.act = activation
21 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
22 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
23 | 
24 |     # x: [B,C,T]
25 |     def forward(self, x):
26 |         x = self.upsample(x)
27 |         x = self.act(x)
28 |         x = self.downsample(x)
29 | 
30 |         return x
31 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import math
  8 | 
  9 | if "sinc" in dir(torch):
 10 |     sinc = torch.sinc
 11 | else:
 12 |     # This code is adopted from adefossez's julius.core.sinc under the MIT License
 13 |     # https://adefossez.github.io/julius/julius/core.html
 14 |     #   LICENSE is in incl_licenses directory.
 15 |     def sinc(x: torch.Tensor):
 16 |         """
 17 |         Implementation of sinc, i.e. sin(pi * x) / (pi * x)
 18 |         __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
 19 |         """
 20 |         return torch.where(
 21 |             x == 0,
 22 |             torch.tensor(1.0, device=x.device, dtype=x.dtype),
 23 |             torch.sin(math.pi * x) / math.pi / x,
 24 |         )
 25 | 
 26 | 
 27 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
 28 | # https://adefossez.github.io/julius/julius/lowpass.html
 29 | #   LICENSE is in incl_licenses directory.
 30 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):  # return filter [1,1,kernel_size]
 31 |     even = kernel_size % 2 == 0
 32 |     half_size = kernel_size // 2
 33 | 
 34 |     # For kaiser window
 35 |     delta_f = 4 * half_width
 36 |     A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
 37 |     if A > 50.0:
 38 |         beta = 0.1102 * (A - 8.7)
 39 |     elif A >= 21.0:
 40 |         beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
 41 |     else:
 42 |         beta = 0.0
 43 |     window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
 44 | 
 45 |     # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
 46 |     if even:
 47 |         time = torch.arange(-half_size, half_size) + 0.5
 48 |     else:
 49 |         time = torch.arange(kernel_size) - half_size
 50 |     if cutoff == 0:
 51 |         filter_ = torch.zeros_like(time)
 52 |     else:
 53 |         filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
 54 |         """
 55 |         Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
 56 |         """
 57 |         filter_ /= filter_.sum()
 58 |         filter = filter_.view(1, 1, kernel_size)
 59 | 
 60 |     return filter
 61 | 
 62 | 
 63 | class LowPassFilter1d(nn.Module):
 64 |     def __init__(
 65 |         self,
 66 |         cutoff=0.5,
 67 |         half_width=0.6,
 68 |         stride: int = 1,
 69 |         padding: bool = True,
 70 |         padding_mode: str = "replicate",
 71 |         kernel_size: int = 12,
 72 |     ):
 73 |         """
 74 |         kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
 75 |         """
 76 |         super().__init__()
 77 |         if cutoff < -0.0:
 78 |             raise ValueError("Minimum cutoff must be larger than zero.")
 79 |         if cutoff > 0.5:
 80 |             raise ValueError("A cutoff above 0.5 does not make sense.")
 81 |         self.kernel_size = kernel_size
 82 |         self.even = kernel_size % 2 == 0
 83 |         self.pad_left = kernel_size // 2 - int(self.even)
 84 |         self.pad_right = kernel_size // 2
 85 |         self.stride = stride
 86 |         self.padding = padding
 87 |         self.padding_mode = padding_mode
 88 |         filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
 89 |         self.register_buffer("filter", filter)
 90 | 
 91 |     # Input [B, C, T]
 92 |     def forward(self, x):
 93 |         _, C, _ = x.shape
 94 | 
 95 |         if self.padding:
 96 |             x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
 97 |         out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
 98 | 
 99 |         return out
100 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from torch.nn import functional as F
 6 | from .filter import LowPassFilter1d
 7 | from .filter import kaiser_sinc_filter1d
 8 | 
 9 | 
10 | class UpSample1d(nn.Module):
11 |     def __init__(self, ratio=2, kernel_size=None):
12 |         super().__init__()
13 |         self.ratio = ratio
14 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
15 |         self.stride = ratio
16 |         self.pad = self.kernel_size // ratio - 1
17 |         self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
18 |         self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
19 |         filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size)
20 |         self.register_buffer("filter", filter)
21 | 
22 |     # x: [B, C, T]
23 |     def forward(self, x):
24 |         _, C, _ = x.shape
25 | 
26 |         x = F.pad(x, (self.pad, self.pad), mode="replicate")
27 |         x = self.ratio * F.conv_transpose1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
28 |         x = x[..., self.pad_left : -self.pad_right]
29 | 
30 |         return x
31 | 
32 | 
33 | class DownSample1d(nn.Module):
34 |     def __init__(self, ratio=2, kernel_size=None):
35 |         super().__init__()
36 |         self.ratio = ratio
37 |         self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
38 |         self.lowpass = LowPassFilter1d(
39 |             cutoff=0.5 / ratio,
40 |             half_width=0.6 / ratio,
41 |             stride=ratio,
42 |             kernel_size=self.kernel_size,
43 |         )
44 | 
45 |     def forward(self, x):
46 |         xx = self.lowpass(x)
47 | 
48 |         return xx
49 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_22khz_80band.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 32,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "activation": "snakebeta",
18 |     "snake_logscale": true,
19 | 
20 |     "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 |     "mpd_reshapes": [2, 3, 5, 7, 11],
22 |     "use_spectral_norm": false,
23 |     "discriminator_channel_mult": 1,
24 | 
25 |     "segment_size": 8192,
26 |     "num_mels": 80,
27 |     "num_freq": 1025,
28 |     "n_fft": 1024,
29 |     "hop_size": 256,
30 |     "win_size": 1024,
31 | 
32 |     "sampling_rate": 22050,
33 | 
34 |     "fmin": 0,
35 |     "fmax": 8000,
36 |     "fmax_for_loss": null,
37 | 
38 |     "num_workers": 4,
39 | 
40 |     "dist_config": {
41 |         "dist_backend": "nccl",
42 |         "dist_url": "tcp://localhost:54321",
43 |         "world_size": 1
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_24khz_100band.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 32,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "activation": "snakebeta",
18 |     "snake_logscale": true,
19 | 
20 |     "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 |     "mpd_reshapes": [2, 3, 5, 7, 11],
22 |     "use_spectral_norm": false,
23 |     "discriminator_channel_mult": 1,
24 | 
25 |     "segment_size": 8192,
26 |     "num_mels": 100,
27 |     "num_freq": 1025,
28 |     "n_fft": 1024,
29 |     "hop_size": 256,
30 |     "win_size": 1024,
31 | 
32 |     "sampling_rate": 24000,
33 | 
34 |     "fmin": 0,
35 |     "fmax": 12000,
36 |     "fmax_for_loss": null,
37 | 
38 |     "num_workers": 4,
39 | 
40 |     "dist_config": {
41 |         "dist_backend": "nccl",
42 |         "dist_url": "tcp://localhost:54321",
43 |         "world_size": 1
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_base_22khz_80band.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 32,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [8,8,2,2],
12 |     "upsample_kernel_sizes": [16,16,4,4],
13 |     "upsample_initial_channel": 512,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "activation": "snakebeta",
18 |     "snake_logscale": true,
19 | 
20 |     "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 |     "mpd_reshapes": [2, 3, 5, 7, 11],
22 |     "use_spectral_norm": false,
23 |     "discriminator_channel_mult": 1,
24 | 
25 |     "segment_size": 8192,
26 |     "num_mels": 80,
27 |     "num_freq": 1025,
28 |     "n_fft": 1024,
29 |     "hop_size": 256,
30 |     "win_size": 1024,
31 | 
32 |     "sampling_rate": 22050,
33 | 
34 |     "fmin": 0,
35 |     "fmax": 8000,
36 |     "fmax_for_loss": null,
37 | 
38 |     "num_workers": 4,
39 | 
40 |     "dist_config": {
41 |         "dist_backend": "nccl",
42 |         "dist_url": "tcp://localhost:54321",
43 |         "world_size": 1
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_base_24khz_100band.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 32,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [8,8,2,2],
12 |     "upsample_kernel_sizes": [16,16,4,4],
13 |     "upsample_initial_channel": 512,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "activation": "snakebeta",
18 |     "snake_logscale": true,
19 | 
20 |     "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 |     "mpd_reshapes": [2, 3, 5, 7, 11],
22 |     "use_spectral_norm": false,
23 |     "discriminator_channel_mult": 1,
24 | 
25 |     "segment_size": 8192,
26 |     "num_mels": 100,
27 |     "num_freq": 1025,
28 |     "n_fft": 1024,
29 |     "hop_size": 256,
30 |     "win_size": 1024,
31 | 
32 |     "sampling_rate": 24000,
33 | 
34 |     "fmin": 0,
35 |     "fmax": 12000,
36 |     "fmax_for_loss": null,
37 | 
38 |     "num_workers": 4,
39 | 
40 |     "dist_config": {
41 |         "dist_backend": "nccl",
42 |         "dist_url": "tcp://localhost:54321",
43 |         "world_size": 1
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_256x.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 4,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 | 
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 |     
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 80,
43 |     "num_freq": 1025,
44 |     "n_fft": 1024,
45 |     "hop_size": 256,
46 |     "win_size": 1024,
47 | 
48 |     "sampling_rate": 22050,
49 | 
50 |     "fmin": 0,
51 |     "fmax": null,
52 |     "fmax_for_loss": null,
53 |     
54 |     "num_workers": 4,
55 | 
56 |     "dist_config": {
57 |         "dist_backend": "nccl",
58 |         "dist_url": "tcp://localhost:54321",
59 |         "world_size": 1
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_fmax8k_256x.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 4,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 |     
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 | 
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 80,
43 |     "num_freq": 1025,
44 |     "n_fft": 1024,
45 |     "hop_size": 256,
46 |     "win_size": 1024,
47 | 
48 |     "sampling_rate": 22050,
49 | 
50 |     "fmin": 0,
51 |     "fmax": 8000,
52 |     "fmax_for_loss": null,
53 | 
54 |     "num_workers": 4,
55 | 
56 |     "dist_config": {
57 |         "dist_backend": "nccl",
58 |         "dist_url": "tcp://localhost:54321",
59 |         "world_size": 1
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_24khz_100band_256x.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 4,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 | 
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 | 
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 100,
43 |     "num_freq": 1025,
44 |     "n_fft": 1024,
45 |     "hop_size": 256,
46 |     "win_size": 1024,
47 | 
48 |     "sampling_rate": 24000,
49 | 
50 |     "fmin": 0,
51 |     "fmax": null,
52 |     "fmax_for_loss": null,
53 |     
54 |     "num_workers": 4,
55 | 
56 |     "dist_config": {
57 |         "dist_backend": "nccl",
58 |         "dist_url": "tcp://localhost:54321",
59 |         "world_size": 1
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_256x.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 4,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 | 
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 | 
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 128,
43 |     "num_freq": 1025,
44 |     "n_fft": 1024,
45 |     "hop_size": 256,
46 |     "win_size": 1024,
47 | 
48 |     "sampling_rate": 44100,
49 | 
50 |     "fmin": 0,
51 |     "fmax": null,
52 |     "fmax_for_loss": null,
53 |     
54 |     "num_workers": 4,
55 | 
56 |     "dist_config": {
57 |         "dist_backend": "nccl",
58 |         "dist_url": "tcp://localhost:54321",
59 |         "world_size": 1
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_512x.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 4,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [8,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [16,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 | 
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 | 
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 128,
43 |     "num_freq": 2049,
44 |     "n_fft": 2048,
45 |     "hop_size": 512,
46 |     "win_size": 2048,
47 | 
48 |     "sampling_rate": 44100,
49 | 
50 |     "fmin": 0,
51 |     "fmax": null,
52 |     "fmax_for_loss": null,
53 | 
54 |     "num_workers": 4,
55 | 
56 |     "dist_config": {
57 |         "dist_backend": "nccl",
58 |         "dist_url": "tcp://localhost:54321",
59 |         "world_size": 1
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/env.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import os
 5 | import shutil
 6 | 
 7 | 
 8 | class AttrDict(dict):
 9 |     def __init__(self, *args, **kwargs):
10 |         super(AttrDict, self).__init__(*args, **kwargs)
11 |         self.__dict__ = self
12 | 
13 | 
14 | def build_env(config, config_name, path):
15 |     t_path = os.path.join(path, config_name)
16 |     if config != t_path:
17 |         os.makedirs(path, exist_ok=True)
18 |         shutil.copyfile(config, os.path.join(path, config_name))
19 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_1:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jungil Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_2:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Edward Dixon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_4:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Seungwon Park 박승원
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_5:
--------------------------------------------------------------------------------
 1 | Copyright 2020 Alexandre Défossez
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
 4 | associated documentation files (the "Software"), to deal in the Software without restriction,
 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute,
 6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 7 | furnished to do so, subject to the following conditions:
 8 | 
 9 | The above copyright notice and this permission notice shall be included in all copies or
10 | substantial portions of the Software.
11 | 
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
15 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_6:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023-present, Descript
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_7:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Charactr Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_8:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Amphion
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/inference.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | from __future__ import absolute_import, division, print_function, unicode_literals
 5 | 
 6 | import os
 7 | import argparse
 8 | import json
 9 | import torch
10 | import librosa
11 | from utils import load_checkpoint
12 | from meldataset import get_mel_spectrogram
13 | from scipy.io.wavfile import write
14 | from env import AttrDict
15 | from meldataset import MAX_WAV_VALUE
16 | from bigvgan import BigVGAN as Generator
17 | 
18 | h = None
19 | device = None
20 | torch.backends.cudnn.benchmark = False
21 | 
22 | 
23 | def inference(a, h):
24 |     generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device)
25 | 
26 |     state_dict_g = load_checkpoint(a.checkpoint_file, device)
27 |     generator.load_state_dict(state_dict_g["generator"])
28 | 
29 |     filelist = os.listdir(a.input_wavs_dir)
30 | 
31 |     os.makedirs(a.output_dir, exist_ok=True)
32 | 
33 |     generator.eval()
34 |     generator.remove_weight_norm()
35 |     with torch.no_grad():
36 |         for i, filname in enumerate(filelist):
37 |             # Load the ground truth audio and resample if necessary
38 |             wav, sr = librosa.load(os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True)
39 |             wav = torch.FloatTensor(wav).to(device)
40 |             # Compute mel spectrogram from the ground truth audio
41 |             x = get_mel_spectrogram(wav.unsqueeze(0), generator.h)
42 | 
43 |             y_g_hat = generator(x)
44 | 
45 |             audio = y_g_hat.squeeze()
46 |             audio = audio * MAX_WAV_VALUE
47 |             audio = audio.cpu().numpy().astype("int16")
48 | 
49 |             output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated.wav")
50 |             write(output_file, h.sampling_rate, audio)
51 |             print(output_file)
52 | 
53 | 
54 | def main():
55 |     print("Initializing Inference Process..")
56 | 
57 |     parser = argparse.ArgumentParser()
58 |     parser.add_argument("--input_wavs_dir", default="test_files")
59 |     parser.add_argument("--output_dir", default="generated_files")
60 |     parser.add_argument("--checkpoint_file", required=True)
61 |     parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
62 | 
63 |     a = parser.parse_args()
64 | 
65 |     config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
66 |     with open(config_file) as f:
67 |         data = f.read()
68 | 
69 |     global h
70 |     json_config = json.loads(data)
71 |     h = AttrDict(json_config)
72 | 
73 |     torch.manual_seed(h.seed)
74 |     global device
75 |     if torch.cuda.is_available():
76 |         torch.cuda.manual_seed(h.seed)
77 |         device = torch.device("cuda")
78 |     else:
79 |         device = torch.device("cpu")
80 | 
81 |     inference(a, h)
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/inference_e2e.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | from __future__ import absolute_import, division, print_function, unicode_literals
  5 | 
  6 | import glob
  7 | import os
  8 | import numpy as np
  9 | import argparse
 10 | import json
 11 | import torch
 12 | from scipy.io.wavfile import write
 13 | from env import AttrDict
 14 | from meldataset import MAX_WAV_VALUE
 15 | from bigvgan import BigVGAN as Generator
 16 | 
 17 | h = None
 18 | device = None
 19 | torch.backends.cudnn.benchmark = False
 20 | 
 21 | 
 22 | def load_checkpoint(filepath, device):
 23 |     assert os.path.isfile(filepath)
 24 |     print(f"Loading '{filepath}'")
 25 |     checkpoint_dict = torch.load(filepath, map_location=device)
 26 |     print("Complete.")
 27 |     return checkpoint_dict
 28 | 
 29 | 
 30 | def scan_checkpoint(cp_dir, prefix):
 31 |     pattern = os.path.join(cp_dir, prefix + "*")
 32 |     cp_list = glob.glob(pattern)
 33 |     if len(cp_list) == 0:
 34 |         return ""
 35 |     return sorted(cp_list)[-1]
 36 | 
 37 | 
 38 | def inference(a, h):
 39 |     generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device)
 40 | 
 41 |     state_dict_g = load_checkpoint(a.checkpoint_file, device)
 42 |     generator.load_state_dict(state_dict_g["generator"])
 43 | 
 44 |     filelist = os.listdir(a.input_mels_dir)
 45 | 
 46 |     os.makedirs(a.output_dir, exist_ok=True)
 47 | 
 48 |     generator.eval()
 49 |     generator.remove_weight_norm()
 50 |     with torch.no_grad():
 51 |         for i, filname in enumerate(filelist):
 52 |             # Load the mel spectrogram in .npy format
 53 |             x = np.load(os.path.join(a.input_mels_dir, filname))
 54 |             x = torch.FloatTensor(x).to(device)
 55 |             if len(x.shape) == 2:
 56 |                 x = x.unsqueeze(0)
 57 | 
 58 |             y_g_hat = generator(x)
 59 | 
 60 |             audio = y_g_hat.squeeze()
 61 |             audio = audio * MAX_WAV_VALUE
 62 |             audio = audio.cpu().numpy().astype("int16")
 63 | 
 64 |             output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav")
 65 |             write(output_file, h.sampling_rate, audio)
 66 |             print(output_file)
 67 | 
 68 | 
 69 | def main():
 70 |     print("Initializing Inference Process..")
 71 | 
 72 |     parser = argparse.ArgumentParser()
 73 |     parser.add_argument("--input_mels_dir", default="test_mel_files")
 74 |     parser.add_argument("--output_dir", default="generated_files_from_mel")
 75 |     parser.add_argument("--checkpoint_file", required=True)
 76 |     parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
 77 | 
 78 |     a = parser.parse_args()
 79 | 
 80 |     config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
 81 |     with open(config_file) as f:
 82 |         data = f.read()
 83 | 
 84 |     global h
 85 |     json_config = json.loads(data)
 86 |     h = AttrDict(json_config)
 87 | 
 88 |     torch.manual_seed(h.seed)
 89 |     global device
 90 |     if torch.cuda.is_available():
 91 |         torch.cuda.manual_seed(h.seed)
 92 |         device = torch.device("cuda")
 93 |     else:
 94 |         device = torch.device("cpu")
 95 | 
 96 |     inference(a, h)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/bias.md:
--------------------------------------------------------------------------------
1 | | Field                                                                                                      | Response                                             |
2 | | :--------------------------------------------------------------------------------------------------------- | :--------------------------------------------------- |
3 | | Participation considerations from adversely impacted groups protected classes in model design and testing: | None                                                 |
4 | | Measures taken to mitigate against unwanted bias:                                                          | No measures taken to mitigate against unwanted bias. |
5 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/explainability.md:
--------------------------------------------------------------------------------
 1 | | Field                                                                                                 | Response                                                                                                                                                                                                               |
 2 | | :---------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 3 | | Intended Application & Domain:                                                                        | Generating waveform from mel spectrogram.                                                                                                                                                                              |
 4 | | Model Type:                                                                                           | Convolutional Neural Network (CNN)                                                                                                                                                                                     |
 5 | | Intended Users:                                                                                       | This model is intended for developers to synthesize and generate waveforms from the AI-generated mel spectrograms.                                                                                                     |
 6 | | Output:                                                                                               | Audio Waveform                                                                                                                                                                                                         |
 7 | | Describe how the model works:                                                                         | Model generates audio waveform corresponding to the input mel spectrogram.                                                                                                                                             |
 8 | | Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable                                                                                                                                                                                                         |
 9 | | Technical Limitations:                                                                                | This may not perform well on synthetically-generated mel spectrograms that deviate significantly from the profile of mel spectrograms on which this was trained.                                                       |
10 | | Verified to have met prescribed NVIDIA quality standards:                                             | Yes                                                                                                                                                                                                                    |
11 | | Performance Metrics:                                                                                  | Perceptual Evaluation of Speech Quality (PESQ), Virtual Speech Quality Objective Listener (VISQOL), Multi-resolution STFT (MRSTFT), Mel cepstral distortion (MCD), Periodicity RMSE, Voice/Unvoiced F1 Score (V/UV F1) |
12 | | Potential Known Risks:                                                                                | This model may generate low-quality or distorted soundwaves.                                                                                                                                                           |
13 | | Licensing:                                                                                            | https://github.com/NVIDIA/BigVGAN/blob/main/LICENSE                                                                                                                                                                    |
14 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/privacy.md:
--------------------------------------------------------------------------------
 1 | | Field                                                                                                                                  | Response                                       |
 2 | | :------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------- |
 3 | | Generatable or reverse engineerable personal information?                                                                              | None                                           |
 4 | | Protected class data used to create this model?                                                                                        | None                                           |
 5 | | Was consent obtained for any personal data used?                                                                                       | Not Applicable (No Personal Data)              |
 6 | | How often is dataset reviewed?                                                                                                         | Before Release                                 |
 7 | | Is a mechanism in place to honor data subject right of access or deletion of personal data?                                            | Not Applicable                                 |
 8 | | If personal collected for the development of the model, was it collected directly by NVIDIA?                                           | Not Applicable                                 |
 9 | | If personal collected for the development of the model by NVIDIA, do you maintain or have access to disclosures made to data subjects? | Not Applicable                                 |
10 | | If personal collected for the development of this AI model, was it minimized to only what was required?                                | Not Applicable                                 |
11 | | Is data in dataset traceable?                                                                                                          | Yes                                            |
12 | | Is there provenance for all datasets used in training?                                                                                 | Yes                                            |
13 | | Does data labeling (annotation, metadata) comply with privacy laws?                                                                    | Yes                                            |
14 | | Is data compliant with data subject requests for data correction or removal, if such a request was made?                               | No, not possible with externally-sourced data. |
15 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/safety.md:
--------------------------------------------------------------------------------
1 | | Field                                           | Response                                                                                                                                                                                                          |
2 | | :---------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
3 | | Model Application(s):                           | Synethic Audio Generation                                                                                                                                                                                         |
4 | | Describe the life critical impact (if present). | Not Applicable                                                                                                                                                                                                    |
5 | | Use Case Restrictions:                          | None                                                                                                                                                                                                              |
6 | | Model and dataset restrictions:                 | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |
7 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | numpy
 3 | librosa>=0.8.1
 4 | scipy
 5 | tensorboard
 6 | soundfile
 7 | matplotlib
 8 | pesq
 9 | auraloss
10 | tqdm
11 | nnAudio
12 | ninja
13 | huggingface_hub>=0.23.4


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/tests/test_activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | # to import modules from parent_dir
 8 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 9 | sys.path.append(parent_dir)
10 | 
11 | import torch
12 | from alias_free_activation.cuda import activation1d
13 | from activations import Snake
14 | 
15 | 
16 | def test_load_fused_kernels():
17 |     try:
18 |         print("[Success] load_fused_kernels")
19 |     except ImportError as e:
20 |         print("[Fail] load_fused_kernels")
21 |         raise e
22 | 
23 | 
24 | def test_anti_alias_activation():
25 |     data = torch.rand((10, 10, 200), device="cuda")
26 | 
27 |     # Check activations.Snake cuda vs. torch
28 |     fused_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=True).cuda()
29 |     fused_activation_output = fused_anti_alias_activation(data)
30 | 
31 |     torch_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=False).cuda()
32 |     torch_activation_output = torch_anti_alias_activation(data)
33 | 
34 |     test_result = (fused_activation_output - torch_activation_output).abs()
35 | 
36 |     while test_result.dim() != 1:
37 |         test_result = test_result.mean(dim=-1)
38 | 
39 |     diff = test_result.mean(dim=-1)
40 | 
41 |     if diff <= 1e-3:
42 |         print(
43 |             f"\n[Success] test_fused_anti_alias_activation"
44 |             f"\n > mean_difference={diff}"
45 |             f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
46 |             f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
47 |         )
48 |     else:
49 |         print(
50 |             f"\n[Fail] test_fused_anti_alias_activation"
51 |             f"\n > mean_difference={diff}, "
52 |             f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
53 |             f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
54 |         )
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     from alias_free_activation.cuda import load
59 | 
60 |     load.load()
61 |     test_load_fused_kernels()
62 |     test_anti_alias_activation()
63 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | # to import modules from parent_dir
 8 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 9 | sys.path.append(parent_dir)
10 | 
11 | import torch
12 | from alias_free_activation.cuda import activation1d
13 | from activations import SnakeBeta
14 | 
15 | 
16 | def test_load_fused_kernels():
17 |     try:
18 |         print("[Success] load_fused_kernels")
19 |     except ImportError as e:
20 |         print("[Fail] load_fused_kernels")
21 |         raise e
22 | 
23 | 
24 | def test_anti_alias_activation():
25 |     data = torch.rand((10, 10, 200), device="cuda")
26 | 
27 |     # Check activations, Snake CUDA vs. Torch
28 |     fused_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=True).cuda()
29 |     fused_activation_output = fused_anti_alias_activation(data)
30 | 
31 |     torch_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=False).cuda()
32 |     torch_activation_output = torch_anti_alias_activation(data)
33 | 
34 |     test_result = (fused_activation_output - torch_activation_output).abs()
35 | 
36 |     while test_result.dim() != 1:
37 |         test_result = test_result.mean(dim=-1)
38 | 
39 |     diff = test_result.mean(dim=-1)
40 | 
41 |     if diff <= 1e-3:
42 |         print(
43 |             f"\n[Success] test_fused_anti_alias_activation"
44 |             f"\n > mean_difference={diff}"
45 |             f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
46 |             f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
47 |         )
48 |     else:
49 |         print(
50 |             f"\n[Fail] test_fused_anti_alias_activation"
51 |             f"\n > mean_difference={diff}, "
52 |             f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
53 |             f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
54 |         )
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     from alias_free_activation.cuda import load
59 | 
60 |     load.load()
61 |     test_load_fused_kernels()
62 |     test_anti_alias_activation()
63 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/utils0.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
  2 | #   LICENSE is in incl_licenses directory.
  3 | 
  4 | import glob
  5 | import os
  6 | import matplotlib
  7 | import torch
  8 | from torch.nn.utils import weight_norm
  9 | 
 10 | matplotlib.use("Agg")
 11 | import matplotlib.pylab as plt
 12 | from .meldataset import MAX_WAV_VALUE
 13 | from scipy.io.wavfile import write
 14 | 
 15 | 
 16 | def plot_spectrogram(spectrogram):
 17 |     fig, ax = plt.subplots(figsize=(10, 2))
 18 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
 19 |     plt.colorbar(im, ax=ax)
 20 | 
 21 |     fig.canvas.draw()
 22 |     plt.close()
 23 | 
 24 |     return fig
 25 | 
 26 | 
 27 | def plot_spectrogram_clipped(spectrogram, clip_max=2.0):
 28 |     fig, ax = plt.subplots(figsize=(10, 2))
 29 |     im = ax.imshow(
 30 |         spectrogram,
 31 |         aspect="auto",
 32 |         origin="lower",
 33 |         interpolation="none",
 34 |         vmin=1e-6,
 35 |         vmax=clip_max,
 36 |     )
 37 |     plt.colorbar(im, ax=ax)
 38 | 
 39 |     fig.canvas.draw()
 40 |     plt.close()
 41 | 
 42 |     return fig
 43 | 
 44 | 
 45 | def init_weights(m, mean=0.0, std=0.01):
 46 |     classname = m.__class__.__name__
 47 |     if classname.find("Conv") != -1:
 48 |         m.weight.data.normal_(mean, std)
 49 | 
 50 | 
 51 | def apply_weight_norm(m):
 52 |     classname = m.__class__.__name__
 53 |     if classname.find("Conv") != -1:
 54 |         weight_norm(m)
 55 | 
 56 | 
 57 | def get_padding(kernel_size, dilation=1):
 58 |     return int((kernel_size * dilation - dilation) / 2)
 59 | 
 60 | 
 61 | def load_checkpoint(filepath, device):
 62 |     assert os.path.isfile(filepath)
 63 |     print(f"Loading '{filepath}'")
 64 |     checkpoint_dict = torch.load(filepath, map_location=device)
 65 |     print("Complete.")
 66 |     return checkpoint_dict
 67 | 
 68 | 
 69 | def save_checkpoint(filepath, obj):
 70 |     print(f"Saving checkpoint to {filepath}")
 71 |     torch.save(obj, filepath)
 72 |     print("Complete.")
 73 | 
 74 | 
 75 | def scan_checkpoint(cp_dir, prefix, renamed_file=None):
 76 |     # Fallback to original scanning logic first
 77 |     pattern = os.path.join(cp_dir, prefix + "????????")
 78 |     cp_list = glob.glob(pattern)
 79 | 
 80 |     if len(cp_list) > 0:
 81 |         last_checkpoint_path = sorted(cp_list)[-1]
 82 |         print(f"[INFO] Resuming from checkpoint: '{last_checkpoint_path}'")
 83 |         return last_checkpoint_path
 84 | 
 85 |     # If no pattern-based checkpoints are found, check for renamed file
 86 |     if renamed_file:
 87 |         renamed_path = os.path.join(cp_dir, renamed_file)
 88 |         if os.path.isfile(renamed_path):
 89 |             print(f"[INFO] Resuming from renamed checkpoint: '{renamed_file}'")
 90 |             return renamed_path
 91 | 
 92 |     return None
 93 | 
 94 | 
 95 | def save_audio(audio, path, sr):
 96 |     # wav: torch with 1d shape
 97 |     audio = audio * MAX_WAV_VALUE
 98 |     audio = audio.cpu().numpy().astype("int16")
 99 |     write(path, sr, audio)
100 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/TTS_infer_pack/__init__.py:
--------------------------------------------------------------------------------
1 | from . import TTS, text_segmentation_method
2 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/.gitignore:
--------------------------------------------------------------------------------
1 | *.yaml


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 8
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 512
24 |   hidden_dim: 512
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 12
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 8
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 1024
24 |   hidden_dim: 1024
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 16
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big2.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 12
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 1024
24 |   hidden_dim: 1024
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 6
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1longer-v2.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 20
 4 |   batch_size: 8
 5 |   save_every_n_epoch: 1
 6 |   precision: 16-mixed
 7 |   gradient_clip: 1.0
 8 | optimizer:
 9 |   lr: 0.01
10 |   lr_init: 0.00001
11 |   lr_end: 0.0001
12 |   warmup_steps: 2000
13 |   decay_steps: 40000
14 | data:
15 |   max_eval_sample: 8
16 |   max_sec: 54
17 |   num_workers: 4
18 |   pad_val: 1024 # same with EOS in model
19 | model:
20 |   vocab_size: 1025
21 |   phoneme_vocab_size: 732
22 |   embedding_dim: 512
23 |   hidden_dim: 512
24 |   head: 16
25 |   linear_units: 2048
26 |   n_layer: 24
27 |   dropout: 0
28 |   EOS: 1024
29 |   random_bert: 0
30 | inference:
31 |   top_k: 15
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1longer.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 20
 4 |   batch_size: 8
 5 |   save_every_n_epoch: 1
 6 |   precision: 16-mixed
 7 |   gradient_clip: 1.0
 8 | optimizer:
 9 |   lr: 0.01
10 |   lr_init: 0.00001
11 |   lr_end: 0.0001
12 |   warmup_steps: 2000
13 |   decay_steps: 40000
14 | data:
15 |   max_eval_sample: 8
16 |   max_sec: 54
17 |   num_workers: 4
18 |   pad_val: 1024 # same with EOS in model
19 | model:
20 |   vocab_size: 1025
21 |   phoneme_vocab_size: 512
22 |   embedding_dim: 512
23 |   hidden_dim: 512
24 |   head: 16
25 |   linear_units: 2048
26 |   n_layer: 24
27 |   dropout: 0
28 |   EOS: 1024
29 |   random_bert: 0
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1mq.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 100
 4 |   batch_size: 6
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 32
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 40
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   saving_path: "ckpt/"
22 |   resume_checkpoint: null
23 |   vocoder_config_path: "quantizer/new_ckpt/config.json"
24 |   vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000"
25 |   datadir: "/home/liweiche/GigaSpeech/wavs"
26 |   metapath: "/home/liweiche/GigaSpeech/train2.json"
27 |   val_metapath: "/home/liweiche/GigaSpeech/dev2.json"
28 |   sampledir: "logs/"
29 |   pretrained_path: null
30 |   lr: 0.0001
31 |   batch_size: 200.0
32 |   train_bucket_size: 8192
33 |   training_step: 800000
34 |   optim_flat_percent: 0.0
35 |   warmup_step: 50
36 |   adam_beta1: 0.9
37 |   adam_beta2: 0.98
38 |   ffd_size: 3072
39 |   hidden_size: 768
40 |   enc_nlayers: 6
41 |   dec_nlayers: 6
42 |   nheads: 12
43 |   ar_layer: 4
44 |   ar_ffd_size: 1024
45 |   ar_hidden_size: 256
46 |   ar_nheads: 4
47 |   aligner_softmax_temp: 1.0
48 |   layer_norm_eps: 0.00001
49 |   speaker_embed_dropout: 0.05
50 |   label_smoothing: 0.0
51 |   val_check_interval: 5000
52 |   check_val_every_n_epoch: 1
53 |   precision: "fp16"
54 |   nworkers: 16
55 |   distributed: true
56 |   accelerator: "ddp"
57 |   version: null
58 |   accumulate_grad_batches: 1
59 |   use_repetition_token: true
60 |   use_repetition_gating: false
61 |   repetition_penalty: 1.0
62 |   sampling_temperature: 1.0
63 |   top_k: -1
64 |   min_top_k: 3
65 |   top_p: 0.8
66 |   sample_num: 4
67 |   length_penalty_max_length: 15000
68 |   length_penalty_max_prob: 0.95
69 |   max_input_length: 2048
70 |   max_output_length: 2000
71 |   sample_rate: 16000
72 |   n_codes: 1024
73 |   n_cluster_groups: 1
74 |   phone_context_window: 4
75 |   phoneset_size: 1000
76 | inference:
77 |   top_k: 5
78 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 100,
 4 |     "eval_interval": 500,
 5 |     "seed": 1234,
 6 |     "epochs": 100,
 7 |     "learning_rate": 0.0001,
 8 |     "betas": [
 9 |       0.8,
10 |       0.99
11 |     ],
12 |     "eps": 1e-09,
13 |     "batch_size": 32,
14 |     "fp16_run": true,
15 |     "lr_decay": 0.999875,
16 |     "segment_size": 20480,
17 |     "init_lr_ratio": 1,
18 |     "warmup_epochs": 0,
19 |     "c_mel": 45,
20 |     "c_kl": 1.0,
21 |     "text_low_lr_rate": 0.4, 
22 |     "grad_ckpt": false
23 |   },
24 |   "data": {
25 |     "max_wav_value": 32768.0,
26 |     "sampling_rate": 32000,
27 |     "filter_length": 2048,
28 |     "hop_length": 640,
29 |     "win_length": 2048,
30 |     "n_mel_channels": 128,
31 |     "mel_fmin": 0.0,
32 |     "mel_fmax": null,
33 |     "add_blank": true,
34 |     "n_speakers": 300,
35 |     "cleaned_text": true
36 |   },
37 |   "model": {
38 |     "inter_channels": 192,
39 |     "hidden_channels": 192,
40 |     "filter_channels": 768,
41 |     "n_heads": 2,
42 |     "n_layers": 6,
43 |     "kernel_size": 3,
44 |     "p_dropout": 0.1,
45 |     "resblock": "1",
46 |     "resblock_kernel_sizes": [
47 |       3,
48 |       7,
49 |       11
50 |     ],
51 |     "resblock_dilation_sizes": [
52 |       [
53 |         1,
54 |         3,
55 |         5
56 |       ],
57 |       [
58 |         1,
59 |         3,
60 |         5
61 |       ],
62 |       [
63 |         1,
64 |         3,
65 |         5
66 |       ]
67 |     ],
68 |     "upsample_rates": [
69 |       10,
70 |       8,
71 |       2,
72 |       2,
73 |       2
74 |     ],
75 |     "upsample_initial_channel": 512,
76 |     "upsample_kernel_sizes": [
77 |       16,
78 |       16,
79 |       8,
80 |       2,
81 |       2
82 |     ],
83 |     "n_layers_q": 3,
84 |     "use_spectral_norm": false,
85 |     "gin_channels": 512,
86 |     "semantic_frame_rate": "25hz",
87 |     "freeze_quantizer": true
88 |   },
89 |   "s2_ckpt_dir": "logs/s2/big2k1",
90 |   "content_module": "cnhubert"
91 | }


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | gpu:
 2 |   n_card: 1
 3 |   n_process_per_card: 2
 4 | io:
 5 |   text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 512
24 |   hidden_dim: 512
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 24
28 |   dropout: 0
29 |   EOS: 1024
30 |   random_bert: 0
31 | inference:
32 |   top_k: 5
33 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/configs/tts_infer.yaml:
--------------------------------------------------------------------------------
 1 | custom:
 2 |   bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
 3 |   cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
 4 |   device: cuda
 5 |   is_half: true
 6 |   t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
 7 |   version: v2
 8 |   vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
 9 | v1:
10 |   bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
11 |   cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
12 |   device: cpu
13 |   is_half: false
14 |   t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
15 |   version: v1
16 |   vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
17 | v2:
18 |   bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
19 |   cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
20 |   device: cpu
21 |   is_half: false
22 |   t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
23 |   version: v2
24 |   vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
25 | v3:
26 |   bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
27 |   cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
28 |   device: cpu
29 |   is_half: false
30 |   t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
31 |   version: v3
32 |   vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
33 | v4:
34 |   bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
35 |   cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
36 |   device: cpu
37 |   is_half: false
38 |   t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
39 |   version: v4
40 |   vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth
41 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | now_dir = os.getcwd()
 5 | sys.path.insert(0, now_dir)
 6 | from text.g2pw import G2PWPinyin
 7 | 
 8 | g2pw = G2PWPinyin(
 9 |     model_dir="GPT_SoVITS/text/G2PWModel",
10 |     model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
11 |     v_to_u=False,
12 |     neutral_tone_with_five=True,
13 | )
14 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/f5_tts/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # from f5_tts.model.cfm import CFM
 2 | #
 3 | # from f5_tts.model.backbones.unett import UNetT
 4 | from GPT_SoVITS.f5_tts.model.backbones.dit import DiT
 5 | # from f5_tts.model.backbones.dit import DiTNoCond
 6 | # from f5_tts.model.backbones.dit import DiTNoCondNoT
 7 | # from f5_tts.model.backbones.mmdit import MMDiT
 8 | 
 9 | # from f5_tts.model.trainer import Trainer
10 | 
11 | 
12 | # __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
13 | # __all__ = ["CFM", "UNetT", "DiTNoCond","DiT", "MMDiT"]
14 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/f5_tts/model/backbones/README.md:
--------------------------------------------------------------------------------
 1 | ## Backbones quick introduction
 2 | 
 3 | 
 4 | ### unett.py
 5 | - flat unet transformer
 6 | - structure same as in e2-tts & voicebox paper except using rotary pos emb
 7 | - update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
 8 | 
 9 | ### dit.py
10 | - adaln-zero dit
11 | - embedded timestep as condition
12 | - concatted noised_input + masked_cond + embedded_text, linear proj in
13 | - possible abs pos emb & convnextv2 blocks for embedded text before concat
14 | - possible long skip connection (first layer to last layer)
15 | 
16 | ### mmdit.py
17 | - sd3 structure
18 | - timestep as condition
19 | - left stream: text embedded and applied a abs pos emb
20 | - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
21 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cnhubert, whisper_enc
2 | 
3 | content_module_map = {"cnhubert": cnhubert, "whisper": whisper_enc}
4 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/cnhubert.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | from transformers import logging as tf_logging
  4 | 
  5 | tf_logging.set_verbosity_error()
  6 | 
  7 | import logging
  8 | 
  9 | logging.getLogger("numba").setLevel(logging.WARNING)
 10 | 
 11 | from transformers import (
 12 |     Wav2Vec2FeatureExtractor,
 13 |     HubertModel,
 14 | )
 15 | 
 16 | import utils
 17 | import torch.nn as nn
 18 | 
 19 | cnhubert_base_path = None
 20 | 
 21 | 
 22 | class CNHubert(nn.Module):
 23 |     def __init__(self, base_path: str = None):
 24 |         super().__init__()
 25 |         if base_path is None:
 26 |             base_path = cnhubert_base_path
 27 |         if os.path.exists(base_path):
 28 |             ...
 29 |         else:
 30 |             raise FileNotFoundError(base_path)
 31 |         self.model = HubertModel.from_pretrained(base_path, local_files_only=True)
 32 |         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(base_path, local_files_only=True)
 33 | 
 34 |     def forward(self, x):
 35 |         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 36 |         feats = self.model(input_values)["last_hidden_state"]
 37 |         return feats
 38 | 
 39 | 
 40 | # class CNHubertLarge(nn.Module):
 41 | #     def __init__(self):
 42 | #         super().__init__()
 43 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 44 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 45 | #     def forward(self, x):
 46 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 47 | #         feats = self.model(input_values)["last_hidden_state"]
 48 | #         return feats
 49 | #
 50 | # class CVec(nn.Module):
 51 | #     def __init__(self):
 52 | #         super().__init__()
 53 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 54 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 55 | #     def forward(self, x):
 56 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 57 | #         feats = self.model(input_values)["last_hidden_state"]
 58 | #         return feats
 59 | #
 60 | # class cnw2v2base(nn.Module):
 61 | #     def __init__(self):
 62 | #         super().__init__()
 63 | #         self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 64 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 65 | #     def forward(self, x):
 66 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 67 | #         feats = self.model(input_values)["last_hidden_state"]
 68 | #         return feats
 69 | 
 70 | 
 71 | def get_model():
 72 |     model = CNHubert()
 73 |     model.eval()
 74 |     return model
 75 | 
 76 | 
 77 | # def get_large_model():
 78 | #     model = CNHubertLarge()
 79 | #     model.eval()
 80 | #     return model
 81 | #
 82 | # def get_model_cvec():
 83 | #     model = CVec()
 84 | #     model.eval()
 85 | #     return model
 86 | #
 87 | # def get_model_cnw2v2base():
 88 | #     model = cnw2v2base()
 89 | #     model.eval()
 90 | #     return model
 91 | 
 92 | 
 93 | def get_content(hmodel, wav_16k_tensor):
 94 |     with torch.no_grad():
 95 |         feats = hmodel(wav_16k_tensor)
 96 |     return feats.transpose(1, 2)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     model = get_model()
101 |     src_path = "/Users/Shared/原音频2.wav"
102 |     wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
103 |     model = model
104 |     wav_16k_tensor = wav_16k_tensor
105 |     feats = get_content(model, wav_16k_tensor)
106 |     print(feats.shape)
107 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/whisper_enc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_model():
 5 |     import whisper
 6 | 
 7 |     model = whisper.load_model("small", device="cpu")
 8 | 
 9 |     return model.encoder
10 | 
11 | 
12 | def get_content(model=None, wav_16k_tensor=None):
13 |     from whisper import log_mel_spectrogram, pad_or_trim
14 | 
15 |     dev = next(model.parameters()).device
16 |     mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
17 |     # if torch.cuda.is_available():
18 |     #     mel = mel.to(torch.float16)
19 |     feature_len = mel.shape[-1] // 2
20 |     assert mel.shape[-1] < 3000, "输入音频过长，只允许输入30以内音频"
21 |     with torch.no_grad():
22 |         feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[:1, :feature_len, :].transpose(1, 2)
23 |     return feature
24 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/inference_cli.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import soundfile as sf
 4 | 
 5 | from tools.i18n.i18n import I18nAuto
 6 | from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
 7 | 
 8 | i18n = I18nAuto()
 9 | 
10 | 
11 | def synthesize(
12 |     GPT_model_path,
13 |     SoVITS_model_path,
14 |     ref_audio_path,
15 |     ref_text_path,
16 |     ref_language,
17 |     target_text_path,
18 |     target_language,
19 |     output_path,
20 | ):
21 |     # Read reference text
22 |     with open(ref_text_path, "r", encoding="utf-8") as file:
23 |         ref_text = file.read()
24 | 
25 |     # Read target text
26 |     with open(target_text_path, "r", encoding="utf-8") as file:
27 |         target_text = file.read()
28 | 
29 |     # Change model weights
30 |     change_gpt_weights(gpt_path=GPT_model_path)
31 |     change_sovits_weights(sovits_path=SoVITS_model_path)
32 | 
33 |     # Synthesize audio
34 |     synthesis_result = get_tts_wav(
35 |         ref_wav_path=ref_audio_path,
36 |         prompt_text=ref_text,
37 |         prompt_language=i18n(ref_language),
38 |         text=target_text,
39 |         text_language=i18n(target_language),
40 |         top_p=1,
41 |         temperature=1,
42 |     )
43 | 
44 |     result_list = list(synthesis_result)
45 | 
46 |     if result_list:
47 |         last_sampling_rate, last_audio_data = result_list[-1]
48 |         output_wav_path = os.path.join(output_path, "output.wav")
49 |         sf.write(output_wav_path, last_audio_data, last_sampling_rate)
50 |         print(f"Audio saved to {output_wav_path}")
51 | 
52 | 
53 | def main():
54 |     parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
55 |     parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file")
56 |     parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file")
57 |     parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file")
58 |     parser.add_argument("--ref_text", required=True, help="Path to the reference text file")
59 |     parser.add_argument(
60 |         "--ref_language", required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio"
61 |     )
62 |     parser.add_argument("--target_text", required=True, help="Path to the target text file")
63 |     parser.add_argument(
64 |         "--target_language",
65 |         required=True,
66 |         choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"],
67 |         help="Language of the target text",
68 |     )
69 |     parser.add_argument("--output_path", required=True, help="Path to the output directory")
70 | 
71 |     args = parser.parse_args()
72 | 
73 |     synthesize(
74 |         args.gpt_model,
75 |         args.sovits_model,
76 |         args.ref_audio,
77 |         args.ref_text,
78 |         args.ref_language,
79 |         args.target_text,
80 |         args.target_language,
81 |         args.output_path,
82 |     )
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/module/__init__.py


--------------------------------------------------------------------------------
/GPT_SoVITS/module/losses.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def feature_loss(fmap_r, fmap_g):
 7 |     loss = 0
 8 |     for dr, dg in zip(fmap_r, fmap_g):
 9 |         for rl, gl in zip(dr, dg):
10 |             rl = rl.float().detach()
11 |             gl = gl.float()
12 |             loss += torch.mean(torch.abs(rl - gl))
13 | 
14 |     return loss * 2
15 | 
16 | 
17 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
18 |     loss = 0
19 |     r_losses = []
20 |     g_losses = []
21 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
22 |         dr = dr.float()
23 |         dg = dg.float()
24 |         r_loss = torch.mean((1 - dr) ** 2)
25 |         g_loss = torch.mean(dg**2)
26 |         loss += r_loss + g_loss
27 |         r_losses.append(r_loss.item())
28 |         g_losses.append(g_loss.item())
29 | 
30 |     return loss, r_losses, g_losses
31 | 
32 | 
33 | def generator_loss(disc_outputs):
34 |     loss = 0
35 |     gen_losses = []
36 |     for dg in disc_outputs:
37 |         dg = dg.float()
38 |         l = torch.mean((1 - dg) ** 2)
39 |         gen_losses.append(l)
40 |         loss += l
41 | 
42 |     return loss, gen_losses
43 | 
44 | 
45 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
46 |     """
47 |     z_p, logs_q: [b, h, t_t]
48 |     m_p, logs_p: [b, h, t_t]
49 |     """
50 |     z_p = z_p.float()
51 |     logs_q = logs_q.float()
52 |     m_p = m_p.float()
53 |     logs_p = logs_p.float()
54 |     z_mask = z_mask.float()
55 | 
56 |     kl = logs_p - logs_q - 0.5
57 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
58 |     kl = torch.sum(kl * z_mask)
59 |     l = kl / torch.sum(z_mask)
60 |     return l
61 | 
62 | 
63 | def mle_loss(z, m, logs, logdet, mask):
64 |     l = torch.sum(logs) + 0.5 * torch.sum(
65 |         torch.exp(-2 * logs) * ((z - m) ** 2)
66 |     )  # neg normal likelihood w/o the constant term
67 |     l = l - torch.sum(logdet)  # log jacobian determinant
68 |     l = l / torch.sum(torch.ones_like(z) * mask)  # averaging across batch, channel and time axes
69 |     l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
70 |     return l
71 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/pretrained_models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/GPT_SoVITS/process_ckpt.py:
--------------------------------------------------------------------------------
  1 | import traceback
  2 | from collections import OrderedDict
  3 | from time import time as ttime
  4 | import shutil
  5 | import os
  6 | import torch
  7 | from tools.i18n.i18n import I18nAuto
  8 | 
  9 | i18n = I18nAuto()
 10 | 
 11 | 
 12 | def my_save(fea, path):  #####fix issue: torch.save doesn't support chinese path
 13 |     dir = os.path.dirname(path)
 14 |     name = os.path.basename(path)
 15 |     tmp_path = "%s.pth" % (ttime())
 16 |     torch.save(fea, tmp_path)
 17 |     shutil.move(tmp_path, "%s/%s" % (dir, name))
 18 | 
 19 | 
 20 | """
 21 | 00:v1
 22 | 01:v2
 23 | 02:v3
 24 | 03:v3lora
 25 | 04:v4lora
 26 | 
 27 | """
 28 | from io import BytesIO
 29 | 
 30 | 
 31 | def my_save2(fea, path, cfm_version):
 32 |     bio = BytesIO()
 33 |     torch.save(fea, bio)
 34 |     bio.seek(0)
 35 |     data = bio.getvalue()
 36 |     byte = b"03" if cfm_version == "v3" else b"04"
 37 |     data = byte + data[2:]
 38 |     with open(path, "wb") as f:
 39 |         f.write(data)
 40 | 
 41 | 
 42 | def savee(ckpt, name, epoch, steps, hps, cfm_version=None, lora_rank=None):
 43 |     try:
 44 |         opt = OrderedDict()
 45 |         opt["weight"] = {}
 46 |         for key in ckpt.keys():
 47 |             if "enc_q" in key:
 48 |                 continue
 49 |             opt["weight"][key] = ckpt[key].half()
 50 |         opt["config"] = hps
 51 |         opt["info"] = "%sepoch_%siteration" % (epoch, steps)
 52 |         if lora_rank:
 53 |             opt["lora_rank"] = lora_rank
 54 |             my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), cfm_version)
 55 |         else:
 56 |             my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
 57 |         return "Success."
 58 |     except:
 59 |         return traceback.format_exc()
 60 | 
 61 | 
 62 | head2version = {
 63 |     b"00": ["v1", "v1", False],
 64 |     b"01": ["v2", "v2", False],
 65 |     b"02": ["v2", "v3", False],
 66 |     b"03": ["v2", "v3", True],
 67 |     b"04": ["v2", "v4", True],
 68 | }
 69 | hash_pretrained_dict = {
 70 |     "dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False],  # s2G488k.pth#sovits_v1_pretrained
 71 |     "43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False],  # s2Gv3.pth#sovits_v3_pretrained
 72 |     "6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False],  # s2G2333K.pth#sovits_v2_pretrained
 73 |     "4f26b9476d0c5033e04162c486074374": ["v2", "v4", False],  # s2Gv4.pth#sovits_v4_pretrained
 74 | }
 75 | import hashlib
 76 | 
 77 | 
 78 | def get_hash_from_file(sovits_path):
 79 |     with open(sovits_path, "rb") as f:
 80 |         data = f.read(8192)
 81 |     hash_md5 = hashlib.md5()
 82 |     hash_md5.update(data)
 83 |     return hash_md5.hexdigest()
 84 | 
 85 | 
 86 | def get_sovits_version_from_path_fast(sovits_path):
 87 |     ###1-if it is pretrained sovits models, by hash
 88 |     hash = get_hash_from_file(sovits_path)
 89 |     if hash in hash_pretrained_dict:
 90 |         return hash_pretrained_dict[hash]
 91 |     ###2-new weights, by head
 92 |     with open(sovits_path, "rb") as f:
 93 |         version = f.read(2)
 94 |     if version != b"PK":
 95 |         return head2version[version]
 96 |     ###3-old weights, by file size
 97 |     if_lora_v3 = False
 98 |     size = os.path.getsize(sovits_path)
 99 |     """
100 |             v1weights:about 82942KB
101 |                 half thr:82978KB
102 |             v2weights:about 83014KB
103 |             v3weights:about 750MB
104 |     """
105 |     if size < 82978 * 1024:
106 |         model_version = version = "v1"
107 |     elif size < 700 * 1024 * 1024:
108 |         model_version = version = "v2"
109 |     else:
110 |         version = "v2"
111 |         model_version = "v3"
112 |     return version, model_version, if_lora_v3
113 | 
114 | 
115 | def load_sovits_new(sovits_path):
116 |     f = open(sovits_path, "rb")
117 |     meta = f.read(2)
118 |     if meta != "PK":
119 |         data = b"PK" + f.read()
120 |         bio = BytesIO()
121 |         bio.write(data)
122 |         bio.seek(0)
123 |         return torch.load(bio, map_location="cpu", weights_only=False)
124 |     return torch.load(sovits_path, map_location="cpu", weights_only=False)
125 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/.gitignore:
--------------------------------------------------------------------------------
1 | G2PWModel
2 | __pycache__
3 | *.zip


--------------------------------------------------------------------------------
/GPT_SoVITS/text/LangSegmenter/__init__.py:
--------------------------------------------------------------------------------
1 | from .langsegmenter import LangSegmenter
2 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # if os.environ.get("version","v1")=="v1":
 3 | #   from text.symbols import symbols
 4 | # else:
 5 | #   from text.symbols2 import symbols
 6 | 
 7 | from text import symbols as symbols_v1
 8 | from text import symbols2 as symbols_v2
 9 | 
10 | _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
11 | _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
12 | 
13 | 
14 | def cleaned_text_to_sequence(cleaned_text, version=None):
15 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 |     Args:
17 |       text: string to convert to a sequence
18 |     Returns:
19 |       List of integers corresponding to the symbols in the text
20 |     """
21 |     if version is None:
22 |         version = os.environ.get("version", "v2")
23 |     if version == "v1":
24 |         phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
25 |     else:
26 |         phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
27 | 
28 |     return phones
29 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from text import cleaned_text_to_sequence
 2 | import os
 3 | # if os.environ.get("version","v1")=="v1":
 4 | #     from text import chinese
 5 | #     from text.symbols import symbols
 6 | # else:
 7 | #     from text import chinese2 as chinese
 8 | #     from text.symbols2 import symbols
 9 | 
10 | from text import symbols as symbols_v1
11 | from text import symbols2 as symbols_v2
12 | 
13 | special = [
14 |     # ("%", "zh", "SP"),
15 |     ("￥", "zh", "SP2"),
16 |     ("^", "zh", "SP3"),
17 |     # ('@', 'zh', "SP4")#不搞鬼畜了，和第二版保持一致吧
18 | ]
19 | 
20 | 
21 | def clean_text(text, language, version=None):
22 |     if version is None:
23 |         version = os.environ.get("version", "v2")
24 |     if version == "v1":
25 |         symbols = symbols_v1.symbols
26 |         language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
27 |     else:
28 |         symbols = symbols_v2.symbols
29 |         language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
30 | 
31 |     if language not in language_module_map:
32 |         language = "en"
33 |         text = " "
34 |     for special_s, special_l, target_symbol in special:
35 |         if special_s in text and language == special_l:
36 |             return clean_special(text, language, special_s, target_symbol, version)
37 |     language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
38 |     if hasattr(language_module, "text_normalize"):
39 |         norm_text = language_module.text_normalize(text)
40 |     else:
41 |         norm_text = text
42 |     if language == "zh" or language == "yue":  ##########
43 |         phones, word2ph = language_module.g2p(norm_text)
44 |         assert len(phones) == sum(word2ph)
45 |         assert len(norm_text) == len(word2ph)
46 |     elif language == "en":
47 |         phones = language_module.g2p(norm_text)
48 |         if len(phones) < 4:
49 |             phones = [","] + phones
50 |         word2ph = None
51 |     else:
52 |         phones = language_module.g2p(norm_text)
53 |         word2ph = None
54 |     phones = ["UNK" if ph not in symbols else ph for ph in phones]
55 |     return phones, word2ph, norm_text
56 | 
57 | 
58 | def clean_special(text, language, special_s, target_symbol, version=None):
59 |     if version is None:
60 |         version = os.environ.get("version", "v2")
61 |     if version == "v1":
62 |         symbols = symbols_v1.symbols
63 |         language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
64 |     else:
65 |         symbols = symbols_v2.symbols
66 |         language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
67 | 
68 |     """
69 |     特殊静音段sp符号处理
70 |     """
71 |     text = text.replace(special_s, ",")
72 |     language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
73 |     norm_text = language_module.text_normalize(text)
74 |     phones = language_module.g2p(norm_text)
75 |     new_ph = []
76 |     for ph in phones[0]:
77 |         assert ph in symbols
78 |         if ph == ",":
79 |             new_ph.append(target_symbol)
80 |         else:
81 |             new_ph.append(ph)
82 |     return new_ph, phones[1], norm_text
83 | 
84 | 
85 | def text_to_sequence(text, language, version=None):
86 |     version = os.environ.get("version", version)
87 |     if version is None:
88 |         version = "v2"
89 |     phones = clean_text(text)
90 |     return cleaned_text_to_sequence(phones, version)
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
95 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict-hot.rep:
--------------------------------------------------------------------------------
1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2 | JSON JH EY1 S AH0 N
3 | CONDA K AA1 N D AH0


--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/text/engdict_cache.pickle


--------------------------------------------------------------------------------
/GPT_SoVITS/text/g2pw/__init__.py:
--------------------------------------------------------------------------------
1 | from text.g2pw.g2pw import *
2 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/g2pw/polyphonic.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/text/g2pw/polyphonic.pickle


--------------------------------------------------------------------------------
/GPT_SoVITS/text/g2pw/polyphonic.rep:
--------------------------------------------------------------------------------
 1 | 湖泊: ['hu2','po1']
 2 | 地壳: ['di4','qiao4']
 3 | 柏树: ['bai3','shu4']
 4 | 曝光: ['bao4','guang1']
 5 | 弹力: ['tan2','li4']
 6 | 字帖: ['zi4','tie4']
 7 | 口吃: ['kou3','chi1']
 8 | 包扎: ['bao1','za1']
 9 | 哪吒: ['ne2','zha1']
10 | 说服: ['shuo1','fu2']
11 | 识字: ['shi2','zi4']
12 | 骨头: ['gu3','tou5']
13 | 对称: ['dui4','chen4']
14 | 口供: ['kou3','gong4']
15 | 抹布: ['ma1','bu4']
16 | 露背: ['lu4','bei4']
17 | 圈养: ['juan4', 'yang3']
18 | 眼眶: ['yan3', 'kuang4']
19 | 品行: ['pin3','xing2']
20 | 颤抖: ['chan4','dou3']
21 | 差不多: ['cha4','bu5','duo1']
22 | 鸭绿江: ['ya1','lu4','jiang1']
23 | 撒切尔: ['sa4','qie4','er3']
24 | 比比皆是: ['bi3','bi3','jie1','shi4']
25 | 身无长物: ['shen1','wu2','chang2','wu4']
26 | 手里: ['shou2','li3']
27 | 关卡: ['guan1','qia3']
28 | 怀揣: ['huai2','chuai1']
29 | 挑剔: ['tiao1','ti4']
30 | 供称: ['gong4','cheng1']
31 | 作坊: ['zuo1', 'fang5']
32 | 中医: ['zhong1','yi1']
33 | 嚷嚷: ['rang1','rang5']
34 | 商厦: ['shang1','sha4']
35 | 大厦: ['da4','sha4']
36 | 刹车: ['sha1','che1']
37 | 嘚瑟: ['de4','se5']
38 | 朝鲜: ['chao2','xian3']
39 | 阿房宫: ['e1','pang2','gong1']
40 | 阿胶: ['e1','jiao1']
41 | 咖喱: ['ga1','li5']
42 | 时分: ['shi2','fen1']
43 | 蚌埠: ['beng4','bu4']
44 | 驯服: ['xun4','fu2']
45 | 幸免于难: ['xing4','mian3','yu2','nan4']
46 | 恶行: ['e4','xing2']
47 | 唉: ['ai4']
48 | 扎实: ['zha1','shi2']
49 | 干将: ['gan4','jiang4']
50 | 陈威行: ['chen2', 'wei1', 'hang2']
51 | 郭晟: ['guo1', 'sheng4']
52 | 中标: ['zhong4', 'biao1']
53 | 抗住: ['kang2', 'zhu4']


--------------------------------------------------------------------------------
/GPT_SoVITS/text/namedict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/text/namedict_cache.pickle


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/README.md:
--------------------------------------------------------------------------------
 1 | ## Supported NSW (Non-Standard-Word) Normalization
 2 | 
 3 | |NSW type|raw|normalized|
 4 | |:--|:-|:-|
 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
 6 | |cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
 7 | |numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
 8 | |date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
14 | |telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from text.zh_normalization.text_normlization import *
15 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import re
 15 | 
 16 | from .num import DIGITS
 17 | from .num import num2str
 18 | from .num import verbalize_cardinal
 19 | from .num import verbalize_digit
 20 | 
 21 | 
 22 | def _time_num2str(num_string: str) -> str:
 23 |     """A special case for verbalizing number in time."""
 24 |     result = num2str(num_string.lstrip("0"))
 25 |     if num_string.startswith("0"):
 26 |         result = DIGITS["0"] + result
 27 |     return result
 28 | 
 29 | 
 30 | # 时刻表达式
 31 | RE_TIME = re.compile(
 32 |     r"([0-1]?[0-9]|2[0-3])"
 33 |     r":([0-5][0-9])"
 34 |     r"(:([0-5][0-9]))?"
 35 | )
 36 | 
 37 | # 时间范围，如8:30-12:30
 38 | RE_TIME_RANGE = re.compile(
 39 |     r"([0-1]?[0-9]|2[0-3])"
 40 |     r":([0-5][0-9])"
 41 |     r"(:([0-5][0-9]))?"
 42 |     r"(~|-)"
 43 |     r"([0-1]?[0-9]|2[0-3])"
 44 |     r":([0-5][0-9])"
 45 |     r"(:([0-5][0-9]))?"
 46 | )
 47 | 
 48 | 
 49 | def replace_time(match) -> str:
 50 |     """
 51 |     Args:
 52 |         match (re.Match)
 53 |     Returns:
 54 |         str
 55 |     """
 56 | 
 57 |     is_range = len(match.groups()) > 5
 58 | 
 59 |     hour = match.group(1)
 60 |     minute = match.group(2)
 61 |     second = match.group(4)
 62 | 
 63 |     if is_range:
 64 |         hour_2 = match.group(6)
 65 |         minute_2 = match.group(7)
 66 |         second_2 = match.group(9)
 67 | 
 68 |     result = f"{num2str(hour)}点"
 69 |     if minute.lstrip("0"):
 70 |         if int(minute) == 30:
 71 |             result += "半"
 72 |         else:
 73 |             result += f"{_time_num2str(minute)}分"
 74 |     if second and second.lstrip("0"):
 75 |         result += f"{_time_num2str(second)}秒"
 76 | 
 77 |     if is_range:
 78 |         result += "至"
 79 |         result += f"{num2str(hour_2)}点"
 80 |         if minute_2.lstrip("0"):
 81 |             if int(minute) == 30:
 82 |                 result += "半"
 83 |             else:
 84 |                 result += f"{_time_num2str(minute_2)}分"
 85 |         if second_2 and second_2.lstrip("0"):
 86 |             result += f"{_time_num2str(second_2)}秒"
 87 | 
 88 |     return result
 89 | 
 90 | 
 91 | RE_DATE = re.compile(
 92 |     r"(\d{4}|\d{2})年"
 93 |     r"((0?[1-9]|1[0-2])月)?"
 94 |     r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?"
 95 | )
 96 | 
 97 | 
 98 | def replace_date(match) -> str:
 99 |     """
100 |     Args:
101 |         match (re.Match)
102 |     Returns:
103 |         str
104 |     """
105 |     year = match.group(1)
106 |     month = match.group(3)
107 |     day = match.group(5)
108 |     result = ""
109 |     if year:
110 |         result += f"{verbalize_digit(year)}年"
111 |     if month:
112 |         result += f"{verbalize_cardinal(month)}月"
113 |     if day:
114 |         result += f"{verbalize_cardinal(day)}{match.group(9)}"
115 |     return result
116 | 
117 | 
118 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
119 | RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])")
120 | 
121 | 
122 | def replace_date2(match) -> str:
123 |     """
124 |     Args:
125 |         match (re.Match)
126 |     Returns:
127 |         str
128 |     """
129 |     year = match.group(1)
130 |     month = match.group(3)
131 |     day = match.group(4)
132 |     result = ""
133 |     if year:
134 |         result += f"{verbalize_digit(year)}年"
135 |     if month:
136 |         result += f"{verbalize_cardinal(month)}月"
137 |     if day:
138 |         result += f"{verbalize_cardinal(day)}日"
139 |     return result
140 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 | 
17 | from pypinyin.constants import SUPPORT_UCS4
18 | 
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters}
22 | 
23 | # 英文字符半角 -> 全角映射表
24 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
25 | 
26 | # 数字字符全角 -> 半角映射表 (num: 10)
27 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
28 | # 数字字符半角 -> 全角映射表
29 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
30 | 
31 | # 标点符号全角 -> 半角映射表 (num: 32)
32 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
33 | # 标点符号半角 -> 全角映射表
34 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
35 | 
36 | # 空格 (num: 1)
37 | F2H_SPACE = {"\u3000": " "}
38 | H2F_SPACE = {" ": "\u3000"}
39 | 
40 | # 非"有拼音的汉字"的字符串，可用于NSW提取
41 | if SUPPORT_UCS4:
42 |     RE_NSW = re.compile(
43 |         r"(?:[^"
44 |         r"\u3007"  # 〇
45 |         r"\u3400-\u4dbf"  # CJK扩展A:[3400-4DBF]
46 |         r"\u4e00-\u9fff"  # CJK基本:[4E00-9FFF]
47 |         r"\uf900-\ufaff"  # CJK兼容:[F900-FAFF]
48 |         r"\U00020000-\U0002A6DF"  # CJK扩展B:[20000-2A6DF]
49 |         r"\U0002A703-\U0002B73F"  # CJK扩展C:[2A700-2B73F]
50 |         r"\U0002B740-\U0002B81D"  # CJK扩展D:[2B740-2B81D]
51 |         r"\U0002F80A-\U0002FA1F"  # CJK兼容扩展:[2F800-2FA1F]
52 |         r"])+"
53 |     )
54 | else:
55 |     RE_NSW = re.compile(  # pragma: no cover
56 |         r"(?:[^"
57 |         r"\u3007"  # 〇
58 |         r"\u3400-\u4dbf"  # CJK扩展A:[3400-4DBF]
59 |         r"\u4e00-\u9fff"  # CJK基本:[4E00-9FFF]
60 |         r"\uf900-\ufaff"  # CJK兼容:[F900-FAFF]
61 |         r"])+"
62 |     )
63 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import verbalize_digit
17 | 
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通：130、131、132、156、155、186、185、176
23 | # 电信：133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
25 | RE_TELEPHONE = re.compile(r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
26 | 
27 | # 全国统一的号码400开头
28 | RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
29 | 
30 | 
31 | def phone2str(phone_string: str, mobile=True) -> str:
32 |     if mobile:
33 |         sp_parts = phone_string.strip("+").split()
34 |         result = "，".join([verbalize_digit(part, alt_one=True) for part in sp_parts])
35 |         return result
36 |     else:
37 |         sil_parts = phone_string.split("-")
38 |         result = "，".join([verbalize_digit(part, alt_one=True) for part in sil_parts])
39 |         return result
40 | 
41 | 
42 | def replace_phone(match) -> str:
43 |     """
44 |     Args:
45 |         match (re.Match)
46 |     Returns:
47 |         str
48 |     """
49 |     return phone2str(match.group(0), mobile=False)
50 | 
51 | 
52 | def replace_mobile(match) -> str:
53 |     """
54 |     Args:
55 |         match (re.Match)
56 |     Returns:
57 |         str
58 |     """
59 |     return phone2str(match.group(0))
60 | 


--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import num2str
17 | 
18 | # 温度表达式，温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r"(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)")
21 | measure_dict = {
22 |     "cm2": "平方厘米",
23 |     "cm²": "平方厘米",
24 |     "cm3": "立方厘米",
25 |     "cm³": "立方厘米",
26 |     "cm": "厘米",
27 |     "db": "分贝",
28 |     "ds": "毫秒",
29 |     "kg": "千克",
30 |     "km": "千米",
31 |     "m2": "平方米",
32 |     "m²": "平方米",
33 |     "m³": "立方米",
34 |     "m3": "立方米",
35 |     "ml": "毫升",
36 |     "m": "米",
37 |     "mm": "毫米",
38 |     "s": "秒",
39 | }
40 | 
41 | 
42 | def replace_temperature(match) -> str:
43 |     """
44 |     Args:
45 |         match (re.Match)
46 |     Returns:
47 |         str
48 |     """
49 |     sign = match.group(1)
50 |     temperature = match.group(2)
51 |     unit = match.group(3)
52 |     sign: str = "零下" if sign else ""
53 |     temperature: str = num2str(temperature)
54 |     unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 |     result = f"{sign}{temperature}{unit}"
56 |     return result
57 | 
58 | 
59 | def replace_measure(sentence) -> str:
60 |     for q_notation in measure_dict:
61 |         if q_notation in sentence:
62 |             sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 |     return sentence
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 RVC-Boss
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | import torch
 5 | 
 6 | # 推理用的指定模型
 7 | sovits_path = ""
 8 | gpt_path = ""
 9 | is_half_str = os.environ.get("is_half", "True")
10 | is_half = True if is_half_str.lower() == "true" else False
11 | is_share_str = os.environ.get("is_share", "False")
12 | is_share = True if is_share_str.lower() == "true" else False
13 | 
14 | cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
15 | bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
16 | pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
17 | pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
18 | 
19 | exp_root = "logs"
20 | python_exec = sys.executable or "python"
21 | if torch.cuda.is_available():
22 |     infer_device = "cuda"
23 | else:
24 |     infer_device = "cpu"
25 | 
26 | webui_port_main = 9874
27 | webui_port_uvr5 = 9873
28 | webui_port_infer_tts = 9872
29 | webui_port_subfix = 9871
30 | 
31 | api_port = 9880
32 | 
33 | if infer_device == "cuda":
34 |     gpu_name = torch.cuda.get_device_name(0)
35 |     if (
36 |         ("16" in gpu_name and "V100" not in gpu_name.upper())
37 |         or "P40" in gpu_name.upper()
38 |         or "P10" in gpu_name.upper()
39 |         or "1060" in gpu_name
40 |         or "1070" in gpu_name
41 |         or "1080" in gpu_name
42 |     ):
43 |         is_half = False
44 | 
45 | if infer_device == "cpu":
46 |     is_half = False
47 | 
48 | 
49 | class Config:
50 |     def __init__(self):
51 |         self.sovits_path = sovits_path
52 |         self.gpt_path = gpt_path
53 |         self.is_half = is_half
54 | 
55 |         self.cnhubert_path = cnhubert_path
56 |         self.bert_path = bert_path
57 |         self.pretrained_sovits_path = pretrained_sovits_path
58 |         self.pretrained_gpt_path = pretrained_gpt_path
59 | 
60 |         self.exp_root = exp_root
61 |         self.python_exec = python_exec
62 |         self.infer_device = infer_device
63 | 
64 |         self.webui_port_main = webui_port_main
65 |         self.webui_port_uvr5 = webui_port_uvr5
66 |         self.webui_port_infer_tts = webui_port_infer_tts
67 |         self.webui_port_subfix = webui_port_subfix
68 | 
69 |         self.api_port = api_port
70 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | 
 3 | services:
 4 |   GPT-SoVITS-CU126:
 5 |     image: xxxxrt666/gpt-sovits:latest-cu126
 6 |     container_name: GPT-SoVITS-CU126
 7 |     ports:
 8 |       - "9871:9871"
 9 |       - "9872:9872"
10 |       - "9873:9873"
11 |       - "9874:9874"
12 |       - "9880:9880"
13 |     volumes:
14 |       - .:/workspace/GPT-SoVITS
15 |       - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
16 |       - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
17 |       - /dev/null:/workspace/GPT-SoVITS/tools/asr/models
18 |       - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
19 |     environment:
20 |       - is_half=true
21 |     tty: true
22 |     stdin_open: true
23 |     shm_size: "16g"
24 |     restart: unless-stopped
25 |     runtime: nvidia
26 |   GPT-SoVITS-CU126-Lite:
27 |     image: xxxxrt666/gpt-sovits:latest-cu126-lite
28 |     container_name: GPT-SoVITS-CU126-Lite
29 |     ports:
30 |       - "9871:9871"
31 |       - "9872:9872"
32 |       - "9873:9873"
33 |       - "9874:9874"
34 |       - "9880:9880"
35 |     volumes:
36 |       - .:/workspace/GPT-SoVITS
37 |       - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
38 |       - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
39 |       - /dev/null:/workspace/GPT-SoVITS/tools/asr/models
40 |       - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
41 |       - tools/asr/models:/workspace/models/asr_models
42 |       - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
43 |     environment:
44 |       - is_half=true
45 |     tty: true
46 |     stdin_open: true
47 |     shm_size: "16g"
48 |     restart: unless-stopped
49 |     runtime: nvidia
50 |   GPT-SoVITS-CU128:
51 |     image: xxxxrt666/gpt-sovits:latest-cu128
52 |     container_name: GPT-SoVITS-CU128
53 |     ports:
54 |       - "9871:9871"
55 |       - "9872:9872"
56 |       - "9873:9873"
57 |       - "9874:9874"
58 |       - "9880:9880"
59 |     volumes:
60 |       - .:/workspace/GPT-SoVITS
61 |       - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
62 |       - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
63 |       - /dev/null:/workspace/GPT-SoVITS/tools/asr/models
64 |       - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
65 |     environment:
66 |       - is_half=true
67 |     tty: true
68 |     stdin_open: true
69 |     shm_size: "16g"
70 |     restart: unless-stopped
71 |     runtime: nvidia
72 |   GPT-SoVITS-CU128-Lite:
73 |     image: xxxxrt666/gpt-sovits:latest-cu128-lite
74 |     container_name: GPT-SoVITS-CU128-Lite
75 |     ports:
76 |       - "9871:9871"
77 |       - "9872:9872"
78 |       - "9873:9873"
79 |       - "9874:9874"
80 |       - "9880:9880"
81 |     volumes:
82 |       - .:/workspace/GPT-SoVITS
83 |       - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
84 |       - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
85 |       - /dev/null:/workspace/GPT-SoVITS/tools/asr/models
86 |       - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
87 |       - tools/asr/models:/workspace/models/asr_models
88 |       - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
89 |     environment:
90 |       - is_half=true
91 |     tty: true
92 |     stdin_open: true
93 |     shm_size: "16g"
94 |     restart: unless-stopped
95 |     runtime: nvidia


--------------------------------------------------------------------------------
/docker_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 4 | 
 5 | cd "$SCRIPT_DIR" || exit 1
 6 | 
 7 | set -e
 8 | 
 9 | if ! command -v docker &>/dev/null; then
10 |     echo "Docker Not Found"
11 |     exit 1
12 | fi
13 | 
14 | trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR
15 | 
16 | LITE=false
17 | CUDA_VERSION=12.6
18 | 
19 | print_help() {
20 |     echo "Usage: bash docker_build.sh [OPTIONS]"
21 |     echo ""
22 |     echo "Options:"
23 |     echo "  --cuda 12.6|12.8    Specify the CUDA VERSION (REQUIRED)"
24 |     echo "  --lite              Build a Lite Image"
25 |     echo "  -h, --help          Show this help message and exit"
26 |     echo ""
27 |     echo "Examples:"
28 |     echo "  bash docker_build.sh --cuda 12.6 --funasr --faster-whisper"
29 | }
30 | 
31 | # Show help if no arguments provided
32 | if [[ $# -eq 0 ]]; then
33 |     print_help
34 |     exit 0
35 | fi
36 | 
37 | # Parse arguments
38 | while [[ $# -gt 0 ]]; do
39 |     case "$1" in
40 |     --cuda)
41 |         case "$2" in
42 |         12.6)
43 |             CUDA_VERSION=12.6
44 |             ;;
45 |         12.8)
46 |             CUDA_VERSION=12.8
47 |             ;;
48 |         *)
49 |             echo "Error: Invalid CUDA_VERSION: $2"
50 |             echo "Choose From: [12.6, 12.8]"
51 |             exit 1
52 |             ;;
53 |         esac
54 |         shift 2
55 |         ;;
56 |     --lite)
57 |         LITE=true
58 |         shift
59 |         ;;
60 |     *)
61 |         echo "Unknown Argument: $1"
62 |         echo "Use -h or --help to see available options."
63 |         exit 1
64 |         ;;
65 |     esac
66 | done
67 | 
68 | TARGETPLATFORM=$(uname -m | grep -q 'x86' && echo "linux/amd64" || echo "linux/arm64")
69 | 
70 | if [ $LITE = true ]; then
71 |     TORCH_BASE="lite"
72 | else
73 |     TORCH_BASE="full"
74 | fi
75 | 
76 | docker build \
77 |     --build-arg CUDA_VERSION=$CUDA_VERSION \
78 |     --build-arg LITE=$LITE \
79 |     --build-arg TARGETPLATFORM="$TARGETPLATFORM" \
80 |     --build-arg TORCH_BASE=$TORCH_BASE \
81 |     -t "${USER}/gpt-sovits:local" \
82 |     .
83 | 


--------------------------------------------------------------------------------
/extra-req.txt:
--------------------------------------------------------------------------------
1 | faster-whisper
2 | 


--------------------------------------------------------------------------------
/go-webui.bat:
--------------------------------------------------------------------------------
1 | set "SCRIPT_DIR=%~dp0"
2 | set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
3 | cd /d "%SCRIPT_DIR%"
4 | set "PATH=%SCRIPT_DIR%\runtime;%PATH%"
5 | runtime\python.exe -I webui.py zh_CN
6 | pause
7 | 


--------------------------------------------------------------------------------
/go-webui.ps1:
--------------------------------------------------------------------------------
1 | $ErrorActionPreference = "SilentlyContinue"
2 | chcp 65001
3 | Set-Location $PSScriptRoot
4 | $runtimePath = Join-Path $PSScriptRoot "runtime"
5 | $env:PATH = "$runtimePath;$env:PATH"
6 | & "$runtimePath\python.exe" -I "$PSScriptRoot\webui.py" zh_CN
7 | pause
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --no-binary=opencc
 2 | numpy<2.0
 3 | scipy
 4 | tensorboard
 5 | librosa==0.10.2
 6 | numba
 7 | pytorch-lightning>=2.4
 8 | gradio<5
 9 | ffmpeg-python
10 | onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64"
11 | onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64"
12 | tqdm
13 | funasr==1.0.27
14 | cn2an
15 | pypinyin
16 | pyopenjtalk>=0.4.1
17 | g2p_en
18 | torchaudio
19 | modelscope==1.10.0
20 | sentencepiece
21 | transformers>=4.43,<=4.50
22 | peft
23 | chardet
24 | PyYAML
25 | psutil
26 | jieba_fast
27 | jieba
28 | split-lang
29 | fast_langdetect>=0.3.1
30 | wordsegment
31 | rotary_embedding_torch
32 | ToJyutping 
33 | g2pk2
34 | ko_pron
35 | opencc
36 | python_mecab_ko; sys_platform != 'win32'
37 | fastapi[standard]>=0.115.2
38 | x_transformers
39 | torchmetrics<=1.5
40 | pydantic<=2.10.6
41 | ctranslate2>=4.0,<5
42 | huggingface_hub>=0.13
43 | tokenizers>=0.13,<1
44 | av>=11
45 | tqdm
46 | 


--------------------------------------------------------------------------------
/tools/AP_BWE_main/24kto48k/readme.txt:
--------------------------------------------------------------------------------
 1 | For the inference of the v3 model, if you find that the generated audio sounds somewhat muffled, you can try using this audio super-resolution model.
 2 | 对于v3模型的推理，如果你发现生成的音频比较闷，可以尝试这个音频超分模型。
 3 | 
 4 | put g_24kto48k.zip and config.json in this folder
 5 | 把g_24kto48k.zip and config.json下到这个文件夹
 6 | 
 7 | download link 下载链接:
 8 | https://drive.google.com/drive/folders/1IIYTf2zbJWzelu4IftKD6ooHloJ8mnZF?usp=share_link
 9 | 
10 | audio sr project page 音频超分项目主页:
11 | https://github.com/yxlu-0102/AP-BWE
12 | 


--------------------------------------------------------------------------------
/tools/AP_BWE_main/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Ye-Xin Lu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tools/AP_BWE_main/datasets1/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tools/AP_BWE_main/models/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/tools/__init__.py


--------------------------------------------------------------------------------
/tools/asr/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def check_fw_local_models():
 5 |     """
 6 |     启动时检查本地是否有 Faster Whisper 模型.
 7 |     """
 8 |     model_size_list = [
 9 |         "tiny",
10 |         "tiny.en",
11 |         "base",
12 |         "base.en",
13 |         "small",
14 |         "small.en",
15 |         "medium",
16 |         "medium.en",
17 |         "large",
18 |         "large-v1",
19 |         "large-v2",
20 |         "large-v3",
21 |     ]
22 |     for i, size in enumerate(model_size_list):
23 |         if os.path.exists(f"tools/asr/models/faster-whisper-{size}"):
24 |             model_size_list[i] = size + "-local"
25 |     return model_size_list
26 | 
27 | 
28 | asr_dict = {
29 |     "达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
30 |     "Faster Whisper (多语种)": {
31 |         "lang": ["auto", "zh", "en", "ja", "ko", "yue"],
32 |         "size": check_fw_local_models(),
33 |         "path": "fasterwhisper_asr.py",
34 |         "precision": ["float32", "float16", "int8"],
35 |     },
36 | }
37 | 


--------------------------------------------------------------------------------
/tools/asr/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/tools/audio_sr.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function, unicode_literals
 2 | import sys
 3 | import os
 4 | 
 5 | AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main")
 6 | sys.path.append(AP_BWE_main_dir_path)
 7 | import json
 8 | import torch
 9 | import torchaudio.functional as aF
10 | # from attrdict import AttrDict####will be bug in py3.10
11 | 
12 | from datasets1.dataset import amp_pha_stft, amp_pha_istft
13 | from models.model import APNet_BWE_Model
14 | 
15 | 
16 | class AP_BWE:
17 |     def __init__(self, device, DictToAttrRecursive, checkpoint_file=None):
18 |         if checkpoint_file == None:
19 |             checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path)
20 |             if os.path.exists(checkpoint_file) == False:
21 |                 raise FileNotFoundError
22 |         config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json")
23 |         with open(config_file) as f:
24 |             data = f.read()
25 |         json_config = json.loads(data)
26 |         # h = AttrDict(json_config)
27 |         h = DictToAttrRecursive(json_config)
28 |         model = APNet_BWE_Model(h).to(device)
29 |         state_dict = torch.load(checkpoint_file, map_location="cpu", weights_only=False)
30 |         model.load_state_dict(state_dict["generator"])
31 |         model.eval()
32 |         self.device = device
33 |         self.model = model
34 |         self.h = h
35 | 
36 |     def to(self, *arg, **kwargs):
37 |         self.model.to(*arg, **kwargs)
38 |         self.device = self.model.conv_pre_mag.weight.device
39 |         return self
40 | 
41 |     def __call__(self, audio, orig_sampling_rate):
42 |         with torch.no_grad():
43 |             # audio, orig_sampling_rate = torchaudio.load(inp_path)
44 |             # audio = audio.to(self.device)
45 |             audio = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.h.hr_sampling_rate)
46 |             amp_nb, pha_nb, com_nb = amp_pha_stft(audio, self.h.n_fft, self.h.hop_size, self.h.win_size)
47 |             amp_wb_g, pha_wb_g, com_wb_g = self.model(amp_nb, pha_nb)
48 |             audio_hr_g = amp_pha_istft(amp_wb_g, pha_wb_g, self.h.n_fft, self.h.hop_size, self.h.win_size)
49 |             # sf.write(opt_path, audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate, 'PCM_16')
50 |             return audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate
51 | 


--------------------------------------------------------------------------------
/tools/cmd-denoise.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import traceback
 4 | 
 5 | from modelscope.pipelines import pipeline
 6 | from modelscope.utils.constant import Tasks
 7 | from tqdm import tqdm
 8 | 
 9 | path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k"
10 | path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
11 | ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise)
12 | 
13 | 
14 | def execute_denoise(input_folder, output_folder):
15 |     os.makedirs(output_folder, exist_ok=True)
16 |     # print(input_folder)
17 |     # print(list(os.listdir(input_folder).sort()))
18 |     for name in tqdm(os.listdir(input_folder)):
19 |         try:
20 |             ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name))
21 |         except:
22 |             traceback.print_exc()
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument(
28 |         "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files."
29 |     )
30 |     parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.")
31 |     parser.add_argument(
32 |         "-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32"
33 |     )  # 还没接入
34 |     cmd = parser.parse_args()
35 |     execute_denoise(
36 |         input_folder=cmd.input_folder,
37 |         output_folder=cmd.output_folder,
38 |     )
39 | 


--------------------------------------------------------------------------------
/tools/denoise-model/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/tools/i18n/i18n.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import locale
 3 | import os
 4 | 
 5 | I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
 6 | 
 7 | 
 8 | def load_language_list(language):
 9 |     with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f:
10 |         language_list = json.load(f)
11 |     return language_list
12 | 
13 | 
14 | def scan_language_list():
15 |     language_list = []
16 |     for name in os.listdir(I18N_JSON_DIR):
17 |         if name.endswith(".json"):
18 |             language_list.append(name.split(".")[0])
19 |     return language_list
20 | 
21 | 
22 | class I18nAuto:
23 |     def __init__(self, language=None):
24 |         if language in ["Auto", None]:
25 |             language = locale.getdefaultlocale()[0]
26 |             # getlocale can't identify the system's language ((None, None))
27 |         if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")):
28 |             language = "en_US"
29 |         self.language = language
30 |         self.language_map = load_language_list(language)
31 | 
32 |     def __call__(self, key):
33 |         return self.language_map.get(key, key)
34 | 
35 |     def __repr__(self):
36 |         return "Use Language: " + self.language
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     i18n = I18nAuto(language="en_US")
41 |     print(i18n)
42 | 


--------------------------------------------------------------------------------
/tools/slice_audio.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import numpy as np
 4 | import traceback
 5 | from scipy.io import wavfile
 6 | 
 7 | # parent_directory = os.path.dirname(os.path.abspath(__file__))
 8 | # sys.path.append(parent_directory)
 9 | from tools.my_utils import load_audio
10 | from slicer2 import Slicer
11 | 
12 | 
13 | def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part):
14 |     os.makedirs(opt_root, exist_ok=True)
15 |     if os.path.isfile(inp):
16 |         input = [inp]
17 |     elif os.path.isdir(inp):
18 |         input = [os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
19 |     else:
20 |         return "输入路径存在但既不是文件也不是文件夹"
21 |     slicer = Slicer(
22 |         sr=32000,  # 长音频采样率
23 |         threshold=int(threshold),  # 音量小于这个值视作静音的备选切割点
24 |         min_length=int(min_length),  # 每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值
25 |         min_interval=int(min_interval),  # 最短切割间隔
26 |         hop_size=int(hop_size),  # 怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）
27 |         max_sil_kept=int(max_sil_kept),  # 切完后静音最多留多长
28 |     )
29 |     _max = float(_max)
30 |     alpha = float(alpha)
31 |     for inp_path in input[int(i_part) :: int(all_part)]:
32 |         # print(inp_path)
33 |         try:
34 |             name = os.path.basename(inp_path)
35 |             audio = load_audio(inp_path, 32000)
36 |             # print(audio.shape)
37 |             for chunk, start, end in slicer.slice(audio):  # start和end是帧数
38 |                 tmp_max = np.abs(chunk).max()
39 |                 if tmp_max > 1:
40 |                     chunk /= tmp_max
41 |                 chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
42 |                 wavfile.write(
43 |                     "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
44 |                     32000,
45 |                     # chunk.astype(np.float32),
46 |                     (chunk * 32767).astype(np.int16),
47 |                 )
48 |         except:
49 |             print(inp_path, "->fail->", traceback.format_exc())
50 |     return "执行完毕，请检查输出文件"
51 | 
52 | 
53 | print(slice(*sys.argv[1:]))
54 | 


--------------------------------------------------------------------------------
/tools/uvr5/bs_roformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/tools/uvr5/bs_roformer/__init__.py


--------------------------------------------------------------------------------
/tools/uvr5/bs_roformer/attend.py:
--------------------------------------------------------------------------------
 1 | from packaging import version
 2 | import torch
 3 | from torch import nn, einsum
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def exists(val):
 8 |     return val is not None
 9 | 
10 | 
11 | def default(v, d):
12 |     return v if exists(v) else d
13 | 
14 | 
15 | class Attend(nn.Module):
16 |     def __init__(self, dropout=0.0, flash=False, scale=None):
17 |         super().__init__()
18 |         self.scale = scale
19 |         self.dropout = dropout
20 |         self.attn_dropout = nn.Dropout(dropout)
21 | 
22 |         self.flash = flash
23 |         assert not (flash and version.parse(torch.__version__) < version.parse("2.0.0")), (
24 |             "in order to use flash attention, you must be using pytorch 2.0 or above"
25 |         )
26 | 
27 |     def flash_attn(self, q, k, v):
28 |         # _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device
29 | 
30 |         if exists(self.scale):
31 |             default_scale = q.shape[-1] ** -0.5
32 |             q = q * (self.scale / default_scale)
33 | 
34 |         # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale
35 |         # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
36 |         return F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout if self.training else 0.0)
37 | 
38 |     def forward(self, q, k, v):
39 |         """
40 |         einstein notation
41 |         b - batch
42 |         h - heads
43 |         n, i, j - sequence length (base sequence length, source, target)
44 |         d - feature dimension
45 |         """
46 | 
47 |         # q_len, k_len, device = q.shape[-2], k.shape[-2], q.device
48 | 
49 |         scale = default(self.scale, q.shape[-1] ** -0.5)
50 | 
51 |         if self.flash:
52 |             return self.flash_attn(q, k, v)
53 | 
54 |         # similarity
55 | 
56 |         sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale
57 | 
58 |         # attention
59 | 
60 |         attn = sim.softmax(dim=-1)
61 |         attn = self.attn_dropout(attn)
62 | 
63 |         # aggregate values
64 | 
65 |         out = einsum("b h i j, b h j d -> b h i d", attn, v)
66 | 
67 |         return out
68 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
 67 |         super(Decoder, self).__init__()
 68 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 69 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 70 | 
 71 |     def __call__(self, x, skip=None):
 72 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 73 |         if skip is not None:
 74 |             skip = spec_utils.crop_center(skip, x)
 75 |             x = torch.cat([x, skip], dim=1)
 76 |         h = self.conv(x)
 77 | 
 78 |         if self.dropout is not None:
 79 |             h = self.dropout(h)
 80 | 
 81 |         return h
 82 | 
 83 | 
 84 | class ASPPModule(nn.Module):
 85 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 86 |         super(ASPPModule, self).__init__()
 87 |         self.conv1 = nn.Sequential(
 88 |             nn.AdaptiveAvgPool2d((1, None)),
 89 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 90 |         )
 91 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 92 |         self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
 93 |         self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
 94 |         self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
 95 |         self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
 96 | 
 97 |     def forward(self, x):
 98 |         _, _, h, w = x.size()
 99 |         feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
100 |         feat2 = self.conv2(x)
101 |         feat3 = self.conv3(x)
102 |         feat4 = self.conv4(x)
103 |         feat5 = self.conv5(x)
104 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
105 |         bottle = self.bottleneck(out)
106 |         return bottle
107 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123812KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
 67 |         super(Decoder, self).__init__()
 68 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 69 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 70 | 
 71 |     def __call__(self, x, skip=None):
 72 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 73 |         if skip is not None:
 74 |             skip = spec_utils.crop_center(skip, x)
 75 |             x = torch.cat([x, skip], dim=1)
 76 |         h = self.conv(x)
 77 | 
 78 |         if self.dropout is not None:
 79 |             h = self.dropout(h)
 80 | 
 81 |         return h
 82 | 
 83 | 
 84 | class ASPPModule(nn.Module):
 85 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 86 |         super(ASPPModule, self).__init__()
 87 |         self.conv1 = nn.Sequential(
 88 |             nn.AdaptiveAvgPool2d((1, None)),
 89 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 90 |         )
 91 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 92 |         self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
 93 |         self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
 94 |         self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
 95 |         self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
 96 | 
 97 |     def forward(self, x):
 98 |         _, _, h, w = x.size()
 99 |         feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
100 |         feat2 = self.conv2(x)
101 |         feat3 = self.conv3(x)
102 |         feat4 = self.conv4(x)
103 |         feat5 = self.conv5(x)
104 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
105 |         bottle = self.bottleneck(out)
106 |         return bottle
107 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123821KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
 67 |         super(Decoder, self).__init__()
 68 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 69 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 70 | 
 71 |     def __call__(self, x, skip=None):
 72 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 73 |         if skip is not None:
 74 |             skip = spec_utils.crop_center(skip, x)
 75 |             x = torch.cat([x, skip], dim=1)
 76 |         h = self.conv(x)
 77 | 
 78 |         if self.dropout is not None:
 79 |             h = self.dropout(h)
 80 | 
 81 |         return h
 82 | 
 83 | 
 84 | class ASPPModule(nn.Module):
 85 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 86 |         super(ASPPModule, self).__init__()
 87 |         self.conv1 = nn.Sequential(
 88 |             nn.AdaptiveAvgPool2d((1, None)),
 89 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 90 |         )
 91 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 92 |         self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
 93 |         self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
 94 |         self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
 95 |         self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
 96 | 
 97 |     def forward(self, x):
 98 |         _, _, h, w = x.size()
 99 |         feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
100 |         feat2 = self.conv2(x)
101 |         feat3 = self.conv3(x)
102 |         feat4 = self.conv4(x)
103 |         feat5 = self.conv5(x)
104 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
105 |         bottle = self.bottleneck(out)
106 |         return bottle
107 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_33966KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
 67 |         super(Decoder, self).__init__()
 68 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 69 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 70 | 
 71 |     def __call__(self, x, skip=None):
 72 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 73 |         if skip is not None:
 74 |             skip = spec_utils.crop_center(skip, x)
 75 |             x = torch.cat([x, skip], dim=1)
 76 |         h = self.conv(x)
 77 | 
 78 |         if self.dropout is not None:
 79 |             h = self.dropout(h)
 80 | 
 81 |         return h
 82 | 
 83 | 
 84 | class ASPPModule(nn.Module):
 85 |     def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
 86 |         super(ASPPModule, self).__init__()
 87 |         self.conv1 = nn.Sequential(
 88 |             nn.AdaptiveAvgPool2d((1, None)),
 89 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 90 |         )
 91 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 92 |         self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
 93 |         self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
 94 |         self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
 95 |         self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
 96 |         self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
 97 |         self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
 98 | 
 99 |     def forward(self, x):
100 |         _, _, h, w = x.size()
101 |         feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
102 |         feat2 = self.conv2(x)
103 |         feat3 = self.conv3(x)
104 |         feat4 = self.conv4(x)
105 |         feat5 = self.conv5(x)
106 |         feat6 = self.conv6(x)
107 |         feat7 = self.conv7(x)
108 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
109 |         bottle = self.bottleneck(out)
110 |         return bottle
111 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_new.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class Encoder(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 31 |         super(Encoder, self).__init__()
 32 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
 33 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 34 | 
 35 |     def __call__(self, x):
 36 |         h = self.conv1(x)
 37 |         h = self.conv2(h)
 38 | 
 39 |         return h
 40 | 
 41 | 
 42 | class Decoder(nn.Module):
 43 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
 44 |         super(Decoder, self).__init__()
 45 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 46 |         # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 47 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 48 | 
 49 |     def __call__(self, x, skip=None):
 50 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 51 | 
 52 |         if skip is not None:
 53 |             skip = spec_utils.crop_center(skip, x)
 54 |             x = torch.cat([x, skip], dim=1)
 55 | 
 56 |         h = self.conv1(x)
 57 |         # h = self.conv2(h)
 58 | 
 59 |         if self.dropout is not None:
 60 |             h = self.dropout(h)
 61 | 
 62 |         return h
 63 | 
 64 | 
 65 | class ASPPModule(nn.Module):
 66 |     def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
 67 |         super(ASPPModule, self).__init__()
 68 |         self.conv1 = nn.Sequential(
 69 |             nn.AdaptiveAvgPool2d((1, None)),
 70 |             Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
 71 |         )
 72 |         self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
 73 |         self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
 74 |         self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
 75 |         self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
 76 |         self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
 77 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 78 | 
 79 |     def forward(self, x):
 80 |         _, _, h, w = x.size()
 81 |         feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
 82 |         feat2 = self.conv2(x)
 83 |         feat3 = self.conv3(x)
 84 |         feat4 = self.conv4(x)
 85 |         feat5 = self.conv5(x)
 86 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
 87 |         out = self.bottleneck(out)
 88 | 
 89 |         if self.dropout is not None:
 90 |             out = self.dropout(out)
 91 | 
 92 |         return out
 93 | 
 94 | 
 95 | class LSTMModule(nn.Module):
 96 |     def __init__(self, nin_conv, nin_lstm, nout_lstm):
 97 |         super(LSTMModule, self).__init__()
 98 |         self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
 99 |         self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
100 |         self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())
101 | 
102 |     def forward(self, x):
103 |         N, _, nbins, nframes = x.size()
104 |         h = self.conv(x)[:, 0]  # N, nbins, nframes
105 |         h = h.permute(2, 0, 1)  # nframes, N, nbins
106 |         h, _ = self.lstm(h)
107 |         h = self.dense(h.reshape(-1, h.size()[-1]))  # nframes * N, nbins
108 |         h = h.reshape(nframes, N, 1, nbins)
109 |         h = h.permute(1, 2, 3, 0)
110 | 
111 |         return h
112 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/model_param_init.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pathlib
 3 | 
 4 | default_param = {}
 5 | default_param["bins"] = 768
 6 | default_param["unstable_bins"] = 9  # training only
 7 | default_param["reduction_bins"] = 762  # training only
 8 | default_param["sr"] = 44100
 9 | default_param["pre_filter_start"] = 757
10 | default_param["pre_filter_stop"] = 768
11 | default_param["band"] = {}
12 | 
13 | 
14 | default_param["band"][1] = {
15 |     "sr": 11025,
16 |     "hl": 128,
17 |     "n_fft": 960,
18 |     "crop_start": 0,
19 |     "crop_stop": 245,
20 |     "lpf_start": 61,  # inference only
21 |     "res_type": "polyphase",
22 | }
23 | 
24 | default_param["band"][2] = {
25 |     "sr": 44100,
26 |     "hl": 512,
27 |     "n_fft": 1536,
28 |     "crop_start": 24,
29 |     "crop_stop": 547,
30 |     "hpf_start": 81,  # inference only
31 |     "res_type": "sinc_best",
32 | }
33 | 
34 | 
35 | def int_keys(d):
36 |     r = {}
37 |     for k, v in d:
38 |         if k.isdigit():
39 |             k = int(k)
40 |         r[k] = v
41 |     return r
42 | 
43 | 
44 | class ModelParameters(object):
45 |     def __init__(self, config_path=""):
46 |         if ".pth" == pathlib.Path(config_path).suffix:
47 |             import zipfile
48 | 
49 |             with zipfile.ZipFile(config_path, "r") as zip:
50 |                 self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys)
51 |         elif ".json" == pathlib.Path(config_path).suffix:
52 |             with open(config_path, "r") as f:
53 |                 self.param = json.loads(f.read(), object_pairs_hook=int_keys)
54 |         else:
55 |             self.param = default_param
56 | 
57 |         for k in [
58 |             "mid_side",
59 |             "mid_side_b",
60 |             "mid_side_b2",
61 |             "stereo_w",
62 |             "stereo_n",
63 |             "reverse",
64 |         ]:
65 |             if k not in self.param:
66 |                 self.param[k] = False
67 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 16000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 16000,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 32000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "kaiser_fast"
14 | 		}
15 | 	},
16 | 	"sr": 32000,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 33075,
 8 | 			"hl": 384,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 33075,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 1024,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 256,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 256,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 256,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 256,
18 | 	"pre_filter_stop": 256
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 700,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 700
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 118,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 32000,
18 | 			"hl": 352,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 44,
23 | 			"hpf_stop": 23,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 32000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }
31 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 512,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 510,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 160,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 192,
12 | 			"lpf_start": 41,
13 | 			"lpf_stop": 139,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 44100,
18 | 			"hl": 640,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 10,
21 | 			"crop_stop": 320,
22 | 			"hpf_start": 47,
23 | 			"hpf_stop": 15,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 44100,
28 | 	"pre_filter_start": 510,
29 | 	"pre_filter_stop": 512
30 | }
31 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 240,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 48000,
18 | 			"hl": 528,
19 | 			"n_fft": 1536,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 82,
23 | 			"hpf_stop": 22,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 48000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 5,
 4 | 	"reduction_bins": 733,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 278,
12 | 			"lpf_start": 28,
13 | 			"lpf_stop": 140,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 22050,
18 | 			"hl": 256,
19 | 			"n_fft": 768,
20 | 			"crop_start": 14,
21 | 			"crop_stop": 322,
22 | 			"hpf_start": 70,
23 | 			"hpf_stop": 14,
24 | 			"lpf_start": 283,
25 | 			"lpf_stop": 314,
26 | 			"res_type": "polyphase"
27 | 		},	
28 | 		"3": {
29 | 			"sr": 44100,
30 | 			"hl": 512,
31 | 			"n_fft": 768,
32 | 			"crop_start": 131,
33 | 			"crop_stop": 313,
34 | 			"hpf_start": 154,
35 | 			"hpf_stop": 141,
36 | 			"res_type": "sinc_medium"
37 | 		}
38 | 	},
39 | 	"sr": 44100,
40 | 	"pre_filter_start": 757,
41 | 	"pre_filter_stop": 768
42 | }
43 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 5,
 5 | 	"reduction_bins": 733,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 768,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 278,
13 | 			"lpf_start": 28,
14 | 			"lpf_stop": 140,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 256,
20 | 			"n_fft": 768,
21 | 			"crop_start": 14,
22 | 			"crop_stop": 322,
23 | 			"hpf_start": 70,
24 | 			"hpf_stop": 14,
25 | 			"lpf_start": 283,
26 | 			"lpf_stop": 314,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 512,
32 | 			"n_fft": 768,
33 | 			"crop_start": 131,
34 | 			"crop_stop": 313,
35 | 			"hpf_start": 154,
36 | 			"hpf_stop": 141,
37 | 			"res_type": "sinc_medium"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 757,
42 | 	"pre_filter_stop": 768
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 640,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 187,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 768,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 212,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 174,
26 | 			"lpf_stop": 209,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 640,
33 | 			"crop_start": 66,
34 | 			"crop_stop": 307,
35 | 			"hpf_start": 86,
36 | 			"hpf_stop": 72,
37 | 			"res_type": "kaiser_fast"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 639,
42 | 	"pre_filter_stop": 640
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 668,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 1024,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 186,
12 | 			"lpf_start": 37,
13 | 			"lpf_stop": 73,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 11025,
18 | 			"hl": 128,
19 | 			"n_fft": 512,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 185,			
22 | 			"hpf_start": 36,
23 | 			"hpf_stop": 18,
24 | 			"lpf_start": 93,
25 | 			"lpf_stop": 185,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 22050,
30 | 			"hl": 256,
31 | 			"n_fft": 512,
32 | 			"crop_start": 46,
33 | 			"crop_stop": 186,
34 | 			"hpf_start": 93,
35 | 			"hpf_stop": 46,
36 | 			"lpf_start": 164,
37 | 			"lpf_stop": 186,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 512,
43 | 			"n_fft": 768,
44 | 			"crop_start": 121,
45 | 			"crop_stop": 382,
46 | 			"hpf_start": 138,
47 | 			"hpf_stop": 123,
48 | 			"res_type": "sinc_medium"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 740,
53 | 	"pre_filter_stop": 768
54 | }
55 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"mid_side": true,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }
56 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"reverse": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"stereo_w": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"convert_channels": "stereo_n",
49 | 			"res_type": "kaiser_fast"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 668,
54 | 	"pre_filter_stop": 672
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 530,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/ensemble.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 1280,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 2048,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 374,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 1536,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 424,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 348,
26 | 			"lpf_stop": 418,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 1280,
33 | 			"crop_start": 132,
34 | 			"crop_stop": 614,
35 | 			"hpf_start": 172,
36 | 			"hpf_stop": 144,
37 | 			"res_type": "polyphase"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 1280,
42 | 	"pre_filter_stop": 1280
43 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | def load_data(file_name: str = "./lib/name_params.json") -> dict:
  9 |     with open(file_name, "r") as f:
 10 |         data = json.load(f)
 11 | 
 12 |     return data
 13 | 
 14 | 
 15 | def make_padding(width, cropsize, offset):
 16 |     left = offset
 17 |     roi_size = cropsize - left * 2
 18 |     if roi_size == 0:
 19 |         roi_size = cropsize
 20 |     right = roi_size - (width % roi_size) + left
 21 | 
 22 |     return left, right, roi_size
 23 | 
 24 | 
 25 | def inference(X_spec, device, model, aggressiveness, data):
 26 |     """
 27 |     data : dic configs
 28 |     """
 29 | 
 30 |     def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True):
 31 |         model.eval()
 32 |         with torch.no_grad():
 33 |             preds = []
 34 | 
 35 |             iterations = [n_window]
 36 | 
 37 |             total_iterations = sum(iterations)
 38 |             for i in tqdm(range(n_window)):
 39 |                 start = i * roi_size
 40 |                 X_mag_window = X_mag_pad[None, :, :, start : start + data["window_size"]]
 41 |                 X_mag_window = torch.from_numpy(X_mag_window)
 42 |                 if is_half:
 43 |                     X_mag_window = X_mag_window.half()
 44 |                 X_mag_window = X_mag_window.to(device)
 45 | 
 46 |                 pred = model.predict(X_mag_window, aggressiveness)
 47 | 
 48 |                 pred = pred.detach().cpu().numpy()
 49 |                 preds.append(pred[0])
 50 | 
 51 |             pred = np.concatenate(preds, axis=2)
 52 |         return pred
 53 | 
 54 |     def preprocess(X_spec):
 55 |         X_mag = np.abs(X_spec)
 56 |         X_phase = np.angle(X_spec)
 57 | 
 58 |         return X_mag, X_phase
 59 | 
 60 |     X_mag, X_phase = preprocess(X_spec)
 61 | 
 62 |     coef = X_mag.max()
 63 |     X_mag_pre = X_mag / coef
 64 | 
 65 |     n_frame = X_mag_pre.shape[2]
 66 |     pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
 67 |     n_window = int(np.ceil(n_frame / roi_size))
 68 | 
 69 |     X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 70 | 
 71 |     if list(model.state_dict().values())[0].dtype == torch.float16:
 72 |         is_half = True
 73 |     else:
 74 |         is_half = False
 75 |     pred = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half)
 76 |     pred = pred[:, :, :n_frame]
 77 | 
 78 |     if data["tta"]:
 79 |         pad_l += roi_size // 2
 80 |         pad_r += roi_size // 2
 81 |         n_window += 1
 82 | 
 83 |         X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 84 | 
 85 |         pred_tta = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half)
 86 |         pred_tta = pred_tta[:, :, roi_size // 2 :]
 87 |         pred_tta = pred_tta[:, :, :n_frame]
 88 | 
 89 |         return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
 90 |     else:
 91 |         return pred * coef, X_mag, np.exp(1.0j * X_phase)
 92 | 
 93 | 
 94 | def _get_name_params(model_path, model_hash):
 95 |     data = load_data()
 96 |     flag = False
 97 |     ModelName = model_path
 98 |     for type in list(data):
 99 |         for model in list(data[type][0]):
100 |             for i in range(len(data[type][0][model])):
101 |                 if str(data[type][0][model][i]["hash_name"]) == model_hash:
102 |                     flag = True
103 |                 elif str(data[type][0][model][i]["hash_name"]) in ModelName:
104 |                     flag = True
105 | 
106 |                 if flag:
107 |                     model_params_auto = data[type][0][model][i]["model_params"]
108 |                     param_name_auto = data[type][0][model][i]["param_name"]
109 |                     if type == "equivalent":
110 |                         return param_name_auto, model_params_auto
111 |                     else:
112 |                         flag = False
113 |     return param_name_auto, model_params_auto
114 | 


--------------------------------------------------------------------------------
/tools/uvr5/uvr5_weights/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------