├── .dockerignore
├── .github
├── build_windows_packages.ps1
└── workflows
│ ├── build_windows_packages.yaml
│ └── docker-publish.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── Colab-Inference.ipynb
├── Colab-WebUI.ipynb
├── Docker
├── install_wrapper.sh
└── miniconda_install.sh
├── Dockerfile
├── GPT_SoVITS
├── AR
│ ├── __init__.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── bucket_sampler.py
│ │ ├── data_module.py
│ │ └── dataset.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── t2s_lightning_module.py
│ │ ├── t2s_lightning_module_onnx.py
│ │ ├── t2s_model.py
│ │ ├── t2s_model_onnx.py
│ │ └── utils.py
│ ├── modules
│ │ ├── __init__.py
│ │ ├── activation.py
│ │ ├── activation_onnx.py
│ │ ├── embedding.py
│ │ ├── embedding_onnx.py
│ │ ├── lr_schedulers.py
│ │ ├── optim.py
│ │ ├── patched_mha_with_cache.py
│ │ ├── patched_mha_with_cache_onnx.py
│ │ ├── scaling.py
│ │ ├── transformer.py
│ │ └── transformer_onnx.py
│ ├── text_processing
│ │ ├── __init__.py
│ │ ├── phonemizer.py
│ │ └── symbols.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── initialize.py
│ │ └── io.py
├── BigVGAN
│ ├── LICENSE
│ ├── README.md
│ ├── activations.py
│ ├── alias_free_activation
│ │ ├── cuda
│ │ │ ├── __init__.py
│ │ │ ├── activation1d.py
│ │ │ ├── anti_alias_activation.cpp
│ │ │ ├── anti_alias_activation_cuda.cu
│ │ │ ├── build
│ │ │ │ └── _
│ │ │ ├── compat.h
│ │ │ ├── load.py
│ │ │ └── type_shim.h
│ │ └── torch
│ │ │ ├── __init__.py
│ │ │ ├── act.py
│ │ │ ├── filter.py
│ │ │ └── resample.py
│ ├── bigvgan.py
│ ├── configs
│ │ ├── bigvgan_22khz_80band.json
│ │ ├── bigvgan_24khz_100band.json
│ │ ├── bigvgan_base_22khz_80band.json
│ │ ├── bigvgan_base_24khz_100band.json
│ │ ├── bigvgan_v2_22khz_80band_256x.json
│ │ ├── bigvgan_v2_22khz_80band_fmax8k_256x.json
│ │ ├── bigvgan_v2_24khz_100band_256x.json
│ │ ├── bigvgan_v2_44khz_128band_256x.json
│ │ └── bigvgan_v2_44khz_128band_512x.json
│ ├── discriminators.py
│ ├── env.py
│ ├── incl_licenses
│ │ ├── LICENSE_1
│ │ ├── LICENSE_2
│ │ ├── LICENSE_3
│ │ ├── LICENSE_4
│ │ ├── LICENSE_5
│ │ ├── LICENSE_6
│ │ ├── LICENSE_7
│ │ └── LICENSE_8
│ ├── inference.py
│ ├── inference_e2e.py
│ ├── loss.py
│ ├── meldataset.py
│ ├── nv-modelcard++
│ │ ├── .gitkeep
│ │ ├── bias.md
│ │ ├── explainability.md
│ │ ├── overview.md
│ │ ├── privacy.md
│ │ └── safety.md
│ ├── requirements.txt
│ ├── tests
│ │ ├── test_activation.py
│ │ ├── test_activation_snake_beta.py
│ │ └── test_cuda_vs_torch_model.py
│ ├── train.py
│ └── utils0.py
├── TTS_infer_pack
│ ├── TTS.py
│ ├── TextPreprocessor.py
│ ├── __init__.py
│ └── text_segmentation_method.py
├── configs
│ ├── .gitignore
│ ├── s1.yaml
│ ├── s1big.yaml
│ ├── s1big2.yaml
│ ├── s1longer-v2.yaml
│ ├── s1longer.yaml
│ ├── s1mq.yaml
│ ├── s2.json
│ ├── train.yaml
│ └── tts_infer.yaml
├── download.py
├── export_torch_script.py
├── export_torch_script_v3.py
├── f5_tts
│ └── model
│ │ ├── __init__.py
│ │ ├── backbones
│ │ ├── README.md
│ │ ├── dit.py
│ │ ├── mmdit.py
│ │ └── unett.py
│ │ └── modules.py
├── feature_extractor
│ ├── __init__.py
│ ├── cnhubert.py
│ └── whisper_enc.py
├── inference_cli.py
├── inference_gui.py
├── inference_webui.py
├── inference_webui_fast.py
├── module
│ ├── __init__.py
│ ├── attentions.py
│ ├── attentions_onnx.py
│ ├── commons.py
│ ├── core_vq.py
│ ├── data_utils.py
│ ├── losses.py
│ ├── mel_processing.py
│ ├── models.py
│ ├── models_onnx.py
│ ├── modules.py
│ ├── mrte_model.py
│ ├── quantize.py
│ └── transforms.py
├── onnx_export.py
├── prepare_datasets
│ ├── 1-get-text.py
│ ├── 2-get-hubert-wav32k.py
│ └── 3-get-semantic.py
├── pretrained_models
│ └── .gitignore
├── process_ckpt.py
├── s1_train.py
├── s2_train.py
├── s2_train_v3.py
├── s2_train_v3_lora.py
├── text
│ ├── .gitignore
│ ├── LangSegmenter
│ │ ├── __init__.py
│ │ └── langsegmenter.py
│ ├── __init__.py
│ ├── cantonese.py
│ ├── chinese.py
│ ├── chinese2.py
│ ├── cleaner.py
│ ├── cmudict-fast.rep
│ ├── cmudict.rep
│ ├── en_normalization
│ │ └── expend.py
│ ├── engdict-hot.rep
│ ├── engdict_cache.pickle
│ ├── english.py
│ ├── g2pw
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ ├── g2pw.py
│ │ ├── onnx_api.py
│ │ ├── polyphonic-fix.rep
│ │ ├── polyphonic.pickle
│ │ ├── polyphonic.rep
│ │ └── utils.py
│ ├── ja_userdic
│ │ └── userdict.csv
│ ├── japanese.py
│ ├── korean.py
│ ├── namedict_cache.pickle
│ ├── opencpop-strict.txt
│ ├── symbols.py
│ ├── symbols2.py
│ ├── tone_sandhi.py
│ └── zh_normalization
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── char_convert.py
│ │ ├── chronology.py
│ │ ├── constants.py
│ │ ├── num.py
│ │ ├── phonecode.py
│ │ ├── quantifier.py
│ │ └── text_normlization.py
└── utils.py
├── LICENSE
├── README.md
├── api.py
├── api_v2.py
├── config.py
├── docker-compose.yaml
├── docker_build.sh
├── docs
├── cn
│ ├── Changelog_CN.md
│ └── README.md
├── en
│ └── Changelog_EN.md
├── ja
│ ├── Changelog_JA.md
│ └── README.md
├── ko
│ ├── Changelog_KO.md
│ └── README.md
└── tr
│ ├── Changelog_TR.md
│ └── README.md
├── extra-req.txt
├── go-webui.bat
├── go-webui.ps1
├── gpt-sovits_kaggle.ipynb
├── install.sh
├── requirements.txt
├── tools
├── AP_BWE_main
│ ├── 24kto48k
│ │ └── readme.txt
│ ├── LICENSE
│ ├── README.md
│ ├── datasets1
│ │ ├── __init__.py
│ │ └── dataset.py
│ └── models
│ │ ├── __init__.py
│ │ └── model.py
├── __init__.py
├── asr
│ ├── config.py
│ ├── fasterwhisper_asr.py
│ ├── funasr_asr.py
│ └── models
│ │ └── .gitignore
├── audio_sr.py
├── cmd-denoise.py
├── denoise-model
│ └── .gitignore
├── i18n
│ ├── i18n.py
│ ├── locale
│ │ ├── en_US.json
│ │ ├── es_ES.json
│ │ ├── fr_FR.json
│ │ ├── it_IT.json
│ │ ├── ja_JP.json
│ │ ├── ko_KR.json
│ │ ├── pt_BR.json
│ │ ├── ru_RU.json
│ │ ├── tr_TR.json
│ │ ├── zh_CN.json
│ │ ├── zh_HK.json
│ │ ├── zh_SG.json
│ │ └── zh_TW.json
│ └── scan_i18n.py
├── my_utils.py
├── slice_audio.py
├── slicer2.py
├── subfix_webui.py
└── uvr5
│ ├── bs_roformer
│ ├── __init__.py
│ ├── attend.py
│ ├── bs_roformer.py
│ └── mel_band_roformer.py
│ ├── bsroformer.py
│ ├── lib
│ ├── lib_v5
│ │ ├── dataset.py
│ │ ├── layers.py
│ │ ├── layers_123812KB.py
│ │ ├── layers_123821KB.py
│ │ ├── layers_33966KB.py
│ │ ├── layers_537227KB.py
│ │ ├── layers_537238KB.py
│ │ ├── layers_new.py
│ │ ├── model_param_init.py
│ │ ├── modelparams
│ │ │ ├── 1band_sr16000_hl512.json
│ │ │ ├── 1band_sr32000_hl512.json
│ │ │ ├── 1band_sr33075_hl384.json
│ │ │ ├── 1band_sr44100_hl1024.json
│ │ │ ├── 1band_sr44100_hl256.json
│ │ │ ├── 1band_sr44100_hl512.json
│ │ │ ├── 1band_sr44100_hl512_cut.json
│ │ │ ├── 2band_32000.json
│ │ │ ├── 2band_44100_lofi.json
│ │ │ ├── 2band_48000.json
│ │ │ ├── 3band_44100.json
│ │ │ ├── 3band_44100_mid.json
│ │ │ ├── 3band_44100_msb2.json
│ │ │ ├── 4band_44100.json
│ │ │ ├── 4band_44100_mid.json
│ │ │ ├── 4band_44100_msb.json
│ │ │ ├── 4band_44100_msb2.json
│ │ │ ├── 4band_44100_reverse.json
│ │ │ ├── 4band_44100_sw.json
│ │ │ ├── 4band_v2.json
│ │ │ ├── 4band_v2_sn.json
│ │ │ ├── 4band_v3.json
│ │ │ └── ensemble.json
│ │ ├── nets.py
│ │ ├── nets_123812KB.py
│ │ ├── nets_123821KB.py
│ │ ├── nets_33966KB.py
│ │ ├── nets_537227KB.py
│ │ ├── nets_537238KB.py
│ │ ├── nets_61968KB.py
│ │ ├── nets_new.py
│ │ └── spec_utils.py
│ ├── name_params.json
│ └── utils.py
│ ├── mdxnet.py
│ ├── uvr5_weights
│ └── .gitignore
│ ├── vr.py
│ └── webui.py
└── webui.py
/.github/workflows/build_windows_packages.yaml:
--------------------------------------------------------------------------------
1 | name: Build and Upload Windows Package
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | date:
7 | description: "Date suffix (optional)"
8 | required: false
9 | default: ""
10 | suffix:
11 | description: "Package name suffix (optional)"
12 | required: false
13 | default: ""
14 |
15 | jobs:
16 | build:
17 | runs-on: windows-latest
18 | strategy:
19 | matrix:
20 | torch_cuda: [cu124, cu128]
21 | env:
22 | TORCH_CUDA: ${{ matrix.torch_cuda }}
23 | MODELSCOPE_USERNAME: ${{ secrets.MODELSCOPE_USERNAME }}
24 | MODELSCOPE_TOKEN: ${{ secrets.MODELSCOPE_TOKEN }}
25 | HUGGINGFACE_USERNAME: ${{ secrets.HUGGINGFACE_USERNAME }}
26 | HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
27 | DATE_SUFFIX: ${{ github.event.inputs.date }}
28 | PKG_SUFFIX: ${{ github.event.inputs.suffix }}
29 |
30 | steps:
31 | - name: Checkout
32 | uses: actions/checkout@v4
33 |
34 | - name: Run Build and Upload Script
35 | shell: pwsh
36 | run: |
37 | Move-Item .github/build_windows_packages.ps1 ../build_windows_packages.ps1
38 | ../build_windows_packages.ps1
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | ci:
2 | autoupdate_schedule: monthly
3 |
4 | repos:
5 | - repo: https://github.com/astral-sh/ruff-pre-commit
6 | rev: v0.11.7
7 | hooks:
8 | # Run the linter.
9 | - id: ruff
10 | types_or: [ python, pyi ]
11 | args: [ --fix ]
12 | # Run the formatter.
13 | - id: ruff-format
14 | types_or: [ python, pyi ]
15 | args: [ --line-length, "120", --target-version, "py310" ]
16 |
--------------------------------------------------------------------------------
/Colab-WebUI.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "view-in-github"
8 | },
9 | "source": [
10 | "
"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "# GPT-SoVITS WebUI"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {
23 | "id": "_o6a8GS2lWQM"
24 | },
25 | "source": [
26 | "## Env Setup (Run Once Only)\n",
27 | "## 环境配置, 只需运行一次"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "### 1."
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "%%writefile /content/setup.sh\n",
44 | "set -e\n",
45 | "\n",
46 | "cd /content\n",
47 | "\n",
48 | "git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
49 | "\n",
50 | "cd GPT-SoVITS\n",
51 | "\n",
52 | "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
53 | " :\n",
54 | "else\n",
55 | " conda create -n GPTSoVITS python=3.10 -y\n",
56 | "fi\n",
57 | "\n",
58 | "source activate GPTSoVITS\n",
59 | "\n",
60 | "pip install ipykernel\n",
61 | "\n",
62 | "bash install.sh --device CU126 --source HF --download-uvr5"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "### 2."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "%pip install -q condacolab\n",
79 | "import condacolab\n",
80 | "condacolab.install_from_url(\"https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh\")\n",
81 | "!cd /content && bash setup.sh"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "## Launch WebUI\n",
89 | "## 启动 WebUI"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "id": "4oRGUzkrk8C7"
97 | },
98 | "outputs": [],
99 | "source": [
100 | "!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py"
101 | ]
102 | }
103 | ],
104 | "metadata": {
105 | "accelerator": "GPU",
106 | "colab": {
107 | "include_colab_link": true,
108 | "provenance": []
109 | },
110 | "kernelspec": {
111 | "display_name": "Python 3",
112 | "name": "python3"
113 | }
114 | },
115 | "nbformat": 4,
116 | "nbformat_minor": 0
117 | }
118 |
--------------------------------------------------------------------------------
/Docker/install_wrapper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
4 |
5 | cd "$SCRIPT_DIR" || exit 1
6 |
7 | cd .. || exit 1
8 |
9 | set -e
10 |
11 | source "$HOME/miniconda3/etc/profile.d/conda.sh"
12 |
13 | mkdir -p GPT_SoVITS
14 |
15 | mkdir -p GPT_SoVITS/text
16 |
17 | ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
18 |
19 | ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
20 |
21 | bash install.sh --device "CU${CUDA_VERSION//./}" --source HF
22 |
23 | pip cache purge
24 |
25 | pip show torch
26 |
27 | rm -rf /tmp/* /var/tmp/*
28 |
29 | rm -rf "$HOME/miniconda3/pkgs"
30 |
31 | mkdir -p "$HOME/miniconda3/pkgs"
32 |
33 | rm -rf /root/.conda /root/.cache
34 |
--------------------------------------------------------------------------------
/Docker/miniconda_install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
6 |
7 | cd "$SCRIPT_DIR" || exit 1
8 |
9 | cd .. || exit 1
10 |
11 | if [ -d "$HOME/miniconda3" ]; then
12 | exit 0
13 | fi
14 |
15 | WORKFLOW=${WORKFLOW:-"false"}
16 | TARGETPLATFORM=${TARGETPLATFORM:-"linux/amd64"}
17 |
18 | if [ "$WORKFLOW" = "true" ]; then
19 | WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
20 | else
21 | WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
22 | fi
23 |
24 | if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
25 | "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-x86_64.sh
26 | elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then
27 | "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-aarch64.sh
28 | else
29 | exit 1
30 | fi
31 |
32 | LOG_PATH="/tmp/miniconda-install.log"
33 |
34 | bash miniconda.sh -b -p "$HOME/miniconda3" >"$LOG_PATH" 2>&1
35 |
36 | if [ $? -eq 0 ]; then
37 | echo "== Miniconda Installed =="
38 | else
39 | echo "Failed to Install miniconda"
40 | tail -n 50 "$LOG_PATH"
41 | exit 1
42 | fi
43 |
44 | rm miniconda.sh
45 |
46 | source "$HOME/miniconda3/etc/profile.d/conda.sh"
47 |
48 | "$HOME/miniconda3/bin/conda" config --add channels conda-forge
49 |
50 | "$HOME/miniconda3/bin/conda" update -q --all -y 1>/dev/null
51 |
52 | "$HOME/miniconda3/bin/conda" install python=3.11 -q -y
53 |
54 | "$HOME/miniconda3/bin/conda" install gcc=14 gxx ffmpeg cmake make unzip -q -y
55 |
56 | if [ "$CUDA_VERSION" = "12.8" ]; then
57 | "$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu128
58 | elif [ "$CUDA_VERSION" = "12.6" ]; then
59 | "$HOME/miniconda3/bin/pip" install torch==2.6 torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
60 | fi
61 |
62 | "$HOME/miniconda3/bin/pip" cache purge
63 |
64 | rm $LOG_PATH
65 |
66 | rm -rf "$HOME/miniconda3/pkgs"
67 |
68 | mkdir -p "$HOME/miniconda3/pkgs"
69 |
70 | rm -rf "$HOME/.conda" "$HOME/.cache"
71 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG CUDA_VERSION=12.6
2 | ARG TORCH_BASE=full
3 |
4 | FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE}
5 |
6 | LABEL maintainer="XXXXRT"
7 | LABEL version="V4"
8 | LABEL description="Docker image for GPT-SoVITS"
9 |
10 | ARG CUDA_VERSION=12.6
11 |
12 | ENV CUDA_VERSION=${CUDA_VERSION}
13 |
14 | SHELL ["/bin/bash", "-c"]
15 |
16 | WORKDIR /workspace/GPT-SoVITS
17 |
18 | COPY Docker /workspace/GPT-SoVITS/Docker/
19 |
20 | ARG LITE=false
21 | ENV LITE=${LITE}
22 |
23 | ARG WORKFLOW=false
24 | ENV WORKFLOW=${WORKFLOW}
25 |
26 | ARG TARGETPLATFORM
27 | ENV TARGETPLATFORM=${TARGETPLATFORM}
28 |
29 | RUN bash Docker/miniconda_install.sh
30 |
31 | COPY extra-req.txt /workspace/GPT-SoVITS/
32 |
33 | COPY requirements.txt /workspace/GPT-SoVITS/
34 |
35 | COPY install.sh /workspace/GPT-SoVITS/
36 |
37 | RUN bash Docker/install_wrapper.sh
38 |
39 | EXPOSE 9871 9872 9873 9874 9880
40 |
41 | ENV PYTHONPATH="/workspace/GPT-SoVITS"
42 |
43 | RUN conda init bash && echo "conda activate base" >> ~/.bashrc
44 |
45 | WORKDIR /workspace
46 |
47 | RUN rm -rf /workspace/GPT-SoVITS
48 |
49 | WORKDIR /workspace/GPT-SoVITS
50 |
51 | COPY . /workspace/GPT-SoVITS
52 |
53 | CMD ["/bin/bash", "-c", "\
54 | rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
55 | rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
56 | rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
57 | rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
58 | ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
59 | ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
60 | ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
61 | ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
62 | exec bash"]
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/data/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/data/data_module.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | from pytorch_lightning import LightningDataModule
4 | from torch.utils.data import DataLoader
5 |
6 | from AR.data.bucket_sampler import DistributedBucketSampler
7 | from AR.data.dataset import Text2SemanticDataset
8 |
9 |
10 | class Text2SemanticDataModule(LightningDataModule):
11 | def __init__(
12 | self,
13 | config,
14 | train_semantic_path,
15 | train_phoneme_path,
16 | dev_semantic_path=None,
17 | dev_phoneme_path=None,
18 | ):
19 | super().__init__()
20 | self.config = config
21 | self.train_semantic_path = train_semantic_path
22 | self.train_phoneme_path = train_phoneme_path
23 | self.dev_semantic_path = dev_semantic_path
24 | self.dev_phoneme_path = dev_phoneme_path
25 | self.num_workers = self.config["data"]["num_workers"]
26 |
27 | def prepare_data(self):
28 | pass
29 |
30 | def setup(self, stage=None, output_logs=False):
31 | self._train_dataset = Text2SemanticDataset(
32 | phoneme_path=self.train_phoneme_path,
33 | semantic_path=self.train_semantic_path,
34 | max_sec=self.config["data"]["max_sec"],
35 | pad_val=self.config["data"]["pad_val"],
36 | )
37 | self._dev_dataset = self._train_dataset
38 | # self._dev_dataset = Text2SemanticDataset(
39 | # phoneme_path=self.dev_phoneme_path,
40 | # semantic_path=self.dev_semantic_path,
41 | # max_sample=self.config['data']['max_eval_sample'],
42 | # max_sec=self.config['data']['max_sec'],
43 | # pad_val=self.config['data']['pad_val'])
44 |
45 | def train_dataloader(self):
46 | batch_size = (
47 | self.config["train"]["batch_size"] // 2
48 | if self.config["train"].get("if_dpo", False) is True
49 | else self.config["train"]["batch_size"]
50 | )
51 | batch_size = max(min(batch_size, len(self._train_dataset) // 4), 1) # 防止不保存
52 | sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
53 | return DataLoader(
54 | self._train_dataset,
55 | batch_size=batch_size,
56 | sampler=sampler,
57 | collate_fn=self._train_dataset.collate,
58 | num_workers=self.num_workers,
59 | persistent_workers=True,
60 | prefetch_factor=16,
61 | )
62 |
63 | def val_dataloader(self):
64 | return DataLoader(
65 | self._dev_dataset,
66 | batch_size=1,
67 | shuffle=False,
68 | collate_fn=self._train_dataset.collate,
69 | num_workers=max(self.num_workers, 12),
70 | persistent_workers=True,
71 | prefetch_factor=16,
72 | )
73 |
74 | # 这个会使用到嘛?
75 | def test_dataloader(self):
76 | return DataLoader(
77 | self._dev_dataset,
78 | batch_size=1,
79 | shuffle=False,
80 | collate_fn=self._train_dataset.collate,
81 | )
82 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/models/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | import os
4 | import sys
5 |
6 | now_dir = os.getcwd()
7 | sys.path.append(now_dir)
8 | from typing import Dict
9 |
10 | import torch
11 | from pytorch_lightning import LightningModule
12 |
13 | from AR.models.t2s_model_onnx import Text2SemanticDecoder
14 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule
15 | from AR.modules.optim import ScaledAdam
16 |
17 |
18 | class Text2SemanticLightningModule(LightningModule):
19 | def __init__(self, config, output_dir, is_train=True):
20 | super().__init__()
21 | self.config = config
22 | self.top_k = 3
23 | self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
24 | pretrained_s1 = config.get("pretrained_s1")
25 | if pretrained_s1 and is_train:
26 | # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
27 | print(
28 | self.load_state_dict(
29 | torch.load(
30 | pretrained_s1,
31 | map_location="cpu",
32 | )["weight"],
33 | ),
34 | )
35 | if is_train:
36 | self.automatic_optimization = False
37 | self.save_hyperparameters()
38 | self.eval_dir = output_dir / "eval"
39 | self.eval_dir.mkdir(parents=True, exist_ok=True)
40 |
41 | def training_step(self, batch: Dict, batch_idx: int):
42 | opt = self.optimizers()
43 | scheduler = self.lr_schedulers()
44 | loss, acc = self.model.forward(
45 | batch["phoneme_ids"],
46 | batch["phoneme_ids_len"],
47 | batch["semantic_ids"],
48 | batch["semantic_ids_len"],
49 | batch["bert_feature"],
50 | )
51 | self.manual_backward(loss)
52 | if batch_idx > 0 and batch_idx % 4 == 0:
53 | opt.step()
54 | opt.zero_grad()
55 | scheduler.step()
56 |
57 | self.log(
58 | "total_loss",
59 | loss,
60 | on_step=True,
61 | on_epoch=True,
62 | prog_bar=True,
63 | sync_dist=True,
64 | )
65 | self.log(
66 | "lr",
67 | scheduler.get_last_lr()[0],
68 | on_epoch=True,
69 | prog_bar=True,
70 | sync_dist=True,
71 | )
72 | self.log(
73 | f"top_{self.top_k}_acc",
74 | acc,
75 | on_step=True,
76 | on_epoch=True,
77 | prog_bar=True,
78 | sync_dist=True,
79 | )
80 |
81 | def validation_step(self, batch: Dict, batch_idx: int):
82 | return
83 |
84 | def configure_optimizers(self):
85 | model_parameters = self.model.parameters()
86 | parameters_names = []
87 | parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()])
88 | lm_opt = ScaledAdam(
89 | model_parameters,
90 | lr=0.01,
91 | betas=(0.9, 0.95),
92 | clipping_scale=2.0,
93 | parameters_names=parameters_names,
94 | show_dominant_parameters=False,
95 | clipping_update_period=1000,
96 | )
97 |
98 | return {
99 | "optimizer": lm_opt,
100 | "lr_scheduler": {
101 | "scheduler": WarmupCosineLRSchedule(
102 | lm_opt,
103 | init_lr=self.config["optimizer"]["lr_init"],
104 | peak_lr=self.config["optimizer"]["lr"],
105 | end_lr=self.config["optimizer"]["lr_end"],
106 | warmup_steps=self.config["optimizer"]["warmup_steps"],
107 | total_steps=self.config["optimizer"]["decay_steps"],
108 | )
109 | },
110 | }
111 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/modules/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
2 | import math
3 |
4 | import torch
5 | from torch import nn
6 |
7 |
8 | class TokenEmbedding(nn.Module):
9 | def __init__(
10 | self,
11 | embedding_dim: int,
12 | vocab_size: int,
13 | dropout: float = 0.0,
14 | ):
15 | super().__init__()
16 |
17 | self.vocab_size = vocab_size
18 | self.embedding_dim = embedding_dim
19 |
20 | self.dropout = torch.nn.Dropout(p=dropout)
21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 |
23 | @property
24 | def weight(self) -> torch.Tensor:
25 | return self.word_embeddings.weight
26 |
27 | def embedding(self, index: int) -> torch.Tensor:
28 | return self.word_embeddings.weight[index : index + 1]
29 |
30 | def forward(self, x: torch.Tensor):
31 | x = self.word_embeddings(x)
32 | x = self.dropout(x)
33 | return x
34 |
35 |
36 | class SinePositionalEmbedding(nn.Module):
37 | def __init__(
38 | self,
39 | embedding_dim: int,
40 | dropout: float = 0.0,
41 | scale: bool = False,
42 | alpha: bool = False,
43 | ):
44 | super().__init__()
45 | self.embedding_dim = embedding_dim
46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 | self.dropout = torch.nn.Dropout(p=dropout)
49 |
50 | self.reverse = False
51 | self.pe = None
52 | self.extend_pe(torch.tensor(0.0).expand(1, 4000))
53 |
54 | def extend_pe(self, x):
55 | """Reset the positional encodings."""
56 | if self.pe is not None:
57 | if self.pe.size(1) >= x.size(1):
58 | if self.pe.dtype != x.dtype or self.pe.device != x.device:
59 | self.pe = self.pe.to(dtype=x.dtype, device=x.device)
60 | return
61 | pe = torch.zeros(x.size(1), self.embedding_dim)
62 | if self.reverse:
63 | position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
64 | else:
65 | position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
66 | div_term = torch.exp(
67 | torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim)
68 | )
69 | pe[:, 0::2] = torch.sin(position * div_term)
70 | pe[:, 1::2] = torch.cos(position * div_term)
71 | pe = pe.unsqueeze(0)
72 | self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
73 |
74 | def forward(self, x: torch.Tensor) -> torch.Tensor:
75 | self.extend_pe(x)
76 | output = x.unsqueeze(-1) if x.ndim == 2 else x
77 | output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
78 | return self.dropout(output)
79 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/embedding_onnx.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
2 | import math
3 |
4 | import torch
5 | from torch import nn
6 |
7 |
8 | class TokenEmbedding(nn.Module):
9 | def __init__(
10 | self,
11 | embedding_dim: int,
12 | vocab_size: int,
13 | dropout: float = 0.0,
14 | ):
15 | super().__init__()
16 |
17 | self.vocab_size = vocab_size
18 | self.embedding_dim = embedding_dim
19 |
20 | self.dropout = torch.nn.Dropout(p=dropout)
21 | self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 |
23 | @property
24 | def weight(self) -> torch.Tensor:
25 | return self.word_embeddings.weight
26 |
27 | def embedding(self, index: int) -> torch.Tensor:
28 | return self.word_embeddings.weight[index : index + 1]
29 |
30 | def forward(self, x: torch.Tensor):
31 | x = self.word_embeddings(x)
32 | x = self.dropout(x)
33 | return x
34 |
35 |
36 | class SinePositionalEmbedding(nn.Module):
37 | def __init__(
38 | self,
39 | embedding_dim: int,
40 | dropout: float = 0.0,
41 | scale: bool = False,
42 | alpha: bool = False,
43 | ):
44 | super().__init__()
45 | self.embedding_dim = embedding_dim
46 | self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 | self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 | self.dropout = torch.nn.Dropout(p=dropout)
49 | self.reverse = False
50 | self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
51 |
52 | def extend_pe(self, x):
53 | position = torch.cumsum(torch.ones_like(x[:, :, 0]), dim=1).transpose(0, 1)
54 | scpe = (position * self.div_term).unsqueeze(0)
55 | pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
56 | pe = pe.contiguous().view(1, -1, self.embedding_dim)
57 | return pe
58 |
59 | def forward(self, x: torch.Tensor) -> torch.Tensor:
60 | pe = self.extend_pe(x)
61 | output = x.unsqueeze(-1) if x.ndim == 2 else x
62 | output = output * self.x_scale + self.alpha * pe
63 | return self.dropout(output)
64 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/lr_schedulers.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | import math
4 |
5 | import torch
6 | from matplotlib import pyplot as plt
7 | from torch import nn
8 | from torch.optim import Adam
9 |
10 |
11 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
12 | """
13 | Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
14 | """
15 |
16 | def __init__(
17 | self,
18 | optimizer,
19 | init_lr,
20 | peak_lr,
21 | end_lr,
22 | warmup_steps=10000,
23 | total_steps=400000,
24 | current_step=0,
25 | ):
26 | self.init_lr = init_lr
27 | self.peak_lr = peak_lr
28 | self.end_lr = end_lr
29 | self.optimizer = optimizer
30 | self._warmup_rate = (peak_lr - init_lr) / warmup_steps
31 | self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
32 | self._current_step = current_step
33 | self.lr = init_lr
34 | self.warmup_steps = warmup_steps
35 | self.total_steps = total_steps
36 | self._last_lr = [self.lr]
37 |
38 | def set_lr(self, lr):
39 | self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
40 | for g in self.optimizer.param_groups:
41 | # g['lr'] = lr
42 | g["lr"] = self.end_lr ###锁定用线性
43 |
44 | def step(self):
45 | if self._current_step < self.warmup_steps:
46 | lr = self.init_lr + self._warmup_rate * self._current_step
47 |
48 | elif self._current_step > self.total_steps:
49 | lr = self.end_lr
50 |
51 | else:
52 | decay_ratio = (self._current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
53 | if decay_ratio < 0.0 or decay_ratio > 1.0:
54 | raise RuntimeError("Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings.")
55 | coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
56 | lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
57 |
58 | self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定!
59 | self.set_lr(lr)
60 | self.lr = lr
61 | self._current_step += 1
62 | return self.lr
63 |
64 |
65 | if __name__ == "__main__":
66 | m = nn.Linear(10, 10)
67 | opt = Adam(m.parameters(), lr=1e-4)
68 | s = WarmupCosineLRSchedule(
69 | opt,
70 | 1e-6,
71 | 2e-4,
72 | 1e-6,
73 | warmup_steps=2000,
74 | total_steps=20000,
75 | current_step=0,
76 | )
77 | lrs = []
78 | for i in range(25000):
79 | s.step()
80 | lrs.append(s.lr)
81 | print(s.lr)
82 |
83 | plt.plot(lrs)
84 | plt.plot(range(0, 25000), lrs)
85 | plt.show()
86 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py:
--------------------------------------------------------------------------------
1 | from torch.nn.functional import *
2 | from torch.nn.functional import (
3 | _canonical_mask,
4 | )
5 |
6 |
7 | def multi_head_attention_forward_patched(
8 | query,
9 | key,
10 | value,
11 | embed_dim_to_check: int,
12 | num_heads: int,
13 | in_proj_weight,
14 | in_proj_bias: Optional[Tensor],
15 | bias_k: Optional[Tensor],
16 | bias_v: Optional[Tensor],
17 | add_zero_attn: bool,
18 | dropout_p: float,
19 | out_proj_weight: Tensor,
20 | out_proj_bias: Optional[Tensor],
21 | training: bool = True,
22 | key_padding_mask: Optional[Tensor] = None,
23 | need_weights: bool = True,
24 | attn_mask: Optional[Tensor] = None,
25 | use_separate_proj_weight: bool = False,
26 | q_proj_weight: Optional[Tensor] = None,
27 | k_proj_weight: Optional[Tensor] = None,
28 | v_proj_weight: Optional[Tensor] = None,
29 | static_k: Optional[Tensor] = None,
30 | static_v: Optional[Tensor] = None,
31 | average_attn_weights: bool = True,
32 | is_causal: bool = False,
33 | cache=None,
34 | ) -> Tuple[Tensor, Optional[Tensor]]:
35 | # set up shape vars
36 | _, _, embed_dim = query.shape
37 | attn_mask = _canonical_mask(
38 | mask=attn_mask,
39 | mask_name="attn_mask",
40 | other_type=None,
41 | other_name="",
42 | target_type=query.dtype,
43 | check_other=False,
44 | )
45 | head_dim = embed_dim // num_heads
46 |
47 | proj_qkv = linear(query, in_proj_weight, in_proj_bias)
48 | proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
49 | q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
50 |
51 | if cache["first_infer"] == 1:
52 | cache["k"][cache["stage"]] = k
53 | cache["v"][cache["stage"]] = v
54 | else:
55 | cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
56 | cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
57 | k = cache["k"][cache["stage"]]
58 | v = cache["v"][cache["stage"]]
59 | cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
60 |
61 | attn_mask = _canonical_mask(
62 | mask=attn_mask,
63 | mask_name="attn_mask",
64 | other_type=None,
65 | other_name="",
66 | target_type=q.dtype,
67 | check_other=False,
68 | )
69 | attn_mask = attn_mask.unsqueeze(0)
70 |
71 | q = q.view(-1, num_heads, head_dim).transpose(0, 1)
72 | k = k.view(-1, num_heads, head_dim).transpose(0, 1)
73 | v = v.view(-1, num_heads, head_dim).transpose(0, 1)
74 |
75 | dropout_p = 0.0
76 | attn_mask = attn_mask.unsqueeze(0)
77 | q = q.view(num_heads, -1, head_dim).unsqueeze(0)
78 | k = k.view(num_heads, -1, head_dim).unsqueeze(0)
79 | v = v.view(num_heads, -1, head_dim).unsqueeze(0)
80 | attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
81 | attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
82 | attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
83 | attn_output = attn_output.view(-1, 1, attn_output.size(1))
84 |
85 | return attn_output
86 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/AR/text_processing/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | import itertools
4 | import re
5 | from typing import Dict
6 | from typing import List
7 |
8 | import regex
9 | from gruut import sentences
10 | from gruut.const import Sentence
11 | from gruut.const import Word
12 | from AR.text_processing.symbols import SYMBOL_TO_ID
13 |
14 |
15 | class GruutPhonemizer:
16 | def __init__(self, language: str):
17 | self._phonemizer = sentences
18 | self.lang = language
19 | self.symbol_to_id = SYMBOL_TO_ID
20 | self._special_cases_dict: Dict[str] = {
21 | r"\.\.\.": "... ",
22 | ";": "; ",
23 | ":": ": ",
24 | ",": ", ",
25 | r"\.": ". ",
26 | "!": "! ",
27 | r"\?": "? ",
28 | "—": "—",
29 | "…": "… ",
30 | "«": "«",
31 | "»": "»",
32 | }
33 | self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])"
34 |
35 | def _normalize_punctuation(self, text: str) -> str:
36 | text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
37 | text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
38 | text = regex.sub(r"\pZ+", r" ", text)
39 | return text.strip()
40 |
41 | def _convert_punctuation(self, word: Word) -> str:
42 | if not word.phonemes:
43 | return ""
44 | if word.phonemes[0] in ["‖", "|"]:
45 | return word.text.strip()
46 |
47 | phonemes = "".join(word.phonemes)
48 | # remove modifier characters ˈˌː with regex
49 | phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
50 | return phonemes.strip()
51 |
52 | def phonemize(self, text: str, espeak: bool = False) -> str:
53 | text_to_phonemize: str = self._normalize_punctuation(text)
54 | sents: List[Sentence] = [sent for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)]
55 | words: List[str] = [self._convert_punctuation(word) for word in itertools.chain(*sents)]
56 | return " ".join(words)
57 |
58 | def transform(self, phonemes):
59 | # convert phonemes to ids
60 | # dictionary is in symbols.py
61 | return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
62 |
63 |
64 | if __name__ == "__main__":
65 | phonemizer = GruutPhonemizer("en-us")
66 | # text -> IPA
67 | phonemes = phonemizer.phonemize("Hello, wor-ld ?")
68 | print("phonemes:", phonemes)
69 | print("len(phonemes):", len(phonemes))
70 | phoneme_ids = phonemizer.transform(phonemes)
71 | print("phoneme_ids:", phoneme_ids)
72 | print("len(phoneme_ids):", len(phoneme_ids))
73 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/text_processing/symbols.py:
--------------------------------------------------------------------------------
1 | # modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py
2 | # reference: https://github.com/lifeiteng/vall-e
3 | PAD = "_"
4 | PUNCTUATION = ';:,.!?¡¿—…"«»“” '
5 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
6 | IPA_LETTERS = (
7 | "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
8 | )
9 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
10 | SPACE_ID = SYMBOLS.index(" ")
11 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
12 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
13 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/__init__.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def str2bool(str):
5 | return True if str.lower() == "true" else False
6 |
7 |
8 | def get_newest_ckpt(string_list):
9 | # 定义一个正则表达式模式,用于匹配字符串中的数字
10 | pattern = r"epoch=(\d+)-step=(\d+)\.ckpt"
11 |
12 | # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表
13 | extracted_info = []
14 | for string in string_list:
15 | match = re.match(pattern, string)
16 | if match:
17 | epoch = int(match.group(1))
18 | step = int(match.group(2))
19 | extracted_info.append((epoch, step, string))
20 | # 按照 epoch 后面的数字和 step 后面的数字进行排序
21 | sorted_info = sorted(extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
22 | # 获取最新的 ckpt 文件名
23 | newest_ckpt = sorted_info[0][2]
24 | return newest_ckpt
25 |
26 |
27 | # 文本存在且不为空时 return True
28 | def check_txt_file(file_path):
29 | try:
30 | with open(file_path, "r") as file:
31 | text = file.readline().strip()
32 | assert text.strip() != ""
33 | return text
34 | except Exception:
35 | return False
36 | return False
37 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/initialize.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Initialize modules for espnet2 neural networks."""
3 |
4 | import torch
5 | from typeguard import check_argument_types
6 |
7 |
8 | def initialize(model: torch.nn.Module, init: str):
9 | """Initialize weights of a neural network module.
10 |
11 | Parameters are initialized using the given method or distribution.
12 |
13 | Custom initialization routines can be implemented into submodules
14 | as function `espnet_initialization_fn` within the custom module.
15 |
16 | Args:
17 | model: Target.
18 | init: Method of initialization.
19 | """
20 | assert check_argument_types()
21 | print("init with", init)
22 |
23 | # weight init
24 | for p in model.parameters():
25 | if p.dim() > 1:
26 | if init == "xavier_uniform":
27 | torch.nn.init.xavier_uniform_(p.data)
28 | elif init == "xavier_normal":
29 | torch.nn.init.xavier_normal_(p.data)
30 | elif init == "kaiming_uniform":
31 | torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
32 | elif init == "kaiming_normal":
33 | torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
34 | else:
35 | raise ValueError("Unknown initialization: " + init)
36 | # bias init
37 | for name, p in model.named_parameters():
38 | if ".bias" in name and p.dim() == 1:
39 | p.data.zero_()
40 |
--------------------------------------------------------------------------------
/GPT_SoVITS/AR/utils/io.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 | import yaml
5 |
6 |
7 | def load_yaml_config(path):
8 | with open(path) as f:
9 | config = yaml.full_load(f)
10 | return config
11 |
12 |
13 | def save_config_to_yaml(config, path):
14 | assert path.endswith(".yaml")
15 | with open(path, "w") as f:
16 | f.write(yaml.dump(config))
17 | f.close()
18 |
19 |
20 | def write_args(args, path):
21 | args_dict = dict((name, getattr(args, name)) for name in dir(args) if not name.startswith("_"))
22 | with open(path, "a") as args_file:
23 | args_file.write("==> torch version: {}\n".format(torch.__version__))
24 | args_file.write("==> cudnn version: {}\n".format(torch.backends.cudnn.version()))
25 | args_file.write("==> Cmd:\n")
26 | args_file.write(str(sys.argv))
27 | args_file.write("\n==> args:\n")
28 | for k, v in sorted(args_dict.items()):
29 | args_file.write(" %s: %s\n" % (str(k), str(v)))
30 | args_file.close()
31 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 NVIDIA CORPORATION.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 NVIDIA CORPORATION.
2 | # Licensed under the MIT license.
3 |
4 | import torch
5 | import torch.nn as nn
6 | from alias_free_activation.torch.resample import UpSample1d, DownSample1d
7 |
8 | # load fused CUDA kernel: this enables importing anti_alias_activation_cuda
9 | from alias_free_activation.cuda import load
10 |
11 | anti_alias_activation_cuda = load.load()
12 |
13 |
14 | class FusedAntiAliasActivation(torch.autograd.Function):
15 | """
16 | Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
17 | The hyperparameters are hard-coded in the kernel to maximize speed.
18 | NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
19 | """
20 |
21 | @staticmethod
22 | def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
23 | activation_results = anti_alias_activation_cuda.forward(inputs, up_ftr, down_ftr, alpha, beta)
24 |
25 | return activation_results
26 |
27 | @staticmethod
28 | def backward(ctx, output_grads):
29 | raise NotImplementedError
30 | return output_grads, None, None
31 |
32 |
33 | class Activation1d(nn.Module):
34 | def __init__(
35 | self,
36 | activation,
37 | up_ratio: int = 2,
38 | down_ratio: int = 2,
39 | up_kernel_size: int = 12,
40 | down_kernel_size: int = 12,
41 | fused: bool = True,
42 | ):
43 | super().__init__()
44 | self.up_ratio = up_ratio
45 | self.down_ratio = down_ratio
46 | self.act = activation
47 | self.upsample = UpSample1d(up_ratio, up_kernel_size)
48 | self.downsample = DownSample1d(down_ratio, down_kernel_size)
49 |
50 | self.fused = fused # Whether to use fused CUDA kernel or not
51 |
52 | def forward(self, x):
53 | if not self.fused:
54 | x = self.upsample(x)
55 | x = self.act(x)
56 | x = self.downsample(x)
57 | return x
58 | else:
59 | if self.act.__class__.__name__ == "Snake":
60 | beta = self.act.alpha.data # Snake uses same params for alpha and beta
61 | else:
62 | beta = self.act.beta.data # Snakebeta uses different params for alpha and beta
63 | alpha = self.act.alpha.data
64 | if not self.act.alpha_logscale: # Exp baked into cuda kernel, cancel it out with a log
65 | alpha = torch.log(alpha)
66 | beta = torch.log(beta)
67 |
68 | x = FusedAntiAliasActivation.apply(x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta)
69 | return x
70 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp:
--------------------------------------------------------------------------------
1 | /* coding=utf-8
2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 |
19 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
20 |
21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
22 | m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
23 | }
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/build/_:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/compat.h:
--------------------------------------------------------------------------------
1 | /* coding=utf-8
2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | /*This code is copied fron NVIDIA apex:
18 | * https://github.com/NVIDIA/apex
19 | * with minor changes. */
20 |
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 |
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 NVIDIA CORPORATION.
2 | # Licensed under the MIT license.
3 |
4 | import os
5 | import pathlib
6 | import subprocess
7 |
8 | from torch.utils import cpp_extension
9 |
10 | """
11 | Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels.
12 | Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
13 | """
14 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
15 |
16 |
17 | def load():
18 | # Check if cuda 11 is installed for compute capability 8.0
19 | cc_flag = []
20 | _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
21 | if int(bare_metal_major) >= 11:
22 | cc_flag.append("-gencode")
23 | cc_flag.append("arch=compute_80,code=sm_80")
24 |
25 | # Build path
26 | srcpath = pathlib.Path(__file__).parent.absolute()
27 | buildpath = srcpath / "build"
28 | _create_build_dir(buildpath)
29 |
30 | # Helper function to build the kernels.
31 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
32 | return cpp_extension.load(
33 | name=name,
34 | sources=sources,
35 | build_directory=buildpath,
36 | extra_cflags=[
37 | "-O3",
38 | ],
39 | extra_cuda_cflags=[
40 | "-O3",
41 | "-gencode",
42 | "arch=compute_70,code=sm_70",
43 | "--use_fast_math",
44 | ]
45 | + extra_cuda_flags
46 | + cc_flag,
47 | verbose=True,
48 | )
49 |
50 | extra_cuda_flags = [
51 | "-U__CUDA_NO_HALF_OPERATORS__",
52 | "-U__CUDA_NO_HALF_CONVERSIONS__",
53 | "--expt-relaxed-constexpr",
54 | "--expt-extended-lambda",
55 | ]
56 |
57 | sources = [
58 | srcpath / "anti_alias_activation.cpp",
59 | srcpath / "anti_alias_activation_cuda.cu",
60 | ]
61 | anti_alias_activation_cuda = _cpp_extention_load_helper("anti_alias_activation_cuda", sources, extra_cuda_flags)
62 |
63 | return anti_alias_activation_cuda
64 |
65 |
66 | def _get_cuda_bare_metal_version(cuda_dir):
67 | raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
68 | output = raw_output.split()
69 | release_idx = output.index("release") + 1
70 | release = output[release_idx].split(".")
71 | bare_metal_major = release[0]
72 | bare_metal_minor = release[1][0]
73 |
74 | return raw_output, bare_metal_major, bare_metal_minor
75 |
76 |
77 | def _create_build_dir(buildpath):
78 | try:
79 | os.mkdir(buildpath)
80 | except OSError:
81 | if not os.path.isdir(buildpath):
82 | print(f"Creation of the build directory {buildpath} failed")
83 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | # LICENSE is in incl_licenses directory.
3 |
4 | from .filter import *
5 | from .resample import *
6 | from .act import *
7 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | # LICENSE is in incl_licenses directory.
3 |
4 | import torch.nn as nn
5 | from .resample import UpSample1d, DownSample1d
6 |
7 |
8 | class Activation1d(nn.Module):
9 | def __init__(
10 | self,
11 | activation,
12 | up_ratio: int = 2,
13 | down_ratio: int = 2,
14 | up_kernel_size: int = 12,
15 | down_kernel_size: int = 12,
16 | ):
17 | super().__init__()
18 | self.up_ratio = up_ratio
19 | self.down_ratio = down_ratio
20 | self.act = activation
21 | self.upsample = UpSample1d(up_ratio, up_kernel_size)
22 | self.downsample = DownSample1d(down_ratio, down_kernel_size)
23 |
24 | # x: [B,C,T]
25 | def forward(self, x):
26 | x = self.upsample(x)
27 | x = self.act(x)
28 | x = self.downsample(x)
29 |
30 | return x
31 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | # LICENSE is in incl_licenses directory.
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | import math
8 |
9 | if "sinc" in dir(torch):
10 | sinc = torch.sinc
11 | else:
12 | # This code is adopted from adefossez's julius.core.sinc under the MIT License
13 | # https://adefossez.github.io/julius/julius/core.html
14 | # LICENSE is in incl_licenses directory.
15 | def sinc(x: torch.Tensor):
16 | """
17 | Implementation of sinc, i.e. sin(pi * x) / (pi * x)
18 | __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
19 | """
20 | return torch.where(
21 | x == 0,
22 | torch.tensor(1.0, device=x.device, dtype=x.dtype),
23 | torch.sin(math.pi * x) / math.pi / x,
24 | )
25 |
26 |
27 | # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
28 | # https://adefossez.github.io/julius/julius/lowpass.html
29 | # LICENSE is in incl_licenses directory.
30 | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
31 | even = kernel_size % 2 == 0
32 | half_size = kernel_size // 2
33 |
34 | # For kaiser window
35 | delta_f = 4 * half_width
36 | A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
37 | if A > 50.0:
38 | beta = 0.1102 * (A - 8.7)
39 | elif A >= 21.0:
40 | beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
41 | else:
42 | beta = 0.0
43 | window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
44 |
45 | # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
46 | if even:
47 | time = torch.arange(-half_size, half_size) + 0.5
48 | else:
49 | time = torch.arange(kernel_size) - half_size
50 | if cutoff == 0:
51 | filter_ = torch.zeros_like(time)
52 | else:
53 | filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
54 | """
55 | Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
56 | """
57 | filter_ /= filter_.sum()
58 | filter = filter_.view(1, 1, kernel_size)
59 |
60 | return filter
61 |
62 |
63 | class LowPassFilter1d(nn.Module):
64 | def __init__(
65 | self,
66 | cutoff=0.5,
67 | half_width=0.6,
68 | stride: int = 1,
69 | padding: bool = True,
70 | padding_mode: str = "replicate",
71 | kernel_size: int = 12,
72 | ):
73 | """
74 | kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
75 | """
76 | super().__init__()
77 | if cutoff < -0.0:
78 | raise ValueError("Minimum cutoff must be larger than zero.")
79 | if cutoff > 0.5:
80 | raise ValueError("A cutoff above 0.5 does not make sense.")
81 | self.kernel_size = kernel_size
82 | self.even = kernel_size % 2 == 0
83 | self.pad_left = kernel_size // 2 - int(self.even)
84 | self.pad_right = kernel_size // 2
85 | self.stride = stride
86 | self.padding = padding
87 | self.padding_mode = padding_mode
88 | filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
89 | self.register_buffer("filter", filter)
90 |
91 | # Input [B, C, T]
92 | def forward(self, x):
93 | _, C, _ = x.shape
94 |
95 | if self.padding:
96 | x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
97 | out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
98 |
99 | return out
100 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | # LICENSE is in incl_licenses directory.
3 |
4 | import torch.nn as nn
5 | from torch.nn import functional as F
6 | from .filter import LowPassFilter1d
7 | from .filter import kaiser_sinc_filter1d
8 |
9 |
10 | class UpSample1d(nn.Module):
11 | def __init__(self, ratio=2, kernel_size=None):
12 | super().__init__()
13 | self.ratio = ratio
14 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
15 | self.stride = ratio
16 | self.pad = self.kernel_size // ratio - 1
17 | self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
18 | self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
19 | filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size)
20 | self.register_buffer("filter", filter)
21 |
22 | # x: [B, C, T]
23 | def forward(self, x):
24 | _, C, _ = x.shape
25 |
26 | x = F.pad(x, (self.pad, self.pad), mode="replicate")
27 | x = self.ratio * F.conv_transpose1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
28 | x = x[..., self.pad_left : -self.pad_right]
29 |
30 | return x
31 |
32 |
33 | class DownSample1d(nn.Module):
34 | def __init__(self, ratio=2, kernel_size=None):
35 | super().__init__()
36 | self.ratio = ratio
37 | self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
38 | self.lowpass = LowPassFilter1d(
39 | cutoff=0.5 / ratio,
40 | half_width=0.6 / ratio,
41 | stride=ratio,
42 | kernel_size=self.kernel_size,
43 | )
44 |
45 | def forward(self, x):
46 | xx = self.lowpass(x)
47 |
48 | return xx
49 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_22khz_80band.json:
--------------------------------------------------------------------------------
1 | {
2 | "resblock": "1",
3 | "num_gpus": 0,
4 | "batch_size": 32,
5 | "learning_rate": 0.0001,
6 | "adam_b1": 0.8,
7 | "adam_b2": 0.99,
8 | "lr_decay": 0.9999996,
9 | "seed": 1234,
10 |
11 | "upsample_rates": [4,4,2,2,2,2],
12 | "upsample_kernel_sizes": [8,8,4,4,4,4],
13 | "upsample_initial_channel": 1536,
14 | "resblock_kernel_sizes": [3,7,11],
15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |
17 | "activation": "snakebeta",
18 | "snake_logscale": true,
19 |
20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 | "mpd_reshapes": [2, 3, 5, 7, 11],
22 | "use_spectral_norm": false,
23 | "discriminator_channel_mult": 1,
24 |
25 | "segment_size": 8192,
26 | "num_mels": 80,
27 | "num_freq": 1025,
28 | "n_fft": 1024,
29 | "hop_size": 256,
30 | "win_size": 1024,
31 |
32 | "sampling_rate": 22050,
33 |
34 | "fmin": 0,
35 | "fmax": 8000,
36 | "fmax_for_loss": null,
37 |
38 | "num_workers": 4,
39 |
40 | "dist_config": {
41 | "dist_backend": "nccl",
42 | "dist_url": "tcp://localhost:54321",
43 | "world_size": 1
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_24khz_100band.json:
--------------------------------------------------------------------------------
1 | {
2 | "resblock": "1",
3 | "num_gpus": 0,
4 | "batch_size": 32,
5 | "learning_rate": 0.0001,
6 | "adam_b1": 0.8,
7 | "adam_b2": 0.99,
8 | "lr_decay": 0.9999996,
9 | "seed": 1234,
10 |
11 | "upsample_rates": [4,4,2,2,2,2],
12 | "upsample_kernel_sizes": [8,8,4,4,4,4],
13 | "upsample_initial_channel": 1536,
14 | "resblock_kernel_sizes": [3,7,11],
15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |
17 | "activation": "snakebeta",
18 | "snake_logscale": true,
19 |
20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 | "mpd_reshapes": [2, 3, 5, 7, 11],
22 | "use_spectral_norm": false,
23 | "discriminator_channel_mult": 1,
24 |
25 | "segment_size": 8192,
26 | "num_mels": 100,
27 | "num_freq": 1025,
28 | "n_fft": 1024,
29 | "hop_size": 256,
30 | "win_size": 1024,
31 |
32 | "sampling_rate": 24000,
33 |
34 | "fmin": 0,
35 | "fmax": 12000,
36 | "fmax_for_loss": null,
37 |
38 | "num_workers": 4,
39 |
40 | "dist_config": {
41 | "dist_backend": "nccl",
42 | "dist_url": "tcp://localhost:54321",
43 | "world_size": 1
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_base_22khz_80band.json:
--------------------------------------------------------------------------------
1 | {
2 | "resblock": "1",
3 | "num_gpus": 0,
4 | "batch_size": 32,
5 | "learning_rate": 0.0001,
6 | "adam_b1": 0.8,
7 | "adam_b2": 0.99,
8 | "lr_decay": 0.9999996,
9 | "seed": 1234,
10 |
11 | "upsample_rates": [8,8,2,2],
12 | "upsample_kernel_sizes": [16,16,4,4],
13 | "upsample_initial_channel": 512,
14 | "resblock_kernel_sizes": [3,7,11],
15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |
17 | "activation": "snakebeta",
18 | "snake_logscale": true,
19 |
20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 | "mpd_reshapes": [2, 3, 5, 7, 11],
22 | "use_spectral_norm": false,
23 | "discriminator_channel_mult": 1,
24 |
25 | "segment_size": 8192,
26 | "num_mels": 80,
27 | "num_freq": 1025,
28 | "n_fft": 1024,
29 | "hop_size": 256,
30 | "win_size": 1024,
31 |
32 | "sampling_rate": 22050,
33 |
34 | "fmin": 0,
35 | "fmax": 8000,
36 | "fmax_for_loss": null,
37 |
38 | "num_workers": 4,
39 |
40 | "dist_config": {
41 | "dist_backend": "nccl",
42 | "dist_url": "tcp://localhost:54321",
43 | "world_size": 1
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_base_24khz_100band.json:
--------------------------------------------------------------------------------
1 | {
2 | "resblock": "1",
3 | "num_gpus": 0,
4 | "batch_size": 32,
5 | "learning_rate": 0.0001,
6 | "adam_b1": 0.8,
7 | "adam_b2": 0.99,
8 | "lr_decay": 0.9999996,
9 | "seed": 1234,
10 |
11 | "upsample_rates": [8,8,2,2],
12 | "upsample_kernel_sizes": [16,16,4,4],
13 | "upsample_initial_channel": 512,
14 | "resblock_kernel_sizes": [3,7,11],
15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |
17 | "activation": "snakebeta",
18 | "snake_logscale": true,
19 |
20 | "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21 | "mpd_reshapes": [2, 3, 5, 7, 11],
22 | "use_spectral_norm": false,
23 | "discriminator_channel_mult": 1,
24 |
25 | "segment_size": 8192,
26 | "num_mels": 100,
27 | "num_freq": 1025,
28 | "n_fft": 1024,
29 | "hop_size": 256,
30 | "win_size": 1024,
31 |
32 | "sampling_rate": 24000,
33 |
34 | "fmin": 0,
35 | "fmax": 12000,
36 | "fmax_for_loss": null,
37 |
38 | "num_workers": 4,
39 |
40 | "dist_config": {
41 | "dist_backend": "nccl",
42 | "dist_url": "tcp://localhost:54321",
43 | "world_size": 1
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_256x.json:
--------------------------------------------------------------------------------
1 | {
2 | "resblock": "1",
3 | "num_gpus": 0,
4 | "batch_size": 4,
5 | "learning_rate": 0.0001,
6 | "adam_b1": 0.8,
7 | "adam_b2": 0.99,
8 | "lr_decay": 0.9999996,
9 | "seed": 1234,
10 |
11 | "upsample_rates": [4,4,2,2,2,2],
12 | "upsample_kernel_sizes": [8,8,4,4,4,4],
13 | "upsample_initial_channel": 1536,
14 | "resblock_kernel_sizes": [3,7,11],
15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |
17 | "use_tanh_at_final": false,
18 | "use_bias_at_final": false,
19 |
20 | "activation": "snakebeta",
21 | "snake_logscale": true,
22 |
23 | "use_cqtd_instead_of_mrd": true,
24 | "cqtd_filters": 128,
25 | "cqtd_max_filters": 1024,
26 | "cqtd_filters_scale": 1,
27 | "cqtd_dilations": [1, 2, 4],
28 | "cqtd_hop_lengths": [512, 256, 256],
29 | "cqtd_n_octaves": [9, 9, 9],
30 | "cqtd_bins_per_octaves": [24, 36, 48],
31 |
32 | "mpd_reshapes": [2, 3, 5, 7, 11],
33 | "use_spectral_norm": false,
34 | "discriminator_channel_mult": 1,
35 |
36 | "use_multiscale_melloss": true,
37 | "lambda_melloss": 15,
38 |
39 | "clip_grad_norm": 500,
40 |
41 | "segment_size": 65536,
42 | "num_mels": 80,
43 | "num_freq": 1025,
44 | "n_fft": 1024,
45 | "hop_size": 256,
46 | "win_size": 1024,
47 |
48 | "sampling_rate": 22050,
49 |
50 | "fmin": 0,
51 | "fmax": null,
52 | "fmax_for_loss": null,
53 |
54 | "num_workers": 4,
55 |
56 | "dist_config": {
57 | "dist_backend": "nccl",
58 | "dist_url": "tcp://localhost:54321",
59 | "world_size": 1
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_fmax8k_256x.json:
--------------------------------------------------------------------------------
1 | {
2 | "resblock": "1",
3 | "num_gpus": 0,
4 | "batch_size": 4,
5 | "learning_rate": 0.0001,
6 | "adam_b1": 0.8,
7 | "adam_b2": 0.99,
8 | "lr_decay": 0.9999996,
9 | "seed": 1234,
10 |
11 | "upsample_rates": [4,4,2,2,2,2],
12 | "upsample_kernel_sizes": [8,8,4,4,4,4],
13 | "upsample_initial_channel": 1536,
14 | "resblock_kernel_sizes": [3,7,11],
15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |
17 | "use_tanh_at_final": false,
18 | "use_bias_at_final": false,
19 |
20 | "activation": "snakebeta",
21 | "snake_logscale": true,
22 |
23 | "use_cqtd_instead_of_mrd": true,
24 | "cqtd_filters": 128,
25 | "cqtd_max_filters": 1024,
26 | "cqtd_filters_scale": 1,
27 | "cqtd_dilations": [1, 2, 4],
28 | "cqtd_hop_lengths": [512, 256, 256],
29 | "cqtd_n_octaves": [9, 9, 9],
30 | "cqtd_bins_per_octaves": [24, 36, 48],
31 |
32 | "mpd_reshapes": [2, 3, 5, 7, 11],
33 | "use_spectral_norm": false,
34 | "discriminator_channel_mult": 1,
35 |
36 | "use_multiscale_melloss": true,
37 | "lambda_melloss": 15,
38 |
39 | "clip_grad_norm": 500,
40 |
41 | "segment_size": 65536,
42 | "num_mels": 80,
43 | "num_freq": 1025,
44 | "n_fft": 1024,
45 | "hop_size": 256,
46 | "win_size": 1024,
47 |
48 | "sampling_rate": 22050,
49 |
50 | "fmin": 0,
51 | "fmax": 8000,
52 | "fmax_for_loss": null,
53 |
54 | "num_workers": 4,
55 |
56 | "dist_config": {
57 | "dist_backend": "nccl",
58 | "dist_url": "tcp://localhost:54321",
59 | "world_size": 1
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_24khz_100band_256x.json:
--------------------------------------------------------------------------------
1 | {
2 | "resblock": "1",
3 | "num_gpus": 0,
4 | "batch_size": 4,
5 | "learning_rate": 0.0001,
6 | "adam_b1": 0.8,
7 | "adam_b2": 0.99,
8 | "lr_decay": 0.9999996,
9 | "seed": 1234,
10 |
11 | "upsample_rates": [4,4,2,2,2,2],
12 | "upsample_kernel_sizes": [8,8,4,4,4,4],
13 | "upsample_initial_channel": 1536,
14 | "resblock_kernel_sizes": [3,7,11],
15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |
17 | "use_tanh_at_final": false,
18 | "use_bias_at_final": false,
19 |
20 | "activation": "snakebeta",
21 | "snake_logscale": true,
22 |
23 | "use_cqtd_instead_of_mrd": true,
24 | "cqtd_filters": 128,
25 | "cqtd_max_filters": 1024,
26 | "cqtd_filters_scale": 1,
27 | "cqtd_dilations": [1, 2, 4],
28 | "cqtd_hop_lengths": [512, 256, 256],
29 | "cqtd_n_octaves": [9, 9, 9],
30 | "cqtd_bins_per_octaves": [24, 36, 48],
31 |
32 | "mpd_reshapes": [2, 3, 5, 7, 11],
33 | "use_spectral_norm": false,
34 | "discriminator_channel_mult": 1,
35 |
36 | "use_multiscale_melloss": true,
37 | "lambda_melloss": 15,
38 |
39 | "clip_grad_norm": 500,
40 |
41 | "segment_size": 65536,
42 | "num_mels": 100,
43 | "num_freq": 1025,
44 | "n_fft": 1024,
45 | "hop_size": 256,
46 | "win_size": 1024,
47 |
48 | "sampling_rate": 24000,
49 |
50 | "fmin": 0,
51 | "fmax": null,
52 | "fmax_for_loss": null,
53 |
54 | "num_workers": 4,
55 |
56 | "dist_config": {
57 | "dist_backend": "nccl",
58 | "dist_url": "tcp://localhost:54321",
59 | "world_size": 1
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_256x.json:
--------------------------------------------------------------------------------
1 | {
2 | "resblock": "1",
3 | "num_gpus": 0,
4 | "batch_size": 4,
5 | "learning_rate": 0.0001,
6 | "adam_b1": 0.8,
7 | "adam_b2": 0.99,
8 | "lr_decay": 0.9999996,
9 | "seed": 1234,
10 |
11 | "upsample_rates": [4,4,2,2,2,2],
12 | "upsample_kernel_sizes": [8,8,4,4,4,4],
13 | "upsample_initial_channel": 1536,
14 | "resblock_kernel_sizes": [3,7,11],
15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |
17 | "use_tanh_at_final": false,
18 | "use_bias_at_final": false,
19 |
20 | "activation": "snakebeta",
21 | "snake_logscale": true,
22 |
23 | "use_cqtd_instead_of_mrd": true,
24 | "cqtd_filters": 128,
25 | "cqtd_max_filters": 1024,
26 | "cqtd_filters_scale": 1,
27 | "cqtd_dilations": [1, 2, 4],
28 | "cqtd_hop_lengths": [512, 256, 256],
29 | "cqtd_n_octaves": [9, 9, 9],
30 | "cqtd_bins_per_octaves": [24, 36, 48],
31 |
32 | "mpd_reshapes": [2, 3, 5, 7, 11],
33 | "use_spectral_norm": false,
34 | "discriminator_channel_mult": 1,
35 |
36 | "use_multiscale_melloss": true,
37 | "lambda_melloss": 15,
38 |
39 | "clip_grad_norm": 500,
40 |
41 | "segment_size": 65536,
42 | "num_mels": 128,
43 | "num_freq": 1025,
44 | "n_fft": 1024,
45 | "hop_size": 256,
46 | "win_size": 1024,
47 |
48 | "sampling_rate": 44100,
49 |
50 | "fmin": 0,
51 | "fmax": null,
52 | "fmax_for_loss": null,
53 |
54 | "num_workers": 4,
55 |
56 | "dist_config": {
57 | "dist_backend": "nccl",
58 | "dist_url": "tcp://localhost:54321",
59 | "world_size": 1
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_512x.json:
--------------------------------------------------------------------------------
1 | {
2 | "resblock": "1",
3 | "num_gpus": 0,
4 | "batch_size": 4,
5 | "learning_rate": 0.0001,
6 | "adam_b1": 0.8,
7 | "adam_b2": 0.99,
8 | "lr_decay": 0.9999996,
9 | "seed": 1234,
10 |
11 | "upsample_rates": [8,4,2,2,2,2],
12 | "upsample_kernel_sizes": [16,8,4,4,4,4],
13 | "upsample_initial_channel": 1536,
14 | "resblock_kernel_sizes": [3,7,11],
15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 |
17 | "use_tanh_at_final": false,
18 | "use_bias_at_final": false,
19 |
20 | "activation": "snakebeta",
21 | "snake_logscale": true,
22 |
23 | "use_cqtd_instead_of_mrd": true,
24 | "cqtd_filters": 128,
25 | "cqtd_max_filters": 1024,
26 | "cqtd_filters_scale": 1,
27 | "cqtd_dilations": [1, 2, 4],
28 | "cqtd_hop_lengths": [512, 256, 256],
29 | "cqtd_n_octaves": [9, 9, 9],
30 | "cqtd_bins_per_octaves": [24, 36, 48],
31 |
32 | "mpd_reshapes": [2, 3, 5, 7, 11],
33 | "use_spectral_norm": false,
34 | "discriminator_channel_mult": 1,
35 |
36 | "use_multiscale_melloss": true,
37 | "lambda_melloss": 15,
38 |
39 | "clip_grad_norm": 500,
40 |
41 | "segment_size": 65536,
42 | "num_mels": 128,
43 | "num_freq": 2049,
44 | "n_fft": 2048,
45 | "hop_size": 512,
46 | "win_size": 2048,
47 |
48 | "sampling_rate": 44100,
49 |
50 | "fmin": 0,
51 | "fmax": null,
52 | "fmax_for_loss": null,
53 |
54 | "num_workers": 4,
55 |
56 | "dist_config": {
57 | "dist_backend": "nccl",
58 | "dist_url": "tcp://localhost:54321",
59 | "world_size": 1
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/env.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
2 | # LICENSE is in incl_licenses directory.
3 |
4 | import os
5 | import shutil
6 |
7 |
8 | class AttrDict(dict):
9 | def __init__(self, *args, **kwargs):
10 | super(AttrDict, self).__init__(*args, **kwargs)
11 | self.__dict__ = self
12 |
13 |
14 | def build_env(config, config_name, path):
15 | t_path = os.path.join(path, config_name)
16 | if config != t_path:
17 | os.makedirs(path, exist_ok=True)
18 | shutil.copyfile(config, os.path.join(path, config_name))
19 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_1:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Jungil Kong
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_2:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Edward Dixon
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_4:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, Seungwon Park 박승원
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_5:
--------------------------------------------------------------------------------
1 | Copyright 2020 Alexandre Défossez
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
4 | associated documentation files (the "Software"), to deal in the Software without restriction,
5 | including without limitation the rights to use, copy, modify, merge, publish, distribute,
6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
7 | furnished to do so, subject to the following conditions:
8 |
9 | The above copyright notice and this permission notice shall be included in all copies or
10 | substantial portions of the Software.
11 |
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
15 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_6:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023-present, Descript
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_7:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Charactr Inc.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_8:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Amphion
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/inference.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
2 | # LICENSE is in incl_licenses directory.
3 |
4 | from __future__ import absolute_import, division, print_function, unicode_literals
5 |
6 | import os
7 | import argparse
8 | import json
9 | import torch
10 | import librosa
11 | from utils import load_checkpoint
12 | from meldataset import get_mel_spectrogram
13 | from scipy.io.wavfile import write
14 | from env import AttrDict
15 | from meldataset import MAX_WAV_VALUE
16 | from bigvgan import BigVGAN as Generator
17 |
18 | h = None
19 | device = None
20 | torch.backends.cudnn.benchmark = False
21 |
22 |
23 | def inference(a, h):
24 | generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device)
25 |
26 | state_dict_g = load_checkpoint(a.checkpoint_file, device)
27 | generator.load_state_dict(state_dict_g["generator"])
28 |
29 | filelist = os.listdir(a.input_wavs_dir)
30 |
31 | os.makedirs(a.output_dir, exist_ok=True)
32 |
33 | generator.eval()
34 | generator.remove_weight_norm()
35 | with torch.no_grad():
36 | for i, filname in enumerate(filelist):
37 | # Load the ground truth audio and resample if necessary
38 | wav, sr = librosa.load(os.path.join(a.input_wavs_dir, filname), sr=h.sampling_rate, mono=True)
39 | wav = torch.FloatTensor(wav).to(device)
40 | # Compute mel spectrogram from the ground truth audio
41 | x = get_mel_spectrogram(wav.unsqueeze(0), generator.h)
42 |
43 | y_g_hat = generator(x)
44 |
45 | audio = y_g_hat.squeeze()
46 | audio = audio * MAX_WAV_VALUE
47 | audio = audio.cpu().numpy().astype("int16")
48 |
49 | output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated.wav")
50 | write(output_file, h.sampling_rate, audio)
51 | print(output_file)
52 |
53 |
54 | def main():
55 | print("Initializing Inference Process..")
56 |
57 | parser = argparse.ArgumentParser()
58 | parser.add_argument("--input_wavs_dir", default="test_files")
59 | parser.add_argument("--output_dir", default="generated_files")
60 | parser.add_argument("--checkpoint_file", required=True)
61 | parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
62 |
63 | a = parser.parse_args()
64 |
65 | config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
66 | with open(config_file) as f:
67 | data = f.read()
68 |
69 | global h
70 | json_config = json.loads(data)
71 | h = AttrDict(json_config)
72 |
73 | torch.manual_seed(h.seed)
74 | global device
75 | if torch.cuda.is_available():
76 | torch.cuda.manual_seed(h.seed)
77 | device = torch.device("cuda")
78 | else:
79 | device = torch.device("cpu")
80 |
81 | inference(a, h)
82 |
83 |
84 | if __name__ == "__main__":
85 | main()
86 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/inference_e2e.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
2 | # LICENSE is in incl_licenses directory.
3 |
4 | from __future__ import absolute_import, division, print_function, unicode_literals
5 |
6 | import glob
7 | import os
8 | import numpy as np
9 | import argparse
10 | import json
11 | import torch
12 | from scipy.io.wavfile import write
13 | from env import AttrDict
14 | from meldataset import MAX_WAV_VALUE
15 | from bigvgan import BigVGAN as Generator
16 |
17 | h = None
18 | device = None
19 | torch.backends.cudnn.benchmark = False
20 |
21 |
22 | def load_checkpoint(filepath, device):
23 | assert os.path.isfile(filepath)
24 | print(f"Loading '{filepath}'")
25 | checkpoint_dict = torch.load(filepath, map_location=device)
26 | print("Complete.")
27 | return checkpoint_dict
28 |
29 |
30 | def scan_checkpoint(cp_dir, prefix):
31 | pattern = os.path.join(cp_dir, prefix + "*")
32 | cp_list = glob.glob(pattern)
33 | if len(cp_list) == 0:
34 | return ""
35 | return sorted(cp_list)[-1]
36 |
37 |
38 | def inference(a, h):
39 | generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device)
40 |
41 | state_dict_g = load_checkpoint(a.checkpoint_file, device)
42 | generator.load_state_dict(state_dict_g["generator"])
43 |
44 | filelist = os.listdir(a.input_mels_dir)
45 |
46 | os.makedirs(a.output_dir, exist_ok=True)
47 |
48 | generator.eval()
49 | generator.remove_weight_norm()
50 | with torch.no_grad():
51 | for i, filname in enumerate(filelist):
52 | # Load the mel spectrogram in .npy format
53 | x = np.load(os.path.join(a.input_mels_dir, filname))
54 | x = torch.FloatTensor(x).to(device)
55 | if len(x.shape) == 2:
56 | x = x.unsqueeze(0)
57 |
58 | y_g_hat = generator(x)
59 |
60 | audio = y_g_hat.squeeze()
61 | audio = audio * MAX_WAV_VALUE
62 | audio = audio.cpu().numpy().astype("int16")
63 |
64 | output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav")
65 | write(output_file, h.sampling_rate, audio)
66 | print(output_file)
67 |
68 |
69 | def main():
70 | print("Initializing Inference Process..")
71 |
72 | parser = argparse.ArgumentParser()
73 | parser.add_argument("--input_mels_dir", default="test_mel_files")
74 | parser.add_argument("--output_dir", default="generated_files_from_mel")
75 | parser.add_argument("--checkpoint_file", required=True)
76 | parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
77 |
78 | a = parser.parse_args()
79 |
80 | config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
81 | with open(config_file) as f:
82 | data = f.read()
83 |
84 | global h
85 | json_config = json.loads(data)
86 | h = AttrDict(json_config)
87 |
88 | torch.manual_seed(h.seed)
89 | global device
90 | if torch.cuda.is_available():
91 | torch.cuda.manual_seed(h.seed)
92 | device = torch.device("cuda")
93 | else:
94 | device = torch.device("cpu")
95 |
96 | inference(a, h)
97 |
98 |
99 | if __name__ == "__main__":
100 | main()
101 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/.gitkeep:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/bias.md:
--------------------------------------------------------------------------------
1 | | Field | Response |
2 | | :--------------------------------------------------------------------------------------------------------- | :--------------------------------------------------- |
3 | | Participation considerations from adversely impacted groups protected classes in model design and testing: | None |
4 | | Measures taken to mitigate against unwanted bias: | No measures taken to mitigate against unwanted bias. |
5 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/explainability.md:
--------------------------------------------------------------------------------
1 | | Field | Response |
2 | | :---------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
3 | | Intended Application & Domain: | Generating waveform from mel spectrogram. |
4 | | Model Type: | Convolutional Neural Network (CNN) |
5 | | Intended Users: | This model is intended for developers to synthesize and generate waveforms from the AI-generated mel spectrograms. |
6 | | Output: | Audio Waveform |
7 | | Describe how the model works: | Model generates audio waveform corresponding to the input mel spectrogram. |
8 | | Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable |
9 | | Technical Limitations: | This may not perform well on synthetically-generated mel spectrograms that deviate significantly from the profile of mel spectrograms on which this was trained. |
10 | | Verified to have met prescribed NVIDIA quality standards: | Yes |
11 | | Performance Metrics: | Perceptual Evaluation of Speech Quality (PESQ), Virtual Speech Quality Objective Listener (VISQOL), Multi-resolution STFT (MRSTFT), Mel cepstral distortion (MCD), Periodicity RMSE, Voice/Unvoiced F1 Score (V/UV F1) |
12 | | Potential Known Risks: | This model may generate low-quality or distorted soundwaves. |
13 | | Licensing: | https://github.com/NVIDIA/BigVGAN/blob/main/LICENSE |
14 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/privacy.md:
--------------------------------------------------------------------------------
1 | | Field | Response |
2 | | :------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------- |
3 | | Generatable or reverse engineerable personal information? | None |
4 | | Protected class data used to create this model? | None |
5 | | Was consent obtained for any personal data used? | Not Applicable (No Personal Data) |
6 | | How often is dataset reviewed? | Before Release |
7 | | Is a mechanism in place to honor data subject right of access or deletion of personal data? | Not Applicable |
8 | | If personal collected for the development of the model, was it collected directly by NVIDIA? | Not Applicable |
9 | | If personal collected for the development of the model by NVIDIA, do you maintain or have access to disclosures made to data subjects? | Not Applicable |
10 | | If personal collected for the development of this AI model, was it minimized to only what was required? | Not Applicable |
11 | | Is data in dataset traceable? | Yes |
12 | | Is there provenance for all datasets used in training? | Yes |
13 | | Does data labeling (annotation, metadata) comply with privacy laws? | Yes |
14 | | Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. |
15 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/nv-modelcard++/safety.md:
--------------------------------------------------------------------------------
1 | | Field | Response |
2 | | :---------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
3 | | Model Application(s): | Synethic Audio Generation |
4 | | Describe the life critical impact (if present). | Not Applicable |
5 | | Use Case Restrictions: | None |
6 | | Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |
7 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | numpy
3 | librosa>=0.8.1
4 | scipy
5 | tensorboard
6 | soundfile
7 | matplotlib
8 | pesq
9 | auraloss
10 | tqdm
11 | nnAudio
12 | ninja
13 | huggingface_hub>=0.23.4
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/tests/test_activation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 NVIDIA CORPORATION.
2 | # Licensed under the MIT license.
3 |
4 | import os
5 | import sys
6 |
7 | # to import modules from parent_dir
8 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
9 | sys.path.append(parent_dir)
10 |
11 | import torch
12 | from alias_free_activation.cuda import activation1d
13 | from activations import Snake
14 |
15 |
16 | def test_load_fused_kernels():
17 | try:
18 | print("[Success] load_fused_kernels")
19 | except ImportError as e:
20 | print("[Fail] load_fused_kernels")
21 | raise e
22 |
23 |
24 | def test_anti_alias_activation():
25 | data = torch.rand((10, 10, 200), device="cuda")
26 |
27 | # Check activations.Snake cuda vs. torch
28 | fused_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=True).cuda()
29 | fused_activation_output = fused_anti_alias_activation(data)
30 |
31 | torch_anti_alias_activation = activation1d.Activation1d(activation=Snake(10), fused=False).cuda()
32 | torch_activation_output = torch_anti_alias_activation(data)
33 |
34 | test_result = (fused_activation_output - torch_activation_output).abs()
35 |
36 | while test_result.dim() != 1:
37 | test_result = test_result.mean(dim=-1)
38 |
39 | diff = test_result.mean(dim=-1)
40 |
41 | if diff <= 1e-3:
42 | print(
43 | f"\n[Success] test_fused_anti_alias_activation"
44 | f"\n > mean_difference={diff}"
45 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
46 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
47 | )
48 | else:
49 | print(
50 | f"\n[Fail] test_fused_anti_alias_activation"
51 | f"\n > mean_difference={diff}, "
52 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
53 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
54 | )
55 |
56 |
57 | if __name__ == "__main__":
58 | from alias_free_activation.cuda import load
59 |
60 | load.load()
61 | test_load_fused_kernels()
62 | test_anti_alias_activation()
63 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 NVIDIA CORPORATION.
2 | # Licensed under the MIT license.
3 |
4 | import os
5 | import sys
6 |
7 | # to import modules from parent_dir
8 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
9 | sys.path.append(parent_dir)
10 |
11 | import torch
12 | from alias_free_activation.cuda import activation1d
13 | from activations import SnakeBeta
14 |
15 |
16 | def test_load_fused_kernels():
17 | try:
18 | print("[Success] load_fused_kernels")
19 | except ImportError as e:
20 | print("[Fail] load_fused_kernels")
21 | raise e
22 |
23 |
24 | def test_anti_alias_activation():
25 | data = torch.rand((10, 10, 200), device="cuda")
26 |
27 | # Check activations, Snake CUDA vs. Torch
28 | fused_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=True).cuda()
29 | fused_activation_output = fused_anti_alias_activation(data)
30 |
31 | torch_anti_alias_activation = activation1d.Activation1d(activation=SnakeBeta(10), fused=False).cuda()
32 | torch_activation_output = torch_anti_alias_activation(data)
33 |
34 | test_result = (fused_activation_output - torch_activation_output).abs()
35 |
36 | while test_result.dim() != 1:
37 | test_result = test_result.mean(dim=-1)
38 |
39 | diff = test_result.mean(dim=-1)
40 |
41 | if diff <= 1e-3:
42 | print(
43 | f"\n[Success] test_fused_anti_alias_activation"
44 | f"\n > mean_difference={diff}"
45 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}"
46 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
47 | )
48 | else:
49 | print(
50 | f"\n[Fail] test_fused_anti_alias_activation"
51 | f"\n > mean_difference={diff}, "
52 | f"\n > fused_values={fused_activation_output[-1][-1][:].tolist()}, "
53 | f"\n > torch_values={torch_activation_output[-1][-1][:].tolist()}"
54 | )
55 |
56 |
57 | if __name__ == "__main__":
58 | from alias_free_activation.cuda import load
59 |
60 | load.load()
61 | test_load_fused_kernels()
62 | test_anti_alias_activation()
63 |
--------------------------------------------------------------------------------
/GPT_SoVITS/BigVGAN/utils0.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
2 | # LICENSE is in incl_licenses directory.
3 |
4 | import glob
5 | import os
6 | import matplotlib
7 | import torch
8 | from torch.nn.utils import weight_norm
9 |
10 | matplotlib.use("Agg")
11 | import matplotlib.pylab as plt
12 | from .meldataset import MAX_WAV_VALUE
13 | from scipy.io.wavfile import write
14 |
15 |
16 | def plot_spectrogram(spectrogram):
17 | fig, ax = plt.subplots(figsize=(10, 2))
18 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
19 | plt.colorbar(im, ax=ax)
20 |
21 | fig.canvas.draw()
22 | plt.close()
23 |
24 | return fig
25 |
26 |
27 | def plot_spectrogram_clipped(spectrogram, clip_max=2.0):
28 | fig, ax = plt.subplots(figsize=(10, 2))
29 | im = ax.imshow(
30 | spectrogram,
31 | aspect="auto",
32 | origin="lower",
33 | interpolation="none",
34 | vmin=1e-6,
35 | vmax=clip_max,
36 | )
37 | plt.colorbar(im, ax=ax)
38 |
39 | fig.canvas.draw()
40 | plt.close()
41 |
42 | return fig
43 |
44 |
45 | def init_weights(m, mean=0.0, std=0.01):
46 | classname = m.__class__.__name__
47 | if classname.find("Conv") != -1:
48 | m.weight.data.normal_(mean, std)
49 |
50 |
51 | def apply_weight_norm(m):
52 | classname = m.__class__.__name__
53 | if classname.find("Conv") != -1:
54 | weight_norm(m)
55 |
56 |
57 | def get_padding(kernel_size, dilation=1):
58 | return int((kernel_size * dilation - dilation) / 2)
59 |
60 |
61 | def load_checkpoint(filepath, device):
62 | assert os.path.isfile(filepath)
63 | print(f"Loading '{filepath}'")
64 | checkpoint_dict = torch.load(filepath, map_location=device)
65 | print("Complete.")
66 | return checkpoint_dict
67 |
68 |
69 | def save_checkpoint(filepath, obj):
70 | print(f"Saving checkpoint to {filepath}")
71 | torch.save(obj, filepath)
72 | print("Complete.")
73 |
74 |
75 | def scan_checkpoint(cp_dir, prefix, renamed_file=None):
76 | # Fallback to original scanning logic first
77 | pattern = os.path.join(cp_dir, prefix + "????????")
78 | cp_list = glob.glob(pattern)
79 |
80 | if len(cp_list) > 0:
81 | last_checkpoint_path = sorted(cp_list)[-1]
82 | print(f"[INFO] Resuming from checkpoint: '{last_checkpoint_path}'")
83 | return last_checkpoint_path
84 |
85 | # If no pattern-based checkpoints are found, check for renamed file
86 | if renamed_file:
87 | renamed_path = os.path.join(cp_dir, renamed_file)
88 | if os.path.isfile(renamed_path):
89 | print(f"[INFO] Resuming from renamed checkpoint: '{renamed_file}'")
90 | return renamed_path
91 |
92 | return None
93 |
94 |
95 | def save_audio(audio, path, sr):
96 | # wav: torch with 1d shape
97 | audio = audio * MAX_WAV_VALUE
98 | audio = audio.cpu().numpy().astype("int16")
99 | write(path, sr, audio)
100 |
--------------------------------------------------------------------------------
/GPT_SoVITS/TTS_infer_pack/__init__.py:
--------------------------------------------------------------------------------
1 | from . import TTS, text_segmentation_method
2 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/.gitignore:
--------------------------------------------------------------------------------
1 | *.yaml
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 300
4 | batch_size: 8
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 16
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 512
24 | hidden_dim: 512
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 12
28 | dropout: 0
29 | EOS: 1024
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 300
4 | batch_size: 8
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 16-mixed
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 1024
24 | hidden_dim: 1024
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 16
28 | dropout: 0
29 | EOS: 1024
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1big2.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 300
4 | batch_size: 12
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 16-mixed
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 1024
24 | hidden_dim: 1024
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 6
28 | dropout: 0
29 | EOS: 1024
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1longer-v2.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 20
4 | batch_size: 8
5 | save_every_n_epoch: 1
6 | precision: 16-mixed
7 | gradient_clip: 1.0
8 | optimizer:
9 | lr: 0.01
10 | lr_init: 0.00001
11 | lr_end: 0.0001
12 | warmup_steps: 2000
13 | decay_steps: 40000
14 | data:
15 | max_eval_sample: 8
16 | max_sec: 54
17 | num_workers: 4
18 | pad_val: 1024 # same with EOS in model
19 | model:
20 | vocab_size: 1025
21 | phoneme_vocab_size: 732
22 | embedding_dim: 512
23 | hidden_dim: 512
24 | head: 16
25 | linear_units: 2048
26 | n_layer: 24
27 | dropout: 0
28 | EOS: 1024
29 | random_bert: 0
30 | inference:
31 | top_k: 15
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1longer.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 20
4 | batch_size: 8
5 | save_every_n_epoch: 1
6 | precision: 16-mixed
7 | gradient_clip: 1.0
8 | optimizer:
9 | lr: 0.01
10 | lr_init: 0.00001
11 | lr_end: 0.0001
12 | warmup_steps: 2000
13 | decay_steps: 40000
14 | data:
15 | max_eval_sample: 8
16 | max_sec: 54
17 | num_workers: 4
18 | pad_val: 1024 # same with EOS in model
19 | model:
20 | vocab_size: 1025
21 | phoneme_vocab_size: 512
22 | embedding_dim: 512
23 | hidden_dim: 512
24 | head: 16
25 | linear_units: 2048
26 | n_layer: 24
27 | dropout: 0
28 | EOS: 1024
29 | random_bert: 0
30 | inference:
31 | top_k: 5
32 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s1mq.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | seed: 1234
3 | epochs: 100
4 | batch_size: 6
5 | gradient_accumulation: 4
6 | save_every_n_epoch: 1
7 | precision: 32
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 40
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | saving_path: "ckpt/"
22 | resume_checkpoint: null
23 | vocoder_config_path: "quantizer/new_ckpt/config.json"
24 | vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000"
25 | datadir: "/home/liweiche/GigaSpeech/wavs"
26 | metapath: "/home/liweiche/GigaSpeech/train2.json"
27 | val_metapath: "/home/liweiche/GigaSpeech/dev2.json"
28 | sampledir: "logs/"
29 | pretrained_path: null
30 | lr: 0.0001
31 | batch_size: 200.0
32 | train_bucket_size: 8192
33 | training_step: 800000
34 | optim_flat_percent: 0.0
35 | warmup_step: 50
36 | adam_beta1: 0.9
37 | adam_beta2: 0.98
38 | ffd_size: 3072
39 | hidden_size: 768
40 | enc_nlayers: 6
41 | dec_nlayers: 6
42 | nheads: 12
43 | ar_layer: 4
44 | ar_ffd_size: 1024
45 | ar_hidden_size: 256
46 | ar_nheads: 4
47 | aligner_softmax_temp: 1.0
48 | layer_norm_eps: 0.00001
49 | speaker_embed_dropout: 0.05
50 | label_smoothing: 0.0
51 | val_check_interval: 5000
52 | check_val_every_n_epoch: 1
53 | precision: "fp16"
54 | nworkers: 16
55 | distributed: true
56 | accelerator: "ddp"
57 | version: null
58 | accumulate_grad_batches: 1
59 | use_repetition_token: true
60 | use_repetition_gating: false
61 | repetition_penalty: 1.0
62 | sampling_temperature: 1.0
63 | top_k: -1
64 | min_top_k: 3
65 | top_p: 0.8
66 | sample_num: 4
67 | length_penalty_max_length: 15000
68 | length_penalty_max_prob: 0.95
69 | max_input_length: 2048
70 | max_output_length: 2000
71 | sample_rate: 16000
72 | n_codes: 1024
73 | n_cluster_groups: 1
74 | phone_context_window: 4
75 | phoneset_size: 1000
76 | inference:
77 | top_k: 5
78 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/s2.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 100,
4 | "eval_interval": 500,
5 | "seed": 1234,
6 | "epochs": 100,
7 | "learning_rate": 0.0001,
8 | "betas": [
9 | 0.8,
10 | 0.99
11 | ],
12 | "eps": 1e-09,
13 | "batch_size": 32,
14 | "fp16_run": true,
15 | "lr_decay": 0.999875,
16 | "segment_size": 20480,
17 | "init_lr_ratio": 1,
18 | "warmup_epochs": 0,
19 | "c_mel": 45,
20 | "c_kl": 1.0,
21 | "text_low_lr_rate": 0.4,
22 | "grad_ckpt": false
23 | },
24 | "data": {
25 | "max_wav_value": 32768.0,
26 | "sampling_rate": 32000,
27 | "filter_length": 2048,
28 | "hop_length": 640,
29 | "win_length": 2048,
30 | "n_mel_channels": 128,
31 | "mel_fmin": 0.0,
32 | "mel_fmax": null,
33 | "add_blank": true,
34 | "n_speakers": 300,
35 | "cleaned_text": true
36 | },
37 | "model": {
38 | "inter_channels": 192,
39 | "hidden_channels": 192,
40 | "filter_channels": 768,
41 | "n_heads": 2,
42 | "n_layers": 6,
43 | "kernel_size": 3,
44 | "p_dropout": 0.1,
45 | "resblock": "1",
46 | "resblock_kernel_sizes": [
47 | 3,
48 | 7,
49 | 11
50 | ],
51 | "resblock_dilation_sizes": [
52 | [
53 | 1,
54 | 3,
55 | 5
56 | ],
57 | [
58 | 1,
59 | 3,
60 | 5
61 | ],
62 | [
63 | 1,
64 | 3,
65 | 5
66 | ]
67 | ],
68 | "upsample_rates": [
69 | 10,
70 | 8,
71 | 2,
72 | 2,
73 | 2
74 | ],
75 | "upsample_initial_channel": 512,
76 | "upsample_kernel_sizes": [
77 | 16,
78 | 16,
79 | 8,
80 | 2,
81 | 2
82 | ],
83 | "n_layers_q": 3,
84 | "use_spectral_norm": false,
85 | "gin_channels": 512,
86 | "semantic_frame_rate": "25hz",
87 | "freeze_quantizer": true
88 | },
89 | "s2_ckpt_dir": "logs/s2/big2k1",
90 | "content_module": "cnhubert"
91 | }
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/train.yaml:
--------------------------------------------------------------------------------
1 | gpu:
2 | n_card: 1
3 | n_process_per_card: 2
4 | io:
5 | text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS
6 | save_every_n_epoch: 1
7 | precision: 16-mixed
8 | gradient_clip: 1.0
9 | optimizer:
10 | lr: 0.01
11 | lr_init: 0.00001
12 | lr_end: 0.0001
13 | warmup_steps: 2000
14 | decay_steps: 40000
15 | data:
16 | max_eval_sample: 8
17 | max_sec: 54
18 | num_workers: 1
19 | pad_val: 1024 # same with EOS in model
20 | model:
21 | vocab_size: 1025
22 | phoneme_vocab_size: 512
23 | embedding_dim: 512
24 | hidden_dim: 512
25 | head: 16
26 | linear_units: 2048
27 | n_layer: 24
28 | dropout: 0
29 | EOS: 1024
30 | random_bert: 0
31 | inference:
32 | top_k: 5
33 |
--------------------------------------------------------------------------------
/GPT_SoVITS/configs/tts_infer.yaml:
--------------------------------------------------------------------------------
1 | custom:
2 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
3 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
4 | device: cuda
5 | is_half: true
6 | t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
7 | version: v2
8 | vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
9 | v1:
10 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
11 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
12 | device: cpu
13 | is_half: false
14 | t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
15 | version: v1
16 | vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
17 | v2:
18 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
19 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
20 | device: cpu
21 | is_half: false
22 | t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
23 | version: v2
24 | vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
25 | v3:
26 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
27 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
28 | device: cpu
29 | is_half: false
30 | t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
31 | version: v3
32 | vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
33 | v4:
34 | bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
35 | cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
36 | device: cpu
37 | is_half: false
38 | t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
39 | version: v4
40 | vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth
41 |
--------------------------------------------------------------------------------
/GPT_SoVITS/download.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | now_dir = os.getcwd()
5 | sys.path.insert(0, now_dir)
6 | from text.g2pw import G2PWPinyin
7 |
8 | g2pw = G2PWPinyin(
9 | model_dir="GPT_SoVITS/text/G2PWModel",
10 | model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
11 | v_to_u=False,
12 | neutral_tone_with_five=True,
13 | )
14 |
--------------------------------------------------------------------------------
/GPT_SoVITS/f5_tts/model/__init__.py:
--------------------------------------------------------------------------------
1 | # from f5_tts.model.cfm import CFM
2 | #
3 | # from f5_tts.model.backbones.unett import UNetT
4 | from GPT_SoVITS.f5_tts.model.backbones.dit import DiT
5 | # from f5_tts.model.backbones.dit import DiTNoCond
6 | # from f5_tts.model.backbones.dit import DiTNoCondNoT
7 | # from f5_tts.model.backbones.mmdit import MMDiT
8 |
9 | # from f5_tts.model.trainer import Trainer
10 |
11 |
12 | # __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
13 | # __all__ = ["CFM", "UNetT", "DiTNoCond","DiT", "MMDiT"]
14 |
--------------------------------------------------------------------------------
/GPT_SoVITS/f5_tts/model/backbones/README.md:
--------------------------------------------------------------------------------
1 | ## Backbones quick introduction
2 |
3 |
4 | ### unett.py
5 | - flat unet transformer
6 | - structure same as in e2-tts & voicebox paper except using rotary pos emb
7 | - update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
8 |
9 | ### dit.py
10 | - adaln-zero dit
11 | - embedded timestep as condition
12 | - concatted noised_input + masked_cond + embedded_text, linear proj in
13 | - possible abs pos emb & convnextv2 blocks for embedded text before concat
14 | - possible long skip connection (first layer to last layer)
15 |
16 | ### mmdit.py
17 | - sd3 structure
18 | - timestep as condition
19 | - left stream: text embedded and applied a abs pos emb
20 | - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
21 |
--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cnhubert, whisper_enc
2 |
3 | content_module_map = {"cnhubert": cnhubert, "whisper": whisper_enc}
4 |
--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/cnhubert.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os
3 | from transformers import logging as tf_logging
4 |
5 | tf_logging.set_verbosity_error()
6 |
7 | import logging
8 |
9 | logging.getLogger("numba").setLevel(logging.WARNING)
10 |
11 | from transformers import (
12 | Wav2Vec2FeatureExtractor,
13 | HubertModel,
14 | )
15 |
16 | import utils
17 | import torch.nn as nn
18 |
19 | cnhubert_base_path = None
20 |
21 |
22 | class CNHubert(nn.Module):
23 | def __init__(self, base_path: str = None):
24 | super().__init__()
25 | if base_path is None:
26 | base_path = cnhubert_base_path
27 | if os.path.exists(base_path):
28 | ...
29 | else:
30 | raise FileNotFoundError(base_path)
31 | self.model = HubertModel.from_pretrained(base_path, local_files_only=True)
32 | self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(base_path, local_files_only=True)
33 |
34 | def forward(self, x):
35 | input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
36 | feats = self.model(input_values)["last_hidden_state"]
37 | return feats
38 |
39 |
40 | # class CNHubertLarge(nn.Module):
41 | # def __init__(self):
42 | # super().__init__()
43 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
44 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
45 | # def forward(self, x):
46 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
47 | # feats = self.model(input_values)["last_hidden_state"]
48 | # return feats
49 | #
50 | # class CVec(nn.Module):
51 | # def __init__(self):
52 | # super().__init__()
53 | # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
54 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
55 | # def forward(self, x):
56 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
57 | # feats = self.model(input_values)["last_hidden_state"]
58 | # return feats
59 | #
60 | # class cnw2v2base(nn.Module):
61 | # def __init__(self):
62 | # super().__init__()
63 | # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
64 | # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
65 | # def forward(self, x):
66 | # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
67 | # feats = self.model(input_values)["last_hidden_state"]
68 | # return feats
69 |
70 |
71 | def get_model():
72 | model = CNHubert()
73 | model.eval()
74 | return model
75 |
76 |
77 | # def get_large_model():
78 | # model = CNHubertLarge()
79 | # model.eval()
80 | # return model
81 | #
82 | # def get_model_cvec():
83 | # model = CVec()
84 | # model.eval()
85 | # return model
86 | #
87 | # def get_model_cnw2v2base():
88 | # model = cnw2v2base()
89 | # model.eval()
90 | # return model
91 |
92 |
93 | def get_content(hmodel, wav_16k_tensor):
94 | with torch.no_grad():
95 | feats = hmodel(wav_16k_tensor)
96 | return feats.transpose(1, 2)
97 |
98 |
99 | if __name__ == "__main__":
100 | model = get_model()
101 | src_path = "/Users/Shared/原音频2.wav"
102 | wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
103 | model = model
104 | wav_16k_tensor = wav_16k_tensor
105 | feats = get_content(model, wav_16k_tensor)
106 | print(feats.shape)
107 |
--------------------------------------------------------------------------------
/GPT_SoVITS/feature_extractor/whisper_enc.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_model():
5 | import whisper
6 |
7 | model = whisper.load_model("small", device="cpu")
8 |
9 | return model.encoder
10 |
11 |
12 | def get_content(model=None, wav_16k_tensor=None):
13 | from whisper import log_mel_spectrogram, pad_or_trim
14 |
15 | dev = next(model.parameters()).device
16 | mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
17 | # if torch.cuda.is_available():
18 | # mel = mel.to(torch.float16)
19 | feature_len = mel.shape[-1] // 2
20 | assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频"
21 | with torch.no_grad():
22 | feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[:1, :feature_len, :].transpose(1, 2)
23 | return feature
24 |
--------------------------------------------------------------------------------
/GPT_SoVITS/inference_cli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import soundfile as sf
4 |
5 | from tools.i18n.i18n import I18nAuto
6 | from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
7 |
8 | i18n = I18nAuto()
9 |
10 |
11 | def synthesize(
12 | GPT_model_path,
13 | SoVITS_model_path,
14 | ref_audio_path,
15 | ref_text_path,
16 | ref_language,
17 | target_text_path,
18 | target_language,
19 | output_path,
20 | ):
21 | # Read reference text
22 | with open(ref_text_path, "r", encoding="utf-8") as file:
23 | ref_text = file.read()
24 |
25 | # Read target text
26 | with open(target_text_path, "r", encoding="utf-8") as file:
27 | target_text = file.read()
28 |
29 | # Change model weights
30 | change_gpt_weights(gpt_path=GPT_model_path)
31 | change_sovits_weights(sovits_path=SoVITS_model_path)
32 |
33 | # Synthesize audio
34 | synthesis_result = get_tts_wav(
35 | ref_wav_path=ref_audio_path,
36 | prompt_text=ref_text,
37 | prompt_language=i18n(ref_language),
38 | text=target_text,
39 | text_language=i18n(target_language),
40 | top_p=1,
41 | temperature=1,
42 | )
43 |
44 | result_list = list(synthesis_result)
45 |
46 | if result_list:
47 | last_sampling_rate, last_audio_data = result_list[-1]
48 | output_wav_path = os.path.join(output_path, "output.wav")
49 | sf.write(output_wav_path, last_audio_data, last_sampling_rate)
50 | print(f"Audio saved to {output_wav_path}")
51 |
52 |
53 | def main():
54 | parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
55 | parser.add_argument("--gpt_model", required=True, help="Path to the GPT model file")
56 | parser.add_argument("--sovits_model", required=True, help="Path to the SoVITS model file")
57 | parser.add_argument("--ref_audio", required=True, help="Path to the reference audio file")
58 | parser.add_argument("--ref_text", required=True, help="Path to the reference text file")
59 | parser.add_argument(
60 | "--ref_language", required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio"
61 | )
62 | parser.add_argument("--target_text", required=True, help="Path to the target text file")
63 | parser.add_argument(
64 | "--target_language",
65 | required=True,
66 | choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"],
67 | help="Language of the target text",
68 | )
69 | parser.add_argument("--output_path", required=True, help="Path to the output directory")
70 |
71 | args = parser.parse_args()
72 |
73 | synthesize(
74 | args.gpt_model,
75 | args.sovits_model,
76 | args.ref_audio,
77 | args.ref_text,
78 | args.ref_language,
79 | args.target_text,
80 | args.target_language,
81 | args.output_path,
82 | )
83 |
84 |
85 | if __name__ == "__main__":
86 | main()
87 |
--------------------------------------------------------------------------------
/GPT_SoVITS/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/module/__init__.py
--------------------------------------------------------------------------------
/GPT_SoVITS/module/losses.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 |
5 |
6 | def feature_loss(fmap_r, fmap_g):
7 | loss = 0
8 | for dr, dg in zip(fmap_r, fmap_g):
9 | for rl, gl in zip(dr, dg):
10 | rl = rl.float().detach()
11 | gl = gl.float()
12 | loss += torch.mean(torch.abs(rl - gl))
13 |
14 | return loss * 2
15 |
16 |
17 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
18 | loss = 0
19 | r_losses = []
20 | g_losses = []
21 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
22 | dr = dr.float()
23 | dg = dg.float()
24 | r_loss = torch.mean((1 - dr) ** 2)
25 | g_loss = torch.mean(dg**2)
26 | loss += r_loss + g_loss
27 | r_losses.append(r_loss.item())
28 | g_losses.append(g_loss.item())
29 |
30 | return loss, r_losses, g_losses
31 |
32 |
33 | def generator_loss(disc_outputs):
34 | loss = 0
35 | gen_losses = []
36 | for dg in disc_outputs:
37 | dg = dg.float()
38 | l = torch.mean((1 - dg) ** 2)
39 | gen_losses.append(l)
40 | loss += l
41 |
42 | return loss, gen_losses
43 |
44 |
45 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
46 | """
47 | z_p, logs_q: [b, h, t_t]
48 | m_p, logs_p: [b, h, t_t]
49 | """
50 | z_p = z_p.float()
51 | logs_q = logs_q.float()
52 | m_p = m_p.float()
53 | logs_p = logs_p.float()
54 | z_mask = z_mask.float()
55 |
56 | kl = logs_p - logs_q - 0.5
57 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
58 | kl = torch.sum(kl * z_mask)
59 | l = kl / torch.sum(z_mask)
60 | return l
61 |
62 |
63 | def mle_loss(z, m, logs, logdet, mask):
64 | l = torch.sum(logs) + 0.5 * torch.sum(
65 | torch.exp(-2 * logs) * ((z - m) ** 2)
66 | ) # neg normal likelihood w/o the constant term
67 | l = l - torch.sum(logdet) # log jacobian determinant
68 | l = l / torch.sum(torch.ones_like(z) * mask) # averaging across batch, channel and time axes
69 | l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term
70 | return l
71 |
--------------------------------------------------------------------------------
/GPT_SoVITS/pretrained_models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
--------------------------------------------------------------------------------
/GPT_SoVITS/process_ckpt.py:
--------------------------------------------------------------------------------
1 | import traceback
2 | from collections import OrderedDict
3 | from time import time as ttime
4 | import shutil
5 | import os
6 | import torch
7 | from tools.i18n.i18n import I18nAuto
8 |
9 | i18n = I18nAuto()
10 |
11 |
12 | def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
13 | dir = os.path.dirname(path)
14 | name = os.path.basename(path)
15 | tmp_path = "%s.pth" % (ttime())
16 | torch.save(fea, tmp_path)
17 | shutil.move(tmp_path, "%s/%s" % (dir, name))
18 |
19 |
20 | """
21 | 00:v1
22 | 01:v2
23 | 02:v3
24 | 03:v3lora
25 | 04:v4lora
26 |
27 | """
28 | from io import BytesIO
29 |
30 |
31 | def my_save2(fea, path, cfm_version):
32 | bio = BytesIO()
33 | torch.save(fea, bio)
34 | bio.seek(0)
35 | data = bio.getvalue()
36 | byte = b"03" if cfm_version == "v3" else b"04"
37 | data = byte + data[2:]
38 | with open(path, "wb") as f:
39 | f.write(data)
40 |
41 |
42 | def savee(ckpt, name, epoch, steps, hps, cfm_version=None, lora_rank=None):
43 | try:
44 | opt = OrderedDict()
45 | opt["weight"] = {}
46 | for key in ckpt.keys():
47 | if "enc_q" in key:
48 | continue
49 | opt["weight"][key] = ckpt[key].half()
50 | opt["config"] = hps
51 | opt["info"] = "%sepoch_%siteration" % (epoch, steps)
52 | if lora_rank:
53 | opt["lora_rank"] = lora_rank
54 | my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), cfm_version)
55 | else:
56 | my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
57 | return "Success."
58 | except:
59 | return traceback.format_exc()
60 |
61 |
62 | head2version = {
63 | b"00": ["v1", "v1", False],
64 | b"01": ["v2", "v2", False],
65 | b"02": ["v2", "v3", False],
66 | b"03": ["v2", "v3", True],
67 | b"04": ["v2", "v4", True],
68 | }
69 | hash_pretrained_dict = {
70 | "dc3c97e17592963677a4a1681f30c653": ["v2", "v2", False], # s2G488k.pth#sovits_v1_pretrained
71 | "43797be674a37c1c83ee81081941ed0f": ["v2", "v3", False], # s2Gv3.pth#sovits_v3_pretrained
72 | "6642b37f3dbb1f76882b69937c95a5f3": ["v2", "v2", False], # s2G2333K.pth#sovits_v2_pretrained
73 | "4f26b9476d0c5033e04162c486074374": ["v2", "v4", False], # s2Gv4.pth#sovits_v4_pretrained
74 | }
75 | import hashlib
76 |
77 |
78 | def get_hash_from_file(sovits_path):
79 | with open(sovits_path, "rb") as f:
80 | data = f.read(8192)
81 | hash_md5 = hashlib.md5()
82 | hash_md5.update(data)
83 | return hash_md5.hexdigest()
84 |
85 |
86 | def get_sovits_version_from_path_fast(sovits_path):
87 | ###1-if it is pretrained sovits models, by hash
88 | hash = get_hash_from_file(sovits_path)
89 | if hash in hash_pretrained_dict:
90 | return hash_pretrained_dict[hash]
91 | ###2-new weights, by head
92 | with open(sovits_path, "rb") as f:
93 | version = f.read(2)
94 | if version != b"PK":
95 | return head2version[version]
96 | ###3-old weights, by file size
97 | if_lora_v3 = False
98 | size = os.path.getsize(sovits_path)
99 | """
100 | v1weights:about 82942KB
101 | half thr:82978KB
102 | v2weights:about 83014KB
103 | v3weights:about 750MB
104 | """
105 | if size < 82978 * 1024:
106 | model_version = version = "v1"
107 | elif size < 700 * 1024 * 1024:
108 | model_version = version = "v2"
109 | else:
110 | version = "v2"
111 | model_version = "v3"
112 | return version, model_version, if_lora_v3
113 |
114 |
115 | def load_sovits_new(sovits_path):
116 | f = open(sovits_path, "rb")
117 | meta = f.read(2)
118 | if meta != "PK":
119 | data = b"PK" + f.read()
120 | bio = BytesIO()
121 | bio.write(data)
122 | bio.seek(0)
123 | return torch.load(bio, map_location="cpu", weights_only=False)
124 | return torch.load(sovits_path, map_location="cpu", weights_only=False)
125 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/.gitignore:
--------------------------------------------------------------------------------
1 | G2PWModel
2 | __pycache__
3 | *.zip
--------------------------------------------------------------------------------
/GPT_SoVITS/text/LangSegmenter/__init__.py:
--------------------------------------------------------------------------------
1 | from .langsegmenter import LangSegmenter
2 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | # if os.environ.get("version","v1")=="v1":
3 | # from text.symbols import symbols
4 | # else:
5 | # from text.symbols2 import symbols
6 |
7 | from text import symbols as symbols_v1
8 | from text import symbols2 as symbols_v2
9 |
10 | _symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
11 | _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
12 |
13 |
14 | def cleaned_text_to_sequence(cleaned_text, version=None):
15 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 | Args:
17 | text: string to convert to a sequence
18 | Returns:
19 | List of integers corresponding to the symbols in the text
20 | """
21 | if version is None:
22 | version = os.environ.get("version", "v2")
23 | if version == "v1":
24 | phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
25 | else:
26 | phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
27 |
28 | return phones
29 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/cleaner.py:
--------------------------------------------------------------------------------
1 | from text import cleaned_text_to_sequence
2 | import os
3 | # if os.environ.get("version","v1")=="v1":
4 | # from text import chinese
5 | # from text.symbols import symbols
6 | # else:
7 | # from text import chinese2 as chinese
8 | # from text.symbols2 import symbols
9 |
10 | from text import symbols as symbols_v1
11 | from text import symbols2 as symbols_v2
12 |
13 | special = [
14 | # ("%", "zh", "SP"),
15 | ("¥", "zh", "SP2"),
16 | ("^", "zh", "SP3"),
17 | # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧
18 | ]
19 |
20 |
21 | def clean_text(text, language, version=None):
22 | if version is None:
23 | version = os.environ.get("version", "v2")
24 | if version == "v1":
25 | symbols = symbols_v1.symbols
26 | language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
27 | else:
28 | symbols = symbols_v2.symbols
29 | language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
30 |
31 | if language not in language_module_map:
32 | language = "en"
33 | text = " "
34 | for special_s, special_l, target_symbol in special:
35 | if special_s in text and language == special_l:
36 | return clean_special(text, language, special_s, target_symbol, version)
37 | language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
38 | if hasattr(language_module, "text_normalize"):
39 | norm_text = language_module.text_normalize(text)
40 | else:
41 | norm_text = text
42 | if language == "zh" or language == "yue": ##########
43 | phones, word2ph = language_module.g2p(norm_text)
44 | assert len(phones) == sum(word2ph)
45 | assert len(norm_text) == len(word2ph)
46 | elif language == "en":
47 | phones = language_module.g2p(norm_text)
48 | if len(phones) < 4:
49 | phones = [","] + phones
50 | word2ph = None
51 | else:
52 | phones = language_module.g2p(norm_text)
53 | word2ph = None
54 | phones = ["UNK" if ph not in symbols else ph for ph in phones]
55 | return phones, word2ph, norm_text
56 |
57 |
58 | def clean_special(text, language, special_s, target_symbol, version=None):
59 | if version is None:
60 | version = os.environ.get("version", "v2")
61 | if version == "v1":
62 | symbols = symbols_v1.symbols
63 | language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
64 | else:
65 | symbols = symbols_v2.symbols
66 | language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
67 |
68 | """
69 | 特殊静音段sp符号处理
70 | """
71 | text = text.replace(special_s, ",")
72 | language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
73 | norm_text = language_module.text_normalize(text)
74 | phones = language_module.g2p(norm_text)
75 | new_ph = []
76 | for ph in phones[0]:
77 | assert ph in symbols
78 | if ph == ",":
79 | new_ph.append(target_symbol)
80 | else:
81 | new_ph.append(ph)
82 | return new_ph, phones[1], norm_text
83 |
84 |
85 | def text_to_sequence(text, language, version=None):
86 | version = os.environ.get("version", version)
87 | if version is None:
88 | version = "v2"
89 | phones = clean_text(text)
90 | return cleaned_text_to_sequence(phones, version)
91 |
92 |
93 | if __name__ == "__main__":
94 | print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
95 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict-hot.rep:
--------------------------------------------------------------------------------
1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2 | JSON JH EY1 S AH0 N
3 | CONDA K AA1 N D AH0
--------------------------------------------------------------------------------
/GPT_SoVITS/text/engdict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/text/engdict_cache.pickle
--------------------------------------------------------------------------------
/GPT_SoVITS/text/g2pw/__init__.py:
--------------------------------------------------------------------------------
1 | from text.g2pw.g2pw import *
2 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/g2pw/polyphonic.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/text/g2pw/polyphonic.pickle
--------------------------------------------------------------------------------
/GPT_SoVITS/text/g2pw/polyphonic.rep:
--------------------------------------------------------------------------------
1 | 湖泊: ['hu2','po1']
2 | 地壳: ['di4','qiao4']
3 | 柏树: ['bai3','shu4']
4 | 曝光: ['bao4','guang1']
5 | 弹力: ['tan2','li4']
6 | 字帖: ['zi4','tie4']
7 | 口吃: ['kou3','chi1']
8 | 包扎: ['bao1','za1']
9 | 哪吒: ['ne2','zha1']
10 | 说服: ['shuo1','fu2']
11 | 识字: ['shi2','zi4']
12 | 骨头: ['gu3','tou5']
13 | 对称: ['dui4','chen4']
14 | 口供: ['kou3','gong4']
15 | 抹布: ['ma1','bu4']
16 | 露背: ['lu4','bei4']
17 | 圈养: ['juan4', 'yang3']
18 | 眼眶: ['yan3', 'kuang4']
19 | 品行: ['pin3','xing2']
20 | 颤抖: ['chan4','dou3']
21 | 差不多: ['cha4','bu5','duo1']
22 | 鸭绿江: ['ya1','lu4','jiang1']
23 | 撒切尔: ['sa4','qie4','er3']
24 | 比比皆是: ['bi3','bi3','jie1','shi4']
25 | 身无长物: ['shen1','wu2','chang2','wu4']
26 | 手里: ['shou2','li3']
27 | 关卡: ['guan1','qia3']
28 | 怀揣: ['huai2','chuai1']
29 | 挑剔: ['tiao1','ti4']
30 | 供称: ['gong4','cheng1']
31 | 作坊: ['zuo1', 'fang5']
32 | 中医: ['zhong1','yi1']
33 | 嚷嚷: ['rang1','rang5']
34 | 商厦: ['shang1','sha4']
35 | 大厦: ['da4','sha4']
36 | 刹车: ['sha1','che1']
37 | 嘚瑟: ['de4','se5']
38 | 朝鲜: ['chao2','xian3']
39 | 阿房宫: ['e1','pang2','gong1']
40 | 阿胶: ['e1','jiao1']
41 | 咖喱: ['ga1','li5']
42 | 时分: ['shi2','fen1']
43 | 蚌埠: ['beng4','bu4']
44 | 驯服: ['xun4','fu2']
45 | 幸免于难: ['xing4','mian3','yu2','nan4']
46 | 恶行: ['e4','xing2']
47 | 唉: ['ai4']
48 | 扎实: ['zha1','shi2']
49 | 干将: ['gan4','jiang4']
50 | 陈威行: ['chen2', 'wei1', 'hang2']
51 | 郭晟: ['guo1', 'sheng4']
52 | 中标: ['zhong4', 'biao1']
53 | 抗住: ['kang2', 'zhu4']
--------------------------------------------------------------------------------
/GPT_SoVITS/text/namedict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/GPT_SoVITS/text/namedict_cache.pickle
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/README.md:
--------------------------------------------------------------------------------
1 | ## Supported NSW (Non-Standard-Word) Normalization
2 |
3 | |NSW type|raw|normalized|
4 | |:--|:-|:-|
5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分|
7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二|
8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日|
9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万|
14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from text.zh_normalization.text_normlization import *
15 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import DIGITS
17 | from .num import num2str
18 | from .num import verbalize_cardinal
19 | from .num import verbalize_digit
20 |
21 |
22 | def _time_num2str(num_string: str) -> str:
23 | """A special case for verbalizing number in time."""
24 | result = num2str(num_string.lstrip("0"))
25 | if num_string.startswith("0"):
26 | result = DIGITS["0"] + result
27 | return result
28 |
29 |
30 | # 时刻表达式
31 | RE_TIME = re.compile(
32 | r"([0-1]?[0-9]|2[0-3])"
33 | r":([0-5][0-9])"
34 | r"(:([0-5][0-9]))?"
35 | )
36 |
37 | # 时间范围,如8:30-12:30
38 | RE_TIME_RANGE = re.compile(
39 | r"([0-1]?[0-9]|2[0-3])"
40 | r":([0-5][0-9])"
41 | r"(:([0-5][0-9]))?"
42 | r"(~|-)"
43 | r"([0-1]?[0-9]|2[0-3])"
44 | r":([0-5][0-9])"
45 | r"(:([0-5][0-9]))?"
46 | )
47 |
48 |
49 | def replace_time(match) -> str:
50 | """
51 | Args:
52 | match (re.Match)
53 | Returns:
54 | str
55 | """
56 |
57 | is_range = len(match.groups()) > 5
58 |
59 | hour = match.group(1)
60 | minute = match.group(2)
61 | second = match.group(4)
62 |
63 | if is_range:
64 | hour_2 = match.group(6)
65 | minute_2 = match.group(7)
66 | second_2 = match.group(9)
67 |
68 | result = f"{num2str(hour)}点"
69 | if minute.lstrip("0"):
70 | if int(minute) == 30:
71 | result += "半"
72 | else:
73 | result += f"{_time_num2str(minute)}分"
74 | if second and second.lstrip("0"):
75 | result += f"{_time_num2str(second)}秒"
76 |
77 | if is_range:
78 | result += "至"
79 | result += f"{num2str(hour_2)}点"
80 | if minute_2.lstrip("0"):
81 | if int(minute) == 30:
82 | result += "半"
83 | else:
84 | result += f"{_time_num2str(minute_2)}分"
85 | if second_2 and second_2.lstrip("0"):
86 | result += f"{_time_num2str(second_2)}秒"
87 |
88 | return result
89 |
90 |
91 | RE_DATE = re.compile(
92 | r"(\d{4}|\d{2})年"
93 | r"((0?[1-9]|1[0-2])月)?"
94 | r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?"
95 | )
96 |
97 |
98 | def replace_date(match) -> str:
99 | """
100 | Args:
101 | match (re.Match)
102 | Returns:
103 | str
104 | """
105 | year = match.group(1)
106 | month = match.group(3)
107 | day = match.group(5)
108 | result = ""
109 | if year:
110 | result += f"{verbalize_digit(year)}年"
111 | if month:
112 | result += f"{verbalize_cardinal(month)}月"
113 | if day:
114 | result += f"{verbalize_cardinal(day)}{match.group(9)}"
115 | return result
116 |
117 |
118 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
119 | RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])")
120 |
121 |
122 | def replace_date2(match) -> str:
123 | """
124 | Args:
125 | match (re.Match)
126 | Returns:
127 | str
128 | """
129 | year = match.group(1)
130 | month = match.group(3)
131 | day = match.group(4)
132 | result = ""
133 | if year:
134 | result += f"{verbalize_digit(year)}年"
135 | if month:
136 | result += f"{verbalize_cardinal(month)}月"
137 | if day:
138 | result += f"{verbalize_cardinal(day)}日"
139 | return result
140 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 |
17 | from pypinyin.constants import SUPPORT_UCS4
18 |
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters}
22 |
23 | # 英文字符半角 -> 全角映射表
24 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
25 |
26 | # 数字字符全角 -> 半角映射表 (num: 10)
27 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
28 | # 数字字符半角 -> 全角映射表
29 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
30 |
31 | # 标点符号全角 -> 半角映射表 (num: 32)
32 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
33 | # 标点符号半角 -> 全角映射表
34 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
35 |
36 | # 空格 (num: 1)
37 | F2H_SPACE = {"\u3000": " "}
38 | H2F_SPACE = {" ": "\u3000"}
39 |
40 | # 非"有拼音的汉字"的字符串,可用于NSW提取
41 | if SUPPORT_UCS4:
42 | RE_NSW = re.compile(
43 | r"(?:[^"
44 | r"\u3007" # 〇
45 | r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF]
46 | r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF]
47 | r"\uf900-\ufaff" # CJK兼容:[F900-FAFF]
48 | r"\U00020000-\U0002A6DF" # CJK扩展B:[20000-2A6DF]
49 | r"\U0002A703-\U0002B73F" # CJK扩展C:[2A700-2B73F]
50 | r"\U0002B740-\U0002B81D" # CJK扩展D:[2B740-2B81D]
51 | r"\U0002F80A-\U0002FA1F" # CJK兼容扩展:[2F800-2FA1F]
52 | r"])+"
53 | )
54 | else:
55 | RE_NSW = re.compile( # pragma: no cover
56 | r"(?:[^"
57 | r"\u3007" # 〇
58 | r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF]
59 | r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF]
60 | r"\uf900-\ufaff" # CJK兼容:[F900-FAFF]
61 | r"])+"
62 | )
63 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import verbalize_digit
17 |
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通:130、131、132、156、155、186、185、176
23 | # 电信:133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(r"(? str:
32 | if mobile:
33 | sp_parts = phone_string.strip("+").split()
34 | result = ",".join([verbalize_digit(part, alt_one=True) for part in sp_parts])
35 | return result
36 | else:
37 | sil_parts = phone_string.split("-")
38 | result = ",".join([verbalize_digit(part, alt_one=True) for part in sil_parts])
39 | return result
40 |
41 |
42 | def replace_phone(match) -> str:
43 | """
44 | Args:
45 | match (re.Match)
46 | Returns:
47 | str
48 | """
49 | return phone2str(match.group(0), mobile=False)
50 |
51 |
52 | def replace_mobile(match) -> str:
53 | """
54 | Args:
55 | match (re.Match)
56 | Returns:
57 | str
58 | """
59 | return phone2str(match.group(0))
60 |
--------------------------------------------------------------------------------
/GPT_SoVITS/text/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from .num import num2str
17 |
18 | # 温度表达式,温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r"(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)")
21 | measure_dict = {
22 | "cm2": "平方厘米",
23 | "cm²": "平方厘米",
24 | "cm3": "立方厘米",
25 | "cm³": "立方厘米",
26 | "cm": "厘米",
27 | "db": "分贝",
28 | "ds": "毫秒",
29 | "kg": "千克",
30 | "km": "千米",
31 | "m2": "平方米",
32 | "m²": "平方米",
33 | "m³": "立方米",
34 | "m3": "立方米",
35 | "ml": "毫升",
36 | "m": "米",
37 | "mm": "毫米",
38 | "s": "秒",
39 | }
40 |
41 |
42 | def replace_temperature(match) -> str:
43 | """
44 | Args:
45 | match (re.Match)
46 | Returns:
47 | str
48 | """
49 | sign = match.group(1)
50 | temperature = match.group(2)
51 | unit = match.group(3)
52 | sign: str = "零下" if sign else ""
53 | temperature: str = num2str(temperature)
54 | unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 | result = f"{sign}{temperature}{unit}"
56 | return result
57 |
58 |
59 | def replace_measure(sentence) -> str:
60 | for q_notation in measure_dict:
61 | if q_notation in sentence:
62 | sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 | return sentence
64 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 RVC-Boss
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 |
4 | import torch
5 |
6 | # 推理用的指定模型
7 | sovits_path = ""
8 | gpt_path = ""
9 | is_half_str = os.environ.get("is_half", "True")
10 | is_half = True if is_half_str.lower() == "true" else False
11 | is_share_str = os.environ.get("is_share", "False")
12 | is_share = True if is_share_str.lower() == "true" else False
13 |
14 | cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
15 | bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
16 | pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
17 | pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
18 |
19 | exp_root = "logs"
20 | python_exec = sys.executable or "python"
21 | if torch.cuda.is_available():
22 | infer_device = "cuda"
23 | else:
24 | infer_device = "cpu"
25 |
26 | webui_port_main = 9874
27 | webui_port_uvr5 = 9873
28 | webui_port_infer_tts = 9872
29 | webui_port_subfix = 9871
30 |
31 | api_port = 9880
32 |
33 | if infer_device == "cuda":
34 | gpu_name = torch.cuda.get_device_name(0)
35 | if (
36 | ("16" in gpu_name and "V100" not in gpu_name.upper())
37 | or "P40" in gpu_name.upper()
38 | or "P10" in gpu_name.upper()
39 | or "1060" in gpu_name
40 | or "1070" in gpu_name
41 | or "1080" in gpu_name
42 | ):
43 | is_half = False
44 |
45 | if infer_device == "cpu":
46 | is_half = False
47 |
48 |
49 | class Config:
50 | def __init__(self):
51 | self.sovits_path = sovits_path
52 | self.gpt_path = gpt_path
53 | self.is_half = is_half
54 |
55 | self.cnhubert_path = cnhubert_path
56 | self.bert_path = bert_path
57 | self.pretrained_sovits_path = pretrained_sovits_path
58 | self.pretrained_gpt_path = pretrained_gpt_path
59 |
60 | self.exp_root = exp_root
61 | self.python_exec = python_exec
62 | self.infer_device = infer_device
63 |
64 | self.webui_port_main = webui_port_main
65 | self.webui_port_uvr5 = webui_port_uvr5
66 | self.webui_port_infer_tts = webui_port_infer_tts
67 | self.webui_port_subfix = webui_port_subfix
68 |
69 | self.api_port = api_port
70 |
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: "3.8"
2 |
3 | services:
4 | GPT-SoVITS-CU126:
5 | image: xxxxrt666/gpt-sovits:latest-cu126
6 | container_name: GPT-SoVITS-CU126
7 | ports:
8 | - "9871:9871"
9 | - "9872:9872"
10 | - "9873:9873"
11 | - "9874:9874"
12 | - "9880:9880"
13 | volumes:
14 | - .:/workspace/GPT-SoVITS
15 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
16 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
17 | - /dev/null:/workspace/GPT-SoVITS/tools/asr/models
18 | - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
19 | environment:
20 | - is_half=true
21 | tty: true
22 | stdin_open: true
23 | shm_size: "16g"
24 | restart: unless-stopped
25 | runtime: nvidia
26 | GPT-SoVITS-CU126-Lite:
27 | image: xxxxrt666/gpt-sovits:latest-cu126-lite
28 | container_name: GPT-SoVITS-CU126-Lite
29 | ports:
30 | - "9871:9871"
31 | - "9872:9872"
32 | - "9873:9873"
33 | - "9874:9874"
34 | - "9880:9880"
35 | volumes:
36 | - .:/workspace/GPT-SoVITS
37 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
38 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
39 | - /dev/null:/workspace/GPT-SoVITS/tools/asr/models
40 | - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
41 | - tools/asr/models:/workspace/models/asr_models
42 | - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
43 | environment:
44 | - is_half=true
45 | tty: true
46 | stdin_open: true
47 | shm_size: "16g"
48 | restart: unless-stopped
49 | runtime: nvidia
50 | GPT-SoVITS-CU128:
51 | image: xxxxrt666/gpt-sovits:latest-cu128
52 | container_name: GPT-SoVITS-CU128
53 | ports:
54 | - "9871:9871"
55 | - "9872:9872"
56 | - "9873:9873"
57 | - "9874:9874"
58 | - "9880:9880"
59 | volumes:
60 | - .:/workspace/GPT-SoVITS
61 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
62 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
63 | - /dev/null:/workspace/GPT-SoVITS/tools/asr/models
64 | - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
65 | environment:
66 | - is_half=true
67 | tty: true
68 | stdin_open: true
69 | shm_size: "16g"
70 | restart: unless-stopped
71 | runtime: nvidia
72 | GPT-SoVITS-CU128-Lite:
73 | image: xxxxrt666/gpt-sovits:latest-cu128-lite
74 | container_name: GPT-SoVITS-CU128-Lite
75 | ports:
76 | - "9871:9871"
77 | - "9872:9872"
78 | - "9873:9873"
79 | - "9874:9874"
80 | - "9880:9880"
81 | volumes:
82 | - .:/workspace/GPT-SoVITS
83 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
84 | - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
85 | - /dev/null:/workspace/GPT-SoVITS/tools/asr/models
86 | - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights
87 | - tools/asr/models:/workspace/models/asr_models
88 | - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
89 | environment:
90 | - is_half=true
91 | tty: true
92 | stdin_open: true
93 | shm_size: "16g"
94 | restart: unless-stopped
95 | runtime: nvidia
--------------------------------------------------------------------------------
/docker_build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
4 |
5 | cd "$SCRIPT_DIR" || exit 1
6 |
7 | set -e
8 |
9 | if ! command -v docker &>/dev/null; then
10 | echo "Docker Not Found"
11 | exit 1
12 | fi
13 |
14 | trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR
15 |
16 | LITE=false
17 | CUDA_VERSION=12.6
18 |
19 | print_help() {
20 | echo "Usage: bash docker_build.sh [OPTIONS]"
21 | echo ""
22 | echo "Options:"
23 | echo " --cuda 12.6|12.8 Specify the CUDA VERSION (REQUIRED)"
24 | echo " --lite Build a Lite Image"
25 | echo " -h, --help Show this help message and exit"
26 | echo ""
27 | echo "Examples:"
28 | echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper"
29 | }
30 |
31 | # Show help if no arguments provided
32 | if [[ $# -eq 0 ]]; then
33 | print_help
34 | exit 0
35 | fi
36 |
37 | # Parse arguments
38 | while [[ $# -gt 0 ]]; do
39 | case "$1" in
40 | --cuda)
41 | case "$2" in
42 | 12.6)
43 | CUDA_VERSION=12.6
44 | ;;
45 | 12.8)
46 | CUDA_VERSION=12.8
47 | ;;
48 | *)
49 | echo "Error: Invalid CUDA_VERSION: $2"
50 | echo "Choose From: [12.6, 12.8]"
51 | exit 1
52 | ;;
53 | esac
54 | shift 2
55 | ;;
56 | --lite)
57 | LITE=true
58 | shift
59 | ;;
60 | *)
61 | echo "Unknown Argument: $1"
62 | echo "Use -h or --help to see available options."
63 | exit 1
64 | ;;
65 | esac
66 | done
67 |
68 | TARGETPLATFORM=$(uname -m | grep -q 'x86' && echo "linux/amd64" || echo "linux/arm64")
69 |
70 | if [ $LITE = true ]; then
71 | TORCH_BASE="lite"
72 | else
73 | TORCH_BASE="full"
74 | fi
75 |
76 | docker build \
77 | --build-arg CUDA_VERSION=$CUDA_VERSION \
78 | --build-arg LITE=$LITE \
79 | --build-arg TARGETPLATFORM="$TARGETPLATFORM" \
80 | --build-arg TORCH_BASE=$TORCH_BASE \
81 | -t "${USER}/gpt-sovits:local" \
82 | .
83 |
--------------------------------------------------------------------------------
/extra-req.txt:
--------------------------------------------------------------------------------
1 | faster-whisper
2 |
--------------------------------------------------------------------------------
/go-webui.bat:
--------------------------------------------------------------------------------
1 | set "SCRIPT_DIR=%~dp0"
2 | set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
3 | cd /d "%SCRIPT_DIR%"
4 | set "PATH=%SCRIPT_DIR%\runtime;%PATH%"
5 | runtime\python.exe -I webui.py zh_CN
6 | pause
7 |
--------------------------------------------------------------------------------
/go-webui.ps1:
--------------------------------------------------------------------------------
1 | $ErrorActionPreference = "SilentlyContinue"
2 | chcp 65001
3 | Set-Location $PSScriptRoot
4 | $runtimePath = Join-Path $PSScriptRoot "runtime"
5 | $env:PATH = "$runtimePath;$env:PATH"
6 | & "$runtimePath\python.exe" -I "$PSScriptRoot\webui.py" zh_CN
7 | pause
8 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | --no-binary=opencc
2 | numpy<2.0
3 | scipy
4 | tensorboard
5 | librosa==0.10.2
6 | numba
7 | pytorch-lightning>=2.4
8 | gradio<5
9 | ffmpeg-python
10 | onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64"
11 | onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64"
12 | tqdm
13 | funasr==1.0.27
14 | cn2an
15 | pypinyin
16 | pyopenjtalk>=0.4.1
17 | g2p_en
18 | torchaudio
19 | modelscope==1.10.0
20 | sentencepiece
21 | transformers>=4.43,<=4.50
22 | peft
23 | chardet
24 | PyYAML
25 | psutil
26 | jieba_fast
27 | jieba
28 | split-lang
29 | fast_langdetect>=0.3.1
30 | wordsegment
31 | rotary_embedding_torch
32 | ToJyutping
33 | g2pk2
34 | ko_pron
35 | opencc
36 | python_mecab_ko; sys_platform != 'win32'
37 | fastapi[standard]>=0.115.2
38 | x_transformers
39 | torchmetrics<=1.5
40 | pydantic<=2.10.6
41 | ctranslate2>=4.0,<5
42 | huggingface_hub>=0.13
43 | tokenizers>=0.13,<1
44 | av>=11
45 | tqdm
46 |
--------------------------------------------------------------------------------
/tools/AP_BWE_main/24kto48k/readme.txt:
--------------------------------------------------------------------------------
1 | For the inference of the v3 model, if you find that the generated audio sounds somewhat muffled, you can try using this audio super-resolution model.
2 | 对于v3模型的推理,如果你发现生成的音频比较闷,可以尝试这个音频超分模型。
3 |
4 | put g_24kto48k.zip and config.json in this folder
5 | 把g_24kto48k.zip and config.json下到这个文件夹
6 |
7 | download link 下载链接:
8 | https://drive.google.com/drive/folders/1IIYTf2zbJWzelu4IftKD6ooHloJ8mnZF?usp=share_link
9 |
10 | audio sr project page 音频超分项目主页:
11 | https://github.com/yxlu-0102/AP-BWE
12 |
--------------------------------------------------------------------------------
/tools/AP_BWE_main/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Ye-Xin Lu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tools/AP_BWE_main/datasets1/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tools/AP_BWE_main/models/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/tools/__init__.py
--------------------------------------------------------------------------------
/tools/asr/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | def check_fw_local_models():
5 | """
6 | 启动时检查本地是否有 Faster Whisper 模型.
7 | """
8 | model_size_list = [
9 | "tiny",
10 | "tiny.en",
11 | "base",
12 | "base.en",
13 | "small",
14 | "small.en",
15 | "medium",
16 | "medium.en",
17 | "large",
18 | "large-v1",
19 | "large-v2",
20 | "large-v3",
21 | ]
22 | for i, size in enumerate(model_size_list):
23 | if os.path.exists(f"tools/asr/models/faster-whisper-{size}"):
24 | model_size_list[i] = size + "-local"
25 | return model_size_list
26 |
27 |
28 | asr_dict = {
29 | "达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
30 | "Faster Whisper (多语种)": {
31 | "lang": ["auto", "zh", "en", "ja", "ko", "yue"],
32 | "size": check_fw_local_models(),
33 | "path": "fasterwhisper_asr.py",
34 | "precision": ["float32", "float16", "int8"],
35 | },
36 | }
37 |
--------------------------------------------------------------------------------
/tools/asr/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
--------------------------------------------------------------------------------
/tools/audio_sr.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 | import sys
3 | import os
4 |
5 | AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main")
6 | sys.path.append(AP_BWE_main_dir_path)
7 | import json
8 | import torch
9 | import torchaudio.functional as aF
10 | # from attrdict import AttrDict####will be bug in py3.10
11 |
12 | from datasets1.dataset import amp_pha_stft, amp_pha_istft
13 | from models.model import APNet_BWE_Model
14 |
15 |
16 | class AP_BWE:
17 | def __init__(self, device, DictToAttrRecursive, checkpoint_file=None):
18 | if checkpoint_file == None:
19 | checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path)
20 | if os.path.exists(checkpoint_file) == False:
21 | raise FileNotFoundError
22 | config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json")
23 | with open(config_file) as f:
24 | data = f.read()
25 | json_config = json.loads(data)
26 | # h = AttrDict(json_config)
27 | h = DictToAttrRecursive(json_config)
28 | model = APNet_BWE_Model(h).to(device)
29 | state_dict = torch.load(checkpoint_file, map_location="cpu", weights_only=False)
30 | model.load_state_dict(state_dict["generator"])
31 | model.eval()
32 | self.device = device
33 | self.model = model
34 | self.h = h
35 |
36 | def to(self, *arg, **kwargs):
37 | self.model.to(*arg, **kwargs)
38 | self.device = self.model.conv_pre_mag.weight.device
39 | return self
40 |
41 | def __call__(self, audio, orig_sampling_rate):
42 | with torch.no_grad():
43 | # audio, orig_sampling_rate = torchaudio.load(inp_path)
44 | # audio = audio.to(self.device)
45 | audio = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.h.hr_sampling_rate)
46 | amp_nb, pha_nb, com_nb = amp_pha_stft(audio, self.h.n_fft, self.h.hop_size, self.h.win_size)
47 | amp_wb_g, pha_wb_g, com_wb_g = self.model(amp_nb, pha_nb)
48 | audio_hr_g = amp_pha_istft(amp_wb_g, pha_wb_g, self.h.n_fft, self.h.hop_size, self.h.win_size)
49 | # sf.write(opt_path, audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate, 'PCM_16')
50 | return audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate
51 |
--------------------------------------------------------------------------------
/tools/cmd-denoise.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import traceback
4 |
5 | from modelscope.pipelines import pipeline
6 | from modelscope.utils.constant import Tasks
7 | from tqdm import tqdm
8 |
9 | path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k"
10 | path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
11 | ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise)
12 |
13 |
14 | def execute_denoise(input_folder, output_folder):
15 | os.makedirs(output_folder, exist_ok=True)
16 | # print(input_folder)
17 | # print(list(os.listdir(input_folder).sort()))
18 | for name in tqdm(os.listdir(input_folder)):
19 | try:
20 | ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name))
21 | except:
22 | traceback.print_exc()
23 |
24 |
25 | if __name__ == "__main__":
26 | parser = argparse.ArgumentParser()
27 | parser.add_argument(
28 | "-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files."
29 | )
30 | parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.")
31 | parser.add_argument(
32 | "-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32"
33 | ) # 还没接入
34 | cmd = parser.parse_args()
35 | execute_denoise(
36 | input_folder=cmd.input_folder,
37 | output_folder=cmd.output_folder,
38 | )
39 |
--------------------------------------------------------------------------------
/tools/denoise-model/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/tools/i18n/i18n.py:
--------------------------------------------------------------------------------
1 | import json
2 | import locale
3 | import os
4 |
5 | I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
6 |
7 |
8 | def load_language_list(language):
9 | with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f:
10 | language_list = json.load(f)
11 | return language_list
12 |
13 |
14 | def scan_language_list():
15 | language_list = []
16 | for name in os.listdir(I18N_JSON_DIR):
17 | if name.endswith(".json"):
18 | language_list.append(name.split(".")[0])
19 | return language_list
20 |
21 |
22 | class I18nAuto:
23 | def __init__(self, language=None):
24 | if language in ["Auto", None]:
25 | language = locale.getdefaultlocale()[0]
26 | # getlocale can't identify the system's language ((None, None))
27 | if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")):
28 | language = "en_US"
29 | self.language = language
30 | self.language_map = load_language_list(language)
31 |
32 | def __call__(self, key):
33 | return self.language_map.get(key, key)
34 |
35 | def __repr__(self):
36 | return "Use Language: " + self.language
37 |
38 |
39 | if __name__ == "__main__":
40 | i18n = I18nAuto(language="en_US")
41 | print(i18n)
42 |
--------------------------------------------------------------------------------
/tools/slice_audio.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import numpy as np
4 | import traceback
5 | from scipy.io import wavfile
6 |
7 | # parent_directory = os.path.dirname(os.path.abspath(__file__))
8 | # sys.path.append(parent_directory)
9 | from tools.my_utils import load_audio
10 | from slicer2 import Slicer
11 |
12 |
13 | def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part):
14 | os.makedirs(opt_root, exist_ok=True)
15 | if os.path.isfile(inp):
16 | input = [inp]
17 | elif os.path.isdir(inp):
18 | input = [os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
19 | else:
20 | return "输入路径存在但既不是文件也不是文件夹"
21 | slicer = Slicer(
22 | sr=32000, # 长音频采样率
23 | threshold=int(threshold), # 音量小于这个值视作静音的备选切割点
24 | min_length=int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值
25 | min_interval=int(min_interval), # 最短切割间隔
26 | hop_size=int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)
27 | max_sil_kept=int(max_sil_kept), # 切完后静音最多留多长
28 | )
29 | _max = float(_max)
30 | alpha = float(alpha)
31 | for inp_path in input[int(i_part) :: int(all_part)]:
32 | # print(inp_path)
33 | try:
34 | name = os.path.basename(inp_path)
35 | audio = load_audio(inp_path, 32000)
36 | # print(audio.shape)
37 | for chunk, start, end in slicer.slice(audio): # start和end是帧数
38 | tmp_max = np.abs(chunk).max()
39 | if tmp_max > 1:
40 | chunk /= tmp_max
41 | chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
42 | wavfile.write(
43 | "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
44 | 32000,
45 | # chunk.astype(np.float32),
46 | (chunk * 32767).astype(np.int16),
47 | )
48 | except:
49 | print(inp_path, "->fail->", traceback.format_exc())
50 | return "执行完毕,请检查输出文件"
51 |
52 |
53 | print(slice(*sys.argv[1:]))
54 |
--------------------------------------------------------------------------------
/tools/uvr5/bs_roformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RVC-Boss/GPT-SoVITS/968952fd2ab3db8ef228a79072f5456fe20bda52/tools/uvr5/bs_roformer/__init__.py
--------------------------------------------------------------------------------
/tools/uvr5/bs_roformer/attend.py:
--------------------------------------------------------------------------------
1 | from packaging import version
2 | import torch
3 | from torch import nn, einsum
4 | import torch.nn.functional as F
5 |
6 |
7 | def exists(val):
8 | return val is not None
9 |
10 |
11 | def default(v, d):
12 | return v if exists(v) else d
13 |
14 |
15 | class Attend(nn.Module):
16 | def __init__(self, dropout=0.0, flash=False, scale=None):
17 | super().__init__()
18 | self.scale = scale
19 | self.dropout = dropout
20 | self.attn_dropout = nn.Dropout(dropout)
21 |
22 | self.flash = flash
23 | assert not (flash and version.parse(torch.__version__) < version.parse("2.0.0")), (
24 | "in order to use flash attention, you must be using pytorch 2.0 or above"
25 | )
26 |
27 | def flash_attn(self, q, k, v):
28 | # _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device
29 |
30 | if exists(self.scale):
31 | default_scale = q.shape[-1] ** -0.5
32 | q = q * (self.scale / default_scale)
33 |
34 | # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale
35 | # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
36 | return F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout if self.training else 0.0)
37 |
38 | def forward(self, q, k, v):
39 | """
40 | einstein notation
41 | b - batch
42 | h - heads
43 | n, i, j - sequence length (base sequence length, source, target)
44 | d - feature dimension
45 | """
46 |
47 | # q_len, k_len, device = q.shape[-2], k.shape[-2], q.device
48 |
49 | scale = default(self.scale, q.shape[-1] ** -0.5)
50 |
51 | if self.flash:
52 | return self.flash_attn(q, k, v)
53 |
54 | # similarity
55 |
56 | sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale
57 |
58 | # attention
59 |
60 | attn = sim.softmax(dim=-1)
61 | attn = self.attn_dropout(attn)
62 |
63 | # aggregate values
64 |
65 | out = einsum("b h i j, b h j d -> b h i d", attn, v)
66 |
67 | return out
68 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
67 | super(Decoder, self).__init__()
68 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
69 | self.dropout = nn.Dropout2d(0.1) if dropout else None
70 |
71 | def __call__(self, x, skip=None):
72 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
73 | if skip is not None:
74 | skip = spec_utils.crop_center(skip, x)
75 | x = torch.cat([x, skip], dim=1)
76 | h = self.conv(x)
77 |
78 | if self.dropout is not None:
79 | h = self.dropout(h)
80 |
81 | return h
82 |
83 |
84 | class ASPPModule(nn.Module):
85 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
86 | super(ASPPModule, self).__init__()
87 | self.conv1 = nn.Sequential(
88 | nn.AdaptiveAvgPool2d((1, None)),
89 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
90 | )
91 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
92 | self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
93 | self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
94 | self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
95 | self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
96 |
97 | def forward(self, x):
98 | _, _, h, w = x.size()
99 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
100 | feat2 = self.conv2(x)
101 | feat3 = self.conv3(x)
102 | feat4 = self.conv4(x)
103 | feat5 = self.conv5(x)
104 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
105 | bottle = self.bottleneck(out)
106 | return bottle
107 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123812KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
67 | super(Decoder, self).__init__()
68 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
69 | self.dropout = nn.Dropout2d(0.1) if dropout else None
70 |
71 | def __call__(self, x, skip=None):
72 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
73 | if skip is not None:
74 | skip = spec_utils.crop_center(skip, x)
75 | x = torch.cat([x, skip], dim=1)
76 | h = self.conv(x)
77 |
78 | if self.dropout is not None:
79 | h = self.dropout(h)
80 |
81 | return h
82 |
83 |
84 | class ASPPModule(nn.Module):
85 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
86 | super(ASPPModule, self).__init__()
87 | self.conv1 = nn.Sequential(
88 | nn.AdaptiveAvgPool2d((1, None)),
89 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
90 | )
91 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
92 | self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
93 | self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
94 | self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
95 | self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
96 |
97 | def forward(self, x):
98 | _, _, h, w = x.size()
99 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
100 | feat2 = self.conv2(x)
101 | feat3 = self.conv3(x)
102 | feat4 = self.conv4(x)
103 | feat5 = self.conv5(x)
104 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
105 | bottle = self.bottleneck(out)
106 | return bottle
107 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123821KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
67 | super(Decoder, self).__init__()
68 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
69 | self.dropout = nn.Dropout2d(0.1) if dropout else None
70 |
71 | def __call__(self, x, skip=None):
72 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
73 | if skip is not None:
74 | skip = spec_utils.crop_center(skip, x)
75 | x = torch.cat([x, skip], dim=1)
76 | h = self.conv(x)
77 |
78 | if self.dropout is not None:
79 | h = self.dropout(h)
80 |
81 | return h
82 |
83 |
84 | class ASPPModule(nn.Module):
85 | def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
86 | super(ASPPModule, self).__init__()
87 | self.conv1 = nn.Sequential(
88 | nn.AdaptiveAvgPool2d((1, None)),
89 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
90 | )
91 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
92 | self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
93 | self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
94 | self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
95 | self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
96 |
97 | def forward(self, x):
98 | _, _, h, w = x.size()
99 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
100 | feat2 = self.conv2(x)
101 | feat3 = self.conv3(x)
102 | feat4 = self.conv4(x)
103 | feat5 = self.conv5(x)
104 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
105 | bottle = self.bottleneck(out)
106 | return bottle
107 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_33966KB.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class SeperableConv2DBNActiv(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31 | super(SeperableConv2DBNActiv, self).__init__()
32 | self.conv = nn.Sequential(
33 | nn.Conv2d(
34 | nin,
35 | nin,
36 | kernel_size=ksize,
37 | stride=stride,
38 | padding=pad,
39 | dilation=dilation,
40 | groups=nin,
41 | bias=False,
42 | ),
43 | nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44 | nn.BatchNorm2d(nout),
45 | activ(),
46 | )
47 |
48 | def __call__(self, x):
49 | return self.conv(x)
50 |
51 |
52 | class Encoder(nn.Module):
53 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54 | super(Encoder, self).__init__()
55 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57 |
58 | def __call__(self, x):
59 | skip = self.conv1(x)
60 | h = self.conv2(skip)
61 |
62 | return h, skip
63 |
64 |
65 | class Decoder(nn.Module):
66 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
67 | super(Decoder, self).__init__()
68 | self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
69 | self.dropout = nn.Dropout2d(0.1) if dropout else None
70 |
71 | def __call__(self, x, skip=None):
72 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
73 | if skip is not None:
74 | skip = spec_utils.crop_center(skip, x)
75 | x = torch.cat([x, skip], dim=1)
76 | h = self.conv(x)
77 |
78 | if self.dropout is not None:
79 | h = self.dropout(h)
80 |
81 | return h
82 |
83 |
84 | class ASPPModule(nn.Module):
85 | def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
86 | super(ASPPModule, self).__init__()
87 | self.conv1 = nn.Sequential(
88 | nn.AdaptiveAvgPool2d((1, None)),
89 | Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
90 | )
91 | self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
92 | self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
93 | self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
94 | self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
95 | self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
96 | self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
97 | self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
98 |
99 | def forward(self, x):
100 | _, _, h, w = x.size()
101 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
102 | feat2 = self.conv2(x)
103 | feat3 = self.conv3(x)
104 | feat4 = self.conv4(x)
105 | feat5 = self.conv5(x)
106 | feat6 = self.conv6(x)
107 | feat7 = self.conv7(x)
108 | out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
109 | bottle = self.bottleneck(out)
110 | return bottle
111 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_new.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 | from . import spec_utils
6 |
7 |
8 | class Conv2DBNActiv(nn.Module):
9 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10 | super(Conv2DBNActiv, self).__init__()
11 | self.conv = nn.Sequential(
12 | nn.Conv2d(
13 | nin,
14 | nout,
15 | kernel_size=ksize,
16 | stride=stride,
17 | padding=pad,
18 | dilation=dilation,
19 | bias=False,
20 | ),
21 | nn.BatchNorm2d(nout),
22 | activ(),
23 | )
24 |
25 | def __call__(self, x):
26 | return self.conv(x)
27 |
28 |
29 | class Encoder(nn.Module):
30 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
31 | super(Encoder, self).__init__()
32 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
33 | self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
34 |
35 | def __call__(self, x):
36 | h = self.conv1(x)
37 | h = self.conv2(h)
38 |
39 | return h
40 |
41 |
42 | class Decoder(nn.Module):
43 | def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
44 | super(Decoder, self).__init__()
45 | self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
46 | # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
47 | self.dropout = nn.Dropout2d(0.1) if dropout else None
48 |
49 | def __call__(self, x, skip=None):
50 | x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
51 |
52 | if skip is not None:
53 | skip = spec_utils.crop_center(skip, x)
54 | x = torch.cat([x, skip], dim=1)
55 |
56 | h = self.conv1(x)
57 | # h = self.conv2(h)
58 |
59 | if self.dropout is not None:
60 | h = self.dropout(h)
61 |
62 | return h
63 |
64 |
65 | class ASPPModule(nn.Module):
66 | def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
67 | super(ASPPModule, self).__init__()
68 | self.conv1 = nn.Sequential(
69 | nn.AdaptiveAvgPool2d((1, None)),
70 | Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
71 | )
72 | self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
73 | self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
74 | self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
75 | self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
76 | self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
77 | self.dropout = nn.Dropout2d(0.1) if dropout else None
78 |
79 | def forward(self, x):
80 | _, _, h, w = x.size()
81 | feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
82 | feat2 = self.conv2(x)
83 | feat3 = self.conv3(x)
84 | feat4 = self.conv4(x)
85 | feat5 = self.conv5(x)
86 | out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
87 | out = self.bottleneck(out)
88 |
89 | if self.dropout is not None:
90 | out = self.dropout(out)
91 |
92 | return out
93 |
94 |
95 | class LSTMModule(nn.Module):
96 | def __init__(self, nin_conv, nin_lstm, nout_lstm):
97 | super(LSTMModule, self).__init__()
98 | self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
99 | self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
100 | self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())
101 |
102 | def forward(self, x):
103 | N, _, nbins, nframes = x.size()
104 | h = self.conv(x)[:, 0] # N, nbins, nframes
105 | h = h.permute(2, 0, 1) # nframes, N, nbins
106 | h, _ = self.lstm(h)
107 | h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
108 | h = h.reshape(nframes, N, 1, nbins)
109 | h = h.permute(1, 2, 3, 0)
110 |
111 | return h
112 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/model_param_init.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pathlib
3 |
4 | default_param = {}
5 | default_param["bins"] = 768
6 | default_param["unstable_bins"] = 9 # training only
7 | default_param["reduction_bins"] = 762 # training only
8 | default_param["sr"] = 44100
9 | default_param["pre_filter_start"] = 757
10 | default_param["pre_filter_stop"] = 768
11 | default_param["band"] = {}
12 |
13 |
14 | default_param["band"][1] = {
15 | "sr": 11025,
16 | "hl": 128,
17 | "n_fft": 960,
18 | "crop_start": 0,
19 | "crop_stop": 245,
20 | "lpf_start": 61, # inference only
21 | "res_type": "polyphase",
22 | }
23 |
24 | default_param["band"][2] = {
25 | "sr": 44100,
26 | "hl": 512,
27 | "n_fft": 1536,
28 | "crop_start": 24,
29 | "crop_stop": 547,
30 | "hpf_start": 81, # inference only
31 | "res_type": "sinc_best",
32 | }
33 |
34 |
35 | def int_keys(d):
36 | r = {}
37 | for k, v in d:
38 | if k.isdigit():
39 | k = int(k)
40 | r[k] = v
41 | return r
42 |
43 |
44 | class ModelParameters(object):
45 | def __init__(self, config_path=""):
46 | if ".pth" == pathlib.Path(config_path).suffix:
47 | import zipfile
48 |
49 | with zipfile.ZipFile(config_path, "r") as zip:
50 | self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys)
51 | elif ".json" == pathlib.Path(config_path).suffix:
52 | with open(config_path, "r") as f:
53 | self.param = json.loads(f.read(), object_pairs_hook=int_keys)
54 | else:
55 | self.param = default_param
56 |
57 | for k in [
58 | "mid_side",
59 | "mid_side_b",
60 | "mid_side_b2",
61 | "stereo_w",
62 | "stereo_n",
63 | "reverse",
64 | ]:
65 | if k not in self.param:
66 | self.param[k] = False
67 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 16000,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 16000,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 1024
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 32000,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "kaiser_fast"
14 | }
15 | },
16 | "sr": 32000,
17 | "pre_filter_start": 1000,
18 | "pre_filter_stop": 1021
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 33075,
8 | "hl": 384,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 33075,
17 | "pre_filter_start": 1000,
18 | "pre_filter_stop": 1021
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 1024,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 1024
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 256,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 256,
9 | "n_fft": 512,
10 | "crop_start": 0,
11 | "crop_stop": 256,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 256,
18 | "pre_filter_stop": 256
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 1024,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 1024
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 1024,
3 | "unstable_bins": 0,
4 | "reduction_bins": 0,
5 | "band": {
6 | "1": {
7 | "sr": 44100,
8 | "hl": 512,
9 | "n_fft": 2048,
10 | "crop_start": 0,
11 | "crop_stop": 700,
12 | "hpf_start": -1,
13 | "res_type": "sinc_best"
14 | }
15 | },
16 | "sr": 44100,
17 | "pre_filter_start": 1023,
18 | "pre_filter_stop": 700
19 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "reduction_bins": 705,
5 | "band": {
6 | "1": {
7 | "sr": 6000,
8 | "hl": 66,
9 | "n_fft": 512,
10 | "crop_start": 0,
11 | "crop_stop": 240,
12 | "lpf_start": 60,
13 | "lpf_stop": 118,
14 | "res_type": "sinc_fastest"
15 | },
16 | "2": {
17 | "sr": 32000,
18 | "hl": 352,
19 | "n_fft": 1024,
20 | "crop_start": 22,
21 | "crop_stop": 505,
22 | "hpf_start": 44,
23 | "hpf_stop": 23,
24 | "res_type": "sinc_medium"
25 | }
26 | },
27 | "sr": 32000,
28 | "pre_filter_start": 710,
29 | "pre_filter_stop": 731
30 | }
31 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 512,
3 | "unstable_bins": 7,
4 | "reduction_bins": 510,
5 | "band": {
6 | "1": {
7 | "sr": 11025,
8 | "hl": 160,
9 | "n_fft": 768,
10 | "crop_start": 0,
11 | "crop_stop": 192,
12 | "lpf_start": 41,
13 | "lpf_stop": 139,
14 | "res_type": "sinc_fastest"
15 | },
16 | "2": {
17 | "sr": 44100,
18 | "hl": 640,
19 | "n_fft": 1024,
20 | "crop_start": 10,
21 | "crop_stop": 320,
22 | "hpf_start": 47,
23 | "hpf_stop": 15,
24 | "res_type": "sinc_medium"
25 | }
26 | },
27 | "sr": 44100,
28 | "pre_filter_start": 510,
29 | "pre_filter_stop": 512
30 | }
31 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "reduction_bins": 705,
5 | "band": {
6 | "1": {
7 | "sr": 6000,
8 | "hl": 66,
9 | "n_fft": 512,
10 | "crop_start": 0,
11 | "crop_stop": 240,
12 | "lpf_start": 60,
13 | "lpf_stop": 240,
14 | "res_type": "sinc_fastest"
15 | },
16 | "2": {
17 | "sr": 48000,
18 | "hl": 528,
19 | "n_fft": 1536,
20 | "crop_start": 22,
21 | "crop_stop": 505,
22 | "hpf_start": 82,
23 | "hpf_stop": 22,
24 | "res_type": "sinc_medium"
25 | }
26 | },
27 | "sr": 48000,
28 | "pre_filter_start": 710,
29 | "pre_filter_stop": 731
30 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 5,
4 | "reduction_bins": 733,
5 | "band": {
6 | "1": {
7 | "sr": 11025,
8 | "hl": 128,
9 | "n_fft": 768,
10 | "crop_start": 0,
11 | "crop_stop": 278,
12 | "lpf_start": 28,
13 | "lpf_stop": 140,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 22050,
18 | "hl": 256,
19 | "n_fft": 768,
20 | "crop_start": 14,
21 | "crop_stop": 322,
22 | "hpf_start": 70,
23 | "hpf_stop": 14,
24 | "lpf_start": 283,
25 | "lpf_stop": 314,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 44100,
30 | "hl": 512,
31 | "n_fft": 768,
32 | "crop_start": 131,
33 | "crop_stop": 313,
34 | "hpf_start": 154,
35 | "hpf_stop": 141,
36 | "res_type": "sinc_medium"
37 | }
38 | },
39 | "sr": 44100,
40 | "pre_filter_start": 757,
41 | "pre_filter_stop": 768
42 | }
43 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side": true,
3 | "bins": 768,
4 | "unstable_bins": 5,
5 | "reduction_bins": 733,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 768,
11 | "crop_start": 0,
12 | "crop_stop": 278,
13 | "lpf_start": 28,
14 | "lpf_stop": 140,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 22050,
19 | "hl": 256,
20 | "n_fft": 768,
21 | "crop_start": 14,
22 | "crop_stop": 322,
23 | "hpf_start": 70,
24 | "hpf_stop": 14,
25 | "lpf_start": 283,
26 | "lpf_stop": 314,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 44100,
31 | "hl": 512,
32 | "n_fft": 768,
33 | "crop_start": 131,
34 | "crop_stop": 313,
35 | "hpf_start": 154,
36 | "hpf_stop": 141,
37 | "res_type": "sinc_medium"
38 | }
39 | },
40 | "sr": 44100,
41 | "pre_filter_start": 757,
42 | "pre_filter_stop": 768
43 | }
44 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b2": true,
3 | "bins": 640,
4 | "unstable_bins": 7,
5 | "reduction_bins": 565,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 108,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 187,
13 | "lpf_start": 92,
14 | "lpf_stop": 186,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 22050,
19 | "hl": 216,
20 | "n_fft": 768,
21 | "crop_start": 0,
22 | "crop_stop": 212,
23 | "hpf_start": 68,
24 | "hpf_stop": 34,
25 | "lpf_start": 174,
26 | "lpf_stop": 209,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 44100,
31 | "hl": 432,
32 | "n_fft": 640,
33 | "crop_start": 66,
34 | "crop_stop": 307,
35 | "hpf_start": 86,
36 | "hpf_stop": 72,
37 | "res_type": "kaiser_fast"
38 | }
39 | },
40 | "sr": 44100,
41 | "pre_filter_start": 639,
42 | "pre_filter_stop": 640
43 | }
44 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "reduction_bins": 668,
5 | "band": {
6 | "1": {
7 | "sr": 11025,
8 | "hl": 128,
9 | "n_fft": 1024,
10 | "crop_start": 0,
11 | "crop_stop": 186,
12 | "lpf_start": 37,
13 | "lpf_stop": 73,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 11025,
18 | "hl": 128,
19 | "n_fft": 512,
20 | "crop_start": 4,
21 | "crop_stop": 185,
22 | "hpf_start": 36,
23 | "hpf_stop": 18,
24 | "lpf_start": 93,
25 | "lpf_stop": 185,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 22050,
30 | "hl": 256,
31 | "n_fft": 512,
32 | "crop_start": 46,
33 | "crop_stop": 186,
34 | "hpf_start": 93,
35 | "hpf_stop": 46,
36 | "lpf_start": 164,
37 | "lpf_stop": 186,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 512,
43 | "n_fft": 768,
44 | "crop_start": 121,
45 | "crop_stop": 382,
46 | "hpf_start": 138,
47 | "hpf_stop": 123,
48 | "res_type": "sinc_medium"
49 | }
50 | },
51 | "sr": 44100,
52 | "pre_filter_start": 740,
53 | "pre_filter_stop": 768
54 | }
55 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 768,
3 | "unstable_bins": 7,
4 | "mid_side": true,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
56 |
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
1 | {
2 | "reverse": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
1 | {
2 | "stereo_w": true,
3 | "bins": 768,
4 | "unstable_bins": 7,
5 | "reduction_bins": 668,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 128,
10 | "n_fft": 1024,
11 | "crop_start": 0,
12 | "crop_stop": 186,
13 | "lpf_start": 37,
14 | "lpf_stop": 73,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 11025,
19 | "hl": 128,
20 | "n_fft": 512,
21 | "crop_start": 4,
22 | "crop_stop": 185,
23 | "hpf_start": 36,
24 | "hpf_stop": 18,
25 | "lpf_start": 93,
26 | "lpf_stop": 185,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 22050,
31 | "hl": 256,
32 | "n_fft": 512,
33 | "crop_start": 46,
34 | "crop_stop": 186,
35 | "hpf_start": 93,
36 | "hpf_stop": 46,
37 | "lpf_start": 164,
38 | "lpf_stop": 186,
39 | "res_type": "polyphase"
40 | },
41 | "4": {
42 | "sr": 44100,
43 | "hl": 512,
44 | "n_fft": 768,
45 | "crop_start": 121,
46 | "crop_stop": 382,
47 | "hpf_start": 138,
48 | "hpf_stop": 123,
49 | "res_type": "sinc_medium"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 740,
54 | "pre_filter_stop": 768
55 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 672,
3 | "unstable_bins": 8,
4 | "reduction_bins": 637,
5 | "band": {
6 | "1": {
7 | "sr": 7350,
8 | "hl": 80,
9 | "n_fft": 640,
10 | "crop_start": 0,
11 | "crop_stop": 85,
12 | "lpf_start": 25,
13 | "lpf_stop": 53,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 7350,
18 | "hl": 80,
19 | "n_fft": 320,
20 | "crop_start": 4,
21 | "crop_stop": 87,
22 | "hpf_start": 25,
23 | "hpf_stop": 12,
24 | "lpf_start": 31,
25 | "lpf_stop": 62,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 14700,
30 | "hl": 160,
31 | "n_fft": 512,
32 | "crop_start": 17,
33 | "crop_stop": 216,
34 | "hpf_start": 48,
35 | "hpf_stop": 24,
36 | "lpf_start": 139,
37 | "lpf_stop": 210,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 480,
43 | "n_fft": 960,
44 | "crop_start": 78,
45 | "crop_stop": 383,
46 | "hpf_start": 130,
47 | "hpf_stop": 86,
48 | "res_type": "kaiser_fast"
49 | }
50 | },
51 | "sr": 44100,
52 | "pre_filter_start": 668,
53 | "pre_filter_stop": 672
54 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 672,
3 | "unstable_bins": 8,
4 | "reduction_bins": 637,
5 | "band": {
6 | "1": {
7 | "sr": 7350,
8 | "hl": 80,
9 | "n_fft": 640,
10 | "crop_start": 0,
11 | "crop_stop": 85,
12 | "lpf_start": 25,
13 | "lpf_stop": 53,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 7350,
18 | "hl": 80,
19 | "n_fft": 320,
20 | "crop_start": 4,
21 | "crop_stop": 87,
22 | "hpf_start": 25,
23 | "hpf_stop": 12,
24 | "lpf_start": 31,
25 | "lpf_stop": 62,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 14700,
30 | "hl": 160,
31 | "n_fft": 512,
32 | "crop_start": 17,
33 | "crop_stop": 216,
34 | "hpf_start": 48,
35 | "hpf_stop": 24,
36 | "lpf_start": 139,
37 | "lpf_stop": 210,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 480,
43 | "n_fft": 960,
44 | "crop_start": 78,
45 | "crop_stop": 383,
46 | "hpf_start": 130,
47 | "hpf_stop": 86,
48 | "convert_channels": "stereo_n",
49 | "res_type": "kaiser_fast"
50 | }
51 | },
52 | "sr": 44100,
53 | "pre_filter_start": 668,
54 | "pre_filter_stop": 672
55 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
1 | {
2 | "bins": 672,
3 | "unstable_bins": 8,
4 | "reduction_bins": 530,
5 | "band": {
6 | "1": {
7 | "sr": 7350,
8 | "hl": 80,
9 | "n_fft": 640,
10 | "crop_start": 0,
11 | "crop_stop": 85,
12 | "lpf_start": 25,
13 | "lpf_stop": 53,
14 | "res_type": "polyphase"
15 | },
16 | "2": {
17 | "sr": 7350,
18 | "hl": 80,
19 | "n_fft": 320,
20 | "crop_start": 4,
21 | "crop_stop": 87,
22 | "hpf_start": 25,
23 | "hpf_stop": 12,
24 | "lpf_start": 31,
25 | "lpf_stop": 62,
26 | "res_type": "polyphase"
27 | },
28 | "3": {
29 | "sr": 14700,
30 | "hl": 160,
31 | "n_fft": 512,
32 | "crop_start": 17,
33 | "crop_stop": 216,
34 | "hpf_start": 48,
35 | "hpf_stop": 24,
36 | "lpf_start": 139,
37 | "lpf_stop": 210,
38 | "res_type": "polyphase"
39 | },
40 | "4": {
41 | "sr": 44100,
42 | "hl": 480,
43 | "n_fft": 960,
44 | "crop_start": 78,
45 | "crop_stop": 383,
46 | "hpf_start": 130,
47 | "hpf_stop": 86,
48 | "res_type": "kaiser_fast"
49 | }
50 | },
51 | "sr": 44100,
52 | "pre_filter_start": 668,
53 | "pre_filter_stop": 672
54 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/ensemble.json:
--------------------------------------------------------------------------------
1 | {
2 | "mid_side_b2": true,
3 | "bins": 1280,
4 | "unstable_bins": 7,
5 | "reduction_bins": 565,
6 | "band": {
7 | "1": {
8 | "sr": 11025,
9 | "hl": 108,
10 | "n_fft": 2048,
11 | "crop_start": 0,
12 | "crop_stop": 374,
13 | "lpf_start": 92,
14 | "lpf_stop": 186,
15 | "res_type": "polyphase"
16 | },
17 | "2": {
18 | "sr": 22050,
19 | "hl": 216,
20 | "n_fft": 1536,
21 | "crop_start": 0,
22 | "crop_stop": 424,
23 | "hpf_start": 68,
24 | "hpf_stop": 34,
25 | "lpf_start": 348,
26 | "lpf_stop": 418,
27 | "res_type": "polyphase"
28 | },
29 | "3": {
30 | "sr": 44100,
31 | "hl": 432,
32 | "n_fft": 1280,
33 | "crop_start": 132,
34 | "crop_stop": 614,
35 | "hpf_start": 172,
36 | "hpf_stop": 144,
37 | "res_type": "polyphase"
38 | }
39 | },
40 | "sr": 44100,
41 | "pre_filter_start": 1280,
42 | "pre_filter_stop": 1280
43 | }
--------------------------------------------------------------------------------
/tools/uvr5/lib/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import numpy as np
4 | import torch
5 | from tqdm import tqdm
6 |
7 |
8 | def load_data(file_name: str = "./lib/name_params.json") -> dict:
9 | with open(file_name, "r") as f:
10 | data = json.load(f)
11 |
12 | return data
13 |
14 |
15 | def make_padding(width, cropsize, offset):
16 | left = offset
17 | roi_size = cropsize - left * 2
18 | if roi_size == 0:
19 | roi_size = cropsize
20 | right = roi_size - (width % roi_size) + left
21 |
22 | return left, right, roi_size
23 |
24 |
25 | def inference(X_spec, device, model, aggressiveness, data):
26 | """
27 | data : dic configs
28 | """
29 |
30 | def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True):
31 | model.eval()
32 | with torch.no_grad():
33 | preds = []
34 |
35 | iterations = [n_window]
36 |
37 | total_iterations = sum(iterations)
38 | for i in tqdm(range(n_window)):
39 | start = i * roi_size
40 | X_mag_window = X_mag_pad[None, :, :, start : start + data["window_size"]]
41 | X_mag_window = torch.from_numpy(X_mag_window)
42 | if is_half:
43 | X_mag_window = X_mag_window.half()
44 | X_mag_window = X_mag_window.to(device)
45 |
46 | pred = model.predict(X_mag_window, aggressiveness)
47 |
48 | pred = pred.detach().cpu().numpy()
49 | preds.append(pred[0])
50 |
51 | pred = np.concatenate(preds, axis=2)
52 | return pred
53 |
54 | def preprocess(X_spec):
55 | X_mag = np.abs(X_spec)
56 | X_phase = np.angle(X_spec)
57 |
58 | return X_mag, X_phase
59 |
60 | X_mag, X_phase = preprocess(X_spec)
61 |
62 | coef = X_mag.max()
63 | X_mag_pre = X_mag / coef
64 |
65 | n_frame = X_mag_pre.shape[2]
66 | pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
67 | n_window = int(np.ceil(n_frame / roi_size))
68 |
69 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
70 |
71 | if list(model.state_dict().values())[0].dtype == torch.float16:
72 | is_half = True
73 | else:
74 | is_half = False
75 | pred = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half)
76 | pred = pred[:, :, :n_frame]
77 |
78 | if data["tta"]:
79 | pad_l += roi_size // 2
80 | pad_r += roi_size // 2
81 | n_window += 1
82 |
83 | X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
84 |
85 | pred_tta = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half)
86 | pred_tta = pred_tta[:, :, roi_size // 2 :]
87 | pred_tta = pred_tta[:, :, :n_frame]
88 |
89 | return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
90 | else:
91 | return pred * coef, X_mag, np.exp(1.0j * X_phase)
92 |
93 |
94 | def _get_name_params(model_path, model_hash):
95 | data = load_data()
96 | flag = False
97 | ModelName = model_path
98 | for type in list(data):
99 | for model in list(data[type][0]):
100 | for i in range(len(data[type][0][model])):
101 | if str(data[type][0][model][i]["hash_name"]) == model_hash:
102 | flag = True
103 | elif str(data[type][0][model][i]["hash_name"]) in ModelName:
104 | flag = True
105 |
106 | if flag:
107 | model_params_auto = data[type][0][model][i]["model_params"]
108 | param_name_auto = data[type][0][model][i]["param_name"]
109 | if type == "equivalent":
110 | return param_name_auto, model_params_auto
111 | else:
112 | flag = False
113 | return param_name_auto, model_params_auto
114 |
--------------------------------------------------------------------------------
/tools/uvr5/uvr5_weights/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------