├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── config.py
├── docs
    └── README_zh.md
├── examples
    ├── dolly.wav
    ├── freddie.wav
    ├── hayami_saori.mp3
    ├── kesidi.wav
    ├── kuidou_cn.wav
    ├── kuidou_en.wav
    ├── linbo_cn.wav
    ├── linbo_en.wav
    ├── maggie.wav
    ├── manman3_cn.wav
    ├── manman4_cn.wav
    ├── maychelle_cn.wav
    ├── maychelle_en.wav
    ├── weilun.wav
    └── xinbada_cn.wav
├── inference.sh
├── inference_parameters.json
├── inference_parameters.txt
├── install.sh
├── pretrained_models
    └── .gitignore
├── quick_start.sh
├── requirements.txt
├── run_training.sh
├── server
    ├── .ipynb_checkpoints
    │   └── webui-checkpoint.py
    ├── app.py
    ├── asr.py
    ├── example.json
    ├── modelhandler.py
    ├── train_webui.py
    └── webui.py
├── src
    ├── AR
    │   ├── __init__.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── bucket_sampler.py
    │   │   ├── data_module.py
    │   │   └── dataset.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── t2s_lightning_module.py
    │   │   ├── t2s_lightning_module_onnx.py
    │   │   ├── t2s_model.py
    │   │   ├── t2s_model_onnx.py
    │   │   └── utils.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── activation.py
    │   │   ├── activation_onnx.py
    │   │   ├── embedding.py
    │   │   ├── embedding_onnx.py
    │   │   ├── lr_schedulers.py
    │   │   ├── optim.py
    │   │   ├── patched_mha_with_cache.py
    │   │   ├── patched_mha_with_cache_onnx.py
    │   │   ├── scaling.py
    │   │   ├── transformer.py
    │   │   └── transformer_onnx.py
    │   ├── text_processing
    │   │   ├── __init__.py
    │   │   ├── phonemizer.py
    │   │   └── symbols.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── initialize.py
    │   │   └── io.py
    ├── configs
    │   ├── s1.yaml
    │   ├── s1big.yaml
    │   ├── s1big2.yaml
    │   ├── s1longer.yaml
    │   ├── s1mq.yaml
    │   ├── sovits.json
    │   └── train.yaml
    ├── feature_extractor
    │   ├── __init__.py
    │   ├── cnhubert.py
    │   └── whisper_enc.py
    ├── inference
    │   ├── .ipynb_checkpoints
    │   │   ├── infer_tool-checkpoint.py
    │   │   └── inference-checkpoint.py
    │   ├── __init__.py
    │   ├── infer_tool.py
    │   └── inference.py
    ├── module
    │   ├── __init__.py
    │   ├── attentions.py
    │   ├── attentions_onnx.py
    │   ├── commons.py
    │   ├── core_vq.py
    │   ├── data_utils.py
    │   ├── losses.py
    │   ├── mel_processing.py
    │   ├── models.py
    │   ├── models_onnx.py
    │   ├── modules.py
    │   ├── mrte_model.py
    │   ├── quantize.py
    │   └── transforms.py
    ├── preprocess
    │   ├── __init__.py
    │   ├── get_phonemes.py
    │   ├── get_semantic.py
    │   ├── get_ssl_features.py
    │   └── process.py
    ├── text
    │   ├── __init__.py
    │   ├── chinese.py
    │   ├── cleaner.py
    │   ├── cmudict-fast.rep
    │   ├── cmudict.rep
    │   ├── engdict-hot.rep
    │   ├── engdict_cache.pickle
    │   ├── english.py
    │   ├── japanese.py
    │   ├── namedict_cache.pickle
    │   ├── opencpop-strict.txt
    │   ├── symbols.py
    │   ├── tone_sandhi.py
    │   └── zh_normalization
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── char_convert.py
    │   │   ├── chronology.py
    │   │   ├── constants.py
    │   │   ├── num.py
    │   │   ├── phonecode.py
    │   │   ├── quantifier.py
    │   │   └── text_normlization.py
    ├── train
    │   ├── train_gpt.py
    │   └── train_sovits.py
    └── utils
    │   ├── __init__.py
    │   ├── config.py
    │   ├── cut.py
    │   ├── onnx_export.py
    │   ├── process_ckpt.py
    │   └── utils.py
└── tools
    ├── asr
        ├── config.py
        ├── fasterwhisper_asr.py
        ├── funasr_asr.py
        └── models
        │   └── .gitignore
    ├── cmd-denoise.py
    ├── denoise-model
        └── .gitignore
    ├── my_utils.py
    ├── process_data.py
    ├── slice_audio.py
    ├── slicer2.py
    ├── subfix_webui.py
    └── uvr5
        ├── lib
            ├── lib_v5
            │   ├── dataset.py
            │   ├── layers.py
            │   ├── layers_123812KB.py
            │   ├── layers_123821KB.py
            │   ├── layers_33966KB.py
            │   ├── layers_537227KB.py
            │   ├── layers_537238KB.py
            │   ├── layers_new.py
            │   ├── model_param_init.py
            │   ├── modelparams
            │   │   ├── 1band_sr16000_hl512.json
            │   │   ├── 1band_sr32000_hl512.json
            │   │   ├── 1band_sr33075_hl384.json
            │   │   ├── 1band_sr44100_hl1024.json
            │   │   ├── 1band_sr44100_hl256.json
            │   │   ├── 1band_sr44100_hl512.json
            │   │   ├── 1band_sr44100_hl512_cut.json
            │   │   ├── 2band_32000.json
            │   │   ├── 2band_44100_lofi.json
            │   │   ├── 2band_48000.json
            │   │   ├── 3band_44100.json
            │   │   ├── 3band_44100_mid.json
            │   │   ├── 3band_44100_msb2.json
            │   │   ├── 4band_44100.json
            │   │   ├── 4band_44100_mid.json
            │   │   ├── 4band_44100_msb.json
            │   │   ├── 4band_44100_msb2.json
            │   │   ├── 4band_44100_reverse.json
            │   │   ├── 4band_44100_sw.json
            │   │   ├── 4band_v2.json
            │   │   ├── 4band_v2_sn.json
            │   │   ├── 4band_v3.json
            │   │   └── ensemble.json
            │   ├── nets.py
            │   ├── nets_123812KB.py
            │   ├── nets_123821KB.py
            │   ├── nets_33966KB.py
            │   ├── nets_537227KB.py
            │   ├── nets_537238KB.py
            │   ├── nets_61968KB.py
            │   ├── nets_new.py
            │   └── spec_utils.py
            ├── name_params.json
            └── utils.py
        ├── mdxnet.py
        ├── vr.py
        └── webui.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | __pycache__
 3 | *.pyc
 4 | env
 5 | runtime
 6 | .idea
 7 | output
 8 | logs
 9 | reference
10 | GPT_weights
11 | SoVITS_weights
12 | TEMP
13 | *.ipynb
14 | input_audio/*
15 | output_audio/*
16 | .history
17 | app.log*
18 | Dockerfile
19 | .gitlab-ci.yml
20 | deploy.yaml
21 | Makefile
22 | .ipynb_checkpoints
23 | onnx/
24 | data/
25 | !data/check_audio.py
26 | example.xlsx
27 | 
28 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # 使用Miniconda镜像作为基础镜像
 2 | FROM continuumio/miniconda3
 3 | 
 4 | # 安装依赖项
 5 | RUN apt-get update && apt-get install -y --no-install-recommends \
 6 |     git \
 7 |     sudo \
 8 |     build-essential \
 9 |     wget \
10 |     curl \
11 |     cmake \
12 |     vim \
13 |     tmux \
14 |     ffmpeg \
15 |     libglu1-mesa \
16 |     libxi-dev \
17 |     libxmu-dev \
18 |     libglu1-mesa-dev \
19 |     freeglut3-dev && \
20 |     apt-get clean && \
21 |     rm -rf /var/lib/apt/lists/*
22 | 
23 | # # 下载CUDA工具包并安装
24 | # RUN wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run \
25 | #     && sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit > /dev/null \
26 | #     && rm cuda_11.8.0_520.61.05_linux.run
27 | 
28 | # # 更新环境变量
29 | # ENV PATH=$PATH:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/nvidia/lib64
30 | # ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
31 | # ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
32 | 
33 | # 创建并初始化conda环境
34 | RUN conda create -n aispeech python=3.10.13 && \
35 |     echo "conda activate aispeech" >> ~/.bashrc
36 | 
37 | SHELL ["conda", "run", "-n", "aispeech", "/bin/bash", "-c"]
38 | 
39 | WORKDIR /app
40 | 
41 | # 安装依赖
42 | 
43 | COPY install.sh /app/install.sh
44 | COPY requirements.txt /app/requirements.txt
45 | RUN chmod +x /app/install.sh && bash /app/install.sh -y
46 | 
47 | # 将当前目录内容复制到容器的/app下
48 | COPY . /app
49 | 
50 | # 在Dockerfile中设置环境变量
51 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
52 | 
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 RVC-Boss
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | 
 3 | import torch
 4 | 
 5 | # 推理用的指定模型
 6 | sovits_path = ""
 7 | gpt_path = ""
 8 | is_half_str = os.environ.get("is_half", "True")
 9 | is_half = True if is_half_str.lower() == 'true' else False
10 | is_share_str = os.environ.get("is_share","False")
11 | is_share= True if is_share_str.lower() == 'true' else False
12 | 
13 | cnhubert_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
14 | bert_path = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
15 | pretrained_sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
16 | pretrained_gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
17 | 
18 | exp_root = "logs"
19 | python_exec = sys.executable or "python"
20 | if torch.cuda.is_available():
21 |     infer_device = "cuda"
22 | else:
23 |     infer_device = "cpu"
24 | 
25 | webui_port_main = 9874
26 | webui_port_uvr5 = 9873
27 | webui_port_infer_tts = 9872
28 | webui_port_subfix = 9871
29 | 
30 | api_port = 9880
31 | 
32 | if infer_device == "cuda":
33 |     gpu_name = torch.cuda.get_device_name(0)
34 |     if (
35 |             ("16" in gpu_name and "V100" not in gpu_name.upper())
36 |             or "P40" in gpu_name.upper()
37 |             or "P10" in gpu_name.upper()
38 |             or "1060" in gpu_name
39 |             or "1070" in gpu_name
40 |             or "1080" in gpu_name
41 |     ):
42 |         is_half=False
43 | 
44 | if(infer_device=="cpu"):is_half=False
45 | 
46 | class Config:
47 |     def __init__(self):
48 |         self.sovits_path = sovits_path
49 |         self.gpt_path = gpt_path
50 |         self.is_half = is_half
51 | 
52 |         self.cnhubert_path = cnhubert_path
53 |         self.bert_path = bert_path
54 |         self.pretrained_sovits_path = pretrained_sovits_path
55 |         self.pretrained_gpt_path = pretrained_gpt_path
56 | 
57 |         self.exp_root = exp_root
58 |         self.python_exec = python_exec
59 |         self.infer_device = infer_device
60 | 
61 |         self.webui_port_main = webui_port_main
62 |         self.webui_port_uvr5 = webui_port_uvr5
63 |         self.webui_port_infer_tts = webui_port_infer_tts
64 |         self.webui_port_subfix = webui_port_subfix
65 | 
66 |         self.api_port = api_port
67 | 


--------------------------------------------------------------------------------
/examples/dolly.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/dolly.wav


--------------------------------------------------------------------------------
/examples/freddie.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/freddie.wav


--------------------------------------------------------------------------------
/examples/hayami_saori.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/hayami_saori.mp3


--------------------------------------------------------------------------------
/examples/kesidi.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/kesidi.wav


--------------------------------------------------------------------------------
/examples/kuidou_cn.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/kuidou_cn.wav


--------------------------------------------------------------------------------
/examples/kuidou_en.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/kuidou_en.wav


--------------------------------------------------------------------------------
/examples/linbo_cn.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/linbo_cn.wav


--------------------------------------------------------------------------------
/examples/linbo_en.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/linbo_en.wav


--------------------------------------------------------------------------------
/examples/maggie.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/maggie.wav


--------------------------------------------------------------------------------
/examples/manman3_cn.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/manman3_cn.wav


--------------------------------------------------------------------------------
/examples/manman4_cn.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/manman4_cn.wav


--------------------------------------------------------------------------------
/examples/maychelle_cn.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/maychelle_cn.wav


--------------------------------------------------------------------------------
/examples/maychelle_en.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/maychelle_en.wav


--------------------------------------------------------------------------------
/examples/weilun.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/weilun.wav


--------------------------------------------------------------------------------
/examples/xinbada_cn.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/examples/xinbada_cn.wav


--------------------------------------------------------------------------------
/inference.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python src/inference/inference.py \
4 |  --sovits_weights pretrained_models/sovits_weights/kuidou_cn/kuidou_cn_e30_s540.pth \
5 |  --gpt_weights pretrained_models/gpt_weights/kuidou_cn/kuidou_cn-e30.ckpt \
6 |  --parameters_file inference_parameters.json \
7 |  --output_folder output_audio/kuidou_cn \
8 | 


--------------------------------------------------------------------------------
/inference_parameters.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |       "ref_wav_path": "kuidou_cn.wav",
 4 |       "prompt_text": "超级空投就在我附近，有人要来抢抢看么？",
 5 |       "prompt_language": "zh",
 6 |       "text": "那时候我才意识到我已经迷路了。我想等到红灯时停下自行车看导航，谁知一路都是绿灯，由此一路往前，我冲着众人鞠躬，这便是我迟到十年的原因。",
 7 |       "text_language": "zh",
 8 |       "how_to_cut": null
 9 |     },
10 |     {
11 |       "ref_wav_path": "kuidou_cn.wav",
12 |       "prompt_text": "超级空投就在我附近，有人要来抢抢看么？",
13 |       "prompt_language": "zh",
14 |       "text": "总的来说对于游戏思维的要求非常高。而且因为规则很多很复杂，所以也不会出现重复游玩带来的枯燥感，非常适合有事没事就拿出来想一想。有时候突然的灵光乍现就可以想到解决问题的办法。",
15 |       "text_language": "zh",
16 |       "how_to_cut": null
17 |     }
18 | ]  


--------------------------------------------------------------------------------
/inference_parameters.txt:
--------------------------------------------------------------------------------
 1 | 0001.wav|我从来不留情面。|zh|日月之力，引我前行|zh|None
 2 | 0002.wav|罪人的哭喊，必将响彻天际。|zh|日月并肩，战场由我主宰|zh|None
 3 | 0003.wav|在安静的时候，我眼前浮现出暗裔和人类的回忆。|zh|遵循荣光的指引|zh|None
 4 | 0015.wav|百般求饶也不会让我停手。|zh|追随暗影的轨迹|zh|None
 5 | 0005.wav|飞升者，太阳给了你力量黑暗给了你归宿。|zh|以平衡之名，出击！|zh|None
 6 | 0006.wav|没有仁慈，没有宽恕，人类和怪物毫无区别，都会倒下。|zh|光影之间，真战始于此刻|zh|None
 7 | 0007.wav|剑士不该质疑自己的方向。|zh|你的光芒耗尽了！|zh|None
 8 | 0017.wav|同情也不过是一声呻吟。|zh|巨兽也无法阻挡光与影的审判！|zh|None
 9 | 0018.wav|你希望我们团结，战斗会实现这个愿望吗？|zh|万物湮灭，不过吹灰之力。|zh|None
10 | 0010.wav|没有哪颗心能拴住我。|zh|光暗未定，此非终结。|zh|None
11 | 0011.wav|就连希望也会消亡。|zh|我听见星辰。。。在召唤我？|zh|None
12 | 0012.wav|你期盼已久的终点。|zh|日月轮回，永无止境，但充满希望。|zh|None
13 | 0013.wav|世界埋葬了我，我也必将回报。|zh|光暗交织，现世审判！|zh|None
14 | 0014.wav|享受短暂的喘息吧，艾欧尼亚的孩子，很快就会结束的。|zh|命运相伴，我们终将再会。|zh|None


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | conda install -c conda-forge gcc
3 | conda install -c conda-forge gxx
4 | conda install ffmpeg cmake
5 | conda install pytorch==2.1.0 torchvision==0.20.0 torchaudio==2.1.0 pytorch-cuda=12.1 -c pytorch -c nvidia
6 | pip install -r requirements.txt
7 | 


--------------------------------------------------------------------------------
/pretrained_models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/quick_start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 获取传入的name参数
 4 | NAME=$1
 5 | 
 6 | # # Preprocess
 7 | python src/preprocess/process.py \
 8 |  --data_dir data \
 9 |  --log_dir logs \
10 |  --name $NAME \
11 | 
12 | # train sovits model
13 | python src/train/train_sovits.py \
14 |  -c src/configs/sovits.json \
15 | <<<<<<< HEAD
16 |  -n $NAME \
17 |  -t sovits \
18 |  -e 30 \
19 |  -lr 0.4 \
20 |  -bs 32 \
21 |  -nw 0 \
22 |  --save_every_epoch 10 \
23 | 
24 | # train gpt model
25 | python src/train/train_gpt.py \
26 |  -c src/configs/s1longer.yaml \
27 |  -n $NAME \
28 |  -e 30 \
29 |  -bs 32 \
30 |  -nw 0 \
31 |  --save_every_epoch 10
32 | =======
33 |  -n varus \
34 |  -t sovits \
35 |  -e 25 \
36 |  -lr 0.4 \
37 |  -bs 16 \
38 |  -nw 0 \
39 |  --save_every_epoch 5 \
40 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.26.0
 2 | scipy==1.12.0
 3 | tensorboard
 4 | librosa
 5 | pydub
 6 | numba
 7 | pytorch-lightning
 8 | gradio==4.41.0
 9 | gradio_client==1.3.0
10 | fastapi==0.112.1
11 | ffmpeg-python
12 | onnxruntime
13 | tqdm
14 | funasr
15 | cn2an
16 | pypinyin
17 | pyopenjtalk
18 | g2p_en
19 | torchaudio
20 | sentencepiece
21 | transformers
22 | chardet
23 | PyYAML
24 | psutil
25 | jieba_fast
26 | jieba
27 | LangSegment>=0.2.0
28 | Faster_Whisper
29 | einops
30 | pydantic
31 | wordsegment
32 | openpyxl


--------------------------------------------------------------------------------
/run_training.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 预先定义的name参数列表
 4 | NAME_LIST=("maychelle_en" "maychelle_cn")
 5 | 
 6 | # 循环处理每个name参数
 7 | for NAME in "${NAME_LIST[@]}"; do
 8 |   # 执行quick_start.sh并传入name参数
 9 |   ./quick_start.sh $NAME
10 | done 


--------------------------------------------------------------------------------
/server/asr.py:
--------------------------------------------------------------------------------
  1 | from agent_kernel import get_llm_client, ASRRequest
  2 | from pydub import AudioSegment
  3 | import os
  4 | import pandas as pd
  5 | import re
  6 | import asyncio
  7 | 
  8 | 
  9 | async def get_asr_result(audio_path: str) -> str:
 10 |     # 加载音频文件
 11 |     audio = AudioSegment.from_wav(audio_path)
 12 |     
 13 |     # 检查音频是否为16位和单声道
 14 |     if audio.sample_width != 2 or audio.channels != 1:
 15 |         # 如果不是16位或单声道，则转换成16位和单声道并保存成一个临时的文件
 16 |         temp_audio_path = audio_path.replace(".wav", "_tmp.wav")
 17 |         audio = audio.set_sample_width(2).set_channels(1)  # 2 bytes = 16 bits, 1 channel = mono
 18 |         audio.export(temp_audio_path, format="wav")
 19 |     else:
 20 |         temp_audio_path = audio_path
 21 | 
 22 |     # 调用agent kernel的asr接口，返回识别结果
 23 |     client = get_llm_client()
 24 |     r = ASRRequest.from_file(temp_audio_path)
 25 |     r.lang_code = "cmn-Hans-CN"
 26 |     res = await client.asr_async(r)  # Async API
 27 |     transcript = res.response.alternatives[0].transcript # type: ignore
 28 | 
 29 |     # 删除临时文件
 30 |     if temp_audio_path != audio_path:
 31 |         os.remove(temp_audio_path)
 32 | 
 33 |     return transcript
 34 | 
 35 | 
 36 | def calculate_edit_distance(hypothesis, reference, lang="zh"):
 37 |     """
 38 |     计算 Translation Edit Rate (TER)
 39 |     """
 40 |     # 去除标点符号
 41 |     hypothesis = re.sub(r'[^\w\s]', '', hypothesis)
 42 |     reference = re.sub(r'[^\w\s]', '', reference)
 43 |     
 44 |     # 将字符串分割成单词列表
 45 |     if lang == "en":
 46 |         hyp_words = hypothesis.split(" ")
 47 |         ref_words = reference.split(" ")
 48 |     else:
 49 |         hyp_words = list(hypothesis)
 50 |         ref_words = list(reference)
 51 |     
 52 |     # 初始化编辑距离矩阵
 53 |     d = [[0] * (len(ref_words) + 1) for _ in range(len(hyp_words) + 1)]
 54 |     
 55 |     # 初始化边界条件
 56 |     for i in range(len(hyp_words) + 1):
 57 |         d[i][0] = i
 58 |     for j in range(len(ref_words) + 1):
 59 |         d[0][j] = j
 60 |     
 61 |     # 动态规划计算编辑距离
 62 |     for i in range(1, len(hyp_words) + 1):
 63 |         for j in range(1, len(ref_words) + 1):
 64 |             if hyp_words[i - 1] == ref_words[j - 1]:
 65 |                 d[i][j] = d[i - 1][j - 1]
 66 |             else:
 67 |                 d[i][j] = min(d[i - 1][j] + 1,    # 删除
 68 |                               d[i][j - 1] + 1,    # 插入
 69 |                               d[i - 1][j - 1] + 1)  # 替换
 70 |     
 71 |     # 编辑距离
 72 |     edit_distance = d[len(hyp_words)][len(ref_words)]
 73 |     return edit_distance
 74 | 
 75 | 
 76 | def match_result(asr_result, lines):
 77 |     ## 使用手动计算的TER来计算匹配度
 78 |     edit_distance = calculate_edit_distance(asr_result, lines)
 79 |     
 80 |     if edit_distance == 0:
 81 |         return 0
 82 |     elif edit_distance <= 2:
 83 |         return 1
 84 |     else:
 85 |         return 2
 86 | 
 87 | 
 88 | async def process_audio(resource_name, line, audio_folder):
 89 |     audio_path = os.path.join(audio_folder, resource_name + '.wav')  # 修正路径拼接
 90 |     if os.path.exists(audio_path):
 91 |         try:
 92 |             # 调用预先写好的get_asr_result函数
 93 |             asr_result = await get_asr_result(audio_path)
 94 |         except Exception as e:
 95 |             print(f"Error processing {resource_name}: {e}")
 96 |             asr_result = ""
 97 |     else:
 98 |         print(f"Audio file {resource_name} not found in {audio_folder}")
 99 |         asr_result = ""
100 |     
101 |     print(f"ASR Result for {resource_name}: {asr_result}")
102 |     match_result_value = match_result(asr_result, line)
103 |     return asr_result, match_result_value
104 | 
105 | 
106 | async def audio_line_check(excel_path, audio_folder):
107 |     # 读取Excel文件
108 |     df = pd.read_excel(excel_path, header=0)
109 |     
110 |     # 获取台词和资源命名
111 |     lines = df.iloc[:, 0]
112 |     resource_names = df.iloc[:, 1]
113 |     
114 |     # 使用asyncio.gather并行处理音频文件
115 |     tasks = [process_audio(resource_name, line, audio_folder) for resource_name, line in zip(resource_names, lines)]
116 |     results = await asyncio.gather(*tasks)
117 |     
118 |     # 分离ASR结果和匹配结果
119 |     asr_results, match_results = zip(*results)
120 |     
121 |     ## 重新组装excel，前两列不变，加一列asr result命名为AI识别文本，和一列match result命名为匹配度
122 |     df['AI识别文本'] = asr_results
123 |     df['匹配度'] = match_results
124 |     
125 |     # 保存新的Excel文件
126 |     new_excel_path = excel_path.replace("需求表.xlsx", "审查结果.xlsx")
127 |     df.to_excel(new_excel_path, index=False)
128 |     print(f"Results saved to {new_excel_path}")
129 | 


--------------------------------------------------------------------------------
/server/modelhandler.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | import os
 3 | import sys
 4 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 5 | import json
 6 | 
 7 | 
 8 | class ModelHandler:
 9 |     def __init__(self, models_folder, models_info_file='models_info.json'):
10 |         self.models_folder = models_folder
11 |         self.models_info_file = models_info_file
12 |         self.models_info = {}
13 |         self.load_models_info()
14 | 
15 | 
16 |     def load_models_info(self):
17 |         """加载模型信息JSON文件"""
18 |         self.update_models_info()
19 |         with open(os.path.join(self.models_folder, self.models_info_file), 'r') as file:
20 |             self.models_info = json.load(file)
21 | 
22 | 
23 |     def update_models_info(self):
24 |         """更新模型信息，并写入JSON文件"""
25 |         for character in os.listdir(self.models_folder):
26 |             character_path = os.path.join(self.models_folder, character)
27 |             if os.path.isdir(character_path):
28 |                 self.models_info[character] = {}
29 |                 for model_file in os.listdir(character_path):
30 |                     model_path = os.path.join(character_path, model_file)
31 |                     if os.path.isfile(model_path):
32 |                         self.models_info[character][model_file] = model_path
33 |         
34 |         with open(os.path.join(self.models_folder, self.models_info_file), 'w') as file:
35 |             json.dump(self.models_info, file, indent=4)
36 | 
37 | 


--------------------------------------------------------------------------------
/src/AR/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/src/AR/__init__.py


--------------------------------------------------------------------------------
/src/AR/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/src/AR/data/__init__.py


--------------------------------------------------------------------------------
/src/AR/data/data_module.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/data_module.py
 2 | from pytorch_lightning import LightningDataModule
 3 | from AR.data.bucket_sampler import DistributedBucketSampler
 4 | from AR.data.dataset import Text2SemanticDataset
 5 | from torch.utils.data import DataLoader
 6 | 
 7 | 
 8 | class Text2SemanticDataModule(LightningDataModule):
 9 |     def __init__(
10 |         self,
11 |         config,
12 |         train_semantic_path,
13 |         train_phoneme_path,
14 |         dev_semantic_path=None,
15 |         dev_phoneme_path=None,
16 |     ):
17 |         super().__init__()
18 |         self.config = config
19 |         self.train_semantic_path = train_semantic_path
20 |         self.train_phoneme_path = train_phoneme_path
21 |         self.dev_semantic_path = dev_semantic_path
22 |         self.dev_phoneme_path = dev_phoneme_path
23 |         self.num_workers = self.config["data"]["num_workers"]
24 | 
25 |     def prepare_data(self):
26 |         pass
27 | 
28 |     def setup(self, stage=None, output_logs=False):
29 |         self._train_dataset = Text2SemanticDataset(
30 |             phoneme_path=self.train_phoneme_path,
31 |             semantic_path=self.train_semantic_path,
32 |             max_sec=self.config["data"]["max_sec"],
33 |             pad_val=self.config["data"]["pad_val"],
34 |         )
35 |         self._dev_dataset = self._train_dataset
36 |         # self._dev_dataset = Text2SemanticDataset(
37 |         #     phoneme_path=self.dev_phoneme_path,
38 |         #     semantic_path=self.dev_semantic_path,
39 |         #     max_sample=self.config['data']['max_eval_sample'],
40 |         #     max_sec=self.config['data']['max_sec'],
41 |         #     pad_val=self.config['data']['pad_val'])
42 | 
43 |     def train_dataloader(self):
44 |         batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"]
45 |         batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存
46 |         sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
47 |         return DataLoader(
48 |             self._train_dataset,
49 |             batch_size=batch_size,
50 |             sampler=sampler,
51 |             collate_fn=self._train_dataset.collate,
52 |             num_workers=self.num_workers,
53 |             persistent_workers=True if self.num_workers>0 else False,
54 |             prefetch_factor=16 if self.num_workers>0 else None,
55 |         )
56 | 
57 |     def val_dataloader(self):
58 |         return DataLoader(
59 |             self._dev_dataset,
60 |             batch_size=1,
61 |             shuffle=False,
62 |             collate_fn=self._train_dataset.collate,
63 |             num_workers=max(self.num_workers, 12),
64 |             persistent_workers=True,
65 |             prefetch_factor=16,
66 |         )
67 | 
68 |     # 这个会使用到嘛？
69 |     def test_dataloader(self):
70 |         return DataLoader(
71 |             self._dev_dataset,
72 |             batch_size=1,
73 |             shuffle=False,
74 |             collate_fn=self._train_dataset.collate,
75 |         )
76 | 


--------------------------------------------------------------------------------
/src/AR/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/src/AR/models/__init__.py


--------------------------------------------------------------------------------
/src/AR/models/t2s_lightning_module_onnx.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py
  2 | import os, sys
  3 | 
  4 | now_dir = os.getcwd()
  5 | sys.path.append(now_dir)
  6 | from typing import Dict
  7 | 
  8 | import torch
  9 | from pytorch_lightning import LightningModule
 10 | from AR.models.t2s_model_onnx import Text2SemanticDecoder
 11 | from AR.modules.lr_schedulers import WarmupCosineLRSchedule
 12 | from AR.modules.optim import ScaledAdam
 13 | 
 14 | 
 15 | class Text2SemanticLightningModule(LightningModule):
 16 |     def __init__(self, config, output_dir, is_train=True):
 17 |         super().__init__()
 18 |         self.config = config
 19 |         self.top_k = 3
 20 |         self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
 21 |         pretrained_s1 = config.get("pretrained_s1")
 22 |         if pretrained_s1 and is_train:
 23 |             # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
 24 |             print(
 25 |                 self.load_state_dict(
 26 |                     torch.load(pretrained_s1, map_location="cpu")["weight"]
 27 |                 )
 28 |             )
 29 |         if is_train:
 30 |             self.automatic_optimization = False
 31 |             self.save_hyperparameters()
 32 |             self.eval_dir = output_dir / "eval"
 33 |             self.eval_dir.mkdir(parents=True, exist_ok=True)
 34 | 
 35 |     def training_step(self, batch: Dict, batch_idx: int):
 36 |         opt = self.optimizers()
 37 |         scheduler = self.lr_schedulers()
 38 |         loss, acc = self.model.forward(
 39 |             batch["phoneme_ids"],
 40 |             batch["phoneme_ids_len"],
 41 |             batch["semantic_ids"],
 42 |             batch["semantic_ids_len"],
 43 |             batch["bert_feature"],
 44 |         )
 45 |         self.manual_backward(loss)
 46 |         if batch_idx > 0 and batch_idx % 4 == 0:
 47 |             opt.step()
 48 |             opt.zero_grad()
 49 |             scheduler.step()
 50 | 
 51 |         self.log(
 52 |             "total_loss",
 53 |             loss,
 54 |             on_step=True,
 55 |             on_epoch=True,
 56 |             prog_bar=True,
 57 |             sync_dist=True,
 58 |         )
 59 |         self.log(
 60 |             "lr",
 61 |             scheduler.get_last_lr()[0],
 62 |             on_epoch=True,
 63 |             prog_bar=True,
 64 |             sync_dist=True,
 65 |         )
 66 |         self.log(
 67 |             f"top_{self.top_k}_acc",
 68 |             acc,
 69 |             on_step=True,
 70 |             on_epoch=True,
 71 |             prog_bar=True,
 72 |             sync_dist=True,
 73 |         )
 74 | 
 75 |     def validation_step(self, batch: Dict, batch_idx: int):
 76 |         return
 77 | 
 78 |     def configure_optimizers(self):
 79 |         model_parameters = self.model.parameters()
 80 |         parameters_names = []
 81 |         parameters_names.append(
 82 |             [name_param_pair[0] for name_param_pair in self.model.named_parameters()]
 83 |         )
 84 |         lm_opt = ScaledAdam(
 85 |             model_parameters,
 86 |             lr=0.01,
 87 |             betas=(0.9, 0.95),
 88 |             clipping_scale=2.0,
 89 |             parameters_names=parameters_names,
 90 |             show_dominant_parameters=False,
 91 |             clipping_update_period=1000,
 92 |         )
 93 | 
 94 |         return {
 95 |             "optimizer": lm_opt,
 96 |             "lr_scheduler": {
 97 |                 "scheduler": WarmupCosineLRSchedule(
 98 |                     lm_opt,
 99 |                     init_lr=self.config["optimizer"]["lr_init"],
100 |                     peak_lr=self.config["optimizer"]["lr"],
101 |                     end_lr=self.config["optimizer"]["lr_end"],
102 |                     warmup_steps=self.config["optimizer"]["warmup_steps"],
103 |                     total_steps=self.config["optimizer"]["decay_steps"],
104 |                 )
105 |             },
106 |         }
107 | 


--------------------------------------------------------------------------------
/src/AR/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/src/AR/modules/__init__.py


--------------------------------------------------------------------------------
/src/AR/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 | 
50 |         self.reverse = False
51 |         self.pe = None
52 |         self.extend_pe(torch.tensor(0.0).expand(1, 4000))
53 | 
54 |     def extend_pe(self, x):
55 |         """Reset the positional encodings."""
56 |         if self.pe is not None:
57 |             if self.pe.size(1) >= x.size(1):
58 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
59 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
60 |                 return
61 |         pe = torch.zeros(x.size(1), self.embedding_dim)
62 |         if self.reverse:
63 |             position = torch.arange(
64 |                 x.size(1) - 1, -1, -1.0, dtype=torch.float32
65 |             ).unsqueeze(1)
66 |         else:
67 |             position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
68 |         div_term = torch.exp(
69 |             torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
70 |             * -(math.log(10000.0) / self.embedding_dim)
71 |         )
72 |         pe[:, 0::2] = torch.sin(position * div_term)
73 |         pe[:, 1::2] = torch.cos(position * div_term)
74 |         pe = pe.unsqueeze(0)
75 |         self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
76 | 
77 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
78 |         self.extend_pe(x)
79 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
80 |         output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
81 |         return self.dropout(output)
82 | 


--------------------------------------------------------------------------------
/src/AR/modules/embedding_onnx.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class TokenEmbedding(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         embedding_dim: int,
12 |         vocab_size: int,
13 |         dropout: float = 0.0,
14 |     ):
15 |         super().__init__()
16 | 
17 |         self.vocab_size = vocab_size
18 |         self.embedding_dim = embedding_dim
19 | 
20 |         self.dropout = torch.nn.Dropout(p=dropout)
21 |         self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
22 | 
23 |     @property
24 |     def weight(self) -> torch.Tensor:
25 |         return self.word_embeddings.weight
26 | 
27 |     def embedding(self, index: int) -> torch.Tensor:
28 |         return self.word_embeddings.weight[index : index + 1]
29 | 
30 |     def forward(self, x: torch.Tensor):
31 |         x = self.word_embeddings(x)
32 |         x = self.dropout(x)
33 |         return x
34 | 
35 | 
36 | class SinePositionalEmbedding(nn.Module):
37 |     def __init__(
38 |         self,
39 |         embedding_dim: int,
40 |         dropout: float = 0.0,
41 |         scale: bool = False,
42 |         alpha: bool = False,
43 |     ):
44 |         super().__init__()
45 |         self.embedding_dim = embedding_dim
46 |         self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
47 |         self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
48 |         self.dropout = torch.nn.Dropout(p=dropout)
49 |         self.reverse = False
50 |         self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
51 | 
52 |     def extend_pe(self, x):
53 |         position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1)
54 |         scpe = (position * self.div_term).unsqueeze(0)
55 |         pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
56 |         pe = pe.contiguous().view(1, -1, self.embedding_dim)
57 |         return pe
58 | 
59 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
60 |         pe = self.extend_pe(x)
61 |         output = x.unsqueeze(-1) if x.ndim == 2 else x
62 |         output = output * self.x_scale + self.alpha * pe
63 |         return self.dropout(output)
64 | 


--------------------------------------------------------------------------------
/src/AR/modules/lr_schedulers.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/lr_schedulers.py
 2 | import math
 3 | 
 4 | import torch
 5 | from matplotlib import pyplot as plt
 6 | from torch import nn
 7 | from torch.optim import Adam
 8 | 
 9 | 
10 | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
11 |     """
12 |     Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
13 |     """
14 | 
15 |     def __init__(
16 |         self,
17 |         optimizer,
18 |         init_lr,
19 |         peak_lr,
20 |         end_lr,
21 |         warmup_steps=10000,
22 |         total_steps=400000,
23 |         current_step=0,
24 |     ):
25 |         self.init_lr = init_lr
26 |         self.peak_lr = peak_lr
27 |         self.end_lr = end_lr
28 |         self.optimizer = optimizer
29 |         self._warmup_rate = (peak_lr - init_lr) / warmup_steps
30 |         self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
31 |         self._current_step = current_step
32 |         self.lr = init_lr
33 |         self.warmup_steps = warmup_steps
34 |         self.total_steps = total_steps
35 |         self._last_lr = [self.lr]
36 | 
37 |     def set_lr(self, lr):
38 |         self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
39 |         for g in self.optimizer.param_groups:
40 |             # g['lr'] = lr
41 |             g["lr"] = self.end_lr  ###锁定用线性
42 | 
43 |     def step(self):
44 |         if self._current_step < self.warmup_steps:
45 |             lr = self.init_lr + self._warmup_rate * self._current_step
46 | 
47 |         elif self._current_step > self.total_steps:
48 |             lr = self.end_lr
49 | 
50 |         else:
51 |             decay_ratio = (self._current_step - self.warmup_steps) / (
52 |                 self.total_steps - self.warmup_steps
53 |             )
54 |             if decay_ratio < 0.0 or decay_ratio > 1.0:
55 |                 raise RuntimeError(
56 |                     "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings."
57 |                 )
58 |             coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
59 |             lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
60 | 
61 |         self.lr = lr = self.end_lr = 0.002  ###锁定用线性###不听话，直接锁定！
62 |         self.set_lr(lr)
63 |         self.lr = lr
64 |         self._current_step += 1
65 |         return self.lr
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     m = nn.Linear(10, 10)
70 |     opt = Adam(m.parameters(), lr=1e-4)
71 |     s = WarmupCosineLRSchedule(
72 |         opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0
73 |     )
74 |     lrs = []
75 |     for i in range(25000):
76 |         s.step()
77 |         lrs.append(s.lr)
78 |         print(s.lr)
79 | 
80 |     plt.plot(lrs)
81 |     plt.plot(range(0, 25000), lrs)
82 |     plt.show()
83 | 


--------------------------------------------------------------------------------
/src/AR/modules/patched_mha_with_cache_onnx.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.functional import *
 2 | from torch.nn.functional import (
 3 |     _mha_shape_check,
 4 |     _canonical_mask,
 5 |     _none_or_dtype,
 6 |     _in_projection_packed,
 7 | )
 8 | 
 9 | def multi_head_attention_forward_patched(
10 |     query,
11 |     key,
12 |     value,
13 |     embed_dim_to_check: int,
14 |     num_heads: int,
15 |     in_proj_weight,
16 |     in_proj_bias: Optional[Tensor],
17 |     bias_k: Optional[Tensor],
18 |     bias_v: Optional[Tensor],
19 |     add_zero_attn: bool,
20 |     dropout_p: float,
21 |     out_proj_weight: Tensor,
22 |     out_proj_bias: Optional[Tensor],
23 |     training: bool = True,
24 |     key_padding_mask: Optional[Tensor] = None,
25 |     need_weights: bool = True,
26 |     attn_mask: Optional[Tensor] = None,
27 |     use_separate_proj_weight: bool = False,
28 |     q_proj_weight: Optional[Tensor] = None,
29 |     k_proj_weight: Optional[Tensor] = None,
30 |     v_proj_weight: Optional[Tensor] = None,
31 |     static_k: Optional[Tensor] = None,
32 |     static_v: Optional[Tensor] = None,
33 |     average_attn_weights: bool = True,
34 |     is_causal: bool = False,
35 |     cache=None,
36 | ) -> Tuple[Tensor, Optional[Tensor]]:
37 | 
38 |     # set up shape vars
39 |     _, _, embed_dim = query.shape
40 |     attn_mask = _canonical_mask(
41 |         mask=attn_mask,
42 |         mask_name="attn_mask",
43 |         other_type=None,
44 |         other_name="",
45 |         target_type=query.dtype,
46 |         check_other=False,
47 |     )
48 |     head_dim = embed_dim // num_heads
49 | 
50 |     proj_qkv = linear(query, in_proj_weight, in_proj_bias)
51 |     proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
52 |     q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
53 | 
54 |     if cache["first_infer"] == 1:
55 |         cache["k"][cache["stage"]] = k
56 |         cache["v"][cache["stage"]] = v
57 |     else:
58 |         cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
59 |         cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
60 |         k = cache["k"][cache["stage"]]
61 |         v = cache["v"][cache["stage"]]
62 |     cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
63 | 
64 |     attn_mask = _canonical_mask(
65 |         mask=attn_mask,
66 |         mask_name="attn_mask",
67 |         other_type=None,
68 |         other_name="",
69 |         target_type=q.dtype,
70 |         check_other=False,
71 |     )
72 |     attn_mask = attn_mask.unsqueeze(0)
73 | 
74 |     q = q.view(-1, num_heads, head_dim).transpose(0, 1)
75 |     k = k.view(-1, num_heads, head_dim).transpose(0, 1)
76 |     v = v.view(-1, num_heads, head_dim).transpose(0, 1)
77 | 
78 |     dropout_p = 0.0
79 |     attn_mask = attn_mask.unsqueeze(0)
80 |     q = q.view(num_heads, -1, head_dim).unsqueeze(0)
81 |     k = k.view(num_heads, -1, head_dim).unsqueeze(0)
82 |     v = v.view(num_heads, -1, head_dim).unsqueeze(0)
83 |     attn_output = scaled_dot_product_attention(
84 |         q, k, v, attn_mask, dropout_p, is_causal
85 |     )
86 |     attn_output = (
87 |         attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
88 |     )
89 |     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
90 |     attn_output = attn_output.view(-1, 1, attn_output.size(1))
91 | 
92 |     return attn_output
93 | 


--------------------------------------------------------------------------------
/src/AR/text_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/src/AR/text_processing/__init__.py


--------------------------------------------------------------------------------
/src/AR/text_processing/phonemizer.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/text_processing/phonemizer.py
 2 | import itertools
 3 | import re
 4 | from typing import Dict
 5 | from typing import List
 6 | 
 7 | import regex
 8 | from gruut import sentences
 9 | from gruut.const import Sentence
10 | from gruut.const import Word
11 | from AR.text_processing.symbols import SYMBOL_TO_ID
12 | 
13 | 
14 | class GruutPhonemizer:
15 |     def __init__(self, language: str):
16 |         self._phonemizer = sentences
17 |         self.lang = language
18 |         self.symbol_to_id = SYMBOL_TO_ID
19 |         self._special_cases_dict: Dict[str] = {
20 |             r"\.\.\.": "... ",
21 |             ";": "; ",
22 |             ":": ": ",
23 |             ",": ", ",
24 |             r"\.": ". ",
25 |             "!": "! ",
26 |             r"\?": "? ",
27 |             "—": "—",
28 |             "…": "… ",
29 |             "«": "«",
30 |             "»": "»",
31 |         }
32 |         self._punctuation_regexp: str = (
33 |             rf"([{''.join(self._special_cases_dict.keys())}])"
34 |         )
35 | 
36 |     def _normalize_punctuation(self, text: str) -> str:
37 |         text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text)
38 |         text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
39 |         text = regex.sub(r"\pZ+", r" ", text)
40 |         return text.strip()
41 | 
42 |     def _convert_punctuation(self, word: Word) -> str:
43 |         if not word.phonemes:
44 |             return ""
45 |         if word.phonemes[0] in ["‖", "|"]:
46 |             return word.text.strip()
47 | 
48 |         phonemes = "".join(word.phonemes)
49 |         # remove modifier characters ˈˌː with regex
50 |         phonemes = re.sub(r"[ˈˌː͡]", "", phonemes)
51 |         return phonemes.strip()
52 | 
53 |     def phonemize(self, text: str, espeak: bool = False) -> str:
54 |         text_to_phonemize: str = self._normalize_punctuation(text)
55 |         sents: List[Sentence] = [
56 |             sent
57 |             for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak)
58 |         ]
59 |         words: List[str] = [
60 |             self._convert_punctuation(word) for word in itertools.chain(*sents)
61 |         ]
62 |         return " ".join(words)
63 | 
64 |     def transform(self, phonemes):
65 |         # convert phonemes to ids
66 |         # dictionary is in symbols.py
67 |         return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()]
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     phonemizer = GruutPhonemizer("en-us")
72 |     # text -> IPA
73 |     phonemes = phonemizer.phonemize("Hello, wor-ld ?")
74 |     print("phonemes:", phonemes)
75 |     print("len(phonemes):", len(phonemes))
76 |     phoneme_ids = phonemizer.transform(phonemes)
77 |     print("phoneme_ids:", phoneme_ids)
78 |     print("len(phoneme_ids):", len(phoneme_ids))
79 | 


--------------------------------------------------------------------------------
/src/AR/text_processing/symbols.py:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/text_processing/symbols.py
 2 | PAD = "_"
 3 | PUNCTUATION = ';:,.!?¡¿—…"«»“” '
 4 | LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 5 | IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
 6 | SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS)
 7 | SPACE_ID = SYMBOLS.index(" ")
 8 | SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)}
 9 | ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)}
10 | 


--------------------------------------------------------------------------------
/src/AR/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def str2bool(str):
 5 |     return True if str.lower() == 'true' else False
 6 | 
 7 | 
 8 | def get_newest_ckpt(string_list):
 9 |     # 定义一个正则表达式模式，用于匹配字符串中的数字
10 |     pattern = r'epoch=(\d+)-step=(\d+)\.ckpt'
11 | 
12 |     # 使用正则表达式提取每个字符串中的数字信息，并创建一个包含元组的列表
13 |     extracted_info = []
14 |     for string in string_list:
15 |         match = re.match(pattern, string)
16 |         if match:
17 |             epoch = int(match.group(1))
18 |             step = int(match.group(2))
19 |             extracted_info.append((epoch, step, string))
20 |     # 按照 epoch 后面的数字和 step 后面的数字进行排序
21 |     sorted_info = sorted(
22 |         extracted_info, key=lambda x: (x[0], x[1]), reverse=True)
23 |     # 获取最新的 ckpt 文件名
24 |     newest_ckpt = sorted_info[0][2]
25 |     return newest_ckpt
26 | 
27 | 
28 | # 文本存在且不为空时 return True
29 | def check_txt_file(file_path):
30 |     try:
31 |         with open(file_path, 'r') as file:
32 |             text = file.readline().strip()
33 |         assert text.strip() != ''
34 |         return text
35 |     except Exception:
36 |         return False
37 |     return False
38 | 


--------------------------------------------------------------------------------
/src/AR/utils/initialize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Initialize modules for espnet2 neural networks."""
 3 | import torch
 4 | from typeguard import check_argument_types
 5 | 
 6 | 
 7 | def initialize(model: torch.nn.Module, init: str):
 8 |     """Initialize weights of a neural network module.
 9 | 
10 |     Parameters are initialized using the given method or distribution.
11 | 
12 |     Custom initialization routines can be implemented into submodules
13 |     as function `espnet_initialization_fn` within the custom module.
14 | 
15 |     Args:
16 |         model: Target.
17 |         init: Method of initialization.
18 |     """
19 |     assert check_argument_types()
20 |     print("init with", init)
21 | 
22 |     # weight init
23 |     for p in model.parameters():
24 |         if p.dim() > 1:
25 |             if init == "xavier_uniform":
26 |                 torch.nn.init.xavier_uniform_(p.data)
27 |             elif init == "xavier_normal":
28 |                 torch.nn.init.xavier_normal_(p.data)
29 |             elif init == "kaiming_uniform":
30 |                 torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
31 |             elif init == "kaiming_normal":
32 |                 torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
33 |             else:
34 |                 raise ValueError("Unknown initialization: " + init)
35 |     # bias init
36 |     for name, p in model.named_parameters():
37 |         if ".bias" in name and p.dim() == 1:
38 |             p.data.zero_()
39 | 


--------------------------------------------------------------------------------
/src/AR/utils/io.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import torch
 4 | import yaml
 5 | 
 6 | 
 7 | def load_yaml_config(path):
 8 |     with open(path) as f:
 9 |         config = yaml.full_load(f)
10 |     return config
11 | 
12 | 
13 | def save_config_to_yaml(config, path):
14 |     assert path.endswith(".yaml")
15 |     with open(path, "w") as f:
16 |         f.write(yaml.dump(config))
17 |         f.close()
18 | 
19 | 
20 | def write_args(args, path):
21 |     args_dict = dict(
22 |         (name, getattr(args, name)) for name in dir(args) if not name.startswith("_")
23 |     )
24 |     with open(path, "a") as args_file:
25 |         args_file.write("==> torch version: {}\n".format(torch.__version__))
26 |         args_file.write(
27 |             "==> cudnn version: {}\n".format(torch.backends.cudnn.version())
28 |         )
29 |         args_file.write("==> Cmd:\n")
30 |         args_file.write(str(sys.argv))
31 |         args_file.write("\n==> args:\n")
32 |         for k, v in sorted(args_dict.items()):
33 |             args_file.write("  %s: %s\n" % (str(k), str(v)))
34 |         args_file.close()
35 | 


--------------------------------------------------------------------------------
/src/configs/s1.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 8
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 512
24 |   hidden_dim: 512
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 12
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/src/configs/s1big.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 8
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 1024
24 |   hidden_dim: 1024
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 16
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/src/configs/s1big2.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 300
 4 |   batch_size: 12
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 1024
24 |   hidden_dim: 1024
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 6
28 |   dropout: 0
29 |   EOS: 1024
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/src/configs/s1longer.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 20
 4 |   batch_size: 8
 5 |   save_every_n_epoch: 1
 6 |   precision: 16-mixed
 7 |   gradient_clip: 1.0
 8 | optimizer:
 9 |   lr: 0.01
10 |   lr_init: 0.00001
11 |   lr_end: 0.0001
12 |   warmup_steps: 2000
13 |   decay_steps: 40000
14 | data:
15 |   max_eval_sample: 8
16 |   max_sec: 54
17 |   num_workers: 4
18 |   pad_val: 1024 # same with EOS in model
19 | model:
20 |   vocab_size: 1025
21 |   phoneme_vocab_size: 512
22 |   embedding_dim: 512
23 |   hidden_dim: 512
24 |   head: 16
25 |   linear_units: 2048
26 |   n_layer: 24
27 |   dropout: 0
28 |   EOS: 1024
29 |   random_bert: 0
30 | inference:
31 |   top_k: 5
32 | 


--------------------------------------------------------------------------------
/src/configs/s1mq.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   seed: 1234
 3 |   epochs: 100
 4 |   batch_size: 6
 5 |   gradient_accumulation: 4
 6 |   save_every_n_epoch: 1
 7 |   precision: 32
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 40
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   saving_path: "ckpt/"
22 |   resume_checkpoint: null
23 |   vocoder_config_path: "quantizer/new_ckpt/config.json"
24 |   vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000"
25 |   datadir: "/home/liweiche/GigaSpeech/wavs"
26 |   metapath: "/home/liweiche/GigaSpeech/train2.json"
27 |   val_metapath: "/home/liweiche/GigaSpeech/dev2.json"
28 |   sampledir: "logs/"
29 |   pretrained_path: null
30 |   lr: 0.0001
31 |   batch_size: 200.0
32 |   train_bucket_size: 8192
33 |   training_step: 800000
34 |   optim_flat_percent: 0.0
35 |   warmup_step: 50
36 |   adam_beta1: 0.9
37 |   adam_beta2: 0.98
38 |   ffd_size: 3072
39 |   hidden_size: 768
40 |   enc_nlayers: 6
41 |   dec_nlayers: 6
42 |   nheads: 12
43 |   ar_layer: 4
44 |   ar_ffd_size: 1024
45 |   ar_hidden_size: 256
46 |   ar_nheads: 4
47 |   aligner_softmax_temp: 1.0
48 |   layer_norm_eps: 0.00001
49 |   speaker_embed_dropout: 0.05
50 |   label_smoothing: 0.0
51 |   val_check_interval: 5000
52 |   check_val_every_n_epoch: 1
53 |   precision: "fp16"
54 |   nworkers: 16
55 |   distributed: true
56 |   accelerator: "ddp"
57 |   version: null
58 |   accumulate_grad_batches: 1
59 |   use_repetition_token: true
60 |   use_repetition_gating: false
61 |   repetition_penalty: 1.0
62 |   sampling_temperature: 1.0
63 |   top_k: -1
64 |   min_top_k: 3
65 |   top_p: 0.8
66 |   sample_num: 4
67 |   length_penalty_max_length: 15000
68 |   length_penalty_max_prob: 0.95
69 |   max_input_length: 2048
70 |   max_output_length: 2000
71 |   sample_rate: 16000
72 |   n_codes: 1024
73 |   n_cluster_groups: 1
74 |   phone_context_window: 4
75 |   phoneset_size: 1000
76 | inference:
77 |   top_k: 5
78 | 


--------------------------------------------------------------------------------
/src/configs/sovits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 100,
 4 |     "eval_interval": 500,
 5 |     "seed": 1234,
 6 |     "epochs": 100,
 7 |     "learning_rate": 0.0001,
 8 |     "betas": [
 9 |       0.8,
10 |       0.99
11 |     ],
12 |     "eps": 1e-09,
13 |     "batch_size": 32,
14 |     "fp16_run": true,
15 |     "lr_decay": 0.999875,
16 |     "segment_size": 20480,
17 |     "init_lr_ratio": 1,
18 |     "warmup_epochs": 0,
19 |     "c_mel": 45,
20 |     "c_kl": 1.0,
21 |     "text_low_lr_rate": 0.4
22 |   },
23 |   "data": {
24 |     "max_wav_value": 32768.0,
25 |     "sampling_rate": 32000,
26 |     "filter_length": 2048,
27 |     "hop_length": 640,
28 |     "win_length": 2048,
29 |     "n_mel_channels": 128,
30 |     "mel_fmin": 0.0,
31 |     "mel_fmax": null,
32 |     "add_blank": true,
33 |     "n_speakers": 300,
34 |     "cleaned_text": true
35 |   },
36 |   "model": {
37 |     "inter_channels": 192,
38 |     "hidden_channels": 192,
39 |     "filter_channels": 768,
40 |     "n_heads": 2,
41 |     "n_layers": 6,
42 |     "kernel_size": 3,
43 |     "p_dropout": 0.1,
44 |     "resblock": "1",
45 |     "resblock_kernel_sizes": [
46 |       3,
47 |       7,
48 |       11
49 |     ],
50 |     "resblock_dilation_sizes": [
51 |       [
52 |         1,
53 |         3,
54 |         5
55 |       ],
56 |       [
57 |         1,
58 |         3,
59 |         5
60 |       ],
61 |       [
62 |         1,
63 |         3,
64 |         5
65 |       ]
66 |     ],
67 |     "upsample_rates": [
68 |       10,
69 |       8,
70 |       2,
71 |       2,
72 |       2
73 |     ],
74 |     "upsample_initial_channel": 512,
75 |     "upsample_kernel_sizes": [
76 |       16,
77 |       16,
78 |       8,
79 |       2,
80 |       2
81 |     ],
82 |     "n_layers_q": 3,
83 |     "use_spectral_norm": false,
84 |     "gin_channels": 512,
85 |     "semantic_frame_rate": "25hz",
86 |     "freeze_quantizer": true
87 |   },
88 |   "content_module": "cnhubert"
89 | }


--------------------------------------------------------------------------------
/src/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | gpu:
 2 |   n_card: 1
 3 |   n_process_per_card: 2
 4 | io:
 5 |   text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS
 6 |   save_every_n_epoch: 1
 7 |   precision: 16-mixed
 8 |   gradient_clip: 1.0
 9 | optimizer:
10 |   lr: 0.01
11 |   lr_init: 0.00001
12 |   lr_end: 0.0001
13 |   warmup_steps: 2000
14 |   decay_steps: 40000
15 | data:
16 |   max_eval_sample: 8
17 |   max_sec: 54
18 |   num_workers: 1
19 |   pad_val: 1024 # same with EOS in model
20 | model:
21 |   vocab_size: 1025
22 |   phoneme_vocab_size: 512
23 |   embedding_dim: 512
24 |   hidden_dim: 512
25 |   head: 16
26 |   linear_units: 2048
27 |   n_layer: 24
28 |   dropout: 0
29 |   EOS: 1024
30 |   random_bert: 0
31 | inference:
32 |   top_k: 5
33 | 


--------------------------------------------------------------------------------
/src/feature_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cnhubert, whisper_enc
2 | 
3 | content_module_map = {
4 |     'cnhubert': cnhubert,
5 |     'whisper': whisper_enc
6 | }


--------------------------------------------------------------------------------
/src/feature_extractor/cnhubert.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import librosa
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import soundfile as sf
  7 | import logging
  8 | 
  9 | # logging.getLogger("numba").setLevel(logging.INFO)
 10 | 
 11 | from transformers import (
 12 |     Wav2Vec2FeatureExtractor,
 13 |     HubertModel,
 14 | )
 15 | 
 16 | import utils
 17 | import torch.nn as nn
 18 | 
 19 | 
 20 | class CNHubert(nn.Module):
 21 |     def __init__(self, cnhubert_base_path='pretrained_models/chinese-hubert-base'):
 22 |         super().__init__()
 23 |         self.model = HubertModel.from_pretrained(cnhubert_base_path)
 24 |         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
 25 |             cnhubert_base_path
 26 |         )
 27 | 
 28 |     def forward(self, x):
 29 |         input_values = self.feature_extractor(
 30 |             x, return_tensors="pt", sampling_rate=16000
 31 |         ).input_values.to(x.device)
 32 |         feats = self.model(input_values)["last_hidden_state"]
 33 |         return feats
 34 | 
 35 | 
 36 | # class CNHubertLarge(nn.Module):
 37 | #     def __init__(self):
 38 | #         super().__init__()
 39 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 40 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
 41 | #     def forward(self, x):
 42 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 43 | #         feats = self.model(input_values)["last_hidden_state"]
 44 | #         return feats
 45 | #
 46 | # class CVec(nn.Module):
 47 | #     def __init__(self):
 48 | #         super().__init__()
 49 | #         self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 50 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
 51 | #     def forward(self, x):
 52 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 53 | #         feats = self.model(input_values)["last_hidden_state"]
 54 | #         return feats
 55 | #
 56 | # class cnw2v2base(nn.Module):
 57 | #     def __init__(self):
 58 | #         super().__init__()
 59 | #         self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 60 | #         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
 61 | #     def forward(self, x):
 62 | #         input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
 63 | #         feats = self.model(input_values)["last_hidden_state"]
 64 | #         return feats
 65 | 
 66 | 
 67 | def get_model():
 68 |     model = CNHubert()
 69 |     model.eval()
 70 |     return model
 71 | 
 72 | 
 73 | # def get_large_model():
 74 | #     model = CNHubertLarge()
 75 | #     model.eval()
 76 | #     return model
 77 | #
 78 | # def get_model_cvec():
 79 | #     model = CVec()
 80 | #     model.eval()
 81 | #     return model
 82 | #
 83 | # def get_model_cnw2v2base():
 84 | #     model = cnw2v2base()
 85 | #     model.eval()
 86 | #     return model
 87 | 
 88 | 
 89 | def get_content(hmodel, wav_16k_tensor):
 90 |     with torch.no_grad():
 91 |         feats = hmodel(wav_16k_tensor)
 92 |     return feats.transpose(1, 2)
 93 | 
 94 | 
 95 | if __name__ == "__main__":
 96 |     model = get_model()
 97 |     src_path = "/Users/Shared/原音频2.wav"
 98 |     wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
 99 |     model = model
100 |     wav_16k_tensor = wav_16k_tensor
101 |     feats = get_content(model, wav_16k_tensor)
102 |     print(feats.shape)
103 | 


--------------------------------------------------------------------------------
/src/feature_extractor/whisper_enc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_model():
 5 |     import whisper
 6 | 
 7 |     model = whisper.load_model("small", device="cpu")
 8 | 
 9 |     return model.encoder
10 | 
11 | 
12 | def get_content(model=None, wav_16k_tensor=None):
13 |     from whisper import log_mel_spectrogram, pad_or_trim
14 | 
15 |     dev = next(model.parameters()).device
16 |     mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
17 |     # if torch.cuda.is_available():
18 |     #     mel = mel.to(torch.float16)
19 |     feature_len = mel.shape[-1] // 2
20 |     assert mel.shape[-1] < 3000, "输入音频过长，只允许输入30以内音频"
21 |     with torch.no_grad():
22 |         feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[
23 |             :1, :feature_len, :
24 |         ].transpose(1, 2)
25 |     return feature
26 | 


--------------------------------------------------------------------------------
/src/inference/.ipynb_checkpoints/inference-checkpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import soundfile as sf
 4 | import json
 5 | 
 6 | from infer_tool import TTSInference
 7 | 
 8 | 
 9 | def generate_audio(sovits_weights, 
10 |                    gpt_weights, 
11 |                    input_folder,
12 |                    output_folder, 
13 |                    ref_wav_path, 
14 |                    prompt_text, 
15 |                    prompt_language, 
16 |                    text, 
17 |                    text_language,
18 |                    how_to_cut=None,
19 |                    save=True
20 |                    ):
21 |     tts = TTSInference(sovits_weights=sovits_weights, gpt_weights=gpt_weights)
22 |     
23 |     infer_dict = {
24 |         'ref_wav_path': os.path.join(input_folder, ref_wav_path),
25 |         'prompt_text': prompt_text,
26 |         'prompt_language': prompt_language,
27 |         'text': text,
28 |         'text_language': text_language,
29 |         'how_to_cut': how_to_cut
30 |     }
31 | 
32 |     audio_generator = tts.infer(**infer_dict)
33 |     
34 |     sr, audio = next(audio_generator)
35 |     
36 |     if save:
37 |         ref_wav_name = ''.join(os.path.basename(ref_wav_path).split('.')[:-1])
38 |         output_path = os.path.join(output_folder, f"{ref_wav_name}_{text[:6]}.wav")
39 |         sf.write(output_path, audio, sr)
40 |         print(f"Audio saved to {output_path}")
41 |     else:
42 |         return sr, audio
43 |     
44 | 
45 | def process_batch(sovits_weights, gpt_weights, input_folder, output_folder, parameters_file):
46 |     _, file_extension = os.path.splitext(parameters_file)
47 |     
48 |     if file_extension.lower() == '.json':
49 |         with open(parameters_file, 'r', encoding='utf-8') as file:
50 |             parameters = json.load(file)
51 |             for param in parameters:
52 |                 generate_audio(sovits_weights, 
53 |                                gpt_weights, 
54 |                                input_folder,
55 |                                output_folder,
56 |                                param['ref_wav_path'], 
57 |                                param['prompt_text'],
58 |                                param['prompt_language'], 
59 |                                param['text'],
60 |                                param['text_language'],
61 |                                param['how_to_cut']
62 |                                )
63 |     elif file_extension.lower() == '.txt':
64 |         with open(parameters_file, 'r', encoding='utf-8') as file:
65 |             for line in file:
66 |                 ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut = line.strip().split('|')
67 |                 generate_audio(sovits_weights, 
68 |                             gpt_weights, 
69 |                             input_folder, 
70 |                             output_folder, 
71 |                             ref_wav_path, 
72 |                             prompt_text, 
73 |                             prompt_language, 
74 |                             text, 
75 |                             text_language, 
76 |                             how_to_cut
77 |                             )
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     parser = argparse.ArgumentParser(description="Batch Run TTS Inference")
82 |     parser.add_argument("--sovits_weights", required=True, help="Path to sovits weights file")
83 |     parser.add_argument("--gpt_weights", required=True, help="Path to gpt weights file")
84 |     parser.add_argument("--input_folder", type=str, default='input_audio', help="Folder of the input audio files")
85 |     parser.add_argument("--output_folder", type=str, default='output_audio', help="Folder to save the output audio files")
86 |     parser.add_argument("--parameters_file", type=str, default='inference_parameters.txt', help="File containing parameters for batch processing")
87 | 
88 |     args = parser.parse_args()
89 | 
90 |     if not os.path.exists(args.output_folder):
91 |         os.makedirs(args.output_folder)
92 | 
93 |     process_batch(args.sovits_weights, 
94 |                   args.gpt_weights, 
95 |                   args.input_folder, 
96 |                   args.output_folder, 
97 |                   args.parameters_file)
98 | 


--------------------------------------------------------------------------------
/src/inference/__init__.py:
--------------------------------------------------------------------------------
1 | from .infer_tool import InferenceModule, TTSInference


--------------------------------------------------------------------------------
/src/inference/inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import soundfile as sf
 4 | import json
 5 | 
 6 | from infer_tool import TTSInference
 7 | 
 8 | 
 9 | def generate_audio(sovits_weights, 
10 |                    gpt_weights, 
11 |                    input_folder,
12 |                    output_folder, 
13 |                    ref_wav_path, 
14 |                    prompt_text, 
15 |                    prompt_language, 
16 |                    text, 
17 |                    text_language,
18 |                    how_to_cut=None,
19 |                    save=True
20 |                    ):
21 |     tts = TTSInference(sovits_weights=sovits_weights, gpt_weights=gpt_weights)
22 |     
23 |     infer_dict = {
24 |         'ref_wav_path': os.path.join(input_folder, ref_wav_path),
25 |         'prompt_text': prompt_text,
26 |         'prompt_language': prompt_language,
27 |         'text': text,
28 |         'text_language': text_language,
29 |         'how_to_cut': how_to_cut
30 |     }
31 | 
32 |     audio_generator = tts.infer(**infer_dict)
33 |     
34 |     sr, audio = next(audio_generator)
35 |     
36 |     if save:
37 |         ref_wav_name = ''.join(os.path.basename(ref_wav_path).split('.')[:-1])
38 |         output_path = os.path.join(output_folder, f"{ref_wav_name}_{text[:6]}.wav")
39 |         sf.write(output_path, audio, sr)
40 |         print(f"Audio saved to {output_path}")
41 |     else:
42 |         return sr, audio
43 |     
44 | 
45 | def process_batch(sovits_weights, gpt_weights, input_folder, output_folder, parameters_file):
46 |     _, file_extension = os.path.splitext(parameters_file)
47 |     
48 |     if file_extension.lower() == '.json':
49 |         with open(parameters_file, 'r', encoding='utf-8') as file:
50 |             parameters = json.load(file)
51 |             for param in parameters:
52 |                 generate_audio(sovits_weights, 
53 |                                gpt_weights, 
54 |                                input_folder,
55 |                                output_folder,
56 |                                param['ref_wav_path'], 
57 |                                param['prompt_text'],
58 |                                param['prompt_language'], 
59 |                                param['text'],
60 |                                param['text_language'],
61 |                                param['how_to_cut']
62 |                                )
63 |     elif file_extension.lower() == '.txt':
64 |         with open(parameters_file, 'r', encoding='utf-8') as file:
65 |             for line in file:
66 |                 ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut = line.strip().split('|')
67 |                 generate_audio(sovits_weights, 
68 |                             gpt_weights, 
69 |                             input_folder, 
70 |                             output_folder, 
71 |                             ref_wav_path, 
72 |                             prompt_text, 
73 |                             prompt_language, 
74 |                             text, 
75 |                             text_language, 
76 |                             how_to_cut
77 |                             )
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     parser = argparse.ArgumentParser(description="Batch Run TTS Inference")
82 |     parser.add_argument("--sovits_weights", required=True, help="Path to sovits weights file")
83 |     parser.add_argument("--gpt_weights", required=True, help="Path to gpt weights file")
84 |     parser.add_argument("--input_folder", type=str, default='input_audio', help="Folder of the input audio files")
85 |     parser.add_argument("--output_folder", type=str, default='output_audio', help="Folder to save the output audio files")
86 |     parser.add_argument("--parameters_file", type=str, default='inference_parameters.txt', help="File containing parameters for batch processing")
87 | 
88 |     args = parser.parse_args()
89 | 
90 |     if not os.path.exists(args.output_folder):
91 |         os.makedirs(args.output_folder)
92 | 
93 |     process_batch(args.sovits_weights, 
94 |                   args.gpt_weights, 
95 |                   args.input_folder, 
96 |                   args.output_folder, 
97 |                   args.parameters_file)
98 | 


--------------------------------------------------------------------------------
/src/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/src/module/__init__.py


--------------------------------------------------------------------------------
/src/module/losses.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch.nn import functional as F
 5 | 
 6 | 
 7 | def feature_loss(fmap_r, fmap_g):
 8 |     loss = 0
 9 |     for dr, dg in zip(fmap_r, fmap_g):
10 |         for rl, gl in zip(dr, dg):
11 |             rl = rl.float().detach()
12 |             gl = gl.float()
13 |             loss += torch.mean(torch.abs(rl - gl))
14 | 
15 |     return loss * 2
16 | 
17 | 
18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19 |     loss = 0
20 |     r_losses = []
21 |     g_losses = []
22 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23 |         dr = dr.float()
24 |         dg = dg.float()
25 |         r_loss = torch.mean((1 - dr) ** 2)
26 |         g_loss = torch.mean(dg**2)
27 |         loss += r_loss + g_loss
28 |         r_losses.append(r_loss.item())
29 |         g_losses.append(g_loss.item())
30 | 
31 |     return loss, r_losses, g_losses
32 | 
33 | 
34 | def generator_loss(disc_outputs):
35 |     loss = 0
36 |     gen_losses = []
37 |     for dg in disc_outputs:
38 |         dg = dg.float()
39 |         l = torch.mean((1 - dg) ** 2)
40 |         gen_losses.append(l)
41 |         loss += l
42 | 
43 |     return loss, gen_losses
44 | 
45 | 
46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47 |     """
48 |     z_p, logs_q: [b, h, t_t]
49 |     m_p, logs_p: [b, h, t_t]
50 |     """
51 |     z_p = z_p.float()
52 |     logs_q = logs_q.float()
53 |     m_p = m_p.float()
54 |     logs_p = logs_p.float()
55 |     z_mask = z_mask.float()
56 | 
57 |     kl = logs_p - logs_q - 0.5
58 |     kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
59 |     kl = torch.sum(kl * z_mask)
60 |     l = kl / torch.sum(z_mask)
61 |     return l
62 | 
63 | 
64 | def mle_loss(z, m, logs, logdet, mask):
65 |     l = torch.sum(logs) + 0.5 * torch.sum(
66 |         torch.exp(-2 * logs) * ((z - m) ** 2)
67 |     )  # neg normal likelihood w/o the constant term
68 |     l = l - torch.sum(logdet)  # log jacobian determinant
69 |     l = l / torch.sum(
70 |         torch.ones_like(z) * mask
71 |     )  # averaging across batch, channel and time axes
72 |     l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
73 |     return l
74 | 


--------------------------------------------------------------------------------
/src/module/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import torch
  5 | from torch import nn
  6 | import torch.nn.functional as F
  7 | import torch.utils.data
  8 | import numpy as np
  9 | import librosa
 10 | import librosa.util as librosa_util
 11 | from librosa.util import normalize, pad_center, tiny
 12 | from scipy.signal import get_window
 13 | from scipy.io.wavfile import read
 14 | from librosa.filters import mel as librosa_mel_fn
 15 | 
 16 | MAX_WAV_VALUE = 32768.0
 17 | 
 18 | 
 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 20 |     """
 21 |     PARAMS
 22 |     ------
 23 |     C: compression factor
 24 |     """
 25 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 26 | 
 27 | 
 28 | def dynamic_range_decompression_torch(x, C=1):
 29 |     """
 30 |     PARAMS
 31 |     ------
 32 |     C: compression factor used to compress
 33 |     """
 34 |     return torch.exp(x) / C
 35 | 
 36 | 
 37 | def spectral_normalize_torch(magnitudes):
 38 |     output = dynamic_range_compression_torch(magnitudes)
 39 |     return output
 40 | 
 41 | 
 42 | def spectral_de_normalize_torch(magnitudes):
 43 |     output = dynamic_range_decompression_torch(magnitudes)
 44 |     return output
 45 | 
 46 | 
 47 | mel_basis = {}
 48 | hann_window = {}
 49 | 
 50 | 
 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 52 |     if torch.min(y) < -1.0:
 53 |         print("min value is ", torch.min(y))
 54 |     if torch.max(y) > 1.0:
 55 |         print("max value is ", torch.max(y))
 56 | 
 57 |     global hann_window
 58 |     dtype_device = str(y.dtype) + "_" + str(y.device)
 59 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
 60 |     if wnsize_dtype_device not in hann_window:
 61 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
 62 |             dtype=y.dtype, device=y.device
 63 |         )
 64 | 
 65 |     y = torch.nn.functional.pad(
 66 |         y.unsqueeze(1),
 67 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
 68 |         mode="reflect",
 69 |     )
 70 |     y = y.squeeze(1)
 71 |     spec = torch.stft(
 72 |         y,
 73 |         n_fft,
 74 |         hop_length=hop_size,
 75 |         win_length=win_size,
 76 |         window=hann_window[wnsize_dtype_device],
 77 |         center=center,
 78 |         pad_mode="reflect",
 79 |         normalized=False,
 80 |         onesided=True,
 81 |         return_complex=False,
 82 |     )
 83 | 
 84 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 85 |     return spec
 86 | 
 87 | 
 88 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 89 |     global mel_basis
 90 |     dtype_device = str(spec.dtype) + "_" + str(spec.device)
 91 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
 92 |     if fmax_dtype_device not in mel_basis:
 93 |         mel = librosa_mel_fn(
 94 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
 95 |         )
 96 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
 97 |             dtype=spec.dtype, device=spec.device
 98 |         )
 99 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
100 |     spec = spectral_normalize_torch(spec)
101 |     return spec
102 | 
103 | 
104 | def mel_spectrogram_torch(
105 |     y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
106 | ):
107 |     if torch.min(y) < -1.0:
108 |         print("min value is ", torch.min(y))
109 |     if torch.max(y) > 1.0:
110 |         print("max value is ", torch.max(y))
111 | 
112 |     global mel_basis, hann_window
113 |     dtype_device = str(y.dtype) + "_" + str(y.device)
114 |     fmax_dtype_device = str(fmax) + "_" + dtype_device
115 |     wnsize_dtype_device = str(win_size) + "_" + dtype_device
116 |     if fmax_dtype_device not in mel_basis:
117 |         mel = librosa_mel_fn(
118 |             sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
119 |         )
120 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
121 |             dtype=y.dtype, device=y.device
122 |         )
123 |     if wnsize_dtype_device not in hann_window:
124 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
125 |             dtype=y.dtype, device=y.device
126 |         )
127 | 
128 |     y = torch.nn.functional.pad(
129 |         y.unsqueeze(1),
130 |         (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
131 |         mode="reflect",
132 |     )
133 |     y = y.squeeze(1)
134 | 
135 |     spec = torch.stft(
136 |         y,
137 |         n_fft,
138 |         hop_length=hop_size,
139 |         win_length=win_size,
140 |         window=hann_window[wnsize_dtype_device],
141 |         center=center,
142 |         pad_mode="reflect",
143 |         normalized=False,
144 |         onesided=True,
145 |         return_complex=False,
146 |     )
147 | 
148 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
149 | 
150 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
151 |     spec = spectral_normalize_torch(spec)
152 | 
153 |     return spec
154 | 


--------------------------------------------------------------------------------
/src/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | from .get_phonemes import get_phonemes
2 | from .get_ssl_features import get_ssl_features
3 | from .get_semantic import get_semantic


--------------------------------------------------------------------------------
/src/preprocess/get_phonemes.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import sys
  3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  4 | 
  5 | import traceback
  6 | import torch
  7 | from transformers import AutoModelForMaskedLM, AutoTokenizer
  8 | 
  9 | from text.cleaner import clean_text
 10 | 
 11 | 
 12 | def get_bert_feature(text, word2ph, tokenizer, bert_model, device):
 13 |    """
 14 |    获取指定文本的BERT特征表示。
 15 | 
 16 |    Args:
 17 |        text (str): 输入文本
 18 |        word2ph (list): 单词到音素的映射列表
 19 |        tokenizer (BertTokenizer): BERT分词器对象
 20 |        bert_model (BertModel): BERT模型对象
 21 |        device (str): 设备类型（'cuda'或'cpu'）
 22 | 
 23 |    Returns:
 24 |        torch.Tensor: 音素级别的BERT特征表示，形状为(bert_hidden_size, 音素数量)
 25 |    """
 26 |    with torch.no_grad():
 27 |        inputs = tokenizer(text, return_tensors="pt")
 28 |        for i in inputs:
 29 |            inputs[i] = inputs[i].to(device)
 30 |        res = bert_model(**inputs, output_hidden_states=True)
 31 |        # 获取倒数第三和倒数第二层的隐藏状态，并在序列长度维度上进行拼接
 32 |        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
 33 | 
 34 |        # 检查word2ph长度与输入文本长度是否相等
 35 |        assert len(word2ph) == len(text)
 36 | 
 37 |        phone_level_feature = []
 38 |        for i in range(len(word2ph)):
 39 |            # 对每个单词的BERT特征进行重复，重复次数等于该单词对应的音素数量
 40 |            repeat_feature = res[i].repeat(word2ph[i], 1)
 41 |            phone_level_feature.append(repeat_feature)
 42 | 
 43 |        # 将所有音素的BERT特征拼接成一个张量
 44 |        phone_level_feature = torch.cat(phone_level_feature, dim=0)
 45 | 
 46 |        return phone_level_feature.T
 47 | 
 48 | def process(data, save_dir, tokenizer, bert_model, device):
 49 |     """
 50 |     处理输入数据，获取音素序列及BERT特征。
 51 | 
 52 |     Args:
 53 |         data (list): 输入数据列表，每个元素为[wav_name, text, language]
 54 |         save_dir (str): BERT特征保存目录
 55 |         tokenizer (BertTokenizer): BERT分词器对象
 56 |         bert_model (BertModel): BERT模型对象
 57 |         device (str): 设备类型（'cuda'或'cpu'）
 58 | 
 59 |     Returns:
 60 |         list: 处理结果列表，每个元素为[wav_name, 音素序列, word2ph, norm_text]
 61 |     """
 62 |     res = []
 63 |     os.makedirs(save_dir, exist_ok=True)
 64 | 
 65 |     for name, text, lan in data:
 66 |         try:
 67 |             name = os.path.basename(name)
 68 |             # 清理文本并获取音素序列、单词到音素的映射以及规范化后的文本
 69 |             phones, word2ph, norm_text = clean_text(
 70 |                 text.replace("%", "-").replace("￥", ","), lan
 71 |             )
 72 |             path_bert = f"{save_dir}/{name}.pt"
 73 | 
 74 |             # 如果是中文文本且对应的BERT特征文件不存在，则计算并保存BERT特征
 75 |             if os.path.exists(path_bert) == False and lan == "zh":
 76 |                 bert_feature = get_bert_feature(norm_text, word2ph, tokenizer, bert_model, device)
 77 |                 assert bert_feature.shape[-1] == len(phones)
 78 |                 torch.save(bert_feature, path_bert)
 79 |             phones = " ".join(phones)
 80 |             res.append([name, phones, word2ph, norm_text])
 81 |         except:
 82 |             print(name, text, traceback.format_exc())
 83 | 
 84 |     return res
 85 | 
 86 | def get_phonemes(input_txt_path: str, 
 87 |                  save_path: str, 
 88 |                  bert_pretrained_dir: str='pretrained_models/chinese-roberta-wwm-ext-large', 
 89 |                  is_half: bool=False, 
 90 |                  **kwargs) -> None:
 91 |    """
 92 |    从输入文本文件中获取音素序列和BERT特征。
 93 | 
 94 |    Args:
 95 |        input_txt_path (str): 输入文本文件路径
 96 |        save_path (str): 保存结果的路径
 97 |        bert_pretrained_dir (str, optional): BERT预训练模型路径. Defaults to 'pretrained_models/chinese-roberta-wwm-ext-large'.
 98 |        is_half (bool, optional): 是否使用半精度（FP16）模式. Defaults to False.
 99 | 
100 |    Returns:
101 |        None
102 |    """
103 |    os.makedirs(save_path, exist_ok=True)
104 |    device = "cuda:0" if torch.cuda.is_available() else "cpu"
105 | 
106 |    tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)
107 |    bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir)
108 | 
109 |    if is_half:
110 |        bert_model = bert_model.half().to(device)
111 |    else:
112 |        bert_model = bert_model.to(device)
113 | 
114 |    bert_model.eval()
115 | 
116 |    todo = []
117 |    with open(input_txt_path, "r", encoding="utf8") as f:
118 |        lines = f.read().strip("\n").split("\n")
119 |        for line in lines:
120 |            try:
121 |                wav_name, spk_name, language, text = line.split("|")
122 |                todo.append([wav_name, text, language.lower()])
123 |            except:
124 |                print(line, traceback.format_exc())
125 | 
126 |    res = process(todo, f'{save_path}/bert_features', tokenizer, bert_model, device)
127 | 
128 |    opt = []
129 |    for name, phones, word2ph, norm_text in res:
130 |        opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text))
131 | 
132 |    with open(f"{save_path}/text2phonemes.txt", "w", encoding="utf8") as f:
133 |        f.write("\n".join(opt) + "\n")
134 | 
135 |    print("文本转音素已完成！")


--------------------------------------------------------------------------------
/src/preprocess/get_semantic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 4 | 
 5 | import traceback
 6 | import torch
 7 | from module.models import SynthesizerTrn
 8 | from utils.utils import get_hparams_from_file
 9 | 
10 | 
11 | def name2go(wav_name: str, 
12 |             cnhubert_features_path: str,
13 |             model: SynthesizerTrn,
14 |             device: str,
15 |             is_half: bool=False):
16 |     """
17 |     根据音频文件名提取语义特征。
18 | 
19 |     参数:
20 |     wav_name (str): 音频文件的名称。
21 |     cnhubert_features_path (str): 存储cnhubert特征文件的路径。
22 |     model (SynthesizerTrn): 用于提取语义特征的模型实例。
23 |     device (str): 指定运行模型的设备（如'cpu'或'cuda:0'）。
24 |     is_half (bool, 可选): 如果为True，则使用半精度浮点数处理数据以加快计算速度，默认为False。
25 | 
26 |     返回:
27 |     str: 音频文件名和其对应的语义特征，使用制表符（'\t'）分隔。
28 |     """
29 |     if not os.path.exists(cnhubert_features_path):
30 |         return
31 |     ssl_content = torch.load(f"{cnhubert_features_path}/{wav_name}.pt", map_location="cpu")
32 |     if is_half:
33 |         ssl_content = ssl_content.half().to(device)
34 |     else:
35 |         ssl_content = ssl_content.to(device)    
36 |     codes = model.extract_latent(ssl_content)
37 |     semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()])
38 |     return f"{wav_name}\t{semantic}"
39 | 
40 | 
41 | def get_semantic(input_txt_path: str,
42 |                 save_path: str,
43 |                 G_path: str='pretrained_models/sovits_weights/pretrained/s2G488k.pth',
44 |                 config_path: str='src/configs/sovits.json',
45 |                 is_half: bool=False,
46 |                 **kwargs
47 |                 ):
48 |     """
49 |     从文本文件中读取音频文件名，并提取相应的语义特征。
50 | 
51 |     参数:
52 |     input_txt_path (str): 包含音频文件名的输入文本文件路径。
53 |     save_path (str): 保存结果的路径。
54 |     G_path (str, 可选): 指向预训练模型权重的路径，默认为'sovits_weights'下的路径。
55 |     config_path (str, 可选): 配置文件的路径，默认为'src/configs'下的sovits.json。
56 |     is_half (bool, 可选): 是否使用半精度计算，默认为False。
57 | 
58 |     使用kwargs接受任何额外的参数，以便未来的扩展。
59 |     """
60 |     hubert_features_path = os.path.join(save_path, "cnhubert_features")
61 |     device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
62 |     
63 |     hps = get_hparams_from_file(config_path)
64 |     model = SynthesizerTrn(
65 |         hps.data.filter_length // 2 + 1,
66 |         hps.train.segment_size // hps.data.hop_length,
67 |         n_speakers=hps.data.n_speakers,
68 |         **hps.model
69 |     )    
70 |     # 加载模型权重
71 |     print(
72 |         model.load_state_dict(
73 |             torch.load(G_path, map_location=device)["weight"], strict=False
74 |         )
75 |     )
76 |     
77 |     if is_half:
78 |         model = model.half()
79 |     model = model.to(device)
80 |     model.eval() 
81 |     
82 |     with open(input_txt_path, "r", encoding="utf8") as f:
83 |         lines = f.read().strip("\n").split("\n")
84 |         
85 |     semantic_results = []
86 |     for line in lines:
87 |         try:
88 |             wav_name, _, _, _ = line.split("|")
89 |             wav_name = os.path.basename(wav_name)
90 |             semantic = name2go(wav_name, hubert_features_path, model, device, is_half)
91 |             semantic_results.append(semantic)
92 |         except Exception as e:
93 |             # 输出错误信息
94 |             print(line, str(e))
95 |     
96 |     with open(f"{save_path}/name2semantic.tsv", "w", encoding="utf8") as f:
97 |         f.write("\n".join(semantic_results))
98 |         
99 |     print('语义特征提取完成!')


--------------------------------------------------------------------------------
/src/preprocess/get_ssl_features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  4 | 
  5 | import traceback
  6 | import numpy as np
  7 | import librosa
  8 | from scipy.io import wavfile
  9 | import torch
 10 | 
 11 | from utils.utils import load_audio
 12 | from feature_extractor.cnhubert import CNHubert
 13 | 
 14 | 
 15 | MAXX = 0.95
 16 | ALPHA = 0.5
 17 | 
 18 | def name2go(wav_name: str, 
 19 |             wav_path: str,
 20 |             features_path: str,
 21 |             wav32k_path: str,
 22 |             model: CNHubert,
 23 |             device: str,
 24 |             is_half: bool=False):
 25 |     """
 26 |     对给定的音频文件进行处理和特征提取。
 27 | 
 28 |     参数:
 29 |     wav_name (str): 音频文件的名称。
 30 |     wav_path (str): 音频文件的完整路径。
 31 |     features_path (str): 用于保存提取出的特征文件的路径。
 32 |     wav32k_path (str): 用于保存处理后的32k采样率音频文件的路径。
 33 |     model (CNHubert): 预训练的CNHubert模型。
 34 |     device (str): 指定运行模型的设备，如'cuda:0'或'cpu'。
 35 |     is_half (bool): 是否将模型和数据转为半精度浮点数以节省内存，默认为False。
 36 | 
 37 |     返回:
 38 |     None或str: 如果处理中出现问题，返回音频文件的名称；否则不返回任何内容。
 39 |     """
 40 |     # 加载音频并调整为32k采样率
 41 |     tmp_audio = load_audio(wav_path, 32000)
 42 |     tmp_max = np.abs(tmp_audio).max()
 43 |     # 如果音频的最大绝对值大于2.2，则打印信息并返回
 44 |     if tmp_max > 2.2:
 45 |         print("%s-filtered,%s" % (wav_name, tmp_max))
 46 |         return
 47 |     # 对音频数据进行归一化和重采样处理
 48 |     tmp_audio32 = (tmp_audio / tmp_max * (MAXX * ALPHA * 32768)) + ((1 - ALPHA) * 32768) * tmp_audio
 49 |     tmp_audio32b = (tmp_audio / tmp_max * (MAXX * ALPHA * 1145.14)) + ((1 - ALPHA) * 1145.14) * tmp_audio
 50 |     tmp_audio = librosa.resample(tmp_audio32b, orig_sr=32000, target_sr=16000)  # 重采样到16k采样率
 51 | 
 52 |     tensor_wav16 = torch.from_numpy(tmp_audio)
 53 |     if is_half:
 54 |         tensor_wav16 = tensor_wav16.half().to(device)
 55 |     else:
 56 |         tensor_wav16 = tensor_wav16.to(device)
 57 |     # 使用CNHubert模型提取特征
 58 |     ssl = model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1, 2).cpu()
 59 |     # 检查提取的特征是否包含NaN值
 60 |     if np.isnan(ssl.detach().numpy()).sum() != 0:
 61 |         print("nan filtered:%s" % wav_name)
 62 |         return wav_name
 63 |     # 保存处理后的音频和特征文件
 64 |     wavfile.write(os.path.join(wav32k_path, wav_name), 32000, tmp_audio32.astype("int16"))
 65 |     torch.save(ssl, f"{features_path}/{wav_name}.pt")
 66 |     
 67 |     
 68 | def get_ssl_features(input_txt_path: str,
 69 |                      save_path: str,
 70 |                      input_wav_path: str = None,
 71 |                      cnhubert_path: str = 'pretrained_models/chinese-hubert-base',
 72 |                      is_half: bool = False,
 73 |                      **kwargs):
 74 |     """
 75 |     从文本文件中读取音频文件列表，对每个音频文件进行处理并提取特征。
 76 | 
 77 |     参数:
 78 |     input_txt_path (str): 包含音频文件信息的文本文件路径。
 79 |     save_path (str): 保存处理结果和特征文件的根目录。
 80 |     input_wav_path (str): 音频文件的输入目录。如果不为None，则会从这个目录读取音频文件，默认为None。
 81 |     cnhubert_path (str): CNHubert模型的路径，默认为预训练模型的路径。
 82 |     is_half (bool): 是否将模型和数据转为半精度浮点数以节省内存，默认为False。
 83 | 
 84 |     返回:
 85 |     None
 86 |     """
 87 |     # 创建保存32k采样率音频和特征文件的目录
 88 |     wav32k_path = os.path.join(save_path, "wav32k")
 89 |     features_path = os.path.join(save_path, "cnhubert_features")
 90 |     os.makedirs(wav32k_path, exist_ok=True)
 91 |     os.makedirs(features_path, exist_ok=True)
 92 |     
 93 |     # 加载模型并设置运行设备
 94 |     model = CNHubert(cnhubert_path)
 95 |     device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
 96 |     if is_half:
 97 |         model = model.half()
 98 |     model = model.to(device)
 99 |     model.eval()
100 |     
101 |     # 读取音频文件列表并进行处理
102 |     with open(input_txt_path, "r", encoding="utf8") as f:
103 |         lines = f.read().strip("\n").split("\n")
104 |     
105 |     nan_fails = []
106 |     for line in lines:
107 |         try:
108 |             wav_name, _, _, _ = line.split("|")
109 |             if input_wav_path:
110 |                 wav_name = os.path.basename(wav_name)
111 |                 wav_path = os.path.join(input_wav_path, wav_name)
112 |             else:
113 |                 wav_path = wav_name
114 |             if not os.path.exists(wav_path):
115 |                 print(f"{wav_path} does not exist")
116 |                 continue
117 |             wav_name = name2go(wav_name, wav_path, features_path, wav32k_path, model, device, is_half)
118 |             if wav_name:
119 |                 nan_fails.append(wav_name)
120 |         except:
121 |             print(line, traceback.format_exc())
122 |     
123 |     # 如果有处理失败的文件，尝试不使用半精度重新处理
124 |     if len(nan_fails) > 0 and is_half:
125 |         is_half = False
126 |         model = model.float()
127 |         for wav_name in nan_fails:
128 |             try:
129 |                 name2go(wav_name, wav_path, features_path, wav32k_path, model, device, is_half)
130 |             except:
131 |                 print(wav_name, traceback.format_exc())
132 | 
133 |     print('CnHubert特征提取已完成!')
134 | 


--------------------------------------------------------------------------------
/src/preprocess/process.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from get_phonemes import get_phonemes
 4 | from get_ssl_features import get_ssl_features
 5 | from get_semantic import get_semantic
 6 | 
 7 | def main(data_dir="../../data/", log_dir="logs/", name="dolly"):
 8 |     params = {
 9 |         "input_txt_path": os.path.join(data_dir, f"{name}/{name}.txt"),
10 |         "save_path": f"{log_dir}/{name}",
11 |         "input_wav_path": os.path.join(data_dir, f"{name}/vocal/")
12 |     }
13 |     get_phonemes(**params)
14 |     get_ssl_features(**params)
15 |     get_semantic(**params)
16 | 
17 | if __name__ == "__main__":
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--data_dir", type=str, default="../../data/", help="Directory to save data")
20 |     parser.add_argument("--log_dir", type=str, default="logs/", help="Directory to save logs")
21 |     parser.add_argument("--name", type=str, default="dolly", help="Name of the logs")
22 |     args = parser.parse_args()
23 | 
24 |     main(args.data_dir, args.log_dir, args.name)


--------------------------------------------------------------------------------
/src/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from text.symbols import *
 2 | 
 3 | 
 4 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 5 | 
 6 | def cleaned_text_to_sequence(cleaned_text):
 7 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 8 |     Args:
 9 |       text: string to convert to a sequence
10 |     Returns:
11 |       List of integers corresponding to the symbols in the text
12 |   '''
13 |   phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14 |   return phones
15 | 
16 | 


--------------------------------------------------------------------------------
/src/text/cleaner.py:
--------------------------------------------------------------------------------
 1 | from text import chinese, japanese, cleaned_text_to_sequence, symbols, english
 2 | 
 3 | language_module_map = {"zh": chinese, "ja": japanese, "en": english}
 4 | special = [
 5 |     # ("%", "zh", "SP"),
 6 |     ("￥", "zh", "SP2"),
 7 |     ("^", "zh", "SP3"),
 8 |     # ('@', 'zh', "SP4")#不搞鬼畜了，和第二版保持一致吧
 9 | ]
10 | 
11 | 
12 | def clean_text(text, language):
13 |     if(language not in language_module_map):
14 |         language="en"
15 |         text=" "
16 |     for special_s, special_l, target_symbol in special:
17 |         if special_s in text and language == special_l:
18 |             return clean_special(text, language, special_s, target_symbol)
19 |     language_module = language_module_map[language]
20 |     norm_text = language_module.text_normalize(text)
21 |     if language == "zh":
22 |         phones, word2ph = language_module.g2p(norm_text)
23 |         assert len(phones) == sum(word2ph)
24 |         assert len(norm_text) == len(word2ph)
25 |     else:
26 |         phones = language_module.g2p(norm_text)
27 |         word2ph = None
28 | 
29 |     for ph in phones:
30 |         assert ph in symbols
31 |     return phones, word2ph, norm_text
32 | 
33 | 
34 | def clean_special(text, language, special_s, target_symbol):
35 |     """
36 |     特殊静音段sp符号处理
37 |     """
38 |     text = text.replace(special_s, ",")
39 |     language_module = language_module_map[language]
40 |     norm_text = language_module.text_normalize(text)
41 |     phones = language_module.g2p(norm_text)
42 |     new_ph = []
43 |     for ph in phones[0]:
44 |         assert ph in symbols
45 |         if ph == ",":
46 |             new_ph.append(target_symbol)
47 |         else:
48 |             new_ph.append(ph)
49 |     return new_ph, phones[1], norm_text
50 | 
51 | 
52 | def text_to_sequence(text, language):
53 |     phones = clean_text(text)
54 |     return cleaned_text_to_sequence(phones)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
59 | 


--------------------------------------------------------------------------------
/src/text/engdict-hot.rep:
--------------------------------------------------------------------------------
1 | CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2 | LLM EH1 L EH1 L EH1 M


--------------------------------------------------------------------------------
/src/text/engdict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/src/text/engdict_cache.pickle


--------------------------------------------------------------------------------
/src/text/namedict_cache.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZaVang/GPT-SoVits/2fad70072ea91dae560e3da894c719676496684a/src/text/namedict_cache.pickle


--------------------------------------------------------------------------------
/src/text/zh_normalization/README.md:
--------------------------------------------------------------------------------
 1 | ## Supported NSW (Non-Standard-Word) Normalization
 2 | 
 3 | |NSW type|raw|normalized|
 4 | |:--|:-|:-|
 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
 6 | |cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
 7 | |numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
 8 | |date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
14 | |telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 | 


--------------------------------------------------------------------------------
/src/text/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from text.zh_normalization.text_normlization import *
15 | 


--------------------------------------------------------------------------------
/src/text/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import re
 15 | 
 16 | from .num import DIGITS
 17 | from .num import num2str
 18 | from .num import verbalize_cardinal
 19 | from .num import verbalize_digit
 20 | 
 21 | 
 22 | def _time_num2str(num_string: str) -> str:
 23 |     """A special case for verbalizing number in time."""
 24 |     result = num2str(num_string.lstrip('0'))
 25 |     if num_string.startswith('0'):
 26 |         result = DIGITS['0'] + result
 27 |     return result
 28 | 
 29 | 
 30 | # 时刻表达式
 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
 32 |                      r':([0-5][0-9])'
 33 |                      r'(:([0-5][0-9]))?')
 34 | 
 35 | # 时间范围，如8:30-12:30
 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
 37 |                            r':([0-5][0-9])'
 38 |                            r'(:([0-5][0-9]))?'
 39 |                            r'(~|-)'
 40 |                            r'([0-1]?[0-9]|2[0-3])'
 41 |                            r':([0-5][0-9])'
 42 |                            r'(:([0-5][0-9]))?')
 43 | 
 44 | 
 45 | def replace_time(match) -> str:
 46 |     """
 47 |     Args:
 48 |         match (re.Match)
 49 |     Returns:
 50 |         str
 51 |     """
 52 | 
 53 |     is_range = len(match.groups()) > 5
 54 | 
 55 |     hour = match.group(1)
 56 |     minute = match.group(2)
 57 |     second = match.group(4)
 58 | 
 59 |     if is_range:
 60 |         hour_2 = match.group(6)
 61 |         minute_2 = match.group(7)
 62 |         second_2 = match.group(9)
 63 | 
 64 |     result = f"{num2str(hour)}点"
 65 |     if minute.lstrip('0'):
 66 |         if int(minute) == 30:
 67 |             result += "半"
 68 |         else:
 69 |             result += f"{_time_num2str(minute)}分"
 70 |     if second and second.lstrip('0'):
 71 |         result += f"{_time_num2str(second)}秒"
 72 | 
 73 |     if is_range:
 74 |         result += "至"
 75 |         result += f"{num2str(hour_2)}点"
 76 |         if minute_2.lstrip('0'):
 77 |             if int(minute) == 30:
 78 |                 result += "半"
 79 |             else:
 80 |                 result += f"{_time_num2str(minute_2)}分"
 81 |         if second_2 and second_2.lstrip('0'):
 82 |             result += f"{_time_num2str(second_2)}秒"
 83 | 
 84 |     return result
 85 | 
 86 | 
 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年'
 88 |                      r'((0?[1-9]|1[0-2])月)?'
 89 |                      r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
 90 | 
 91 | 
 92 | def replace_date(match) -> str:
 93 |     """
 94 |     Args:
 95 |         match (re.Match)
 96 |     Returns:
 97 |         str
 98 |     """
 99 |     year = match.group(1)
100 |     month = match.group(3)
101 |     day = match.group(5)
102 |     result = ""
103 |     if year:
104 |         result += f"{verbalize_digit(year)}年"
105 |     if month:
106 |         result += f"{verbalize_cardinal(month)}月"
107 |     if day:
108 |         result += f"{verbalize_cardinal(day)}{match.group(9)}"
109 |     return result
110 | 
111 | 
112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
113 | RE_DATE2 = re.compile(
114 |     r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
115 | 
116 | 
117 | def replace_date2(match) -> str:
118 |     """
119 |     Args:
120 |         match (re.Match)
121 |     Returns:
122 |         str
123 |     """
124 |     year = match.group(1)
125 |     month = match.group(3)
126 |     day = match.group(4)
127 |     result = ""
128 |     if year:
129 |         result += f"{verbalize_digit(year)}年"
130 |     if month:
131 |         result += f"{verbalize_cardinal(month)}月"
132 |     if day:
133 |         result += f"{verbalize_cardinal(day)}日"
134 |     return result
135 | 


--------------------------------------------------------------------------------
/src/text/zh_normalization/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 | 
17 | from pypinyin.constants import SUPPORT_UCS4
18 | 
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {
22 |     ord(char) + 65248: ord(char)
23 |     for char in string.ascii_letters
24 | }
25 | 
26 | # 英文字符半角 -> 全角映射表
27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
28 | 
29 | # 数字字符全角 -> 半角映射表 (num: 10)
30 | F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
31 | # 数字字符半角 -> 全角映射表
32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
33 | 
34 | # 标点符号全角 -> 半角映射表 (num: 32)
35 | F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
36 | # 标点符号半角 -> 全角映射表
37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
38 | 
39 | # 空格 (num: 1)
40 | F2H_SPACE = {'\u3000': ' '}
41 | H2F_SPACE = {' ': '\u3000'}
42 | 
43 | # 非"有拼音的汉字"的字符串，可用于NSW提取
44 | if SUPPORT_UCS4:
45 |     RE_NSW = re.compile(r'(?:[^'
46 |                         r'\u3007'  # 〇
47 |                         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
48 |                         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
49 |                         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
50 |                         r'\U00020000-\U0002A6DF'  # CJK扩展B:[20000-2A6DF]
51 |                         r'\U0002A703-\U0002B73F'  # CJK扩展C:[2A700-2B73F]
52 |                         r'\U0002B740-\U0002B81D'  # CJK扩展D:[2B740-2B81D]
53 |                         r'\U0002F80A-\U0002FA1F'  # CJK兼容扩展:[2F800-2FA1F]
54 |                         r'])+')
55 | else:
56 |     RE_NSW = re.compile(  # pragma: no cover
57 |         r'(?:[^'
58 |         r'\u3007'  # 〇
59 |         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
60 |         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
61 |         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
62 |         r'])+')
63 | 


--------------------------------------------------------------------------------
/src/text/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import verbalize_digit
17 | 
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通：130、131、132、156、155、186、185、176
23 | # 电信：133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(
25 |     r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
26 | RE_TELEPHONE = re.compile(
27 |     r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
28 | 
29 | # 全国统一的号码400开头
30 | RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
31 | 
32 | 
33 | def phone2str(phone_string: str, mobile=True) -> str:
34 |     if mobile:
35 |         sp_parts = phone_string.strip('+').split()
36 |         result = '，'.join(
37 |             [verbalize_digit(part, alt_one=True) for part in sp_parts])
38 |         return result
39 |     else:
40 |         sil_parts = phone_string.split('-')
41 |         result = '，'.join(
42 |             [verbalize_digit(part, alt_one=True) for part in sil_parts])
43 |         return result
44 | 
45 | 
46 | def replace_phone(match) -> str:
47 |     """
48 |     Args:
49 |         match (re.Match)
50 |     Returns:
51 |         str
52 |     """
53 |     return phone2str(match.group(0), mobile=False)
54 | 
55 | 
56 | def replace_mobile(match) -> str:
57 |     """
58 |     Args:
59 |         match (re.Match)
60 |     Returns:
61 |         str
62 |     """
63 |     return phone2str(match.group(0))
64 | 


--------------------------------------------------------------------------------
/src/text/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import num2str
17 | 
18 | # 温度表达式，温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
21 | measure_dict = {
22 |     "cm2": "平方厘米",
23 |     "cm²": "平方厘米",
24 |     "cm3": "立方厘米",
25 |     "cm³": "立方厘米",
26 |     "cm": "厘米",
27 |     "db": "分贝",
28 |     "ds": "毫秒",
29 |     "kg": "千克",
30 |     "km": "千米",
31 |     "m2": "平方米",
32 |     "m²": "平方米",
33 |     "m³": "立方米",
34 |     "m3": "立方米",
35 |     "ml": "毫升",
36 |     "m": "米",
37 |     "mm": "毫米",
38 |     "s": "秒"
39 | }
40 | 
41 | 
42 | def replace_temperature(match) -> str:
43 |     """
44 |     Args:
45 |         match (re.Match)
46 |     Returns:
47 |         str
48 |     """
49 |     sign = match.group(1)
50 |     temperature = match.group(2)
51 |     unit = match.group(3)
52 |     sign: str = "零下" if sign else ""
53 |     temperature: str = num2str(temperature)
54 |     unit: str = "摄氏度" if unit == "摄氏度" else "度"
55 |     result = f"{sign}{temperature}{unit}"
56 |     return result
57 | 
58 | 
59 | def replace_measure(sentence) -> str:
60 |     for q_notation in measure_dict:
61 |         if q_notation in sentence:
62 |             sentence = sentence.replace(q_notation, measure_dict[q_notation])
63 |     return sentence
64 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import HParams


--------------------------------------------------------------------------------
/src/utils/cut.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from enum import Enum
  3 | 
  4 | SPLITS = {"，", "。", "？", "！", ",", ".", "?", "!", "~", ":", "：", "—", "…", }
  5 | 
  6 | def split(todo_text):
  7 |     todo_text = todo_text.replace("……", "。").replace("——", "，")
  8 |     if todo_text[-1] not in SPLITS:
  9 |         todo_text += "。"
 10 |     i_split_head = i_split_tail = 0
 11 |     len_text = len(todo_text)
 12 |     todo_texts = []
 13 |     while 1:
 14 |         if i_split_head >= len_text:
 15 |             break  # 结尾一定有标点，所以直接跳出即可，最后一段在上次已加入
 16 |         if todo_text[i_split_head] in SPLITS:
 17 |             i_split_head += 1
 18 |             todo_texts.append(todo_text[i_split_tail:i_split_head])
 19 |             i_split_tail = i_split_head
 20 |         else:
 21 |             i_split_head += 1
 22 |     return todo_texts
 23 | 
 24 | 
 25 | def get_first(text):
 26 |     pattern = "[" + "".join(re.escape(sep) for sep in SPLITS) + "]"
 27 |     text = re.split(pattern, text)[0].strip()
 28 |     return text
 29 | 
 30 | 
 31 | def cut1(inp):
 32 |     inp = inp.strip("\n")
 33 |     inps = split(inp)
 34 |     split_idx = list(range(0, len(inps), 4))
 35 |     split_idx[-1] = None
 36 |     if len(split_idx) > 1:
 37 |         opts = []
 38 |         for idx in range(len(split_idx) - 1):
 39 |             opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
 40 |     else:
 41 |         opts = [inp]
 42 |     return "\n".join(opts)
 43 | 
 44 | 
 45 | def cut2(inp):
 46 |     inp = inp.strip("\n")
 47 |     inps = split(inp)
 48 |     if len(inps) < 2:
 49 |         return inp
 50 |     opts = []
 51 |     summ = 0
 52 |     tmp_str = ""
 53 |     for i in range(len(inps)):
 54 |         summ += len(inps[i])
 55 |         tmp_str += inps[i]
 56 |         if summ > 50:
 57 |             summ = 0
 58 |             opts.append(tmp_str)
 59 |             tmp_str = ""
 60 |     if tmp_str != "":
 61 |         opts.append(tmp_str)
 62 |     # print(opts)
 63 |     if len(opts) > 1 and len(opts[-1]) < 50:  ##如果最后一个太短了，和前一个合一起
 64 |         opts[-2] = opts[-2] + opts[-1]
 65 |         opts = opts[:-1]
 66 |     return "\n".join(opts)
 67 | 
 68 | 
 69 | def cut3(inp):
 70 |     inp = inp.strip("\n")
 71 |     return "\n".join(["%s" % item for item in inp.strip("。").split("。")])
 72 | 
 73 | 
 74 | def cut4(inp):
 75 |     inp = inp.strip("\n")
 76 |     return "\n".join(["%s" % item for item in inp.strip(".").split(".")])
 77 | 
 78 | 
 79 | # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
 80 | def cut5(inp):
 81 |     # if not re.search(r'[^\w\s]', inp[-1]):
 82 |     # inp += '。'
 83 |     inp = inp.strip("\n")
 84 |     punds = r'[,.;?!、，。？！;：…]'
 85 |     items = re.split(f'({punds})', inp)
 86 |     mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
 87 |     # 在句子不存在符号或句尾无符号的时候保证文本完整
 88 |     if len(items)%2 == 1:
 89 |         mergeitems.append(items[-1])
 90 |     opt = "\n".join(mergeitems)
 91 |     return opt
 92 | 
 93 | 
 94 | CUT_DICT = {
 95 |     "凑四句一切": cut1,
 96 |     "凑50字一切": cut2,
 97 |     "按中文句号。切": cut3,
 98 |     "按英文句号.切": cut4,
 99 |     "按标点符号切": cut5
100 | }


--------------------------------------------------------------------------------
/src/utils/process_ckpt.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | from collections import OrderedDict
 3 | from time import time as ttime
 4 | import shutil,os
 5 | import torch
 6 | 
 7 | def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path
 8 |     dir=os.path.dirname(path)
 9 |     name=os.path.basename(path)
10 |     tmp_path="%s.pth"%(ttime())
11 |     torch.save(fea,tmp_path)
12 |     shutil.move(tmp_path,"%s/%s"%(dir,name))
13 | 
14 | def savee(ckpt, name, epoch, steps, hps):
15 |     try:
16 |         opt = OrderedDict()
17 |         opt["weight"] = {}
18 |         for key in ckpt.keys():
19 |             if "enc_q" in key:
20 |                 continue
21 |             opt["weight"][key] = ckpt[key].half()
22 |         opt["config"] = hps
23 |         opt["info"] = "%sepoch_%siteration" % (epoch, steps)
24 |         torch.save(opt, f"{hps.model_dir}/{name}.pth")
25 |         # my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
26 |         return "Success."
27 |     except:
28 |         return traceback.format_exc()
29 | 


--------------------------------------------------------------------------------
/tools/asr/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def check_fw_local_models():
 4 |     '''
 5 |     启动时检查本地是否有 Faster Whisper 模型.
 6 |     '''
 7 |     model_size_list = [
 8 |         "tiny",     "tiny.en", 
 9 |         "base",     "base.en", 
10 |         "small",    "small.en", 
11 |         "medium",   "medium.en", 
12 |         "large",    "large-v1", 
13 |         "large-v2", "large-v3"]
14 |     for i, size in enumerate(model_size_list):
15 |         if os.path.exists(f'tools/asr/models/faster-whisper-{size}'):
16 |             model_size_list[i] = size + '-local'
17 |     return model_size_list
18 | 
19 | asr_dict = {
20 |     "达摩 ASR (中文)": {
21 |         'lang': ['zh'],
22 |         'size': ['large'],
23 |         'path': 'funasr_asr.py',
24 |     },
25 |     "Faster Whisper (多语种)": {
26 |         'lang': ['auto', 'zh', 'en', 'ja'],
27 |         'size': check_fw_local_models(),
28 |         'path': 'fasterwhisper_asr.py'
29 |     }
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/tools/asr/fasterwhisper_asr.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | os.environ["HF_ENDPOINT"]="https://hf-mirror.com"
  4 | import traceback
  5 | import requests
  6 | from glob import glob
  7 | 
  8 | from faster_whisper import WhisperModel
  9 | from tqdm import tqdm
 10 | 
 11 | from tools.asr.config import check_fw_local_models
 12 | from tools.asr.funasr_asr import only_asr
 13 | 
 14 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 15 | 
 16 | language_code_list = [
 17 |     "af", "am", "ar", "as", "az", 
 18 |     "ba", "be", "bg", "bn", "bo", 
 19 |     "br", "bs", "ca", "cs", "cy", 
 20 |     "da", "de", "el", "en", "es", 
 21 |     "et", "eu", "fa", "fi", "fo", 
 22 |     "fr", "gl", "gu", "ha", "haw", 
 23 |     "he", "hi", "hr", "ht", "hu", 
 24 |     "hy", "id", "is", "it", "ja", 
 25 |     "jw", "ka", "kk", "km", "kn", 
 26 |     "ko", "la", "lb", "ln", "lo", 
 27 |     "lt", "lv", "mg", "mi", "mk", 
 28 |     "ml", "mn", "mr", "ms", "mt", 
 29 |     "my", "ne", "nl", "nn", "no", 
 30 |     "oc", "pa", "pl", "ps", "pt", 
 31 |     "ro", "ru", "sa", "sd", "si", 
 32 |     "sk", "sl", "sn", "so", "sq", 
 33 |     "sr", "su", "sv", "sw", "ta", 
 34 |     "te", "tg", "th", "tk", "tl", 
 35 |     "tr", "tt", "uk", "ur", "uz", 
 36 |     "vi", "yi", "yo", "zh", "yue",
 37 |     "auto"]
 38 | 
 39 | def execute_asr(input_folder, output_folder, model_size, language,precision):
 40 |     if '-local' in model_size:
 41 |         model_size = model_size[:-6]
 42 |         model_path = f'tools/asr/models/faster-whisper-{model_size}'
 43 |     else:
 44 |         model_path = model_size
 45 |     if language == 'auto':
 46 |         language = None #不设置语种由模型自动输出概率最高的语种
 47 |     print("loading faster whisper model:",model_size,model_path)
 48 |     try:
 49 |         model = WhisperModel(model_path, device="cuda", compute_type=precision)
 50 |     except:
 51 |         return print(traceback.format_exc())
 52 |     output = []
 53 |     output_file_name = os.path.basename(input_folder)
 54 |     output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
 55 | 
 56 |     if not os.path.exists(output_folder):
 57 |         os.makedirs(output_folder)
 58 | 
 59 |     for file in tqdm(glob(os.path.join(input_folder, '**/*.wav'), recursive=True)):
 60 |         try:
 61 |             segments, info = model.transcribe(
 62 |                 audio          = file,
 63 |                 beam_size      = 5,
 64 |                 vad_filter     = True,
 65 |                 vad_parameters = dict(min_silence_duration_ms=700),
 66 |                 language       = language)
 67 |             text = ''
 68 | 
 69 |             if info.language == "zh":
 70 |                 print("检测为中文文本,转funasr处理")
 71 |                 text = only_asr(file)
 72 | 
 73 |             if text == '':
 74 |                 for segment in segments:
 75 |                     text += segment.text
 76 |             output.append(f"{file}|{output_file_name}|{info.language.upper()}|{text}")
 77 |         except:
 78 |             return print(traceback.format_exc())
 79 |         
 80 |     with open(output_file_path, "w", encoding="utf-8") as f:
 81 |         f.write("\n".join(output))
 82 |         print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
 83 |     return output_file_path
 84 | 
 85 | if __name__ == '__main__':
 86 |     parser = argparse.ArgumentParser()
 87 |     parser.add_argument("-i", "--input_folder", type=str, required=True,
 88 |                         help="Path to the folder containing WAV files.")
 89 |     parser.add_argument("-o", "--output_folder", type=str, required=True, 
 90 |                         help="Output folder to store transcriptions.")
 91 |     parser.add_argument("-s", "--model_size", type=str, default='large-v3', 
 92 |                         choices=check_fw_local_models(),
 93 |                         help="Model Size of Faster Whisper")
 94 |     parser.add_argument("-l", "--language", type=str, default='ja',
 95 |                         choices=language_code_list,
 96 |                         help="Language of the audio files.")
 97 |     parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
 98 |                         help="fp16 or fp32")
 99 | 
100 |     cmd = parser.parse_args()
101 |     output_file_path = execute_asr(
102 |         input_folder  = cmd.input_folder,
103 |         output_folder = cmd.output_folder,
104 |         model_size    = cmd.model_size,
105 |         language      = cmd.language,
106 |         precision     = cmd.precision,
107 |     )
108 | 


--------------------------------------------------------------------------------
/tools/asr/funasr_asr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import argparse
 4 | import os
 5 | import traceback
 6 | from tqdm import tqdm
 7 | 
 8 | from funasr import AutoModel
 9 | 
10 | path_asr  = 'pretrained_models/asr/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
11 | path_vad  = 'pretrained_models/asr/speech_fsmn_vad_zh-cn-16k-common-pytorch'
12 | path_punc = 'pretrained_models/asr/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
13 | 
14 | model = AutoModel(
15 |     model               = path_asr,
16 |     model_revision      = "v2.0.4",
17 |     vad_model           = path_vad,
18 |     vad_model_revision  = "v2.0.4",
19 |     punc_model          = path_punc,
20 |     punc_model_revision = "v2.0.4",
21 | )
22 | 
23 | def only_asr(input_file):
24 |     try:
25 |         text = model.generate(input=input_file)[0]["text"]
26 |     except:
27 |         text = ''
28 |         print(traceback.format_exc())
29 |     return text
30 | 
31 | def execute_asr(input_folder, output_folder, model_size, language):
32 |     input_file_names = os.listdir(input_folder)
33 |     input_file_names.sort()
34 |     
35 |     output = []
36 |     output_file_name = os.path.basename(input_folder)
37 | 
38 |     for name in tqdm(input_file_names):
39 |         try:
40 |             text = model.generate(input="%s/%s"%(input_folder, name))[0]["text"]
41 |             output.append(f"{input_folder}/{name}|{output_file_name}|{language.upper()}|{text}")
42 |         except:
43 |             print(traceback.format_exc())
44 | 
45 |     output_folder = output_folder or "output/asr_opt"
46 |     os.makedirs(output_folder, exist_ok=True)
47 |     output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
48 | 
49 |     with open(output_file_path, "w", encoding="utf-8") as f:
50 |         f.write("\n".join(output))
51 |         print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
52 |     return output_file_path
53 | 
54 | if __name__ == '__main__':
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument("-i", "--input_folder", type=str, required=True,
57 |                         help="Path to the folder containing WAV files.")
58 |     parser.add_argument("-o", "--output_folder", type=str, required=True, 
59 |                         help="Output folder to store transcriptions.")
60 |     parser.add_argument("-s", "--model_size", type=str, default='large',
61 |                         help="Model Size of FunASR is Large")
62 |     parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'],
63 |                         help="Language of the audio files.")
64 |     parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
65 |                         help="fp16 or fp32")#还没接入
66 | 
67 |     cmd = parser.parse_args()
68 |     execute_asr(
69 |         input_folder  = cmd.input_folder,
70 |         output_folder = cmd.output_folder,
71 |         model_size    = cmd.model_size,
72 |         language      = cmd.language,
73 |     )
74 | 


--------------------------------------------------------------------------------
/tools/asr/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/tools/cmd-denoise.py:
--------------------------------------------------------------------------------
 1 | import os,argparse
 2 | 
 3 | from modelscope.pipelines import pipeline
 4 | from modelscope.utils.constant import Tasks
 5 | from tqdm import tqdm
 6 | 
 7 | path_denoise  = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
 8 | path_denoise  = path_denoise  if os.path.exists(path_denoise)  else "damo/speech_frcrn_ans_cirm_16k"
 9 | ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
10 | def execute_denoise(input_folder,output_folder):
11 |     os.makedirs(output_folder,exist_ok=True)
12 |     # print(input_folder)
13 |     # print(list(os.listdir(input_folder).sort()))
14 |     for name in tqdm(os.listdir(input_folder)):
15 |         ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
16 | 
17 | if __name__ == '__main__':
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("-i", "--input_folder", type=str, required=True,
20 |                         help="Path to the folder containing WAV files.")
21 |     parser.add_argument("-o", "--output_folder", type=str, required=True, 
22 |                         help="Output folder to store transcriptions.")
23 |     parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
24 |                         help="fp16 or fp32")#还没接入
25 |     cmd = parser.parse_args()
26 |     execute_denoise(
27 |         input_folder  = cmd.input_folder,
28 |         output_folder = cmd.output_folder,
29 |     )


--------------------------------------------------------------------------------
/tools/denoise-model/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/tools/my_utils.py:
--------------------------------------------------------------------------------
 1 | import platform,os,traceback
 2 | import ffmpeg
 3 | import numpy as np
 4 | 
 5 | 
 6 | def load_audio(file, sr):
 7 |     try:
 8 |         # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
 9 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
10 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
11 |         file = clean_path(file)  # 防止小白拷路径头尾带了空格和"和回车
12 |         if os.path.exists(file) == False:
13 |             raise RuntimeError(
14 |                 "You input a wrong audio path that does not exists, please fix it!"
15 |             )
16 |         out, _ = (
17 |             ffmpeg.input(file, threads=0)
18 |             .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
19 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
20 |         )
21 |     except Exception as e:
22 |         traceback.print_exc()
23 |         raise RuntimeError(f"Failed to load audio: {e}")
24 | 
25 |     return np.frombuffer(out, np.float32).flatten()
26 | 
27 | 
28 | def clean_path(path_str):
29 |     if platform.system() == 'Windows':
30 |         path_str = path_str.replace('/', '\\')
31 |     return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
32 | 


--------------------------------------------------------------------------------
/tools/process_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import argparse
 4 | import re
 5 | 
 6 | def convert_xlsx_to_txt(name, language) -> None:
 7 |     # 读取xlsx文件
 8 |     xlsx_path = os.path.join('data', name, 'lines.xlsx')
 9 |     df = pd.read_excel(xlsx_path, header=None)  # 不使用表头
10 |     
11 |     # 创建一个字典来存储音频名称和对应的文本
12 |     audio_text_dict = {}
13 |     for index, row in df.iterrows():
14 |         text = row[0]  # 第一列
15 |         audio_name = row[1]  # 第二列
16 |         if pd.isna(text):
17 |             continue
18 |         # 删除括号里的部分，包括中文括号
19 |         text = re.sub(r'\(.*?\)', '', text).strip()
20 |         text = re.sub(r'\[.*?\]', '', text).strip()
21 |         text = re.sub(r'（.*?）', '', text).strip()  # 处理中文括号
22 |         if not text:
23 |             continue
24 |         audio_text_dict[audio_name] = text
25 |     
26 |     # 遍历vocal文件夹下的所有wav文件
27 |     vocal_dir = os.path.join('data', name, 'vocal')
28 |     wav_files = [f for f in os.listdir(vocal_dir) if f.endswith('.wav')]
29 |     
30 |     total_wav_count = len(wav_files)
31 |     no_text_count = 0
32 |     
33 |     # 打开name.txt文件进行写入
34 |     txt_path = os.path.join('data', name, f'{name}.txt')
35 |     with open(txt_path, 'w', encoding='utf-8') as f:
36 |         for wav_file in wav_files:
37 |             # 去掉末尾的-00x
38 |             base_name = re.sub(r'-\d{3}$', '', wav_file[:-4])
39 |             
40 |             if base_name in audio_text_dict:
41 |                 text = audio_text_dict[base_name]
42 |                 f.write(f"{wav_file}|{name}|{language}|{text}\n")
43 |             else:
44 |                 print(f"Warning: {wav_file} does not have a corresponding text.")
45 |                 no_text_count += 1
46 |     
47 |     print(f"Total WAV files: {total_wav_count}")
48 |     print(f"WAV files without corresponding text: {no_text_count}")
49 | 
50 | def main() -> None:
51 |     parser = argparse.ArgumentParser(description='Convert lines.xlsx to name.txt')
52 |     parser.add_argument('-n', '--name', type=str, help='Name of the dataset')
53 |     parser.add_argument('-l', '--language', type=str, help='Language code (e.g., zh, ja, en)')
54 |     
55 |     args = parser.parse_args()
56 |     
57 |     convert_xlsx_to_txt(args.name, args.language)
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/tools/slice_audio.py:
--------------------------------------------------------------------------------
  1 | import os,sys,numpy as np
  2 | import traceback
  3 | # parent_directory = os.path.dirname(os.path.abspath(__file__))
  4 | # sys.path.append(parent_directory)
  5 | from my_utils import load_audio
  6 | from slicer2 import Slicer
  7 | import os.path
  8 | from argparse import ArgumentParser
  9 | import soundfile
 10 | import librosa
 11 | 
 12 | def slice(input_path: str,
 13 |           output_path: str,
 14 |           db_threshold: int=-40,
 15 |           min_length: int=5000,
 16 |           min_interval: int=300,
 17 |           hop_size: int=10,
 18 |           max_sil_kept: int=500,
 19 |           max_amp: float=1.0,
 20 |           alpha: float=0):
 21 |     
 22 |     os.makedirs(output_path, exist_ok=True)
 23 |     if os.path.isfile(input_path):
 24 |         input_files=[input_path]
 25 |     elif os.path.isdir(input_path):
 26 |         input_files=[os.path.join(input_path, name) for name in sorted(list(os.listdir(input_path)))]
 27 |     else:
 28 |         return "输入路径存在但既不是文件也不是文件夹"
 29 |     
 30 |     max_amp=float(max_amp)
 31 |     alpha=float(alpha)
 32 |     for file in input_files:
 33 |         try:
 34 |             audio, sr = librosa.load(file, sr=None, mono=False)
 35 |             # print(audio.shape)
 36 |             
 37 |             slicer = Slicer(
 38 |                 sr=sr,  # 长音频采样率
 39 |                 threshold=      int(db_threshold),  # 音量小于这个值视作静音的备选切割点
 40 |                 min_length=     int(min_length),  # 每段最小多长，如果第一段太短一直和后面段连起来直到超过这个值
 41 |                 min_interval=   int(min_interval),  # 最短切割间隔
 42 |                 hop_size=       int(hop_size),  # 怎么算音量曲线，越小精度越大计算量越高（不是精度越大效果越好）
 43 |                 max_sil_kept=   int(max_sil_kept),  # 切完后静音最多留多长
 44 |             )
 45 |             
 46 |             chunks = slicer.slice(audio)
 47 |             for i, (chunk, start, end) in enumerate(chunks):
 48 |                 if len(chunk.shape) > 1:
 49 |                     chunk = chunk.T
 50 |                 tmp_max = np.abs(chunk).max()
 51 |                 if(tmp_max>1):chunk/=tmp_max
 52 |                 chunk = (chunk / tmp_max * (max_amp * alpha)) + (1 - alpha) * chunk
 53 |                 soundfile.write(
 54 |                     os.path.join(
 55 |                         output_path,
 56 |                         f"%s_%d.wav"
 57 |                         % (os.path.basename(file).rsplit(".", maxsplit=1)[0], i),
 58 |                     ),
 59 |                     chunk,
 60 |                     sr,
 61 |                 )
 62 |         except:
 63 |             print(file,"->fail->",traceback.format_exc())
 64 |     return "执行完毕，请检查输出文件"
 65 | 
 66 | 
 67 | if __name__ == "__main__":
 68 |     parser = ArgumentParser()
 69 |     parser.add_argument(
 70 |         "--input_path", 
 71 |         type=str, 
 72 |         help="The audios to be sliced, can be a single file or a directory"
 73 |     )
 74 |     parser.add_argument(
 75 |         "--output_path", 
 76 |         type=str, 
 77 |         help="Output directory of the sliced audio clips"
 78 |     )
 79 |     parser.add_argument(
 80 |         "--db_threshold",
 81 |         type=float,
 82 |         required=False,
 83 |         default=-40,
 84 |         help="The dB threshold for silence detection",
 85 |     )
 86 |     parser.add_argument(
 87 |         "--min_length",
 88 |         type=int,
 89 |         required=False,
 90 |         default=5000,
 91 |         help="The minimum milliseconds required for each sliced audio clip",
 92 |     )
 93 |     parser.add_argument(
 94 |         "--min_interval",
 95 |         type=int,
 96 |         required=False,
 97 |         default=300,
 98 |         help="The minimum milliseconds for a silence part to be sliced",
 99 |     )
100 |     parser.add_argument(
101 |         "--hop_size",
102 |         type=int,
103 |         required=False,
104 |         default=10,
105 |         help="Frame length in milliseconds",
106 |     )
107 |     parser.add_argument(
108 |         "--max_sil_kept",
109 |         type=int,
110 |         required=False,
111 |         default=500,
112 |         help="The maximum silence length kept around the sliced clip, presented in milliseconds",
113 |     )
114 |     parser.add_argument(
115 |         "--max_amp",
116 |         type=float,
117 |         required=False,
118 |         default=1.0,
119 |         help="The maximum amplitude of the sliced audio clips",
120 |     )
121 |     parser.add_argument(
122 |         "--alpha",
123 |         type=float,
124 |         required=False,
125 |         default=0,
126 |         help="The alpha value for amplitude adjustment",
127 |     )
128 |     args = parser.parse_args()
129 |     
130 |     slice(**args.__dict__)
131 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123812KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_123821KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.bottleneck = nn.Sequential(
104 |             Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105 |         )
106 | 
107 |     def forward(self, x):
108 |         _, _, h, w = x.size()
109 |         feat1 = F.interpolate(
110 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111 |         )
112 |         feat2 = self.conv2(x)
113 |         feat3 = self.conv3(x)
114 |         feat4 = self.conv4(x)
115 |         feat5 = self.conv5(x)
116 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117 |         bottle = self.bottleneck(out)
118 |         return bottle
119 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_33966KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.conv6 = SeperableConv2DBNActiv(
104 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 |         )
106 |         self.conv7 = SeperableConv2DBNActiv(
107 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 |         )
109 |         self.bottleneck = nn.Sequential(
110 |             Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 |         )
112 | 
113 |     def forward(self, x):
114 |         _, _, h, w = x.size()
115 |         feat1 = F.interpolate(
116 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 |         )
118 |         feat2 = self.conv2(x)
119 |         feat3 = self.conv3(x)
120 |         feat4 = self.conv4(x)
121 |         feat5 = self.conv5(x)
122 |         feat6 = self.conv6(x)
123 |         feat7 = self.conv7(x)
124 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 |         bottle = self.bottleneck(out)
126 |         return bottle
127 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_537227KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.conv6 = SeperableConv2DBNActiv(
104 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 |         )
106 |         self.conv7 = SeperableConv2DBNActiv(
107 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 |         )
109 |         self.bottleneck = nn.Sequential(
110 |             Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 |         )
112 | 
113 |     def forward(self, x):
114 |         _, _, h, w = x.size()
115 |         feat1 = F.interpolate(
116 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 |         )
118 |         feat2 = self.conv2(x)
119 |         feat3 = self.conv3(x)
120 |         feat4 = self.conv4(x)
121 |         feat5 = self.conv5(x)
122 |         feat6 = self.conv6(x)
123 |         feat7 = self.conv7(x)
124 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 |         bottle = self.bottleneck(out)
126 |         return bottle
127 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_537238KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class SeperableConv2DBNActiv(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 31 |         super(SeperableConv2DBNActiv, self).__init__()
 32 |         self.conv = nn.Sequential(
 33 |             nn.Conv2d(
 34 |                 nin,
 35 |                 nin,
 36 |                 kernel_size=ksize,
 37 |                 stride=stride,
 38 |                 padding=pad,
 39 |                 dilation=dilation,
 40 |                 groups=nin,
 41 |                 bias=False,
 42 |             ),
 43 |             nn.Conv2d(nin, nout, kernel_size=1, bias=False),
 44 |             nn.BatchNorm2d(nout),
 45 |             activ(),
 46 |         )
 47 | 
 48 |     def __call__(self, x):
 49 |         return self.conv(x)
 50 | 
 51 | 
 52 | class Encoder(nn.Module):
 53 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 54 |         super(Encoder, self).__init__()
 55 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 56 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
 57 | 
 58 |     def __call__(self, x):
 59 |         skip = self.conv1(x)
 60 |         h = self.conv2(skip)
 61 | 
 62 |         return h, skip
 63 | 
 64 | 
 65 | class Decoder(nn.Module):
 66 |     def __init__(
 67 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 68 |     ):
 69 |         super(Decoder, self).__init__()
 70 |         self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 71 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 72 | 
 73 |     def __call__(self, x, skip=None):
 74 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 75 |         if skip is not None:
 76 |             skip = spec_utils.crop_center(skip, x)
 77 |             x = torch.cat([x, skip], dim=1)
 78 |         h = self.conv(x)
 79 | 
 80 |         if self.dropout is not None:
 81 |             h = self.dropout(h)
 82 | 
 83 |         return h
 84 | 
 85 | 
 86 | class ASPPModule(nn.Module):
 87 |     def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
 88 |         super(ASPPModule, self).__init__()
 89 |         self.conv1 = nn.Sequential(
 90 |             nn.AdaptiveAvgPool2d((1, None)),
 91 |             Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
 92 |         )
 93 |         self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
 94 |         self.conv3 = SeperableConv2DBNActiv(
 95 |             nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
 96 |         )
 97 |         self.conv4 = SeperableConv2DBNActiv(
 98 |             nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
 99 |         )
100 |         self.conv5 = SeperableConv2DBNActiv(
101 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102 |         )
103 |         self.conv6 = SeperableConv2DBNActiv(
104 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105 |         )
106 |         self.conv7 = SeperableConv2DBNActiv(
107 |             nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108 |         )
109 |         self.bottleneck = nn.Sequential(
110 |             Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111 |         )
112 | 
113 |     def forward(self, x):
114 |         _, _, h, w = x.size()
115 |         feat1 = F.interpolate(
116 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117 |         )
118 |         feat2 = self.conv2(x)
119 |         feat3 = self.conv3(x)
120 |         feat4 = self.conv4(x)
121 |         feat5 = self.conv5(x)
122 |         feat6 = self.conv6(x)
123 |         feat7 = self.conv7(x)
124 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125 |         bottle = self.bottleneck(out)
126 |         return bottle
127 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/layers_new.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import spec_utils
  6 | 
  7 | 
  8 | class Conv2DBNActiv(nn.Module):
  9 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
 10 |         super(Conv2DBNActiv, self).__init__()
 11 |         self.conv = nn.Sequential(
 12 |             nn.Conv2d(
 13 |                 nin,
 14 |                 nout,
 15 |                 kernel_size=ksize,
 16 |                 stride=stride,
 17 |                 padding=pad,
 18 |                 dilation=dilation,
 19 |                 bias=False,
 20 |             ),
 21 |             nn.BatchNorm2d(nout),
 22 |             activ(),
 23 |         )
 24 | 
 25 |     def __call__(self, x):
 26 |         return self.conv(x)
 27 | 
 28 | 
 29 | class Encoder(nn.Module):
 30 |     def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
 31 |         super(Encoder, self).__init__()
 32 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
 33 |         self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 34 | 
 35 |     def __call__(self, x):
 36 |         h = self.conv1(x)
 37 |         h = self.conv2(h)
 38 | 
 39 |         return h
 40 | 
 41 | 
 42 | class Decoder(nn.Module):
 43 |     def __init__(
 44 |         self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
 45 |     ):
 46 |         super(Decoder, self).__init__()
 47 |         self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
 48 |         # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 49 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 50 | 
 51 |     def __call__(self, x, skip=None):
 52 |         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 53 | 
 54 |         if skip is not None:
 55 |             skip = spec_utils.crop_center(skip, x)
 56 |             x = torch.cat([x, skip], dim=1)
 57 | 
 58 |         h = self.conv1(x)
 59 |         # h = self.conv2(h)
 60 | 
 61 |         if self.dropout is not None:
 62 |             h = self.dropout(h)
 63 | 
 64 |         return h
 65 | 
 66 | 
 67 | class ASPPModule(nn.Module):
 68 |     def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
 69 |         super(ASPPModule, self).__init__()
 70 |         self.conv1 = nn.Sequential(
 71 |             nn.AdaptiveAvgPool2d((1, None)),
 72 |             Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
 73 |         )
 74 |         self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
 75 |         self.conv3 = Conv2DBNActiv(
 76 |             nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
 77 |         )
 78 |         self.conv4 = Conv2DBNActiv(
 79 |             nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
 80 |         )
 81 |         self.conv5 = Conv2DBNActiv(
 82 |             nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
 83 |         )
 84 |         self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
 85 |         self.dropout = nn.Dropout2d(0.1) if dropout else None
 86 | 
 87 |     def forward(self, x):
 88 |         _, _, h, w = x.size()
 89 |         feat1 = F.interpolate(
 90 |             self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
 91 |         )
 92 |         feat2 = self.conv2(x)
 93 |         feat3 = self.conv3(x)
 94 |         feat4 = self.conv4(x)
 95 |         feat5 = self.conv5(x)
 96 |         out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
 97 |         out = self.bottleneck(out)
 98 | 
 99 |         if self.dropout is not None:
100 |             out = self.dropout(out)
101 | 
102 |         return out
103 | 
104 | 
105 | class LSTMModule(nn.Module):
106 |     def __init__(self, nin_conv, nin_lstm, nout_lstm):
107 |         super(LSTMModule, self).__init__()
108 |         self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
109 |         self.lstm = nn.LSTM(
110 |             input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
111 |         )
112 |         self.dense = nn.Sequential(
113 |             nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
114 |         )
115 | 
116 |     def forward(self, x):
117 |         N, _, nbins, nframes = x.size()
118 |         h = self.conv(x)[:, 0]  # N, nbins, nframes
119 |         h = h.permute(2, 0, 1)  # nframes, N, nbins
120 |         h, _ = self.lstm(h)
121 |         h = self.dense(h.reshape(-1, h.size()[-1]))  # nframes * N, nbins
122 |         h = h.reshape(nframes, N, 1, nbins)
123 |         h = h.permute(1, 2, 3, 0)
124 | 
125 |         return h
126 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/model_param_init.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pathlib
 4 | 
 5 | default_param = {}
 6 | default_param["bins"] = 768
 7 | default_param["unstable_bins"] = 9  # training only
 8 | default_param["reduction_bins"] = 762  # training only
 9 | default_param["sr"] = 44100
10 | default_param["pre_filter_start"] = 757
11 | default_param["pre_filter_stop"] = 768
12 | default_param["band"] = {}
13 | 
14 | 
15 | default_param["band"][1] = {
16 |     "sr": 11025,
17 |     "hl": 128,
18 |     "n_fft": 960,
19 |     "crop_start": 0,
20 |     "crop_stop": 245,
21 |     "lpf_start": 61,  # inference only
22 |     "res_type": "polyphase",
23 | }
24 | 
25 | default_param["band"][2] = {
26 |     "sr": 44100,
27 |     "hl": 512,
28 |     "n_fft": 1536,
29 |     "crop_start": 24,
30 |     "crop_stop": 547,
31 |     "hpf_start": 81,  # inference only
32 |     "res_type": "sinc_best",
33 | }
34 | 
35 | 
36 | def int_keys(d):
37 |     r = {}
38 |     for k, v in d:
39 |         if k.isdigit():
40 |             k = int(k)
41 |         r[k] = v
42 |     return r
43 | 
44 | 
45 | class ModelParameters(object):
46 |     def __init__(self, config_path=""):
47 |         if ".pth" == pathlib.Path(config_path).suffix:
48 |             import zipfile
49 | 
50 |             with zipfile.ZipFile(config_path, "r") as zip:
51 |                 self.param = json.loads(
52 |                     zip.read("param.json"), object_pairs_hook=int_keys
53 |                 )
54 |         elif ".json" == pathlib.Path(config_path).suffix:
55 |             with open(config_path, "r") as f:
56 |                 self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57 |         else:
58 |             self.param = default_param
59 | 
60 |         for k in [
61 |             "mid_side",
62 |             "mid_side_b",
63 |             "mid_side_b2",
64 |             "stereo_w",
65 |             "stereo_n",
66 |             "reverse",
67 |         ]:
68 |             if not k in self.param:
69 |                 self.param[k] = False
70 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 16000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 16000,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 32000,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "kaiser_fast"
14 | 		}
15 | 	},
16 | 	"sr": 32000,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 33075,
 8 | 			"hl": 384,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 33075,
17 | 	"pre_filter_start": 1000,
18 | 	"pre_filter_stop": 1021
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 1024,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 256,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 256,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 256,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 256,
18 | 	"pre_filter_stop": 256
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 1024,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 1024
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 1024,
 3 | 	"unstable_bins": 0,
 4 | 	"reduction_bins": 0,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 44100,
 8 | 			"hl": 512,
 9 | 			"n_fft": 2048,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 700,
12 | 			"hpf_start": -1,
13 | 			"res_type": "sinc_best"
14 | 		}
15 | 	},
16 | 	"sr": 44100,
17 | 	"pre_filter_start": 1023,
18 | 	"pre_filter_stop": 700
19 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 118,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 32000,
18 | 			"hl": 352,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 44,
23 | 			"hpf_stop": 23,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 32000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }
31 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 512,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 510,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 160,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 192,
12 | 			"lpf_start": 41,
13 | 			"lpf_stop": 139,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 44100,
18 | 			"hl": 640,
19 | 			"n_fft": 1024,
20 | 			"crop_start": 10,
21 | 			"crop_stop": 320,
22 | 			"hpf_start": 47,
23 | 			"hpf_stop": 15,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 44100,
28 | 	"pre_filter_start": 510,
29 | 	"pre_filter_stop": 512
30 | }
31 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 705,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 6000,
 8 | 			"hl": 66,
 9 | 			"n_fft": 512,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 240,
12 | 			"lpf_start": 60,
13 | 			"lpf_stop": 240,
14 | 			"res_type": "sinc_fastest"
15 | 		},
16 | 		"2": {
17 | 			"sr": 48000,
18 | 			"hl": 528,
19 | 			"n_fft": 1536,
20 | 			"crop_start": 22,
21 | 			"crop_stop": 505,
22 | 			"hpf_start": 82,
23 | 			"hpf_stop": 22,
24 | 			"res_type": "sinc_medium"
25 | 		}
26 | 	},
27 | 	"sr": 48000,
28 | 	"pre_filter_start": 710,
29 | 	"pre_filter_stop": 731
30 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 5,
 4 | 	"reduction_bins": 733,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 768,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 278,
12 | 			"lpf_start": 28,
13 | 			"lpf_stop": 140,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 22050,
18 | 			"hl": 256,
19 | 			"n_fft": 768,
20 | 			"crop_start": 14,
21 | 			"crop_stop": 322,
22 | 			"hpf_start": 70,
23 | 			"hpf_stop": 14,
24 | 			"lpf_start": 283,
25 | 			"lpf_stop": 314,
26 | 			"res_type": "polyphase"
27 | 		},	
28 | 		"3": {
29 | 			"sr": 44100,
30 | 			"hl": 512,
31 | 			"n_fft": 768,
32 | 			"crop_start": 131,
33 | 			"crop_stop": 313,
34 | 			"hpf_start": 154,
35 | 			"hpf_stop": 141,
36 | 			"res_type": "sinc_medium"
37 | 		}
38 | 	},
39 | 	"sr": 44100,
40 | 	"pre_filter_start": 757,
41 | 	"pre_filter_stop": 768
42 | }
43 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 5,
 5 | 	"reduction_bins": 733,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 768,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 278,
13 | 			"lpf_start": 28,
14 | 			"lpf_stop": 140,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 256,
20 | 			"n_fft": 768,
21 | 			"crop_start": 14,
22 | 			"crop_stop": 322,
23 | 			"hpf_start": 70,
24 | 			"hpf_stop": 14,
25 | 			"lpf_start": 283,
26 | 			"lpf_stop": 314,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 512,
32 | 			"n_fft": 768,
33 | 			"crop_start": 131,
34 | 			"crop_stop": 313,
35 | 			"hpf_start": 154,
36 | 			"hpf_stop": 141,
37 | 			"res_type": "sinc_medium"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 757,
42 | 	"pre_filter_stop": 768
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 640,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 187,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 768,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 212,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 174,
26 | 			"lpf_stop": 209,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 640,
33 | 			"crop_start": 66,
34 | 			"crop_stop": 307,
35 | 			"hpf_start": 86,
36 | 			"hpf_stop": 72,
37 | 			"res_type": "kaiser_fast"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 639,
42 | 	"pre_filter_stop": 640
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"reduction_bins": 668,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 11025,
 8 | 			"hl": 128,
 9 | 			"n_fft": 1024,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 186,
12 | 			"lpf_start": 37,
13 | 			"lpf_stop": 73,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 11025,
18 | 			"hl": 128,
19 | 			"n_fft": 512,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 185,			
22 | 			"hpf_start": 36,
23 | 			"hpf_stop": 18,
24 | 			"lpf_start": 93,
25 | 			"lpf_stop": 185,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 22050,
30 | 			"hl": 256,
31 | 			"n_fft": 512,
32 | 			"crop_start": 46,
33 | 			"crop_stop": 186,
34 | 			"hpf_start": 93,
35 | 			"hpf_stop": 46,
36 | 			"lpf_start": 164,
37 | 			"lpf_stop": 186,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 512,
43 | 			"n_fft": 768,
44 | 			"crop_start": 121,
45 | 			"crop_stop": 382,
46 | 			"hpf_start": 138,
47 | 			"hpf_stop": 123,
48 | 			"res_type": "sinc_medium"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 740,
53 | 	"pre_filter_stop": 768
54 | }
55 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 768,
 3 | 	"unstable_bins": 7,
 4 | 	"mid_side": true,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }
56 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"reverse": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"stereo_w": true,
 3 | 	"bins": 768,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 668,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 128,
10 | 			"n_fft": 1024,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 186,
13 | 			"lpf_start": 37,
14 | 			"lpf_stop": 73,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 11025,
19 | 			"hl": 128,
20 | 			"n_fft": 512,
21 | 			"crop_start": 4,
22 | 			"crop_stop": 185,			
23 | 			"hpf_start": 36,
24 | 			"hpf_stop": 18,
25 | 			"lpf_start": 93,
26 | 			"lpf_stop": 185,
27 | 			"res_type": "polyphase"
28 | 		},
29 | 		"3": {
30 | 			"sr": 22050,
31 | 			"hl": 256,
32 | 			"n_fft": 512,
33 | 			"crop_start": 46,
34 | 			"crop_stop": 186,
35 | 			"hpf_start": 93,
36 | 			"hpf_stop": 46,
37 | 			"lpf_start": 164,
38 | 			"lpf_stop": 186,
39 | 			"res_type": "polyphase"
40 | 		},	
41 | 		"4": {
42 | 			"sr": 44100,
43 | 			"hl": 512,
44 | 			"n_fft": 768,
45 | 			"crop_start": 121,
46 | 			"crop_stop": 382,
47 | 			"hpf_start": 138,
48 | 			"hpf_stop": 123,
49 | 			"res_type": "sinc_medium"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 740,
54 | 	"pre_filter_stop": 768
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 637,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},		
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},	
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"convert_channels": "stereo_n",
49 | 			"res_type": "kaiser_fast"
50 | 		}
51 | 	},
52 | 	"sr": 44100,
53 | 	"pre_filter_start": 668,
54 | 	"pre_filter_stop": 672
55 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"bins": 672,
 3 | 	"unstable_bins": 8,
 4 | 	"reduction_bins": 530,
 5 | 	"band": {
 6 | 		"1": {
 7 | 			"sr": 7350,
 8 | 			"hl": 80,
 9 | 			"n_fft": 640,
10 | 			"crop_start": 0,
11 | 			"crop_stop": 85,
12 | 			"lpf_start": 25,
13 | 			"lpf_stop": 53,
14 | 			"res_type": "polyphase"
15 | 		},
16 | 		"2": {
17 | 			"sr": 7350,
18 | 			"hl": 80,
19 | 			"n_fft": 320,
20 | 			"crop_start": 4,
21 | 			"crop_stop": 87,
22 | 			"hpf_start": 25,
23 | 			"hpf_stop": 12,
24 | 			"lpf_start": 31,
25 | 			"lpf_stop": 62,
26 | 			"res_type": "polyphase"
27 | 		},
28 | 		"3": {
29 | 			"sr": 14700,
30 | 			"hl": 160,
31 | 			"n_fft": 512,
32 | 			"crop_start": 17,
33 | 			"crop_stop": 216,
34 | 			"hpf_start": 48,
35 | 			"hpf_stop": 24,
36 | 			"lpf_start": 139,
37 | 			"lpf_stop": 210,
38 | 			"res_type": "polyphase"
39 | 		},
40 | 		"4": {
41 | 			"sr": 44100,
42 | 			"hl": 480,
43 | 			"n_fft": 960,
44 | 			"crop_start": 78,
45 | 			"crop_stop": 383,
46 | 			"hpf_start": 130,
47 | 			"hpf_stop": 86,
48 | 			"res_type": "kaiser_fast"
49 | 		}
50 | 	},
51 | 	"sr": 44100,
52 | 	"pre_filter_start": 668,
53 | 	"pre_filter_stop": 672
54 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/modelparams/ensemble.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"mid_side_b2": true,
 3 | 	"bins": 1280,
 4 | 	"unstable_bins": 7,
 5 | 	"reduction_bins": 565,
 6 | 	"band": {
 7 | 		"1": {
 8 | 			"sr": 11025,
 9 | 			"hl": 108,
10 | 			"n_fft": 2048,
11 | 			"crop_start": 0,
12 | 			"crop_stop": 374,
13 | 			"lpf_start": 92,
14 | 			"lpf_stop": 186,
15 | 			"res_type": "polyphase"
16 | 		},
17 | 		"2": {
18 | 			"sr": 22050,
19 | 			"hl": 216,
20 | 			"n_fft": 1536,
21 | 			"crop_start": 0,
22 | 			"crop_stop": 424,
23 | 			"hpf_start": 68,
24 | 			"hpf_stop": 34,
25 | 			"lpf_start": 348,
26 | 			"lpf_stop": 418,
27 | 			"res_type": "polyphase"
28 | 		},	
29 | 		"3": {
30 | 			"sr": 44100,
31 | 			"hl": 432,
32 | 			"n_fft": 1280,
33 | 			"crop_start": 132,
34 | 			"crop_stop": 614,
35 | 			"hpf_start": 172,
36 | 			"hpf_stop": 144,
37 | 			"res_type": "polyphase"
38 | 		}
39 | 	},
40 | 	"sr": 44100,
41 | 	"pre_filter_start": 1280,
42 | 	"pre_filter_stop": 1280
43 | }


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets.py:
--------------------------------------------------------------------------------
  1 | import layers
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | from . import spec_utils
  7 | 
  8 | 
  9 | class BaseASPPNet(nn.Module):
 10 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 11 |         super(BaseASPPNet, self).__init__()
 12 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 13 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 14 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 15 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 16 | 
 17 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 18 | 
 19 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 20 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 21 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 22 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 23 | 
 24 |     def __call__(self, x):
 25 |         h, e1 = self.enc1(x)
 26 |         h, e2 = self.enc2(h)
 27 |         h, e3 = self.enc3(h)
 28 |         h, e4 = self.enc4(h)
 29 | 
 30 |         h = self.aspp(h)
 31 | 
 32 |         h = self.dec4(h, e4)
 33 |         h = self.dec3(h, e3)
 34 |         h = self.dec2(h, e2)
 35 |         h = self.dec1(h, e1)
 36 | 
 37 |         return h
 38 | 
 39 | 
 40 | class CascadedASPPNet(nn.Module):
 41 |     def __init__(self, n_fft):
 42 |         super(CascadedASPPNet, self).__init__()
 43 |         self.stg1_low_band_net = BaseASPPNet(2, 16)
 44 |         self.stg1_high_band_net = BaseASPPNet(2, 16)
 45 | 
 46 |         self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
 47 |         self.stg2_full_band_net = BaseASPPNet(8, 16)
 48 | 
 49 |         self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 50 |         self.stg3_full_band_net = BaseASPPNet(16, 32)
 51 | 
 52 |         self.out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
 54 |         self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
 55 | 
 56 |         self.max_bin = n_fft // 2
 57 |         self.output_bin = n_fft // 2 + 1
 58 | 
 59 |         self.offset = 128
 60 | 
 61 |     def forward(self, x, aggressiveness=None):
 62 |         mix = x.detach()
 63 |         x = x.clone()
 64 | 
 65 |         x = x[:, :, : self.max_bin]
 66 | 
 67 |         bandw = x.size()[2] // 2
 68 |         aux1 = torch.cat(
 69 |             [
 70 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 71 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 72 |             ],
 73 |             dim=2,
 74 |         )
 75 | 
 76 |         h = torch.cat([x, aux1], dim=1)
 77 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 78 | 
 79 |         h = torch.cat([x, aux1, aux2], dim=1)
 80 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 81 | 
 82 |         mask = torch.sigmoid(self.out(h))
 83 |         mask = F.pad(
 84 |             input=mask,
 85 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 86 |             mode="replicate",
 87 |         )
 88 | 
 89 |         if self.training:
 90 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 91 |             aux1 = F.pad(
 92 |                 input=aux1,
 93 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 94 |                 mode="replicate",
 95 |             )
 96 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 97 |             aux2 = F.pad(
 98 |                 input=aux2,
 99 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 |                 mode="replicate",
101 |             )
102 |             return mask * mix, aux1 * mix, aux2 * mix
103 |         else:
104 |             if aggressiveness:
105 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 |                     mask[:, :, : aggressiveness["split_bin"]],
107 |                     1 + aggressiveness["value"] / 3,
108 |                 )
109 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 |                     mask[:, :, aggressiveness["split_bin"] :],
111 |                     1 + aggressiveness["value"],
112 |                 )
113 | 
114 |             return mask * mix
115 | 
116 |     def predict(self, x_mag, aggressiveness=None):
117 |         h = self.forward(x_mag, aggressiveness)
118 | 
119 |         if self.offset > 0:
120 |             h = h[:, :, :, self.offset : -self.offset]
121 |             assert h.size()[3] > 0
122 | 
123 |         return h
124 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_123812KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_123821KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_33966KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_33966KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 16)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 16)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(8, 16)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(16, 32)
 50 | 
 51 |         self.out = nn.Conv2d(32, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_537227KB.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | from . import layers_537238KB as layers
  7 | 
  8 | 
  9 | class BaseASPPNet(nn.Module):
 10 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 11 |         super(BaseASPPNet, self).__init__()
 12 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 13 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 14 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 15 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 16 | 
 17 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 18 | 
 19 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 20 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 21 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 22 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 23 | 
 24 |     def __call__(self, x):
 25 |         h, e1 = self.enc1(x)
 26 |         h, e2 = self.enc2(h)
 27 |         h, e3 = self.enc3(h)
 28 |         h, e4 = self.enc4(h)
 29 | 
 30 |         h = self.aspp(h)
 31 | 
 32 |         h = self.dec4(h, e4)
 33 |         h = self.dec3(h, e3)
 34 |         h = self.dec2(h, e2)
 35 |         h = self.dec1(h, e1)
 36 | 
 37 |         return h
 38 | 
 39 | 
 40 | class CascadedASPPNet(nn.Module):
 41 |     def __init__(self, n_fft):
 42 |         super(CascadedASPPNet, self).__init__()
 43 |         self.stg1_low_band_net = BaseASPPNet(2, 64)
 44 |         self.stg1_high_band_net = BaseASPPNet(2, 64)
 45 | 
 46 |         self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 47 |         self.stg2_full_band_net = BaseASPPNet(32, 64)
 48 | 
 49 |         self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
 50 |         self.stg3_full_band_net = BaseASPPNet(64, 128)
 51 | 
 52 |         self.out = nn.Conv2d(128, 2, 1, bias=False)
 53 |         self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
 54 |         self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
 55 | 
 56 |         self.max_bin = n_fft // 2
 57 |         self.output_bin = n_fft // 2 + 1
 58 | 
 59 |         self.offset = 128
 60 | 
 61 |     def forward(self, x, aggressiveness=None):
 62 |         mix = x.detach()
 63 |         x = x.clone()
 64 | 
 65 |         x = x[:, :, : self.max_bin]
 66 | 
 67 |         bandw = x.size()[2] // 2
 68 |         aux1 = torch.cat(
 69 |             [
 70 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 71 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 72 |             ],
 73 |             dim=2,
 74 |         )
 75 | 
 76 |         h = torch.cat([x, aux1], dim=1)
 77 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 78 | 
 79 |         h = torch.cat([x, aux1, aux2], dim=1)
 80 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 81 | 
 82 |         mask = torch.sigmoid(self.out(h))
 83 |         mask = F.pad(
 84 |             input=mask,
 85 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 86 |             mode="replicate",
 87 |         )
 88 | 
 89 |         if self.training:
 90 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 91 |             aux1 = F.pad(
 92 |                 input=aux1,
 93 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 94 |                 mode="replicate",
 95 |             )
 96 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 97 |             aux2 = F.pad(
 98 |                 input=aux2,
 99 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 |                 mode="replicate",
101 |             )
102 |             return mask * mix, aux1 * mix, aux2 * mix
103 |         else:
104 |             if aggressiveness:
105 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 |                     mask[:, :, : aggressiveness["split_bin"]],
107 |                     1 + aggressiveness["value"] / 3,
108 |                 )
109 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 |                     mask[:, :, aggressiveness["split_bin"] :],
111 |                     1 + aggressiveness["value"],
112 |                 )
113 | 
114 |             return mask * mix
115 | 
116 |     def predict(self, x_mag, aggressiveness=None):
117 |         h = self.forward(x_mag, aggressiveness)
118 | 
119 |         if self.offset > 0:
120 |             h = h[:, :, :, self.offset : -self.offset]
121 |             assert h.size()[3] > 0
122 | 
123 |         return h
124 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_537238KB.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | from . import layers_537238KB as layers
  7 | 
  8 | 
  9 | class BaseASPPNet(nn.Module):
 10 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 11 |         super(BaseASPPNet, self).__init__()
 12 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 13 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 14 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 15 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 16 | 
 17 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 18 | 
 19 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 20 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 21 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 22 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 23 | 
 24 |     def __call__(self, x):
 25 |         h, e1 = self.enc1(x)
 26 |         h, e2 = self.enc2(h)
 27 |         h, e3 = self.enc3(h)
 28 |         h, e4 = self.enc4(h)
 29 | 
 30 |         h = self.aspp(h)
 31 | 
 32 |         h = self.dec4(h, e4)
 33 |         h = self.dec3(h, e3)
 34 |         h = self.dec2(h, e2)
 35 |         h = self.dec1(h, e1)
 36 | 
 37 |         return h
 38 | 
 39 | 
 40 | class CascadedASPPNet(nn.Module):
 41 |     def __init__(self, n_fft):
 42 |         super(CascadedASPPNet, self).__init__()
 43 |         self.stg1_low_band_net = BaseASPPNet(2, 64)
 44 |         self.stg1_high_band_net = BaseASPPNet(2, 64)
 45 | 
 46 |         self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 47 |         self.stg2_full_band_net = BaseASPPNet(32, 64)
 48 | 
 49 |         self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
 50 |         self.stg3_full_band_net = BaseASPPNet(64, 128)
 51 | 
 52 |         self.out = nn.Conv2d(128, 2, 1, bias=False)
 53 |         self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
 54 |         self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
 55 | 
 56 |         self.max_bin = n_fft // 2
 57 |         self.output_bin = n_fft // 2 + 1
 58 | 
 59 |         self.offset = 128
 60 | 
 61 |     def forward(self, x, aggressiveness=None):
 62 |         mix = x.detach()
 63 |         x = x.clone()
 64 | 
 65 |         x = x[:, :, : self.max_bin]
 66 | 
 67 |         bandw = x.size()[2] // 2
 68 |         aux1 = torch.cat(
 69 |             [
 70 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 71 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 72 |             ],
 73 |             dim=2,
 74 |         )
 75 | 
 76 |         h = torch.cat([x, aux1], dim=1)
 77 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 78 | 
 79 |         h = torch.cat([x, aux1, aux2], dim=1)
 80 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 81 | 
 82 |         mask = torch.sigmoid(self.out(h))
 83 |         mask = F.pad(
 84 |             input=mask,
 85 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 86 |             mode="replicate",
 87 |         )
 88 | 
 89 |         if self.training:
 90 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 91 |             aux1 = F.pad(
 92 |                 input=aux1,
 93 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 94 |                 mode="replicate",
 95 |             )
 96 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 97 |             aux2 = F.pad(
 98 |                 input=aux2,
 99 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100 |                 mode="replicate",
101 |             )
102 |             return mask * mix, aux1 * mix, aux2 * mix
103 |         else:
104 |             if aggressiveness:
105 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106 |                     mask[:, :, : aggressiveness["split_bin"]],
107 |                     1 + aggressiveness["value"] / 3,
108 |                 )
109 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110 |                     mask[:, :, aggressiveness["split_bin"] :],
111 |                     1 + aggressiveness["value"],
112 |                 )
113 | 
114 |             return mask * mix
115 | 
116 |     def predict(self, x_mag, aggressiveness=None):
117 |         h = self.forward(x_mag, aggressiveness)
118 | 
119 |         if self.offset > 0:
120 |             h = h[:, :, :, self.offset : -self.offset]
121 |             assert h.size()[3] > 0
122 | 
123 |         return h
124 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_61968KB.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_123821KB as layers
  6 | 
  7 | 
  8 | class BaseASPPNet(nn.Module):
  9 |     def __init__(self, nin, ch, dilations=(4, 8, 16)):
 10 |         super(BaseASPPNet, self).__init__()
 11 |         self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
 12 |         self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
 13 |         self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
 14 |         self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
 15 | 
 16 |         self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
 17 | 
 18 |         self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
 19 |         self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
 20 |         self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
 21 |         self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
 22 | 
 23 |     def __call__(self, x):
 24 |         h, e1 = self.enc1(x)
 25 |         h, e2 = self.enc2(h)
 26 |         h, e3 = self.enc3(h)
 27 |         h, e4 = self.enc4(h)
 28 | 
 29 |         h = self.aspp(h)
 30 | 
 31 |         h = self.dec4(h, e4)
 32 |         h = self.dec3(h, e3)
 33 |         h = self.dec2(h, e2)
 34 |         h = self.dec1(h, e1)
 35 | 
 36 |         return h
 37 | 
 38 | 
 39 | class CascadedASPPNet(nn.Module):
 40 |     def __init__(self, n_fft):
 41 |         super(CascadedASPPNet, self).__init__()
 42 |         self.stg1_low_band_net = BaseASPPNet(2, 32)
 43 |         self.stg1_high_band_net = BaseASPPNet(2, 32)
 44 | 
 45 |         self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
 46 |         self.stg2_full_band_net = BaseASPPNet(16, 32)
 47 | 
 48 |         self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
 49 |         self.stg3_full_band_net = BaseASPPNet(32, 64)
 50 | 
 51 |         self.out = nn.Conv2d(64, 2, 1, bias=False)
 52 |         self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
 53 |         self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
 54 | 
 55 |         self.max_bin = n_fft // 2
 56 |         self.output_bin = n_fft // 2 + 1
 57 | 
 58 |         self.offset = 128
 59 | 
 60 |     def forward(self, x, aggressiveness=None):
 61 |         mix = x.detach()
 62 |         x = x.clone()
 63 | 
 64 |         x = x[:, :, : self.max_bin]
 65 | 
 66 |         bandw = x.size()[2] // 2
 67 |         aux1 = torch.cat(
 68 |             [
 69 |                 self.stg1_low_band_net(x[:, :, :bandw]),
 70 |                 self.stg1_high_band_net(x[:, :, bandw:]),
 71 |             ],
 72 |             dim=2,
 73 |         )
 74 | 
 75 |         h = torch.cat([x, aux1], dim=1)
 76 |         aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
 77 | 
 78 |         h = torch.cat([x, aux1, aux2], dim=1)
 79 |         h = self.stg3_full_band_net(self.stg3_bridge(h))
 80 | 
 81 |         mask = torch.sigmoid(self.out(h))
 82 |         mask = F.pad(
 83 |             input=mask,
 84 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
 85 |             mode="replicate",
 86 |         )
 87 | 
 88 |         if self.training:
 89 |             aux1 = torch.sigmoid(self.aux1_out(aux1))
 90 |             aux1 = F.pad(
 91 |                 input=aux1,
 92 |                 pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
 93 |                 mode="replicate",
 94 |             )
 95 |             aux2 = torch.sigmoid(self.aux2_out(aux2))
 96 |             aux2 = F.pad(
 97 |                 input=aux2,
 98 |                 pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
 99 |                 mode="replicate",
100 |             )
101 |             return mask * mix, aux1 * mix, aux2 * mix
102 |         else:
103 |             if aggressiveness:
104 |                 mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105 |                     mask[:, :, : aggressiveness["split_bin"]],
106 |                     1 + aggressiveness["value"] / 3,
107 |                 )
108 |                 mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109 |                     mask[:, :, aggressiveness["split_bin"] :],
110 |                     1 + aggressiveness["value"],
111 |                 )
112 | 
113 |             return mask * mix
114 | 
115 |     def predict(self, x_mag, aggressiveness=None):
116 |         h = self.forward(x_mag, aggressiveness)
117 | 
118 |         if self.offset > 0:
119 |             h = h[:, :, :, self.offset : -self.offset]
120 |             assert h.size()[3] > 0
121 | 
122 |         return h
123 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/lib_v5/nets_new.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from . import layers_new
  6 | 
  7 | 
  8 | class BaseNet(nn.Module):
  9 |     def __init__(
 10 |         self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
 11 |     ):
 12 |         super(BaseNet, self).__init__()
 13 |         self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
 14 |         self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
 15 |         self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
 16 |         self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
 17 |         self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
 18 | 
 19 |         self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
 20 | 
 21 |         self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
 22 |         self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
 23 |         self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
 24 |         self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
 25 |         self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
 26 | 
 27 |     def __call__(self, x):
 28 |         e1 = self.enc1(x)
 29 |         e2 = self.enc2(e1)
 30 |         e3 = self.enc3(e2)
 31 |         e4 = self.enc4(e3)
 32 |         e5 = self.enc5(e4)
 33 | 
 34 |         h = self.aspp(e5)
 35 | 
 36 |         h = self.dec4(h, e4)
 37 |         h = self.dec3(h, e3)
 38 |         h = self.dec2(h, e2)
 39 |         h = torch.cat([h, self.lstm_dec2(h)], dim=1)
 40 |         h = self.dec1(h, e1)
 41 | 
 42 |         return h
 43 | 
 44 | 
 45 | class CascadedNet(nn.Module):
 46 |     def __init__(self, n_fft, nout=32, nout_lstm=128):
 47 |         super(CascadedNet, self).__init__()
 48 | 
 49 |         self.max_bin = n_fft // 2
 50 |         self.output_bin = n_fft // 2 + 1
 51 |         self.nin_lstm = self.max_bin // 2
 52 |         self.offset = 64
 53 | 
 54 |         self.stg1_low_band_net = nn.Sequential(
 55 |             BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
 56 |             layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
 57 |         )
 58 | 
 59 |         self.stg1_high_band_net = BaseNet(
 60 |             2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
 61 |         )
 62 | 
 63 |         self.stg2_low_band_net = nn.Sequential(
 64 |             BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
 65 |             layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
 66 |         )
 67 |         self.stg2_high_band_net = BaseNet(
 68 |             nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
 69 |         )
 70 | 
 71 |         self.stg3_full_band_net = BaseNet(
 72 |             3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
 73 |         )
 74 | 
 75 |         self.out = nn.Conv2d(nout, 2, 1, bias=False)
 76 |         self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
 77 | 
 78 |     def forward(self, x):
 79 |         x = x[:, :, : self.max_bin]
 80 | 
 81 |         bandw = x.size()[2] // 2
 82 |         l1_in = x[:, :, :bandw]
 83 |         h1_in = x[:, :, bandw:]
 84 |         l1 = self.stg1_low_band_net(l1_in)
 85 |         h1 = self.stg1_high_band_net(h1_in)
 86 |         aux1 = torch.cat([l1, h1], dim=2)
 87 | 
 88 |         l2_in = torch.cat([l1_in, l1], dim=1)
 89 |         h2_in = torch.cat([h1_in, h1], dim=1)
 90 |         l2 = self.stg2_low_band_net(l2_in)
 91 |         h2 = self.stg2_high_band_net(h2_in)
 92 |         aux2 = torch.cat([l2, h2], dim=2)
 93 | 
 94 |         f3_in = torch.cat([x, aux1, aux2], dim=1)
 95 |         f3 = self.stg3_full_band_net(f3_in)
 96 | 
 97 |         mask = torch.sigmoid(self.out(f3))
 98 |         mask = F.pad(
 99 |             input=mask,
100 |             pad=(0, 0, 0, self.output_bin - mask.size()[2]),
101 |             mode="replicate",
102 |         )
103 | 
104 |         if self.training:
105 |             aux = torch.cat([aux1, aux2], dim=1)
106 |             aux = torch.sigmoid(self.aux_out(aux))
107 |             aux = F.pad(
108 |                 input=aux,
109 |                 pad=(0, 0, 0, self.output_bin - aux.size()[2]),
110 |                 mode="replicate",
111 |             )
112 |             return mask, aux
113 |         else:
114 |             return mask
115 | 
116 |     def predict_mask(self, x):
117 |         mask = self.forward(x)
118 | 
119 |         if self.offset > 0:
120 |             mask = mask[:, :, :, self.offset : -self.offset]
121 |             assert mask.size()[3] > 0
122 | 
123 |         return mask
124 | 
125 |     def predict(self, x, aggressiveness=None):
126 |         mask = self.forward(x)
127 |         pred_mag = x * mask
128 | 
129 |         if self.offset > 0:
130 |             pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
131 |             assert pred_mag.size()[3] > 0
132 | 
133 |         return pred_mag
134 | 


--------------------------------------------------------------------------------
/tools/uvr5/lib/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | def load_data(file_name: str = "./lib/name_params.json") -> dict:
  9 |     with open(file_name, "r") as f:
 10 |         data = json.load(f)
 11 | 
 12 |     return data
 13 | 
 14 | 
 15 | def make_padding(width, cropsize, offset):
 16 |     left = offset
 17 |     roi_size = cropsize - left * 2
 18 |     if roi_size == 0:
 19 |         roi_size = cropsize
 20 |     right = roi_size - (width % roi_size) + left
 21 | 
 22 |     return left, right, roi_size
 23 | 
 24 | 
 25 | def inference(X_spec, device, model, aggressiveness, data):
 26 |     """
 27 |     data : dic configs
 28 |     """
 29 | 
 30 |     def _execute(
 31 |         X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
 32 |     ):
 33 |         model.eval()
 34 |         with torch.no_grad():
 35 |             preds = []
 36 | 
 37 |             iterations = [n_window]
 38 | 
 39 |             total_iterations = sum(iterations)
 40 |             for i in tqdm(range(n_window)):
 41 |                 start = i * roi_size
 42 |                 X_mag_window = X_mag_pad[
 43 |                     None, :, :, start : start + data["window_size"]
 44 |                 ]
 45 |                 X_mag_window = torch.from_numpy(X_mag_window)
 46 |                 if is_half:
 47 |                     X_mag_window = X_mag_window.half()
 48 |                 X_mag_window = X_mag_window.to(device)
 49 | 
 50 |                 pred = model.predict(X_mag_window, aggressiveness)
 51 | 
 52 |                 pred = pred.detach().cpu().numpy()
 53 |                 preds.append(pred[0])
 54 | 
 55 |             pred = np.concatenate(preds, axis=2)
 56 |         return pred
 57 | 
 58 |     def preprocess(X_spec):
 59 |         X_mag = np.abs(X_spec)
 60 |         X_phase = np.angle(X_spec)
 61 | 
 62 |         return X_mag, X_phase
 63 | 
 64 |     X_mag, X_phase = preprocess(X_spec)
 65 | 
 66 |     coef = X_mag.max()
 67 |     X_mag_pre = X_mag / coef
 68 | 
 69 |     n_frame = X_mag_pre.shape[2]
 70 |     pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
 71 |     n_window = int(np.ceil(n_frame / roi_size))
 72 | 
 73 |     X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 74 | 
 75 |     if list(model.state_dict().values())[0].dtype == torch.float16:
 76 |         is_half = True
 77 |     else:
 78 |         is_half = False
 79 |     pred = _execute(
 80 |         X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
 81 |     )
 82 |     pred = pred[:, :, :n_frame]
 83 | 
 84 |     if data["tta"]:
 85 |         pad_l += roi_size // 2
 86 |         pad_r += roi_size // 2
 87 |         n_window += 1
 88 | 
 89 |         X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
 90 | 
 91 |         pred_tta = _execute(
 92 |             X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
 93 |         )
 94 |         pred_tta = pred_tta[:, :, roi_size // 2 :]
 95 |         pred_tta = pred_tta[:, :, :n_frame]
 96 | 
 97 |         return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
 98 |     else:
 99 |         return pred * coef, X_mag, np.exp(1.0j * X_phase)
100 | 
101 | 
102 | def _get_name_params(model_path, model_hash):
103 |     data = load_data()
104 |     flag = False
105 |     ModelName = model_path
106 |     for type in list(data):
107 |         for model in list(data[type][0]):
108 |             for i in range(len(data[type][0][model])):
109 |                 if str(data[type][0][model][i]["hash_name"]) == model_hash:
110 |                     flag = True
111 |                 elif str(data[type][0][model][i]["hash_name"]) in ModelName:
112 |                     flag = True
113 | 
114 |                 if flag:
115 |                     model_params_auto = data[type][0][model][i]["model_params"]
116 |                     param_name_auto = data[type][0][model][i]["param_name"]
117 |                     if type == "equivalent":
118 |                         return param_name_auto, model_params_auto
119 |                     else:
120 |                         flag = False
121 |     return param_name_auto, model_params_auto
122 | 


--------------------------------------------------------------------------------