├── .github └── workflows │ ├── genlocale.yml │ ├── pull_format.yml │ ├── push_format.yml │ └── unitest.yml ├── .gitignore ├── README.md ├── RVC-GUI.bat ├── config.py ├── docs ├── GUI.JPG └── GUI20230508.JPG ├── infer ├── infer-pm-index256.py ├── train-index.py └── trans_weights.py ├── infer_pack ├── attentions.py ├── commons.py ├── models.py ├── models_onnx.py ├── models_onnx_moess.py ├── modelsv2.py ├── modules.py └── transforms.py ├── my_utils.py ├── requirements.txt ├── rvcgui.py ├── setup.bat ├── trainset_preprocess_pipeline_print.py └── vc_infer_pipeline.py /.github/workflows/genlocale.yml: -------------------------------------------------------------------------------- 1 | name: genlocale 2 | on: 3 | push: 4 | branches: 5 | - main 6 | jobs: 7 | golangci: 8 | name: genlocale 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Check out 12 | uses: actions/checkout@master 13 | 14 | - name: Run locale generation 15 | run: | 16 | python3 extract_locale.py 17 | cd i18n && python3 locale_diff.py 18 | 19 | - name: Commit back 20 | if: ${{ !github.head_ref }} 21 | continue-on-error: true 22 | run: | 23 | git config --local user.name 'github-actions[bot]' 24 | git config --local user.email '41898282+github-actions[bot]@users.noreply.github.com' 25 | git add --all 26 | git commit -m "🎨 同步 locale" 27 | 28 | - name: Create Pull Request 29 | if: ${{ !github.head_ref }} 30 | continue-on-error: true 31 | uses: peter-evans/create-pull-request@v4 32 | 33 | -------------------------------------------------------------------------------- /.github/workflows/pull_format.yml: -------------------------------------------------------------------------------- 1 | name: pull format 2 | 3 | on: [pull_request] 4 | 5 | permissions: 6 | contents: write 7 | jobs: 8 | pull_format: 9 | runs-on: ubuntu-latest 10 | continue-on-error: true 11 | steps: 12 | - name: checkout 13 | continue-on-error: true 14 | uses: actions/checkout@v3 15 | with: 16 | ref: ${{ github.head_ref }} 17 | fetch-depth: 0 18 | 19 | 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Install Black 26 | run: pip install black 27 | 28 | - name: Run Black 29 | # run: black $(git ls-files '*.py') 30 | run: black . 31 | 32 | - name: Commit Back 33 | uses: stefanzweifel/git-auto-commit-action@v4 34 | with: 35 | commit_message: Apply Code Formatter Change 36 | -------------------------------------------------------------------------------- /.github/workflows/push_format.yml: -------------------------------------------------------------------------------- 1 | name: push format 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | permissions: 9 | contents: write 10 | pull-requests: write 11 | jobs: 12 | push_format: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | with: 17 | ref: ${{github.ref_name}} 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install Black 25 | run: pip install black 26 | 27 | - name: Run Black 28 | # run: black $(git ls-files '*.py') 29 | run: black . 30 | 31 | - name: Commit Back 32 | continue-on-error: true 33 | id: commitback 34 | run: | 35 | git config --local user.email "github-actions[bot]@users.noreply.github.com" 36 | git config --local user.name "github-actions[bot]" 37 | git add --all 38 | git commit -m "Format code" 39 | 40 | - name: Create Pull Request 41 | if: steps.commitback.outcome == 'success' 42 | continue-on-error: true 43 | uses: peter-evans/create-pull-request@v4 44 | with: 45 | body: Apply Code Formatter Change 46 | commit-message: Automatic code format 47 | -------------------------------------------------------------------------------- /.github/workflows/unitest.yml: -------------------------------------------------------------------------------- 1 | name: unitest 2 | on: [ push, pull_request ] 3 | jobs: 4 | build: 5 | runs-on: ${{ matrix.os }} 6 | strategy: 7 | matrix: 8 | python-version: ["3.8", "3.9", "3.10"] 9 | os: [ubuntu-latest] 10 | fail-fast: false 11 | 12 | steps: 13 | - uses: actions/checkout@master 14 | - name: Set up Python ${{ matrix.python-version }} 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | - name: Install dependencies 19 | run: | 20 | sudo apt update 21 | sudo apt -y install ffmpeg 22 | sudo apt -y install -qq aria2 23 | aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d ./ -o hubert_base.pt 24 | python -m pip install --upgrade pip 25 | python -m pip install --upgrade setuptools 26 | python -m pip install --upgrade wheel 27 | pip install torch torchvision torchaudio 28 | pip install -r requirements.txt 29 | - name: Test step 1 & 2 30 | run: | 31 | mkdir -p logs/mi-test 32 | touch logs/mi-test/preprocess.log 33 | python trainset_preprocess_pipeline_print.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True 34 | touch logs/mi-test/extract_f0_feature.log 35 | python extract_f0_print.py logs/mi-test $(nproc) pm 36 | python extract_feature_print.py cpu 1 0 0 logs/mi-test 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | /TEMP 4 | *.pyd 5 | hubert_base.pt 6 | /logs 7 | models/ 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

RVC GUI

4 | 5 | For audio file inference only 6 | 7 |
8 | 9 | 10 | 11 |

12 | 13 | 14 | 15 | 16 | 17 | 18 | ## GUI 19 | 20 | ![GUI](https://github.com/Tiger14n/RVC-GUI/raw/main/docs/GUI.JPG) 21 |

22 | 23 | ## Direct setup for Windows users 24 | ## [Windows-pkg](https://github.com/Tiger14n/RVC-GUI/releases/tag/Windows-pkg) 25 | 26 |

27 | ## Preparing the environment 28 | 29 | 30 | * Install Python version +3.8 if you have not: 31 | 32 | * Execute these commands 33 | 34 | Windows with Nvidia cards 35 | ```bash 36 | python -m pip install -U pip setuptools wheel 37 | pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu118 38 | pip install -r requirements.txt 39 | ``` 40 | Other 41 | ``` 42 | python -m pip install -U pip setuptools wheel 43 | pip install -U torch torchaudio 44 | pip install -r requirements.txt 45 | ``` 46 | 47 | Apple silicon Macs fix 48 | ``` 49 | pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu 50 | 51 | export PYTORCH_ENABLE_MPS_FALLBACK=1 52 | ``` 53 |
54 | 55 | * Downlaod [hubert_base.pt](https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt/) and place it in the root folder 56 | 57 |
58 | 59 | * Then use this command to start RVC GUI: 60 | ```bash 61 | python rvcgui.py 62 | ``` 63 | Or run this file on windows 64 | ``` 65 | RVC-GUI.bat 66 | ``` 67 | 68 | # Loading models 69 | use the import button to import a model from a zip file, 70 | * The .zip must contain the ".pth" weight file. 71 | * The .zip is recommended to contain the feature retrieval files ".index" 72 | 73 | Or place the model manually in root/models 74 | ``` 75 | models 76 | ├───Person1 77 | │ ├───xxxx.pth 78 | │ ├───xxxx.index 79 | │ └───xxxx.npy 80 | └───Person2 81 | ├───xxxx.pth 82 | ├───... 83 | └───... 84 | ```` 85 |
86 | 87 | 88 |
89 | 90 | ### How to get models?. 91 | * Join the[ AI Hub](https://discord.gg/aihub) Discord 92 | * [Community Models on HuggingFace](https://huggingface.co/QuickWick/Music-AI-Voices/tree/main) by Wicked aka QuickWick 93 | 94 |
95 | 96 | K7#4523 97 | 98 | 99 | -------------------------------------------------------------------------------- /RVC-GUI.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | REM Get the path of the script's directory 4 | set "scriptDir=%~dp0" 5 | 6 | REM Set the path to the Python runtime folder 7 | set "runtimeFolder=%scriptDir%runtime" 8 | 9 | REM Check if the runtime folder exists 10 | 11 | REM Check if the runtime folder exists 12 | if exist "%runtimeFolder%\python.exe" ( 13 | REM Runtime folder exists, so run the file using the runtime Python 14 | echo Running with the runtime Python. 15 | "runtime/python.exe" rvcgui.py --pycmd "runtime/python.exe" 16 | pause 17 | ) else ( 18 | REM Runtime folder does not exist, so run the file using the system Python 19 | echo Running with the system Python. 20 | python.exe rvcgui.py --pycmd python.exe 21 | pause 22 | ) -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import sys 4 | import torch 5 | from multiprocessing import cpu_count 6 | 7 | 8 | class Config: 9 | def __init__(self): 10 | self.device = "cuda:0" 11 | self.is_half = True 12 | self.n_cpu = 0 13 | self.gpu_name = None 14 | self.gpu_mem = None 15 | ( 16 | self.python_cmd, 17 | self.listen_port, 18 | self.iscolab, 19 | self.noparallel, 20 | self.noautoopen, 21 | self.use_gfloat, 22 | self.paperspace, 23 | ) = self.arg_parse() 24 | 25 | if self.use_gfloat: 26 | print("Using g_float instead of g_half") 27 | self.is_half = False 28 | self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() 29 | 30 | def arg_parse(self) -> tuple: 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument("--port", type=int, default=7865, help="Listen port") 33 | parser.add_argument( 34 | "--pycmd", type=str, default="python", help="Python command" 35 | ) 36 | parser.add_argument("--colab", action="store_true", help="Launch in colab") 37 | parser.add_argument( 38 | "--noparallel", action="store_true", help="Disable parallel processing" 39 | ) 40 | parser.add_argument( 41 | "--noautoopen", 42 | action="store_true", 43 | help="Do not open in browser automatically", 44 | ) 45 | parser.add_argument( # this argument (if set to false) allows windows users to avoid the "slow_conv2d_cpu not implemented for 'Half'" exception 46 | "--use_gfloat", action="store_true", help="Will use g_float instead of g_half during voice conversion." 47 | ) 48 | parser.add_argument( # Fork Feature. Paperspace integration for web UI 49 | "--paperspace", action="store_true", help="Note that this argument just shares a gradio link for the web UI. Thus can be used on other non-local CLI systems." 50 | ) 51 | cmd_opts = parser.parse_args() 52 | 53 | cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865 54 | 55 | return ( 56 | cmd_opts.pycmd, 57 | cmd_opts.port, 58 | cmd_opts.colab, 59 | cmd_opts.noparallel, 60 | cmd_opts.noautoopen, 61 | cmd_opts.use_gfloat, 62 | cmd_opts.paperspace, 63 | ) 64 | 65 | def device_config(self) -> tuple: 66 | if torch.cuda.is_available(): 67 | i_device = int(self.device.split(":")[-1]) 68 | self.gpu_name = torch.cuda.get_device_name(i_device) 69 | if ( 70 | ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) 71 | or "P40" in self.gpu_name.upper() 72 | or "1060" in self.gpu_name 73 | or "1070" in self.gpu_name 74 | or "1080" in self.gpu_name 75 | ): 76 | print("16系/10系显卡和P40强制单精度") 77 | self.is_half = False 78 | with open("trainset_preprocess_pipeline_print.py", "r") as f: 79 | strr = f.read().replace("3.7", "3.0") 80 | with open("trainset_preprocess_pipeline_print.py", "w") as f: 81 | f.write(strr) 82 | else: 83 | self.gpu_name = None 84 | self.gpu_mem = int( 85 | torch.cuda.get_device_properties(i_device).total_memory 86 | / 1024 87 | / 1024 88 | / 1024 89 | + 0.4 90 | ) 91 | if self.gpu_mem <= 4: 92 | with open("trainset_preprocess_pipeline_print.py", "r") as f: 93 | strr = f.read().replace("3.7", "3.0") 94 | with open("trainset_preprocess_pipeline_print.py", "w") as f: 95 | f.write(strr) 96 | elif torch.backends.mps.is_available(): 97 | print("No supported Nvidia cards found, using MPS for inference ") 98 | self.device = "mps" 99 | else: 100 | print("No supported Nvidia cards found, using CPU for inference") 101 | self.device = "cpu" 102 | if not self.use_gfloat: # Fork Feature: Force g_float (is_half = False) if --use_gfloat arg is used. 103 | self.is_half = False 104 | 105 | if self.n_cpu == 0: 106 | self.n_cpu = cpu_count() 107 | 108 | if self.is_half: 109 | # 6G显存配置 110 | x_pad = 3 111 | x_query = 10 112 | x_center = 60 113 | x_max = 65 114 | else: 115 | # 5G显存配置 116 | x_pad = 1 117 | x_query = 6 118 | x_center = 38 119 | x_max = 41 120 | 121 | if self.gpu_mem != None and self.gpu_mem <= 4: 122 | x_pad = 1 123 | x_query = 5 124 | x_center = 30 125 | x_max = 32 126 | 127 | return x_pad, x_query, x_center, x_max 128 | -------------------------------------------------------------------------------- /docs/GUI.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiger14n/RVC-GUI/0c2e2b158e0fdff0ed91a53d9fea2b0b3dc4752b/docs/GUI.JPG -------------------------------------------------------------------------------- /docs/GUI20230508.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiger14n/RVC-GUI/0c2e2b158e0fdff0ed91a53d9fea2b0b3dc4752b/docs/GUI20230508.JPG -------------------------------------------------------------------------------- /infer/infer-pm-index256.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | 对源特征进行检索 4 | """ 5 | import torch, pdb, os, parselmouth 6 | 7 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 8 | import numpy as np 9 | import soundfile as sf 10 | 11 | # from models import SynthesizerTrn256#hifigan_nonsf 12 | # from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf 13 | from infer_pack.models import ( 14 | SynthesizerTrnMs256NSFsid as SynthesizerTrn256, 15 | ) # hifigan_nsf 16 | 17 | # from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf 18 | # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf 19 | # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf 20 | 21 | 22 | from scipy.io import wavfile 23 | from fairseq import checkpoint_utils 24 | 25 | # import pyworld 26 | import librosa 27 | import torch.nn.functional as F 28 | import scipy.signal as signal 29 | 30 | # import torchcrepe 31 | from time import time as ttime 32 | 33 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 34 | model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt" # 35 | print("load model(s) from {}".format(model_path)) 36 | models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( 37 | [model_path], 38 | suffix="", 39 | ) 40 | model = models[0] 41 | model = model.to(device) 42 | model = model.half() 43 | model.eval() 44 | 45 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256 46 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256 47 | net_g = SynthesizerTrn256( 48 | 1025, 49 | 32, 50 | 192, 51 | 192, 52 | 768, 53 | 2, 54 | 6, 55 | 3, 56 | 0, 57 | "1", 58 | [3, 7, 11], 59 | [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 60 | [10, 10, 2, 2], 61 | 512, 62 | [16, 16, 4, 4], 63 | 183, 64 | 256, 65 | is_half=True, 66 | ) # hifigan#512#256#no_dropout 67 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3 68 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr 69 | # 70 | # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms 71 | # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2 72 | 73 | # weights=torch.load("infer/ft-mi_1k-noD.pt") 74 | # weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt") 75 | # weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt") 76 | # weights=torch.load("infer/ft-mi-sim1k.pt") 77 | weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt") 78 | print(net_g.load_state_dict(weights, strict=True)) 79 | 80 | net_g.eval().to(device) 81 | net_g.half() 82 | 83 | 84 | def get_f0(x, p_len, f0_up_key=0): 85 | time_step = 160 / 16000 * 1000 86 | f0_min = 50 87 | f0_max = 1100 88 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 89 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 90 | 91 | f0 = ( 92 | parselmouth.Sound(x, 16000) 93 | .to_pitch_ac( 94 | time_step=time_step / 1000, 95 | voicing_threshold=0.6, 96 | pitch_floor=f0_min, 97 | pitch_ceiling=f0_max, 98 | ) 99 | .selected_array["frequency"] 100 | ) 101 | 102 | pad_size = (p_len - len(f0) + 1) // 2 103 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 104 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") 105 | f0 *= pow(2, f0_up_key / 12) 106 | f0bak = f0.copy() 107 | 108 | f0_mel = 1127 * np.log(1 + f0 / 700) 109 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( 110 | f0_mel_max - f0_mel_min 111 | ) + 1 112 | f0_mel[f0_mel <= 1] = 1 113 | f0_mel[f0_mel > 255] = 255 114 | # f0_mel[f0_mel > 188] = 188 115 | f0_coarse = np.rint(f0_mel).astype(np.int) 116 | return f0_coarse, f0bak 117 | 118 | 119 | import faiss 120 | 121 | index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index") 122 | big_npy = np.load("infer/big_src_feature_mi.npy") 123 | ta0 = ta1 = ta2 = 0 124 | for idx, name in enumerate( 125 | [ 126 | "冬之花clip1.wav", 127 | ] 128 | ): ## 129 | wav_path = "todo-songs/%s" % name # 130 | f0_up_key = -2 # 131 | audio, sampling_rate = sf.read(wav_path) 132 | if len(audio.shape) > 1: 133 | audio = librosa.to_mono(audio.transpose(1, 0)) 134 | if sampling_rate != 16000: 135 | audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) 136 | 137 | feats = torch.from_numpy(audio).float() 138 | if feats.dim() == 2: # double channels 139 | feats = feats.mean(-1) 140 | assert feats.dim() == 1, feats.dim() 141 | feats = feats.view(1, -1) 142 | padding_mask = torch.BoolTensor(feats.shape).fill_(False) 143 | inputs = { 144 | "source": feats.half().to(device), 145 | "padding_mask": padding_mask.to(device), 146 | "output_layer": 9, # layer 9 147 | } 148 | if torch.cuda.is_available(): 149 | torch.cuda.synchronize() 150 | t0 = ttime() 151 | with torch.no_grad(): 152 | logits = model.extract_features(**inputs) 153 | feats = model.final_proj(logits[0]) 154 | 155 | ####索引优化 156 | npy = feats[0].cpu().numpy().astype("float32") 157 | D, I = index.search(npy, 1) 158 | feats = ( 159 | torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device) 160 | ) 161 | 162 | feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) 163 | if torch.cuda.is_available(): 164 | torch.cuda.synchronize() 165 | t1 = ttime() 166 | # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 167 | p_len = min(feats.shape[1], 10000) # 168 | pitch, pitchf = get_f0(audio, p_len, f0_up_key) 169 | p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存 170 | if torch.cuda.is_available(): 171 | torch.cuda.synchronize() 172 | t2 = ttime() 173 | feats = feats[:, :p_len, :] 174 | pitch = pitch[:p_len] 175 | pitchf = pitchf[:p_len] 176 | p_len = torch.LongTensor([p_len]).to(device) 177 | pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) 178 | sid = torch.LongTensor([0]).to(device) 179 | pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device) 180 | with torch.no_grad(): 181 | audio = ( 182 | net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] 183 | .data.cpu() 184 | .float() 185 | .numpy() 186 | ) # nsf 187 | if torch.cuda.is_available(): 188 | torch.cuda.synchronize() 189 | t3 = ttime() 190 | ta0 += t1 - t0 191 | ta1 += t2 - t1 192 | ta2 += t3 - t2 193 | # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)## 194 | # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)## 195 | # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)## 196 | wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ## 197 | 198 | 199 | print(ta0, ta1, ta2) # 200 | -------------------------------------------------------------------------------- /infer/train-index.py: -------------------------------------------------------------------------------- 1 | """ 2 | 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个 3 | """ 4 | import faiss, numpy as np, os 5 | 6 | # ###########如果是原始特征要先写save 7 | inp_root = r"E:\codes\py39\dataset\mi\2-co256" 8 | npys = [] 9 | for name in sorted(list(os.listdir(inp_root))): 10 | phone = np.load("%s/%s" % (inp_root, name)) 11 | npys.append(phone) 12 | big_npy = np.concatenate(npys, 0) 13 | print(big_npy.shape) # (6196072, 192)#fp32#4.43G 14 | np.save("infer/big_src_feature_mi.npy", big_npy) 15 | 16 | ##################train+add 17 | # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy") 18 | print(big_npy.shape) 19 | index = faiss.index_factory(256, "IVF512,Flat") # mi 20 | print("training") 21 | index_ivf = faiss.extract_index_ivf(index) # 22 | index_ivf.nprobe = 9 23 | index.train(big_npy) 24 | faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index") 25 | print("adding") 26 | index.add(big_npy) 27 | faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index") 28 | """ 29 | 大小(都是FP32) 30 | big_src_feature 2.95G 31 | (3098036, 256) 32 | big_emb 4.43G 33 | (6196072, 192) 34 | big_emb双倍是因为求特征要repeat后再加pitch 35 | 36 | """ 37 | -------------------------------------------------------------------------------- /infer/trans_weights.py: -------------------------------------------------------------------------------- 1 | import torch, pdb 2 | 3 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf# 4 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf# 5 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf# 6 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf# 7 | a = torch.load( 8 | r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth" 9 | )[ 10 | "model" 11 | ] # sim_nsf# 12 | for key in a.keys(): 13 | a[key] = a[key].half() 14 | # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")# 15 | # torch.save(a,"ft-mi-sim1k.pt")# 16 | torch.save(a, "ft-mi-no_opt-no_dropout.pt") # 17 | -------------------------------------------------------------------------------- /infer_pack/attentions.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from infer_pack import commons 9 | from infer_pack import modules 10 | from infer_pack.modules import LayerNorm 11 | 12 | 13 | class Encoder(nn.Module): 14 | def __init__( 15 | self, 16 | hidden_channels, 17 | filter_channels, 18 | n_heads, 19 | n_layers, 20 | kernel_size=1, 21 | p_dropout=0.0, 22 | window_size=10, 23 | **kwargs 24 | ): 25 | super().__init__() 26 | self.hidden_channels = hidden_channels 27 | self.filter_channels = filter_channels 28 | self.n_heads = n_heads 29 | self.n_layers = n_layers 30 | self.kernel_size = kernel_size 31 | self.p_dropout = p_dropout 32 | self.window_size = window_size 33 | 34 | self.drop = nn.Dropout(p_dropout) 35 | self.attn_layers = nn.ModuleList() 36 | self.norm_layers_1 = nn.ModuleList() 37 | self.ffn_layers = nn.ModuleList() 38 | self.norm_layers_2 = nn.ModuleList() 39 | for i in range(self.n_layers): 40 | self.attn_layers.append( 41 | MultiHeadAttention( 42 | hidden_channels, 43 | hidden_channels, 44 | n_heads, 45 | p_dropout=p_dropout, 46 | window_size=window_size, 47 | ) 48 | ) 49 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 50 | self.ffn_layers.append( 51 | FFN( 52 | hidden_channels, 53 | hidden_channels, 54 | filter_channels, 55 | kernel_size, 56 | p_dropout=p_dropout, 57 | ) 58 | ) 59 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 60 | 61 | def forward(self, x, x_mask): 62 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 63 | x = x * x_mask 64 | for i in range(self.n_layers): 65 | y = self.attn_layers[i](x, x, attn_mask) 66 | y = self.drop(y) 67 | x = self.norm_layers_1[i](x + y) 68 | 69 | y = self.ffn_layers[i](x, x_mask) 70 | y = self.drop(y) 71 | x = self.norm_layers_2[i](x + y) 72 | x = x * x_mask 73 | return x 74 | 75 | 76 | class Decoder(nn.Module): 77 | def __init__( 78 | self, 79 | hidden_channels, 80 | filter_channels, 81 | n_heads, 82 | n_layers, 83 | kernel_size=1, 84 | p_dropout=0.0, 85 | proximal_bias=False, 86 | proximal_init=True, 87 | **kwargs 88 | ): 89 | super().__init__() 90 | self.hidden_channels = hidden_channels 91 | self.filter_channels = filter_channels 92 | self.n_heads = n_heads 93 | self.n_layers = n_layers 94 | self.kernel_size = kernel_size 95 | self.p_dropout = p_dropout 96 | self.proximal_bias = proximal_bias 97 | self.proximal_init = proximal_init 98 | 99 | self.drop = nn.Dropout(p_dropout) 100 | self.self_attn_layers = nn.ModuleList() 101 | self.norm_layers_0 = nn.ModuleList() 102 | self.encdec_attn_layers = nn.ModuleList() 103 | self.norm_layers_1 = nn.ModuleList() 104 | self.ffn_layers = nn.ModuleList() 105 | self.norm_layers_2 = nn.ModuleList() 106 | for i in range(self.n_layers): 107 | self.self_attn_layers.append( 108 | MultiHeadAttention( 109 | hidden_channels, 110 | hidden_channels, 111 | n_heads, 112 | p_dropout=p_dropout, 113 | proximal_bias=proximal_bias, 114 | proximal_init=proximal_init, 115 | ) 116 | ) 117 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 118 | self.encdec_attn_layers.append( 119 | MultiHeadAttention( 120 | hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout 121 | ) 122 | ) 123 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 124 | self.ffn_layers.append( 125 | FFN( 126 | hidden_channels, 127 | hidden_channels, 128 | filter_channels, 129 | kernel_size, 130 | p_dropout=p_dropout, 131 | causal=True, 132 | ) 133 | ) 134 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 135 | 136 | def forward(self, x, x_mask, h, h_mask): 137 | """ 138 | x: decoder input 139 | h: encoder output 140 | """ 141 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( 142 | device=x.device, dtype=x.dtype 143 | ) 144 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 145 | x = x * x_mask 146 | for i in range(self.n_layers): 147 | y = self.self_attn_layers[i](x, x, self_attn_mask) 148 | y = self.drop(y) 149 | x = self.norm_layers_0[i](x + y) 150 | 151 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 152 | y = self.drop(y) 153 | x = self.norm_layers_1[i](x + y) 154 | 155 | y = self.ffn_layers[i](x, x_mask) 156 | y = self.drop(y) 157 | x = self.norm_layers_2[i](x + y) 158 | x = x * x_mask 159 | return x 160 | 161 | 162 | class MultiHeadAttention(nn.Module): 163 | def __init__( 164 | self, 165 | channels, 166 | out_channels, 167 | n_heads, 168 | p_dropout=0.0, 169 | window_size=None, 170 | heads_share=True, 171 | block_length=None, 172 | proximal_bias=False, 173 | proximal_init=False, 174 | ): 175 | super().__init__() 176 | assert channels % n_heads == 0 177 | 178 | self.channels = channels 179 | self.out_channels = out_channels 180 | self.n_heads = n_heads 181 | self.p_dropout = p_dropout 182 | self.window_size = window_size 183 | self.heads_share = heads_share 184 | self.block_length = block_length 185 | self.proximal_bias = proximal_bias 186 | self.proximal_init = proximal_init 187 | self.attn = None 188 | 189 | self.k_channels = channels // n_heads 190 | self.conv_q = nn.Conv1d(channels, channels, 1) 191 | self.conv_k = nn.Conv1d(channels, channels, 1) 192 | self.conv_v = nn.Conv1d(channels, channels, 1) 193 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 194 | self.drop = nn.Dropout(p_dropout) 195 | 196 | if window_size is not None: 197 | n_heads_rel = 1 if heads_share else n_heads 198 | rel_stddev = self.k_channels**-0.5 199 | self.emb_rel_k = nn.Parameter( 200 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) 201 | * rel_stddev 202 | ) 203 | self.emb_rel_v = nn.Parameter( 204 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) 205 | * rel_stddev 206 | ) 207 | 208 | nn.init.xavier_uniform_(self.conv_q.weight) 209 | nn.init.xavier_uniform_(self.conv_k.weight) 210 | nn.init.xavier_uniform_(self.conv_v.weight) 211 | if proximal_init: 212 | with torch.no_grad(): 213 | self.conv_k.weight.copy_(self.conv_q.weight) 214 | self.conv_k.bias.copy_(self.conv_q.bias) 215 | 216 | def forward(self, x, c, attn_mask=None): 217 | q = self.conv_q(x) 218 | k = self.conv_k(c) 219 | v = self.conv_v(c) 220 | 221 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 222 | 223 | x = self.conv_o(x) 224 | return x 225 | 226 | def attention(self, query, key, value, mask=None): 227 | # reshape [b, d, t] -> [b, n_h, t, d_k] 228 | b, d, t_s, t_t = (*key.size(), query.size(2)) 229 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 230 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 231 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 232 | 233 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 234 | if self.window_size is not None: 235 | assert ( 236 | t_s == t_t 237 | ), "Relative attention is only available for self-attention." 238 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 239 | rel_logits = self._matmul_with_relative_keys( 240 | query / math.sqrt(self.k_channels), key_relative_embeddings 241 | ) 242 | scores_local = self._relative_position_to_absolute_position(rel_logits) 243 | scores = scores + scores_local 244 | if self.proximal_bias: 245 | assert t_s == t_t, "Proximal bias is only available for self-attention." 246 | scores = scores + self._attention_bias_proximal(t_s).to( 247 | device=scores.device, dtype=scores.dtype 248 | ) 249 | if mask is not None: 250 | scores = scores.masked_fill(mask == 0, -1e4) 251 | if self.block_length is not None: 252 | assert ( 253 | t_s == t_t 254 | ), "Local attention is only available for self-attention." 255 | block_mask = ( 256 | torch.ones_like(scores) 257 | .triu(-self.block_length) 258 | .tril(self.block_length) 259 | ) 260 | scores = scores.masked_fill(block_mask == 0, -1e4) 261 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 262 | p_attn = self.drop(p_attn) 263 | output = torch.matmul(p_attn, value) 264 | if self.window_size is not None: 265 | relative_weights = self._absolute_position_to_relative_position(p_attn) 266 | value_relative_embeddings = self._get_relative_embeddings( 267 | self.emb_rel_v, t_s 268 | ) 269 | output = output + self._matmul_with_relative_values( 270 | relative_weights, value_relative_embeddings 271 | ) 272 | output = ( 273 | output.transpose(2, 3).contiguous().view(b, d, t_t) 274 | ) # [b, n_h, t_t, d_k] -> [b, d, t_t] 275 | return output, p_attn 276 | 277 | def _matmul_with_relative_values(self, x, y): 278 | """ 279 | x: [b, h, l, m] 280 | y: [h or 1, m, d] 281 | ret: [b, h, l, d] 282 | """ 283 | ret = torch.matmul(x, y.unsqueeze(0)) 284 | return ret 285 | 286 | def _matmul_with_relative_keys(self, x, y): 287 | """ 288 | x: [b, h, l, d] 289 | y: [h or 1, m, d] 290 | ret: [b, h, l, m] 291 | """ 292 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 293 | return ret 294 | 295 | def _get_relative_embeddings(self, relative_embeddings, length): 296 | max_relative_position = 2 * self.window_size + 1 297 | # Pad first before slice to avoid using cond ops. 298 | pad_length = max(length - (self.window_size + 1), 0) 299 | slice_start_position = max((self.window_size + 1) - length, 0) 300 | slice_end_position = slice_start_position + 2 * length - 1 301 | if pad_length > 0: 302 | padded_relative_embeddings = F.pad( 303 | relative_embeddings, 304 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), 305 | ) 306 | else: 307 | padded_relative_embeddings = relative_embeddings 308 | used_relative_embeddings = padded_relative_embeddings[ 309 | :, slice_start_position:slice_end_position 310 | ] 311 | return used_relative_embeddings 312 | 313 | def _relative_position_to_absolute_position(self, x): 314 | """ 315 | x: [b, h, l, 2*l-1] 316 | ret: [b, h, l, l] 317 | """ 318 | batch, heads, length, _ = x.size() 319 | # Concat columns of pad to shift from relative to absolute indexing. 320 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) 321 | 322 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 323 | x_flat = x.view([batch, heads, length * 2 * length]) 324 | x_flat = F.pad( 325 | x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) 326 | ) 327 | 328 | # Reshape and slice out the padded elements. 329 | x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ 330 | :, :, :length, length - 1 : 331 | ] 332 | return x_final 333 | 334 | def _absolute_position_to_relative_position(self, x): 335 | """ 336 | x: [b, h, l, l] 337 | ret: [b, h, l, 2*l-1] 338 | """ 339 | batch, heads, length, _ = x.size() 340 | # padd along column 341 | x = F.pad( 342 | x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) 343 | ) 344 | x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) 345 | # add 0's in the beginning that will skew the elements after reshape 346 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 347 | x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] 348 | return x_final 349 | 350 | def _attention_bias_proximal(self, length): 351 | """Bias for self-attention to encourage attention to close positions. 352 | Args: 353 | length: an integer scalar. 354 | Returns: 355 | a Tensor with shape [1, 1, length, length] 356 | """ 357 | r = torch.arange(length, dtype=torch.float32) 358 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 359 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 360 | 361 | 362 | class FFN(nn.Module): 363 | def __init__( 364 | self, 365 | in_channels, 366 | out_channels, 367 | filter_channels, 368 | kernel_size, 369 | p_dropout=0.0, 370 | activation=None, 371 | causal=False, 372 | ): 373 | super().__init__() 374 | self.in_channels = in_channels 375 | self.out_channels = out_channels 376 | self.filter_channels = filter_channels 377 | self.kernel_size = kernel_size 378 | self.p_dropout = p_dropout 379 | self.activation = activation 380 | self.causal = causal 381 | 382 | if causal: 383 | self.padding = self._causal_padding 384 | else: 385 | self.padding = self._same_padding 386 | 387 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 388 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 389 | self.drop = nn.Dropout(p_dropout) 390 | 391 | def forward(self, x, x_mask): 392 | x = self.conv_1(self.padding(x * x_mask)) 393 | if self.activation == "gelu": 394 | x = x * torch.sigmoid(1.702 * x) 395 | else: 396 | x = torch.relu(x) 397 | x = self.drop(x) 398 | x = self.conv_2(self.padding(x * x_mask)) 399 | return x * x_mask 400 | 401 | def _causal_padding(self, x): 402 | if self.kernel_size == 1: 403 | return x 404 | pad_l = self.kernel_size - 1 405 | pad_r = 0 406 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 407 | x = F.pad(x, commons.convert_pad_shape(padding)) 408 | return x 409 | 410 | def _same_padding(self, x): 411 | if self.kernel_size == 1: 412 | return x 413 | pad_l = (self.kernel_size - 1) // 2 414 | pad_r = self.kernel_size // 2 415 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 416 | x = F.pad(x, commons.convert_pad_shape(padding)) 417 | return x 418 | -------------------------------------------------------------------------------- /infer_pack/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | 8 | def init_weights(m, mean=0.0, std=0.01): 9 | classname = m.__class__.__name__ 10 | if classname.find("Conv") != -1: 11 | m.weight.data.normal_(mean, std) 12 | 13 | 14 | def get_padding(kernel_size, dilation=1): 15 | return int((kernel_size * dilation - dilation) / 2) 16 | 17 | 18 | def convert_pad_shape(pad_shape): 19 | l = pad_shape[::-1] 20 | pad_shape = [item for sublist in l for item in sublist] 21 | return pad_shape 22 | 23 | 24 | def kl_divergence(m_p, logs_p, m_q, logs_q): 25 | """KL(P||Q)""" 26 | kl = (logs_q - logs_p) - 0.5 27 | kl += ( 28 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) 29 | ) 30 | return kl 31 | 32 | 33 | def rand_gumbel(shape): 34 | """Sample from the Gumbel distribution, protect from overflows.""" 35 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 36 | return -torch.log(-torch.log(uniform_samples)) 37 | 38 | 39 | def rand_gumbel_like(x): 40 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 41 | return g 42 | 43 | 44 | def slice_segments(x, ids_str, segment_size=4): 45 | ret = torch.zeros_like(x[:, :, :segment_size]) 46 | for i in range(x.size(0)): 47 | idx_str = ids_str[i] 48 | idx_end = idx_str + segment_size 49 | ret[i] = x[i, :, idx_str:idx_end] 50 | return ret 51 | 52 | 53 | def slice_segments2(x, ids_str, segment_size=4): 54 | ret = torch.zeros_like(x[:, :segment_size]) 55 | for i in range(x.size(0)): 56 | idx_str = ids_str[i] 57 | idx_end = idx_str + segment_size 58 | ret[i] = x[i, idx_str:idx_end] 59 | return ret 60 | 61 | 62 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 63 | b, d, t = x.size() 64 | if x_lengths is None: 65 | x_lengths = t 66 | ids_str_max = x_lengths - segment_size + 1 67 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 68 | ret = slice_segments(x, ids_str, segment_size) 69 | return ret, ids_str 70 | 71 | 72 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): 73 | position = torch.arange(length, dtype=torch.float) 74 | num_timescales = channels // 2 75 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( 76 | num_timescales - 1 77 | ) 78 | inv_timescales = min_timescale * torch.exp( 79 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment 80 | ) 81 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 82 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 83 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 84 | signal = signal.view(1, channels, length) 85 | return signal 86 | 87 | 88 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 89 | b, channels, length = x.size() 90 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 91 | return x + signal.to(dtype=x.dtype, device=x.device) 92 | 93 | 94 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 95 | b, channels, length = x.size() 96 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 97 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 98 | 99 | 100 | def subsequent_mask(length): 101 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 102 | return mask 103 | 104 | 105 | @torch.jit.script 106 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 107 | n_channels_int = n_channels[0] 108 | in_act = input_a + input_b 109 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 110 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 111 | acts = t_act * s_act 112 | return acts 113 | 114 | 115 | def convert_pad_shape(pad_shape): 116 | l = pad_shape[::-1] 117 | pad_shape = [item for sublist in l for item in sublist] 118 | return pad_shape 119 | 120 | 121 | def shift_1d(x): 122 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 123 | return x 124 | 125 | 126 | def sequence_mask(length, max_length=None): 127 | if max_length is None: 128 | max_length = length.max() 129 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 130 | return x.unsqueeze(0) < length.unsqueeze(1) 131 | 132 | 133 | def generate_path(duration, mask): 134 | """ 135 | duration: [b, 1, t_x] 136 | mask: [b, 1, t_y, t_x] 137 | """ 138 | device = duration.device 139 | 140 | b, _, t_y, t_x = mask.shape 141 | cum_duration = torch.cumsum(duration, -1) 142 | 143 | cum_duration_flat = cum_duration.view(b * t_x) 144 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 145 | path = path.view(b, t_x, t_y) 146 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 147 | path = path.unsqueeze(1).transpose(2, 3) * mask 148 | return path 149 | 150 | 151 | def clip_grad_value_(parameters, clip_value, norm_type=2): 152 | if isinstance(parameters, torch.Tensor): 153 | parameters = [parameters] 154 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 155 | norm_type = float(norm_type) 156 | if clip_value is not None: 157 | clip_value = float(clip_value) 158 | 159 | total_norm = 0 160 | for p in parameters: 161 | param_norm = p.grad.data.norm(norm_type) 162 | total_norm += param_norm.item() ** norm_type 163 | if clip_value is not None: 164 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 165 | total_norm = total_norm ** (1.0 / norm_type) 166 | return total_norm 167 | -------------------------------------------------------------------------------- /infer_pack/models_onnx.py: -------------------------------------------------------------------------------- 1 | import math, pdb, os 2 | from time import time as ttime 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from infer_pack import modules 7 | from infer_pack import attentions 8 | from infer_pack import commons 9 | from infer_pack.commons import init_weights, get_padding 10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 12 | from infer_pack.commons import init_weights 13 | import numpy as np 14 | from infer_pack import commons 15 | 16 | 17 | class TextEncoder256(nn.Module): 18 | def __init__( 19 | self, 20 | out_channels, 21 | hidden_channels, 22 | filter_channels, 23 | n_heads, 24 | n_layers, 25 | kernel_size, 26 | p_dropout, 27 | f0=True, 28 | ): 29 | super().__init__() 30 | self.out_channels = out_channels 31 | self.hidden_channels = hidden_channels 32 | self.filter_channels = filter_channels 33 | self.n_heads = n_heads 34 | self.n_layers = n_layers 35 | self.kernel_size = kernel_size 36 | self.p_dropout = p_dropout 37 | self.emb_phone = nn.Linear(256, hidden_channels) 38 | self.lrelu = nn.LeakyReLU(0.1, inplace=True) 39 | if f0 == True: 40 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 41 | self.encoder = attentions.Encoder( 42 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 43 | ) 44 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 45 | 46 | def forward(self, phone, pitch, lengths): 47 | if pitch == None: 48 | x = self.emb_phone(phone) 49 | else: 50 | x = self.emb_phone(phone) + self.emb_pitch(pitch) 51 | x = x * math.sqrt(self.hidden_channels) # [b, t, h] 52 | x = self.lrelu(x) 53 | x = torch.transpose(x, 1, -1) # [b, h, t] 54 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( 55 | x.dtype 56 | ) 57 | x = self.encoder(x * x_mask, x_mask) 58 | stats = self.proj(x) * x_mask 59 | 60 | m, logs = torch.split(stats, self.out_channels, dim=1) 61 | return m, logs, x_mask 62 | 63 | 64 | class TextEncoder768(nn.Module): 65 | def __init__( 66 | self, 67 | out_channels, 68 | hidden_channels, 69 | filter_channels, 70 | n_heads, 71 | n_layers, 72 | kernel_size, 73 | p_dropout, 74 | f0=True, 75 | ): 76 | super().__init__() 77 | self.out_channels = out_channels 78 | self.hidden_channels = hidden_channels 79 | self.filter_channels = filter_channels 80 | self.n_heads = n_heads 81 | self.n_layers = n_layers 82 | self.kernel_size = kernel_size 83 | self.p_dropout = p_dropout 84 | self.emb_phone = nn.Linear(768, hidden_channels) 85 | self.lrelu = nn.LeakyReLU(0.1, inplace=True) 86 | if f0 == True: 87 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 88 | self.encoder = attentions.Encoder( 89 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 90 | ) 91 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 92 | 93 | def forward(self, phone, pitch, lengths): 94 | if pitch == None: 95 | x = self.emb_phone(phone) 96 | else: 97 | x = self.emb_phone(phone) + self.emb_pitch(pitch) 98 | x = x * math.sqrt(self.hidden_channels) # [b, t, h] 99 | x = self.lrelu(x) 100 | x = torch.transpose(x, 1, -1) # [b, h, t] 101 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( 102 | x.dtype 103 | ) 104 | x = self.encoder(x * x_mask, x_mask) 105 | stats = self.proj(x) * x_mask 106 | 107 | m, logs = torch.split(stats, self.out_channels, dim=1) 108 | return m, logs, x_mask 109 | 110 | 111 | class ResidualCouplingBlock(nn.Module): 112 | def __init__( 113 | self, 114 | channels, 115 | hidden_channels, 116 | kernel_size, 117 | dilation_rate, 118 | n_layers, 119 | n_flows=4, 120 | gin_channels=0, 121 | ): 122 | super().__init__() 123 | self.channels = channels 124 | self.hidden_channels = hidden_channels 125 | self.kernel_size = kernel_size 126 | self.dilation_rate = dilation_rate 127 | self.n_layers = n_layers 128 | self.n_flows = n_flows 129 | self.gin_channels = gin_channels 130 | 131 | self.flows = nn.ModuleList() 132 | for i in range(n_flows): 133 | self.flows.append( 134 | modules.ResidualCouplingLayer( 135 | channels, 136 | hidden_channels, 137 | kernel_size, 138 | dilation_rate, 139 | n_layers, 140 | gin_channels=gin_channels, 141 | mean_only=True, 142 | ) 143 | ) 144 | self.flows.append(modules.Flip()) 145 | 146 | def forward(self, x, x_mask, g=None, reverse=False): 147 | if not reverse: 148 | for flow in self.flows: 149 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 150 | else: 151 | for flow in reversed(self.flows): 152 | x = flow(x, x_mask, g=g, reverse=reverse) 153 | return x 154 | 155 | def remove_weight_norm(self): 156 | for i in range(self.n_flows): 157 | self.flows[i * 2].remove_weight_norm() 158 | 159 | 160 | class PosteriorEncoder(nn.Module): 161 | def __init__( 162 | self, 163 | in_channels, 164 | out_channels, 165 | hidden_channels, 166 | kernel_size, 167 | dilation_rate, 168 | n_layers, 169 | gin_channels=0, 170 | ): 171 | super().__init__() 172 | self.in_channels = in_channels 173 | self.out_channels = out_channels 174 | self.hidden_channels = hidden_channels 175 | self.kernel_size = kernel_size 176 | self.dilation_rate = dilation_rate 177 | self.n_layers = n_layers 178 | self.gin_channels = gin_channels 179 | 180 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 181 | self.enc = modules.WN( 182 | hidden_channels, 183 | kernel_size, 184 | dilation_rate, 185 | n_layers, 186 | gin_channels=gin_channels, 187 | ) 188 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 189 | 190 | def forward(self, x, x_lengths, g=None): 191 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( 192 | x.dtype 193 | ) 194 | x = self.pre(x) * x_mask 195 | x = self.enc(x, x_mask, g=g) 196 | stats = self.proj(x) * x_mask 197 | m, logs = torch.split(stats, self.out_channels, dim=1) 198 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 199 | return z, m, logs, x_mask 200 | 201 | def remove_weight_norm(self): 202 | self.enc.remove_weight_norm() 203 | 204 | 205 | class Generator(torch.nn.Module): 206 | def __init__( 207 | self, 208 | initial_channel, 209 | resblock, 210 | resblock_kernel_sizes, 211 | resblock_dilation_sizes, 212 | upsample_rates, 213 | upsample_initial_channel, 214 | upsample_kernel_sizes, 215 | gin_channels=0, 216 | ): 217 | super(Generator, self).__init__() 218 | self.num_kernels = len(resblock_kernel_sizes) 219 | self.num_upsamples = len(upsample_rates) 220 | self.conv_pre = Conv1d( 221 | initial_channel, upsample_initial_channel, 7, 1, padding=3 222 | ) 223 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 224 | 225 | self.ups = nn.ModuleList() 226 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 227 | self.ups.append( 228 | weight_norm( 229 | ConvTranspose1d( 230 | upsample_initial_channel // (2**i), 231 | upsample_initial_channel // (2 ** (i + 1)), 232 | k, 233 | u, 234 | padding=(k - u) // 2, 235 | ) 236 | ) 237 | ) 238 | 239 | self.resblocks = nn.ModuleList() 240 | for i in range(len(self.ups)): 241 | ch = upsample_initial_channel // (2 ** (i + 1)) 242 | for j, (k, d) in enumerate( 243 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 244 | ): 245 | self.resblocks.append(resblock(ch, k, d)) 246 | 247 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 248 | self.ups.apply(init_weights) 249 | 250 | if gin_channels != 0: 251 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 252 | 253 | def forward(self, x, g=None): 254 | x = self.conv_pre(x) 255 | if g is not None: 256 | x = x + self.cond(g) 257 | 258 | for i in range(self.num_upsamples): 259 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 260 | x = self.ups[i](x) 261 | xs = None 262 | for j in range(self.num_kernels): 263 | if xs is None: 264 | xs = self.resblocks[i * self.num_kernels + j](x) 265 | else: 266 | xs += self.resblocks[i * self.num_kernels + j](x) 267 | x = xs / self.num_kernels 268 | x = F.leaky_relu(x) 269 | x = self.conv_post(x) 270 | x = torch.tanh(x) 271 | 272 | return x 273 | 274 | def remove_weight_norm(self): 275 | for l in self.ups: 276 | remove_weight_norm(l) 277 | for l in self.resblocks: 278 | l.remove_weight_norm() 279 | 280 | 281 | class SineGen(torch.nn.Module): 282 | """Definition of sine generator 283 | SineGen(samp_rate, harmonic_num = 0, 284 | sine_amp = 0.1, noise_std = 0.003, 285 | voiced_threshold = 0, 286 | flag_for_pulse=False) 287 | samp_rate: sampling rate in Hz 288 | harmonic_num: number of harmonic overtones (default 0) 289 | sine_amp: amplitude of sine-wavefrom (default 0.1) 290 | noise_std: std of Gaussian noise (default 0.003) 291 | voiced_thoreshold: F0 threshold for U/V classification (default 0) 292 | flag_for_pulse: this SinGen is used inside PulseGen (default False) 293 | Note: when flag_for_pulse is True, the first time step of a voiced 294 | segment is always sin(np.pi) or cos(0) 295 | """ 296 | 297 | def __init__( 298 | self, 299 | samp_rate, 300 | harmonic_num=0, 301 | sine_amp=0.1, 302 | noise_std=0.003, 303 | voiced_threshold=0, 304 | flag_for_pulse=False, 305 | ): 306 | super(SineGen, self).__init__() 307 | self.sine_amp = sine_amp 308 | self.noise_std = noise_std 309 | self.harmonic_num = harmonic_num 310 | self.dim = self.harmonic_num + 1 311 | self.sampling_rate = samp_rate 312 | self.voiced_threshold = voiced_threshold 313 | 314 | def _f02uv(self, f0): 315 | # generate uv signal 316 | uv = torch.ones_like(f0) 317 | uv = uv * (f0 > self.voiced_threshold) 318 | return uv 319 | 320 | def forward(self, f0, upp): 321 | """sine_tensor, uv = forward(f0) 322 | input F0: tensor(batchsize=1, length, dim=1) 323 | f0 for unvoiced steps should be 0 324 | output sine_tensor: tensor(batchsize=1, length, dim) 325 | output uv: tensor(batchsize=1, length, 1) 326 | """ 327 | with torch.no_grad(): 328 | f0 = f0[:, None].transpose(1, 2) 329 | f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) 330 | # fundamental component 331 | f0_buf[:, :, 0] = f0[:, :, 0] 332 | for idx in np.arange(self.harmonic_num): 333 | f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( 334 | idx + 2 335 | ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic 336 | rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 337 | rand_ini = torch.rand( 338 | f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device 339 | ) 340 | rand_ini[:, 0] = 0 341 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini 342 | tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 343 | tmp_over_one *= upp 344 | tmp_over_one = F.interpolate( 345 | tmp_over_one.transpose(2, 1), 346 | scale_factor=upp, 347 | mode="linear", 348 | align_corners=True, 349 | ).transpose(2, 1) 350 | rad_values = F.interpolate( 351 | rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" 352 | ).transpose( 353 | 2, 1 354 | ) ####### 355 | tmp_over_one %= 1 356 | tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 357 | cumsum_shift = torch.zeros_like(rad_values) 358 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 359 | sine_waves = torch.sin( 360 | torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi 361 | ) 362 | sine_waves = sine_waves * self.sine_amp 363 | uv = self._f02uv(f0) 364 | uv = F.interpolate( 365 | uv.transpose(2, 1), scale_factor=upp, mode="nearest" 366 | ).transpose(2, 1) 367 | noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 368 | noise = noise_amp * torch.randn_like(sine_waves) 369 | sine_waves = sine_waves * uv + noise 370 | return sine_waves, uv, noise 371 | 372 | 373 | class SourceModuleHnNSF(torch.nn.Module): 374 | """SourceModule for hn-nsf 375 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, 376 | add_noise_std=0.003, voiced_threshod=0) 377 | sampling_rate: sampling_rate in Hz 378 | harmonic_num: number of harmonic above F0 (default: 0) 379 | sine_amp: amplitude of sine source signal (default: 0.1) 380 | add_noise_std: std of additive Gaussian noise (default: 0.003) 381 | note that amplitude of noise in unvoiced is decided 382 | by sine_amp 383 | voiced_threshold: threhold to set U/V given F0 (default: 0) 384 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) 385 | F0_sampled (batchsize, length, 1) 386 | Sine_source (batchsize, length, 1) 387 | noise_source (batchsize, length 1) 388 | uv (batchsize, length, 1) 389 | """ 390 | 391 | def __init__( 392 | self, 393 | sampling_rate, 394 | harmonic_num=0, 395 | sine_amp=0.1, 396 | add_noise_std=0.003, 397 | voiced_threshod=0, 398 | is_half=True, 399 | ): 400 | super(SourceModuleHnNSF, self).__init__() 401 | 402 | self.sine_amp = sine_amp 403 | self.noise_std = add_noise_std 404 | self.is_half = is_half 405 | # to produce sine waveforms 406 | self.l_sin_gen = SineGen( 407 | sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod 408 | ) 409 | 410 | # to merge source harmonics into a single excitation 411 | self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) 412 | self.l_tanh = torch.nn.Tanh() 413 | 414 | def forward(self, x, upp=None): 415 | sine_wavs, uv, _ = self.l_sin_gen(x, upp) 416 | if self.is_half: 417 | sine_wavs = sine_wavs.half() 418 | sine_merge = self.l_tanh(self.l_linear(sine_wavs)) 419 | return sine_merge, None, None # noise, uv 420 | 421 | 422 | class GeneratorNSF(torch.nn.Module): 423 | def __init__( 424 | self, 425 | initial_channel, 426 | resblock, 427 | resblock_kernel_sizes, 428 | resblock_dilation_sizes, 429 | upsample_rates, 430 | upsample_initial_channel, 431 | upsample_kernel_sizes, 432 | gin_channels, 433 | sr, 434 | is_half=False, 435 | ): 436 | super(GeneratorNSF, self).__init__() 437 | self.num_kernels = len(resblock_kernel_sizes) 438 | self.num_upsamples = len(upsample_rates) 439 | 440 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) 441 | self.m_source = SourceModuleHnNSF( 442 | sampling_rate=sr, harmonic_num=0, is_half=is_half 443 | ) 444 | self.noise_convs = nn.ModuleList() 445 | self.conv_pre = Conv1d( 446 | initial_channel, upsample_initial_channel, 7, 1, padding=3 447 | ) 448 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 449 | 450 | self.ups = nn.ModuleList() 451 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 452 | c_cur = upsample_initial_channel // (2 ** (i + 1)) 453 | self.ups.append( 454 | weight_norm( 455 | ConvTranspose1d( 456 | upsample_initial_channel // (2**i), 457 | upsample_initial_channel // (2 ** (i + 1)), 458 | k, 459 | u, 460 | padding=(k - u) // 2, 461 | ) 462 | ) 463 | ) 464 | if i + 1 < len(upsample_rates): 465 | stride_f0 = np.prod(upsample_rates[i + 1 :]) 466 | self.noise_convs.append( 467 | Conv1d( 468 | 1, 469 | c_cur, 470 | kernel_size=stride_f0 * 2, 471 | stride=stride_f0, 472 | padding=stride_f0 // 2, 473 | ) 474 | ) 475 | else: 476 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) 477 | 478 | self.resblocks = nn.ModuleList() 479 | for i in range(len(self.ups)): 480 | ch = upsample_initial_channel // (2 ** (i + 1)) 481 | for j, (k, d) in enumerate( 482 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 483 | ): 484 | self.resblocks.append(resblock(ch, k, d)) 485 | 486 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 487 | self.ups.apply(init_weights) 488 | 489 | if gin_channels != 0: 490 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 491 | 492 | self.upp = np.prod(upsample_rates) 493 | 494 | def forward(self, x, f0, g=None): 495 | har_source, noi_source, uv = self.m_source(f0, self.upp) 496 | har_source = har_source.transpose(1, 2) 497 | x = self.conv_pre(x) 498 | if g is not None: 499 | x = x + self.cond(g) 500 | 501 | for i in range(self.num_upsamples): 502 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 503 | x = self.ups[i](x) 504 | x_source = self.noise_convs[i](har_source) 505 | x = x + x_source 506 | xs = None 507 | for j in range(self.num_kernels): 508 | if xs is None: 509 | xs = self.resblocks[i * self.num_kernels + j](x) 510 | else: 511 | xs += self.resblocks[i * self.num_kernels + j](x) 512 | x = xs / self.num_kernels 513 | x = F.leaky_relu(x) 514 | x = self.conv_post(x) 515 | x = torch.tanh(x) 516 | return x 517 | 518 | def remove_weight_norm(self): 519 | for l in self.ups: 520 | remove_weight_norm(l) 521 | for l in self.resblocks: 522 | l.remove_weight_norm() 523 | 524 | 525 | sr2sr = { 526 | "32k": 32000, 527 | "40k": 40000, 528 | "48k": 48000, 529 | } 530 | 531 | 532 | class SynthesizerTrnMsNSFsidM(nn.Module): 533 | def __init__( 534 | self, 535 | spec_channels, 536 | segment_size, 537 | inter_channels, 538 | hidden_channels, 539 | filter_channels, 540 | n_heads, 541 | n_layers, 542 | kernel_size, 543 | p_dropout, 544 | resblock, 545 | resblock_kernel_sizes, 546 | resblock_dilation_sizes, 547 | upsample_rates, 548 | upsample_initial_channel, 549 | upsample_kernel_sizes, 550 | spk_embed_dim, 551 | gin_channels, 552 | sr, 553 | **kwargs 554 | ): 555 | super().__init__() 556 | if type(sr) == type("strr"): 557 | sr = sr2sr[sr] 558 | self.spec_channels = spec_channels 559 | self.inter_channels = inter_channels 560 | self.hidden_channels = hidden_channels 561 | self.filter_channels = filter_channels 562 | self.n_heads = n_heads 563 | self.n_layers = n_layers 564 | self.kernel_size = kernel_size 565 | self.p_dropout = p_dropout 566 | self.resblock = resblock 567 | self.resblock_kernel_sizes = resblock_kernel_sizes 568 | self.resblock_dilation_sizes = resblock_dilation_sizes 569 | self.upsample_rates = upsample_rates 570 | self.upsample_initial_channel = upsample_initial_channel 571 | self.upsample_kernel_sizes = upsample_kernel_sizes 572 | self.segment_size = segment_size 573 | self.gin_channels = gin_channels 574 | # self.hop_length = hop_length# 575 | self.spk_embed_dim = spk_embed_dim 576 | if self.gin_channels == 256: 577 | self.enc_p = TextEncoder256( 578 | inter_channels, 579 | hidden_channels, 580 | filter_channels, 581 | n_heads, 582 | n_layers, 583 | kernel_size, 584 | p_dropout, 585 | ) 586 | else: 587 | self.enc_p = TextEncoder768( 588 | inter_channels, 589 | hidden_channels, 590 | filter_channels, 591 | n_heads, 592 | n_layers, 593 | kernel_size, 594 | p_dropout, 595 | ) 596 | self.dec = GeneratorNSF( 597 | inter_channels, 598 | resblock, 599 | resblock_kernel_sizes, 600 | resblock_dilation_sizes, 601 | upsample_rates, 602 | upsample_initial_channel, 603 | upsample_kernel_sizes, 604 | gin_channels=gin_channels, 605 | sr=sr, 606 | is_half=kwargs["is_half"], 607 | ) 608 | self.enc_q = PosteriorEncoder( 609 | spec_channels, 610 | inter_channels, 611 | hidden_channels, 612 | 5, 613 | 1, 614 | 16, 615 | gin_channels=gin_channels, 616 | ) 617 | self.flow = ResidualCouplingBlock( 618 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels 619 | ) 620 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) 621 | self.speaker_map = None 622 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) 623 | 624 | def remove_weight_norm(self): 625 | self.dec.remove_weight_norm() 626 | self.flow.remove_weight_norm() 627 | self.enc_q.remove_weight_norm() 628 | 629 | def construct_spkmixmap(self, n_speaker): 630 | self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels)) 631 | for i in range(n_speaker): 632 | self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) 633 | self.speaker_map = self.speaker_map.unsqueeze(0) 634 | 635 | def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None): 636 | if self.speaker_map is not None: # [N, S] * [S, B, 1, H] 637 | g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] 638 | g = g * self.speaker_map # [N, S, B, 1, H] 639 | g = torch.sum(g, dim=1) # [N, 1, B, 1, H] 640 | g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] 641 | else: 642 | g = g.unsqueeze(0) 643 | g = self.emb_g(g).transpose(1, 2) 644 | 645 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) 646 | z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask 647 | z = self.flow(z_p, x_mask, g=g, reverse=True) 648 | o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) 649 | return o 650 | 651 | 652 | class MultiPeriodDiscriminator(torch.nn.Module): 653 | def __init__(self, use_spectral_norm=False): 654 | super(MultiPeriodDiscriminator, self).__init__() 655 | periods = [2, 3, 5, 7, 11, 17] 656 | # periods = [3, 5, 7, 11, 17, 23, 37] 657 | 658 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 659 | discs = discs + [ 660 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 661 | ] 662 | self.discriminators = nn.ModuleList(discs) 663 | 664 | def forward(self, y, y_hat): 665 | y_d_rs = [] # 666 | y_d_gs = [] 667 | fmap_rs = [] 668 | fmap_gs = [] 669 | for i, d in enumerate(self.discriminators): 670 | y_d_r, fmap_r = d(y) 671 | y_d_g, fmap_g = d(y_hat) 672 | # for j in range(len(fmap_r)): 673 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) 674 | y_d_rs.append(y_d_r) 675 | y_d_gs.append(y_d_g) 676 | fmap_rs.append(fmap_r) 677 | fmap_gs.append(fmap_g) 678 | 679 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 680 | 681 | 682 | class MultiPeriodDiscriminatorV2(torch.nn.Module): 683 | def __init__(self, use_spectral_norm=False): 684 | super(MultiPeriodDiscriminatorV2, self).__init__() 685 | # periods = [2, 3, 5, 7, 11, 17] 686 | periods = [2, 3, 5, 7, 11, 17, 23, 37] 687 | 688 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 689 | discs = discs + [ 690 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 691 | ] 692 | self.discriminators = nn.ModuleList(discs) 693 | 694 | def forward(self, y, y_hat): 695 | y_d_rs = [] # 696 | y_d_gs = [] 697 | fmap_rs = [] 698 | fmap_gs = [] 699 | for i, d in enumerate(self.discriminators): 700 | y_d_r, fmap_r = d(y) 701 | y_d_g, fmap_g = d(y_hat) 702 | # for j in range(len(fmap_r)): 703 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) 704 | y_d_rs.append(y_d_r) 705 | y_d_gs.append(y_d_g) 706 | fmap_rs.append(fmap_r) 707 | fmap_gs.append(fmap_g) 708 | 709 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 710 | 711 | 712 | class DiscriminatorS(torch.nn.Module): 713 | def __init__(self, use_spectral_norm=False): 714 | super(DiscriminatorS, self).__init__() 715 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 716 | self.convs = nn.ModuleList( 717 | [ 718 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 719 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 720 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 721 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 722 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 723 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 724 | ] 725 | ) 726 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 727 | 728 | def forward(self, x): 729 | fmap = [] 730 | 731 | for l in self.convs: 732 | x = l(x) 733 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 734 | fmap.append(x) 735 | x = self.conv_post(x) 736 | fmap.append(x) 737 | x = torch.flatten(x, 1, -1) 738 | 739 | return x, fmap 740 | 741 | 742 | class DiscriminatorP(torch.nn.Module): 743 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 744 | super(DiscriminatorP, self).__init__() 745 | self.period = period 746 | self.use_spectral_norm = use_spectral_norm 747 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 748 | self.convs = nn.ModuleList( 749 | [ 750 | norm_f( 751 | Conv2d( 752 | 1, 753 | 32, 754 | (kernel_size, 1), 755 | (stride, 1), 756 | padding=(get_padding(kernel_size, 1), 0), 757 | ) 758 | ), 759 | norm_f( 760 | Conv2d( 761 | 32, 762 | 128, 763 | (kernel_size, 1), 764 | (stride, 1), 765 | padding=(get_padding(kernel_size, 1), 0), 766 | ) 767 | ), 768 | norm_f( 769 | Conv2d( 770 | 128, 771 | 512, 772 | (kernel_size, 1), 773 | (stride, 1), 774 | padding=(get_padding(kernel_size, 1), 0), 775 | ) 776 | ), 777 | norm_f( 778 | Conv2d( 779 | 512, 780 | 1024, 781 | (kernel_size, 1), 782 | (stride, 1), 783 | padding=(get_padding(kernel_size, 1), 0), 784 | ) 785 | ), 786 | norm_f( 787 | Conv2d( 788 | 1024, 789 | 1024, 790 | (kernel_size, 1), 791 | 1, 792 | padding=(get_padding(kernel_size, 1), 0), 793 | ) 794 | ), 795 | ] 796 | ) 797 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 798 | 799 | def forward(self, x): 800 | fmap = [] 801 | 802 | # 1d to 2d 803 | b, c, t = x.shape 804 | if t % self.period != 0: # pad first 805 | n_pad = self.period - (t % self.period) 806 | x = F.pad(x, (0, n_pad), "reflect") 807 | t = t + n_pad 808 | x = x.view(b, c, t // self.period, self.period) 809 | 810 | for l in self.convs: 811 | x = l(x) 812 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 813 | fmap.append(x) 814 | x = self.conv_post(x) 815 | fmap.append(x) 816 | x = torch.flatten(x, 1, -1) 817 | 818 | return x, fmap 819 | -------------------------------------------------------------------------------- /infer_pack/models_onnx_moess.py: -------------------------------------------------------------------------------- 1 | import math, pdb, os 2 | from time import time as ttime 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from infer_pack import modules 7 | from infer_pack import attentions 8 | from infer_pack import commons 9 | from infer_pack.commons import init_weights, get_padding 10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 12 | from infer_pack.commons import init_weights 13 | import numpy as np 14 | from infer_pack import commons 15 | 16 | 17 | class TextEncoder256(nn.Module): 18 | def __init__( 19 | self, 20 | out_channels, 21 | hidden_channels, 22 | filter_channels, 23 | n_heads, 24 | n_layers, 25 | kernel_size, 26 | p_dropout, 27 | f0=True, 28 | ): 29 | super().__init__() 30 | self.out_channels = out_channels 31 | self.hidden_channels = hidden_channels 32 | self.filter_channels = filter_channels 33 | self.n_heads = n_heads 34 | self.n_layers = n_layers 35 | self.kernel_size = kernel_size 36 | self.p_dropout = p_dropout 37 | self.emb_phone = nn.Linear(256, hidden_channels) 38 | self.lrelu = nn.LeakyReLU(0.1, inplace=True) 39 | if f0 == True: 40 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 41 | self.encoder = attentions.Encoder( 42 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 43 | ) 44 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 45 | 46 | def forward(self, phone, pitch, lengths): 47 | if pitch == None: 48 | x = self.emb_phone(phone) 49 | else: 50 | x = self.emb_phone(phone) + self.emb_pitch(pitch) 51 | x = x * math.sqrt(self.hidden_channels) # [b, t, h] 52 | x = self.lrelu(x) 53 | x = torch.transpose(x, 1, -1) # [b, h, t] 54 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( 55 | x.dtype 56 | ) 57 | x = self.encoder(x * x_mask, x_mask) 58 | stats = self.proj(x) * x_mask 59 | 60 | m, logs = torch.split(stats, self.out_channels, dim=1) 61 | return m, logs, x_mask 62 | 63 | 64 | class TextEncoder256Sim(nn.Module): 65 | def __init__( 66 | self, 67 | out_channels, 68 | hidden_channels, 69 | filter_channels, 70 | n_heads, 71 | n_layers, 72 | kernel_size, 73 | p_dropout, 74 | f0=True, 75 | ): 76 | super().__init__() 77 | self.out_channels = out_channels 78 | self.hidden_channels = hidden_channels 79 | self.filter_channels = filter_channels 80 | self.n_heads = n_heads 81 | self.n_layers = n_layers 82 | self.kernel_size = kernel_size 83 | self.p_dropout = p_dropout 84 | self.emb_phone = nn.Linear(256, hidden_channels) 85 | self.lrelu = nn.LeakyReLU(0.1, inplace=True) 86 | if f0 == True: 87 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 88 | self.encoder = attentions.Encoder( 89 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 90 | ) 91 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 92 | 93 | def forward(self, phone, pitch, lengths): 94 | if pitch == None: 95 | x = self.emb_phone(phone) 96 | else: 97 | x = self.emb_phone(phone) + self.emb_pitch(pitch) 98 | x = x * math.sqrt(self.hidden_channels) # [b, t, h] 99 | x = self.lrelu(x) 100 | x = torch.transpose(x, 1, -1) # [b, h, t] 101 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( 102 | x.dtype 103 | ) 104 | x = self.encoder(x * x_mask, x_mask) 105 | x = self.proj(x) * x_mask 106 | return x, x_mask 107 | 108 | 109 | class ResidualCouplingBlock(nn.Module): 110 | def __init__( 111 | self, 112 | channels, 113 | hidden_channels, 114 | kernel_size, 115 | dilation_rate, 116 | n_layers, 117 | n_flows=4, 118 | gin_channels=0, 119 | ): 120 | super().__init__() 121 | self.channels = channels 122 | self.hidden_channels = hidden_channels 123 | self.kernel_size = kernel_size 124 | self.dilation_rate = dilation_rate 125 | self.n_layers = n_layers 126 | self.n_flows = n_flows 127 | self.gin_channels = gin_channels 128 | 129 | self.flows = nn.ModuleList() 130 | for i in range(n_flows): 131 | self.flows.append( 132 | modules.ResidualCouplingLayer( 133 | channels, 134 | hidden_channels, 135 | kernel_size, 136 | dilation_rate, 137 | n_layers, 138 | gin_channels=gin_channels, 139 | mean_only=True, 140 | ) 141 | ) 142 | self.flows.append(modules.Flip()) 143 | 144 | def forward(self, x, x_mask, g=None, reverse=False): 145 | if not reverse: 146 | for flow in self.flows: 147 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 148 | else: 149 | for flow in reversed(self.flows): 150 | x = flow(x, x_mask, g=g, reverse=reverse) 151 | return x 152 | 153 | def remove_weight_norm(self): 154 | for i in range(self.n_flows): 155 | self.flows[i * 2].remove_weight_norm() 156 | 157 | 158 | class PosteriorEncoder(nn.Module): 159 | def __init__( 160 | self, 161 | in_channels, 162 | out_channels, 163 | hidden_channels, 164 | kernel_size, 165 | dilation_rate, 166 | n_layers, 167 | gin_channels=0, 168 | ): 169 | super().__init__() 170 | self.in_channels = in_channels 171 | self.out_channels = out_channels 172 | self.hidden_channels = hidden_channels 173 | self.kernel_size = kernel_size 174 | self.dilation_rate = dilation_rate 175 | self.n_layers = n_layers 176 | self.gin_channels = gin_channels 177 | 178 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 179 | self.enc = modules.WN( 180 | hidden_channels, 181 | kernel_size, 182 | dilation_rate, 183 | n_layers, 184 | gin_channels=gin_channels, 185 | ) 186 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 187 | 188 | def forward(self, x, x_lengths, g=None): 189 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( 190 | x.dtype 191 | ) 192 | x = self.pre(x) * x_mask 193 | x = self.enc(x, x_mask, g=g) 194 | stats = self.proj(x) * x_mask 195 | m, logs = torch.split(stats, self.out_channels, dim=1) 196 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 197 | return z, m, logs, x_mask 198 | 199 | def remove_weight_norm(self): 200 | self.enc.remove_weight_norm() 201 | 202 | 203 | class Generator(torch.nn.Module): 204 | def __init__( 205 | self, 206 | initial_channel, 207 | resblock, 208 | resblock_kernel_sizes, 209 | resblock_dilation_sizes, 210 | upsample_rates, 211 | upsample_initial_channel, 212 | upsample_kernel_sizes, 213 | gin_channels=0, 214 | ): 215 | super(Generator, self).__init__() 216 | self.num_kernels = len(resblock_kernel_sizes) 217 | self.num_upsamples = len(upsample_rates) 218 | self.conv_pre = Conv1d( 219 | initial_channel, upsample_initial_channel, 7, 1, padding=3 220 | ) 221 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 222 | 223 | self.ups = nn.ModuleList() 224 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 225 | self.ups.append( 226 | weight_norm( 227 | ConvTranspose1d( 228 | upsample_initial_channel // (2**i), 229 | upsample_initial_channel // (2 ** (i + 1)), 230 | k, 231 | u, 232 | padding=(k - u) // 2, 233 | ) 234 | ) 235 | ) 236 | 237 | self.resblocks = nn.ModuleList() 238 | for i in range(len(self.ups)): 239 | ch = upsample_initial_channel // (2 ** (i + 1)) 240 | for j, (k, d) in enumerate( 241 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 242 | ): 243 | self.resblocks.append(resblock(ch, k, d)) 244 | 245 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 246 | self.ups.apply(init_weights) 247 | 248 | if gin_channels != 0: 249 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 250 | 251 | def forward(self, x, g=None): 252 | x = self.conv_pre(x) 253 | if g is not None: 254 | x = x + self.cond(g) 255 | 256 | for i in range(self.num_upsamples): 257 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 258 | x = self.ups[i](x) 259 | xs = None 260 | for j in range(self.num_kernels): 261 | if xs is None: 262 | xs = self.resblocks[i * self.num_kernels + j](x) 263 | else: 264 | xs += self.resblocks[i * self.num_kernels + j](x) 265 | x = xs / self.num_kernels 266 | x = F.leaky_relu(x) 267 | x = self.conv_post(x) 268 | x = torch.tanh(x) 269 | 270 | return x 271 | 272 | def remove_weight_norm(self): 273 | for l in self.ups: 274 | remove_weight_norm(l) 275 | for l in self.resblocks: 276 | l.remove_weight_norm() 277 | 278 | 279 | class SineGen(torch.nn.Module): 280 | """Definition of sine generator 281 | SineGen(samp_rate, harmonic_num = 0, 282 | sine_amp = 0.1, noise_std = 0.003, 283 | voiced_threshold = 0, 284 | flag_for_pulse=False) 285 | samp_rate: sampling rate in Hz 286 | harmonic_num: number of harmonic overtones (default 0) 287 | sine_amp: amplitude of sine-wavefrom (default 0.1) 288 | noise_std: std of Gaussian noise (default 0.003) 289 | voiced_thoreshold: F0 threshold for U/V classification (default 0) 290 | flag_for_pulse: this SinGen is used inside PulseGen (default False) 291 | Note: when flag_for_pulse is True, the first time step of a voiced 292 | segment is always sin(np.pi) or cos(0) 293 | """ 294 | 295 | def __init__( 296 | self, 297 | samp_rate, 298 | harmonic_num=0, 299 | sine_amp=0.1, 300 | noise_std=0.003, 301 | voiced_threshold=0, 302 | flag_for_pulse=False, 303 | ): 304 | super(SineGen, self).__init__() 305 | self.sine_amp = sine_amp 306 | self.noise_std = noise_std 307 | self.harmonic_num = harmonic_num 308 | self.dim = self.harmonic_num + 1 309 | self.sampling_rate = samp_rate 310 | self.voiced_threshold = voiced_threshold 311 | 312 | def _f02uv(self, f0): 313 | # generate uv signal 314 | uv = torch.ones_like(f0) 315 | uv = uv * (f0 > self.voiced_threshold) 316 | return uv 317 | 318 | def forward(self, f0, upp): 319 | """sine_tensor, uv = forward(f0) 320 | input F0: tensor(batchsize=1, length, dim=1) 321 | f0 for unvoiced steps should be 0 322 | output sine_tensor: tensor(batchsize=1, length, dim) 323 | output uv: tensor(batchsize=1, length, 1) 324 | """ 325 | with torch.no_grad(): 326 | f0 = f0[:, None].transpose(1, 2) 327 | f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) 328 | # fundamental component 329 | f0_buf[:, :, 0] = f0[:, :, 0] 330 | for idx in np.arange(self.harmonic_num): 331 | f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( 332 | idx + 2 333 | ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic 334 | rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 335 | rand_ini = torch.rand( 336 | f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device 337 | ) 338 | rand_ini[:, 0] = 0 339 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini 340 | tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 341 | tmp_over_one *= upp 342 | tmp_over_one = F.interpolate( 343 | tmp_over_one.transpose(2, 1), 344 | scale_factor=upp, 345 | mode="linear", 346 | align_corners=True, 347 | ).transpose(2, 1) 348 | rad_values = F.interpolate( 349 | rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" 350 | ).transpose( 351 | 2, 1 352 | ) ####### 353 | tmp_over_one %= 1 354 | tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 355 | cumsum_shift = torch.zeros_like(rad_values) 356 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 357 | sine_waves = torch.sin( 358 | torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi 359 | ) 360 | sine_waves = sine_waves * self.sine_amp 361 | uv = self._f02uv(f0) 362 | uv = F.interpolate( 363 | uv.transpose(2, 1), scale_factor=upp, mode="nearest" 364 | ).transpose(2, 1) 365 | noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 366 | noise = noise_amp * torch.randn_like(sine_waves) 367 | sine_waves = sine_waves * uv + noise 368 | return sine_waves, uv, noise 369 | 370 | 371 | class SourceModuleHnNSF(torch.nn.Module): 372 | """SourceModule for hn-nsf 373 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, 374 | add_noise_std=0.003, voiced_threshod=0) 375 | sampling_rate: sampling_rate in Hz 376 | harmonic_num: number of harmonic above F0 (default: 0) 377 | sine_amp: amplitude of sine source signal (default: 0.1) 378 | add_noise_std: std of additive Gaussian noise (default: 0.003) 379 | note that amplitude of noise in unvoiced is decided 380 | by sine_amp 381 | voiced_threshold: threhold to set U/V given F0 (default: 0) 382 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) 383 | F0_sampled (batchsize, length, 1) 384 | Sine_source (batchsize, length, 1) 385 | noise_source (batchsize, length 1) 386 | uv (batchsize, length, 1) 387 | """ 388 | 389 | def __init__( 390 | self, 391 | sampling_rate, 392 | harmonic_num=0, 393 | sine_amp=0.1, 394 | add_noise_std=0.003, 395 | voiced_threshod=0, 396 | is_half=True, 397 | ): 398 | super(SourceModuleHnNSF, self).__init__() 399 | 400 | self.sine_amp = sine_amp 401 | self.noise_std = add_noise_std 402 | self.is_half = is_half 403 | # to produce sine waveforms 404 | self.l_sin_gen = SineGen( 405 | sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod 406 | ) 407 | 408 | # to merge source harmonics into a single excitation 409 | self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) 410 | self.l_tanh = torch.nn.Tanh() 411 | 412 | def forward(self, x, upp=None): 413 | sine_wavs, uv, _ = self.l_sin_gen(x, upp) 414 | if self.is_half: 415 | sine_wavs = sine_wavs.half() 416 | sine_merge = self.l_tanh(self.l_linear(sine_wavs)) 417 | return sine_merge, None, None # noise, uv 418 | 419 | 420 | class GeneratorNSF(torch.nn.Module): 421 | def __init__( 422 | self, 423 | initial_channel, 424 | resblock, 425 | resblock_kernel_sizes, 426 | resblock_dilation_sizes, 427 | upsample_rates, 428 | upsample_initial_channel, 429 | upsample_kernel_sizes, 430 | gin_channels, 431 | sr, 432 | is_half=False, 433 | ): 434 | super(GeneratorNSF, self).__init__() 435 | self.num_kernels = len(resblock_kernel_sizes) 436 | self.num_upsamples = len(upsample_rates) 437 | 438 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) 439 | self.m_source = SourceModuleHnNSF( 440 | sampling_rate=sr, harmonic_num=0, is_half=is_half 441 | ) 442 | self.noise_convs = nn.ModuleList() 443 | self.conv_pre = Conv1d( 444 | initial_channel, upsample_initial_channel, 7, 1, padding=3 445 | ) 446 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 447 | 448 | self.ups = nn.ModuleList() 449 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 450 | c_cur = upsample_initial_channel // (2 ** (i + 1)) 451 | self.ups.append( 452 | weight_norm( 453 | ConvTranspose1d( 454 | upsample_initial_channel // (2**i), 455 | upsample_initial_channel // (2 ** (i + 1)), 456 | k, 457 | u, 458 | padding=(k - u) // 2, 459 | ) 460 | ) 461 | ) 462 | if i + 1 < len(upsample_rates): 463 | stride_f0 = np.prod(upsample_rates[i + 1 :]) 464 | self.noise_convs.append( 465 | Conv1d( 466 | 1, 467 | c_cur, 468 | kernel_size=stride_f0 * 2, 469 | stride=stride_f0, 470 | padding=stride_f0 // 2, 471 | ) 472 | ) 473 | else: 474 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) 475 | 476 | self.resblocks = nn.ModuleList() 477 | for i in range(len(self.ups)): 478 | ch = upsample_initial_channel // (2 ** (i + 1)) 479 | for j, (k, d) in enumerate( 480 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 481 | ): 482 | self.resblocks.append(resblock(ch, k, d)) 483 | 484 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 485 | self.ups.apply(init_weights) 486 | 487 | if gin_channels != 0: 488 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 489 | 490 | self.upp = np.prod(upsample_rates) 491 | 492 | def forward(self, x, f0, g=None): 493 | har_source, noi_source, uv = self.m_source(f0, self.upp) 494 | har_source = har_source.transpose(1, 2) 495 | x = self.conv_pre(x) 496 | if g is not None: 497 | x = x + self.cond(g) 498 | 499 | for i in range(self.num_upsamples): 500 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 501 | x = self.ups[i](x) 502 | x_source = self.noise_convs[i](har_source) 503 | x = x + x_source 504 | xs = None 505 | for j in range(self.num_kernels): 506 | if xs is None: 507 | xs = self.resblocks[i * self.num_kernels + j](x) 508 | else: 509 | xs += self.resblocks[i * self.num_kernels + j](x) 510 | x = xs / self.num_kernels 511 | x = F.leaky_relu(x) 512 | x = self.conv_post(x) 513 | x = torch.tanh(x) 514 | return x 515 | 516 | def remove_weight_norm(self): 517 | for l in self.ups: 518 | remove_weight_norm(l) 519 | for l in self.resblocks: 520 | l.remove_weight_norm() 521 | 522 | 523 | sr2sr = { 524 | "32k": 32000, 525 | "40k": 40000, 526 | "48k": 48000, 527 | } 528 | 529 | 530 | class SynthesizerTrnMs256NSFsidM(nn.Module): 531 | def __init__( 532 | self, 533 | spec_channels, 534 | segment_size, 535 | inter_channels, 536 | hidden_channels, 537 | filter_channels, 538 | n_heads, 539 | n_layers, 540 | kernel_size, 541 | p_dropout, 542 | resblock, 543 | resblock_kernel_sizes, 544 | resblock_dilation_sizes, 545 | upsample_rates, 546 | upsample_initial_channel, 547 | upsample_kernel_sizes, 548 | spk_embed_dim, 549 | gin_channels, 550 | sr, 551 | **kwargs 552 | ): 553 | super().__init__() 554 | if type(sr) == type("strr"): 555 | sr = sr2sr[sr] 556 | self.spec_channels = spec_channels 557 | self.inter_channels = inter_channels 558 | self.hidden_channels = hidden_channels 559 | self.filter_channels = filter_channels 560 | self.n_heads = n_heads 561 | self.n_layers = n_layers 562 | self.kernel_size = kernel_size 563 | self.p_dropout = p_dropout 564 | self.resblock = resblock 565 | self.resblock_kernel_sizes = resblock_kernel_sizes 566 | self.resblock_dilation_sizes = resblock_dilation_sizes 567 | self.upsample_rates = upsample_rates 568 | self.upsample_initial_channel = upsample_initial_channel 569 | self.upsample_kernel_sizes = upsample_kernel_sizes 570 | self.segment_size = segment_size 571 | self.gin_channels = gin_channels 572 | # self.hop_length = hop_length# 573 | self.spk_embed_dim = spk_embed_dim 574 | self.enc_p = TextEncoder256( 575 | inter_channels, 576 | hidden_channels, 577 | filter_channels, 578 | n_heads, 579 | n_layers, 580 | kernel_size, 581 | p_dropout, 582 | ) 583 | self.dec = GeneratorNSF( 584 | inter_channels, 585 | resblock, 586 | resblock_kernel_sizes, 587 | resblock_dilation_sizes, 588 | upsample_rates, 589 | upsample_initial_channel, 590 | upsample_kernel_sizes, 591 | gin_channels=gin_channels, 592 | sr=sr, 593 | is_half=kwargs["is_half"], 594 | ) 595 | self.enc_q = PosteriorEncoder( 596 | spec_channels, 597 | inter_channels, 598 | hidden_channels, 599 | 5, 600 | 1, 601 | 16, 602 | gin_channels=gin_channels, 603 | ) 604 | self.flow = ResidualCouplingBlock( 605 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels 606 | ) 607 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) 608 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) 609 | 610 | def remove_weight_norm(self): 611 | self.dec.remove_weight_norm() 612 | self.flow.remove_weight_norm() 613 | self.enc_q.remove_weight_norm() 614 | 615 | def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None): 616 | g = self.emb_g(sid).unsqueeze(-1) 617 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) 618 | z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask 619 | z = self.flow(z_p, x_mask, g=g, reverse=True) 620 | o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) 621 | return o 622 | 623 | 624 | class SynthesizerTrnMs256NSFsid_sim(nn.Module): 625 | """ 626 | Synthesizer for Training 627 | """ 628 | 629 | def __init__( 630 | self, 631 | spec_channels, 632 | segment_size, 633 | inter_channels, 634 | hidden_channels, 635 | filter_channels, 636 | n_heads, 637 | n_layers, 638 | kernel_size, 639 | p_dropout, 640 | resblock, 641 | resblock_kernel_sizes, 642 | resblock_dilation_sizes, 643 | upsample_rates, 644 | upsample_initial_channel, 645 | upsample_kernel_sizes, 646 | spk_embed_dim, 647 | # hop_length, 648 | gin_channels=0, 649 | use_sdp=True, 650 | **kwargs 651 | ): 652 | super().__init__() 653 | self.spec_channels = spec_channels 654 | self.inter_channels = inter_channels 655 | self.hidden_channels = hidden_channels 656 | self.filter_channels = filter_channels 657 | self.n_heads = n_heads 658 | self.n_layers = n_layers 659 | self.kernel_size = kernel_size 660 | self.p_dropout = p_dropout 661 | self.resblock = resblock 662 | self.resblock_kernel_sizes = resblock_kernel_sizes 663 | self.resblock_dilation_sizes = resblock_dilation_sizes 664 | self.upsample_rates = upsample_rates 665 | self.upsample_initial_channel = upsample_initial_channel 666 | self.upsample_kernel_sizes = upsample_kernel_sizes 667 | self.segment_size = segment_size 668 | self.gin_channels = gin_channels 669 | # self.hop_length = hop_length# 670 | self.spk_embed_dim = spk_embed_dim 671 | self.enc_p = TextEncoder256Sim( 672 | inter_channels, 673 | hidden_channels, 674 | filter_channels, 675 | n_heads, 676 | n_layers, 677 | kernel_size, 678 | p_dropout, 679 | ) 680 | self.dec = GeneratorNSF( 681 | inter_channels, 682 | resblock, 683 | resblock_kernel_sizes, 684 | resblock_dilation_sizes, 685 | upsample_rates, 686 | upsample_initial_channel, 687 | upsample_kernel_sizes, 688 | gin_channels=gin_channels, 689 | is_half=kwargs["is_half"], 690 | ) 691 | 692 | self.flow = ResidualCouplingBlock( 693 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels 694 | ) 695 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) 696 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) 697 | 698 | def remove_weight_norm(self): 699 | self.dec.remove_weight_norm() 700 | self.flow.remove_weight_norm() 701 | self.enc_q.remove_weight_norm() 702 | 703 | def forward( 704 | self, phone, phone_lengths, pitch, pitchf, ds, max_len=None 705 | ): # y是spec不需要了现在 706 | g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 707 | x, x_mask = self.enc_p(phone, pitch, phone_lengths) 708 | x = self.flow(x, x_mask, g=g, reverse=True) 709 | o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g) 710 | return o 711 | 712 | 713 | class MultiPeriodDiscriminator(torch.nn.Module): 714 | def __init__(self, use_spectral_norm=False): 715 | super(MultiPeriodDiscriminator, self).__init__() 716 | periods = [2, 3, 5, 7, 11, 17] 717 | # periods = [3, 5, 7, 11, 17, 23, 37] 718 | 719 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 720 | discs = discs + [ 721 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 722 | ] 723 | self.discriminators = nn.ModuleList(discs) 724 | 725 | def forward(self, y, y_hat): 726 | y_d_rs = [] # 727 | y_d_gs = [] 728 | fmap_rs = [] 729 | fmap_gs = [] 730 | for i, d in enumerate(self.discriminators): 731 | y_d_r, fmap_r = d(y) 732 | y_d_g, fmap_g = d(y_hat) 733 | # for j in range(len(fmap_r)): 734 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) 735 | y_d_rs.append(y_d_r) 736 | y_d_gs.append(y_d_g) 737 | fmap_rs.append(fmap_r) 738 | fmap_gs.append(fmap_g) 739 | 740 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 741 | 742 | 743 | class DiscriminatorS(torch.nn.Module): 744 | def __init__(self, use_spectral_norm=False): 745 | super(DiscriminatorS, self).__init__() 746 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 747 | self.convs = nn.ModuleList( 748 | [ 749 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 750 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 751 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 752 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 753 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 754 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 755 | ] 756 | ) 757 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 758 | 759 | def forward(self, x): 760 | fmap = [] 761 | 762 | for l in self.convs: 763 | x = l(x) 764 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 765 | fmap.append(x) 766 | x = self.conv_post(x) 767 | fmap.append(x) 768 | x = torch.flatten(x, 1, -1) 769 | 770 | return x, fmap 771 | 772 | 773 | class DiscriminatorP(torch.nn.Module): 774 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 775 | super(DiscriminatorP, self).__init__() 776 | self.period = period 777 | self.use_spectral_norm = use_spectral_norm 778 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 779 | self.convs = nn.ModuleList( 780 | [ 781 | norm_f( 782 | Conv2d( 783 | 1, 784 | 32, 785 | (kernel_size, 1), 786 | (stride, 1), 787 | padding=(get_padding(kernel_size, 1), 0), 788 | ) 789 | ), 790 | norm_f( 791 | Conv2d( 792 | 32, 793 | 128, 794 | (kernel_size, 1), 795 | (stride, 1), 796 | padding=(get_padding(kernel_size, 1), 0), 797 | ) 798 | ), 799 | norm_f( 800 | Conv2d( 801 | 128, 802 | 512, 803 | (kernel_size, 1), 804 | (stride, 1), 805 | padding=(get_padding(kernel_size, 1), 0), 806 | ) 807 | ), 808 | norm_f( 809 | Conv2d( 810 | 512, 811 | 1024, 812 | (kernel_size, 1), 813 | (stride, 1), 814 | padding=(get_padding(kernel_size, 1), 0), 815 | ) 816 | ), 817 | norm_f( 818 | Conv2d( 819 | 1024, 820 | 1024, 821 | (kernel_size, 1), 822 | 1, 823 | padding=(get_padding(kernel_size, 1), 0), 824 | ) 825 | ), 826 | ] 827 | ) 828 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 829 | 830 | def forward(self, x): 831 | fmap = [] 832 | 833 | # 1d to 2d 834 | b, c, t = x.shape 835 | if t % self.period != 0: # pad first 836 | n_pad = self.period - (t % self.period) 837 | x = F.pad(x, (0, n_pad), "reflect") 838 | t = t + n_pad 839 | x = x.view(b, c, t // self.period, self.period) 840 | 841 | for l in self.convs: 842 | x = l(x) 843 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 844 | fmap.append(x) 845 | x = self.conv_post(x) 846 | fmap.append(x) 847 | x = torch.flatten(x, 1, -1) 848 | 849 | return x, fmap 850 | -------------------------------------------------------------------------------- /infer_pack/modelsv2.py: -------------------------------------------------------------------------------- 1 | import math, pdb, os 2 | from time import time as ttime 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from infer_pack import modules 7 | from infer_pack import attentions 8 | from infer_pack import commons 9 | from infer_pack.commons import init_weights, get_padding 10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 12 | from infer_pack.commons import init_weights 13 | import numpy as np 14 | from infer_pack import commons 15 | 16 | 17 | class TextEncoder256(nn.Module): 18 | def __init__( 19 | self, 20 | out_channels, 21 | hidden_channels, 22 | filter_channels, 23 | n_heads, 24 | n_layers, 25 | kernel_size, 26 | p_dropout, 27 | f0=True, 28 | ): 29 | super().__init__() 30 | self.out_channels = out_channels 31 | self.hidden_channels = hidden_channels 32 | self.filter_channels = filter_channels 33 | self.n_heads = n_heads 34 | self.n_layers = n_layers 35 | self.kernel_size = kernel_size 36 | self.p_dropout = p_dropout 37 | self.emb_phone = nn.Linear(256, hidden_channels) 38 | self.lrelu = nn.LeakyReLU(0.1, inplace=True) 39 | if f0 == True: 40 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 41 | self.encoder = attentions.Encoder( 42 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 43 | ) 44 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 45 | 46 | def forward(self, phone, pitch, lengths): 47 | if pitch == None: 48 | x = self.emb_phone(phone) 49 | else: 50 | x = self.emb_phone(phone) + self.emb_pitch(pitch) 51 | x = x * math.sqrt(self.hidden_channels) # [b, t, h] 52 | x = self.lrelu(x) 53 | x = torch.transpose(x, 1, -1) # [b, h, t] 54 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( 55 | x.dtype 56 | ) 57 | x = self.encoder(x * x_mask, x_mask) 58 | stats = self.proj(x) * x_mask 59 | 60 | m, logs = torch.split(stats, self.out_channels, dim=1) 61 | return m, logs, x_mask 62 | class TextEncoder768(nn.Module): 63 | def __init__( 64 | self, 65 | out_channels, 66 | hidden_channels, 67 | filter_channels, 68 | n_heads, 69 | n_layers, 70 | kernel_size, 71 | p_dropout, 72 | f0=True, 73 | ): 74 | super().__init__() 75 | self.out_channels = out_channels 76 | self.hidden_channels = hidden_channels 77 | self.filter_channels = filter_channels 78 | self.n_heads = n_heads 79 | self.n_layers = n_layers 80 | self.kernel_size = kernel_size 81 | self.p_dropout = p_dropout 82 | self.emb_phone = nn.Linear(768, hidden_channels) 83 | self.lrelu = nn.LeakyReLU(0.1, inplace=True) 84 | if f0 == True: 85 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 86 | self.encoder = attentions.Encoder( 87 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout 88 | ) 89 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 90 | 91 | def forward(self, phone, pitch, lengths): 92 | if pitch == None: 93 | x = self.emb_phone(phone) 94 | else: 95 | x = self.emb_phone(phone) + self.emb_pitch(pitch) 96 | x = x * math.sqrt(self.hidden_channels) # [b, t, h] 97 | x = self.lrelu(x) 98 | x = torch.transpose(x, 1, -1) # [b, h, t] 99 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( 100 | x.dtype 101 | ) 102 | x = self.encoder(x * x_mask, x_mask) 103 | stats = self.proj(x) * x_mask 104 | 105 | m, logs = torch.split(stats, self.out_channels, dim=1) 106 | return m, logs, x_mask 107 | 108 | class ResidualCouplingBlock(nn.Module): 109 | def __init__( 110 | self, 111 | channels, 112 | hidden_channels, 113 | kernel_size, 114 | dilation_rate, 115 | n_layers, 116 | n_flows=4, 117 | gin_channels=0, 118 | ): 119 | super().__init__() 120 | self.channels = channels 121 | self.hidden_channels = hidden_channels 122 | self.kernel_size = kernel_size 123 | self.dilation_rate = dilation_rate 124 | self.n_layers = n_layers 125 | self.n_flows = n_flows 126 | self.gin_channels = gin_channels 127 | 128 | self.flows = nn.ModuleList() 129 | for i in range(n_flows): 130 | self.flows.append( 131 | modules.ResidualCouplingLayer( 132 | channels, 133 | hidden_channels, 134 | kernel_size, 135 | dilation_rate, 136 | n_layers, 137 | gin_channels=gin_channels, 138 | mean_only=True, 139 | ) 140 | ) 141 | self.flows.append(modules.Flip()) 142 | 143 | def forward(self, x, x_mask, g=None, reverse=False): 144 | if not reverse: 145 | for flow in self.flows: 146 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 147 | else: 148 | for flow in reversed(self.flows): 149 | x = flow(x, x_mask, g=g, reverse=reverse) 150 | return x 151 | 152 | def remove_weight_norm(self): 153 | for i in range(self.n_flows): 154 | self.flows[i * 2].remove_weight_norm() 155 | 156 | 157 | class PosteriorEncoder(nn.Module): 158 | def __init__( 159 | self, 160 | in_channels, 161 | out_channels, 162 | hidden_channels, 163 | kernel_size, 164 | dilation_rate, 165 | n_layers, 166 | gin_channels=0, 167 | ): 168 | super().__init__() 169 | self.in_channels = in_channels 170 | self.out_channels = out_channels 171 | self.hidden_channels = hidden_channels 172 | self.kernel_size = kernel_size 173 | self.dilation_rate = dilation_rate 174 | self.n_layers = n_layers 175 | self.gin_channels = gin_channels 176 | 177 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 178 | self.enc = modules.WN( 179 | hidden_channels, 180 | kernel_size, 181 | dilation_rate, 182 | n_layers, 183 | gin_channels=gin_channels, 184 | ) 185 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 186 | 187 | def forward(self, x, x_lengths, g=None): 188 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( 189 | x.dtype 190 | ) 191 | x = self.pre(x) * x_mask 192 | x = self.enc(x, x_mask, g=g) 193 | stats = self.proj(x) * x_mask 194 | m, logs = torch.split(stats, self.out_channels, dim=1) 195 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 196 | return z, m, logs, x_mask 197 | 198 | def remove_weight_norm(self): 199 | self.enc.remove_weight_norm() 200 | 201 | 202 | class Generator(torch.nn.Module): 203 | def __init__( 204 | self, 205 | initial_channel, 206 | resblock, 207 | resblock_kernel_sizes, 208 | resblock_dilation_sizes, 209 | upsample_rates, 210 | upsample_initial_channel, 211 | upsample_kernel_sizes, 212 | gin_channels=0, 213 | ): 214 | super(Generator, self).__init__() 215 | self.num_kernels = len(resblock_kernel_sizes) 216 | self.num_upsamples = len(upsample_rates) 217 | self.conv_pre = Conv1d( 218 | initial_channel, upsample_initial_channel, 7, 1, padding=3 219 | ) 220 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 221 | 222 | self.ups = nn.ModuleList() 223 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 224 | self.ups.append( 225 | weight_norm( 226 | ConvTranspose1d( 227 | upsample_initial_channel // (2**i), 228 | upsample_initial_channel // (2 ** (i + 1)), 229 | k, 230 | u, 231 | padding=(k - u) // 2, 232 | ) 233 | ) 234 | ) 235 | 236 | self.resblocks = nn.ModuleList() 237 | for i in range(len(self.ups)): 238 | ch = upsample_initial_channel // (2 ** (i + 1)) 239 | for j, (k, d) in enumerate( 240 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 241 | ): 242 | self.resblocks.append(resblock(ch, k, d)) 243 | 244 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 245 | self.ups.apply(init_weights) 246 | 247 | if gin_channels != 0: 248 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 249 | 250 | def forward(self, x, g=None): 251 | x = self.conv_pre(x) 252 | if g is not None: 253 | x = x + self.cond(g) 254 | 255 | for i in range(self.num_upsamples): 256 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 257 | x = self.ups[i](x) 258 | xs = None 259 | for j in range(self.num_kernels): 260 | if xs is None: 261 | xs = self.resblocks[i * self.num_kernels + j](x) 262 | else: 263 | xs += self.resblocks[i * self.num_kernels + j](x) 264 | x = xs / self.num_kernels 265 | x = F.leaky_relu(x) 266 | x = self.conv_post(x) 267 | x = torch.tanh(x) 268 | 269 | return x 270 | 271 | def remove_weight_norm(self): 272 | for l in self.ups: 273 | remove_weight_norm(l) 274 | for l in self.resblocks: 275 | l.remove_weight_norm() 276 | 277 | 278 | class SineGen(torch.nn.Module): 279 | """Definition of sine generator 280 | SineGen(samp_rate, harmonic_num = 0, 281 | sine_amp = 0.1, noise_std = 0.003, 282 | voiced_threshold = 0, 283 | flag_for_pulse=False) 284 | samp_rate: sampling rate in Hz 285 | harmonic_num: number of harmonic overtones (default 0) 286 | sine_amp: amplitude of sine-wavefrom (default 0.1) 287 | noise_std: std of Gaussian noise (default 0.003) 288 | voiced_thoreshold: F0 threshold for U/V classification (default 0) 289 | flag_for_pulse: this SinGen is used inside PulseGen (default False) 290 | Note: when flag_for_pulse is True, the first time step of a voiced 291 | segment is always sin(np.pi) or cos(0) 292 | """ 293 | 294 | def __init__( 295 | self, 296 | samp_rate, 297 | harmonic_num=0, 298 | sine_amp=0.1, 299 | noise_std=0.003, 300 | voiced_threshold=0, 301 | flag_for_pulse=False, 302 | ): 303 | super(SineGen, self).__init__() 304 | self.sine_amp = sine_amp 305 | self.noise_std = noise_std 306 | self.harmonic_num = harmonic_num 307 | self.dim = self.harmonic_num + 1 308 | self.sampling_rate = samp_rate 309 | self.voiced_threshold = voiced_threshold 310 | 311 | def _f02uv(self, f0): 312 | # generate uv signal 313 | uv = torch.ones_like(f0) 314 | uv = uv * (f0 > self.voiced_threshold) 315 | return uv 316 | 317 | def forward(self, f0, upp): 318 | """sine_tensor, uv = forward(f0) 319 | input F0: tensor(batchsize=1, length, dim=1) 320 | f0 for unvoiced steps should be 0 321 | output sine_tensor: tensor(batchsize=1, length, dim) 322 | output uv: tensor(batchsize=1, length, 1) 323 | """ 324 | with torch.no_grad(): 325 | f0 = f0[:, None].transpose(1, 2) 326 | f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) 327 | # fundamental component 328 | f0_buf[:, :, 0] = f0[:, :, 0] 329 | for idx in np.arange(self.harmonic_num): 330 | f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( 331 | idx + 2 332 | ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic 333 | rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 334 | rand_ini = torch.rand( 335 | f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device 336 | ) 337 | rand_ini[:, 0] = 0 338 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini 339 | tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 340 | tmp_over_one *= upp 341 | tmp_over_one = F.interpolate( 342 | tmp_over_one.transpose(2, 1), 343 | scale_factor=upp, 344 | mode="linear", 345 | align_corners=True, 346 | ).transpose(2, 1) 347 | rad_values = F.interpolate( 348 | rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" 349 | ).transpose( 350 | 2, 1 351 | ) ####### 352 | tmp_over_one %= 1 353 | tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 354 | cumsum_shift = torch.zeros_like(rad_values) 355 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 356 | sine_waves = torch.sin( 357 | torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi 358 | ) 359 | sine_waves = sine_waves * self.sine_amp 360 | uv = self._f02uv(f0) 361 | uv = F.interpolate( 362 | uv.transpose(2, 1), scale_factor=upp, mode="nearest" 363 | ).transpose(2, 1) 364 | noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 365 | noise = noise_amp * torch.randn_like(sine_waves) 366 | sine_waves = sine_waves * uv + noise 367 | return sine_waves, uv, noise 368 | 369 | 370 | class SourceModuleHnNSF(torch.nn.Module): 371 | """SourceModule for hn-nsf 372 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, 373 | add_noise_std=0.003, voiced_threshod=0) 374 | sampling_rate: sampling_rate in Hz 375 | harmonic_num: number of harmonic above F0 (default: 0) 376 | sine_amp: amplitude of sine source signal (default: 0.1) 377 | add_noise_std: std of additive Gaussian noise (default: 0.003) 378 | note that amplitude of noise in unvoiced is decided 379 | by sine_amp 380 | voiced_threshold: threhold to set U/V given F0 (default: 0) 381 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) 382 | F0_sampled (batchsize, length, 1) 383 | Sine_source (batchsize, length, 1) 384 | noise_source (batchsize, length 1) 385 | uv (batchsize, length, 1) 386 | """ 387 | 388 | def __init__( 389 | self, 390 | sampling_rate, 391 | harmonic_num=0, 392 | sine_amp=0.1, 393 | add_noise_std=0.003, 394 | voiced_threshod=0, 395 | is_half=True, 396 | ): 397 | super(SourceModuleHnNSF, self).__init__() 398 | 399 | self.sine_amp = sine_amp 400 | self.noise_std = add_noise_std 401 | self.is_half = is_half 402 | # to produce sine waveforms 403 | self.l_sin_gen = SineGen( 404 | sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod 405 | ) 406 | 407 | # to merge source harmonics into a single excitation 408 | self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) 409 | self.l_tanh = torch.nn.Tanh() 410 | 411 | def forward(self, x, upp=None): 412 | sine_wavs, uv, _ = self.l_sin_gen(x, upp) 413 | if self.is_half: 414 | sine_wavs = sine_wavs.half() 415 | sine_merge = self.l_tanh(self.l_linear(sine_wavs)) 416 | return sine_merge, None, None # noise, uv 417 | 418 | 419 | class GeneratorNSF(torch.nn.Module): 420 | def __init__( 421 | self, 422 | initial_channel, 423 | resblock, 424 | resblock_kernel_sizes, 425 | resblock_dilation_sizes, 426 | upsample_rates, 427 | upsample_initial_channel, 428 | upsample_kernel_sizes, 429 | gin_channels, 430 | sr, 431 | is_half=False, 432 | ): 433 | super(GeneratorNSF, self).__init__() 434 | self.num_kernels = len(resblock_kernel_sizes) 435 | self.num_upsamples = len(upsample_rates) 436 | 437 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) 438 | self.m_source = SourceModuleHnNSF( 439 | sampling_rate=sr, harmonic_num=0, is_half=is_half 440 | ) 441 | self.noise_convs = nn.ModuleList() 442 | self.conv_pre = Conv1d( 443 | initial_channel, upsample_initial_channel, 7, 1, padding=3 444 | ) 445 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 446 | 447 | self.ups = nn.ModuleList() 448 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 449 | c_cur = upsample_initial_channel // (2 ** (i + 1)) 450 | self.ups.append( 451 | weight_norm( 452 | ConvTranspose1d( 453 | upsample_initial_channel // (2**i), 454 | upsample_initial_channel // (2 ** (i + 1)), 455 | k, 456 | u, 457 | padding=(k - u) // 2, 458 | ) 459 | ) 460 | ) 461 | if i + 1 < len(upsample_rates): 462 | stride_f0 = np.prod(upsample_rates[i + 1 :]) 463 | self.noise_convs.append( 464 | Conv1d( 465 | 1, 466 | c_cur, 467 | kernel_size=stride_f0 * 2, 468 | stride=stride_f0, 469 | padding=stride_f0 // 2, 470 | ) 471 | ) 472 | else: 473 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) 474 | 475 | self.resblocks = nn.ModuleList() 476 | for i in range(len(self.ups)): 477 | ch = upsample_initial_channel // (2 ** (i + 1)) 478 | for j, (k, d) in enumerate( 479 | zip(resblock_kernel_sizes, resblock_dilation_sizes) 480 | ): 481 | self.resblocks.append(resblock(ch, k, d)) 482 | 483 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 484 | self.ups.apply(init_weights) 485 | 486 | if gin_channels != 0: 487 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) 488 | 489 | self.upp = np.prod(upsample_rates) 490 | 491 | def forward(self, x, f0, g=None): 492 | har_source, noi_source, uv = self.m_source(f0, self.upp) 493 | har_source = har_source.transpose(1, 2) 494 | x = self.conv_pre(x) 495 | if g is not None: 496 | x = x + self.cond(g) 497 | 498 | for i in range(self.num_upsamples): 499 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 500 | x = self.ups[i](x) 501 | x_source = self.noise_convs[i](har_source) 502 | x = x + x_source 503 | xs = None 504 | for j in range(self.num_kernels): 505 | if xs is None: 506 | xs = self.resblocks[i * self.num_kernels + j](x) 507 | else: 508 | xs += self.resblocks[i * self.num_kernels + j](x) 509 | x = xs / self.num_kernels 510 | x = F.leaky_relu(x) 511 | x = self.conv_post(x) 512 | x = torch.tanh(x) 513 | return x 514 | 515 | def remove_weight_norm(self): 516 | for l in self.ups: 517 | remove_weight_norm(l) 518 | for l in self.resblocks: 519 | l.remove_weight_norm() 520 | 521 | 522 | sr2sr = { 523 | "32k": 32000, 524 | "40k": 40000, 525 | "48k": 48000, 526 | } 527 | 528 | 529 | class SynthesizerTrnMs256NSFsid(nn.Module): 530 | def __init__( 531 | self, 532 | spec_channels, 533 | segment_size, 534 | inter_channels, 535 | hidden_channels, 536 | filter_channels, 537 | n_heads, 538 | n_layers, 539 | kernel_size, 540 | p_dropout, 541 | resblock, 542 | resblock_kernel_sizes, 543 | resblock_dilation_sizes, 544 | upsample_rates, 545 | upsample_initial_channel, 546 | upsample_kernel_sizes, 547 | spk_embed_dim, 548 | gin_channels, 549 | sr, 550 | **kwargs 551 | ): 552 | super().__init__() 553 | if type(sr) == type("strr"): 554 | sr = sr2sr[sr] 555 | self.spec_channels = spec_channels 556 | self.inter_channels = inter_channels 557 | self.hidden_channels = hidden_channels 558 | self.filter_channels = filter_channels 559 | self.n_heads = n_heads 560 | self.n_layers = n_layers 561 | self.kernel_size = kernel_size 562 | self.p_dropout = p_dropout 563 | self.resblock = resblock 564 | self.resblock_kernel_sizes = resblock_kernel_sizes 565 | self.resblock_dilation_sizes = resblock_dilation_sizes 566 | self.upsample_rates = upsample_rates 567 | self.upsample_initial_channel = upsample_initial_channel 568 | self.upsample_kernel_sizes = upsample_kernel_sizes 569 | self.segment_size = segment_size 570 | self.gin_channels = gin_channels 571 | # self.hop_length = hop_length# 572 | self.spk_embed_dim = spk_embed_dim 573 | self.enc_p = TextEncoder256( 574 | inter_channels, 575 | hidden_channels, 576 | filter_channels, 577 | n_heads, 578 | n_layers, 579 | kernel_size, 580 | p_dropout, 581 | ) 582 | self.dec = GeneratorNSF( 583 | inter_channels, 584 | resblock, 585 | resblock_kernel_sizes, 586 | resblock_dilation_sizes, 587 | upsample_rates, 588 | upsample_initial_channel, 589 | upsample_kernel_sizes, 590 | gin_channels=gin_channels, 591 | sr=sr, 592 | is_half=kwargs["is_half"], 593 | ) 594 | self.enc_q = PosteriorEncoder( 595 | spec_channels, 596 | inter_channels, 597 | hidden_channels, 598 | 5, 599 | 1, 600 | 16, 601 | gin_channels=gin_channels, 602 | ) 603 | self.flow = ResidualCouplingBlock( 604 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels 605 | ) 606 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) 607 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) 608 | 609 | def remove_weight_norm(self): 610 | self.dec.remove_weight_norm() 611 | self.flow.remove_weight_norm() 612 | self.enc_q.remove_weight_norm() 613 | 614 | def forward( 615 | self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds 616 | ): # 这里ds是id,[bs,1] 617 | # print(1,pitch.shape)#[bs,t] 618 | g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 619 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) 620 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) 621 | z_p = self.flow(z, y_mask, g=g) 622 | z_slice, ids_slice = commons.rand_slice_segments( 623 | z, y_lengths, self.segment_size 624 | ) 625 | # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) 626 | pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) 627 | # print(-2,pitchf.shape,z_slice.shape) 628 | o = self.dec(z_slice, pitchf, g=g) 629 | return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) 630 | 631 | def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): 632 | g = self.emb_g(sid).unsqueeze(-1) 633 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) 634 | z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask 635 | z = self.flow(z_p, x_mask, g=g, reverse=True) 636 | o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) 637 | return o, x_mask, (z, z_p, m_p, logs_p) 638 | class SynthesizerTrnMs768NSFsid(nn.Module): 639 | def __init__( 640 | self, 641 | spec_channels, 642 | segment_size, 643 | inter_channels, 644 | hidden_channels, 645 | filter_channels, 646 | n_heads, 647 | n_layers, 648 | kernel_size, 649 | p_dropout, 650 | resblock, 651 | resblock_kernel_sizes, 652 | resblock_dilation_sizes, 653 | upsample_rates, 654 | upsample_initial_channel, 655 | upsample_kernel_sizes, 656 | spk_embed_dim, 657 | gin_channels, 658 | sr, 659 | **kwargs 660 | ): 661 | super().__init__() 662 | if type(sr) == type("strr"): 663 | sr = sr2sr[sr] 664 | self.spec_channels = spec_channels 665 | self.inter_channels = inter_channels 666 | self.hidden_channels = hidden_channels 667 | self.filter_channels = filter_channels 668 | self.n_heads = n_heads 669 | self.n_layers = n_layers 670 | self.kernel_size = kernel_size 671 | self.p_dropout = p_dropout 672 | self.resblock = resblock 673 | self.resblock_kernel_sizes = resblock_kernel_sizes 674 | self.resblock_dilation_sizes = resblock_dilation_sizes 675 | self.upsample_rates = upsample_rates 676 | self.upsample_initial_channel = upsample_initial_channel 677 | self.upsample_kernel_sizes = upsample_kernel_sizes 678 | self.segment_size = segment_size 679 | self.gin_channels = gin_channels 680 | # self.hop_length = hop_length# 681 | self.spk_embed_dim = spk_embed_dim 682 | self.enc_p = TextEncoder768( 683 | inter_channels, 684 | hidden_channels, 685 | filter_channels, 686 | n_heads, 687 | n_layers, 688 | kernel_size, 689 | p_dropout, 690 | ) 691 | self.dec = GeneratorNSF( 692 | inter_channels, 693 | resblock, 694 | resblock_kernel_sizes, 695 | resblock_dilation_sizes, 696 | upsample_rates, 697 | upsample_initial_channel, 698 | upsample_kernel_sizes, 699 | gin_channels=gin_channels, 700 | sr=sr, 701 | is_half=kwargs["is_half"], 702 | ) 703 | self.enc_q = PosteriorEncoder( 704 | spec_channels, 705 | inter_channels, 706 | hidden_channels, 707 | 5, 708 | 1, 709 | 16, 710 | gin_channels=gin_channels, 711 | ) 712 | self.flow = ResidualCouplingBlock( 713 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels 714 | ) 715 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) 716 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) 717 | 718 | def remove_weight_norm(self): 719 | self.dec.remove_weight_norm() 720 | self.flow.remove_weight_norm() 721 | self.enc_q.remove_weight_norm() 722 | 723 | def forward( 724 | self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds 725 | ): # 这里ds是id,[bs,1] 726 | # print(1,pitch.shape)#[bs,t] 727 | g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 728 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) 729 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) 730 | z_p = self.flow(z, y_mask, g=g) 731 | z_slice, ids_slice = commons.rand_slice_segments( 732 | z, y_lengths, self.segment_size 733 | ) 734 | # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) 735 | pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) 736 | # print(-2,pitchf.shape,z_slice.shape) 737 | o = self.dec(z_slice, pitchf, g=g) 738 | return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) 739 | 740 | def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): 741 | g = self.emb_g(sid).unsqueeze(-1) 742 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) 743 | z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask 744 | z = self.flow(z_p, x_mask, g=g, reverse=True) 745 | o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) 746 | return o, x_mask, (z, z_p, m_p, logs_p) 747 | 748 | 749 | class SynthesizerTrnMs256NSFsid_nono(nn.Module): 750 | def __init__( 751 | self, 752 | spec_channels, 753 | segment_size, 754 | inter_channels, 755 | hidden_channels, 756 | filter_channels, 757 | n_heads, 758 | n_layers, 759 | kernel_size, 760 | p_dropout, 761 | resblock, 762 | resblock_kernel_sizes, 763 | resblock_dilation_sizes, 764 | upsample_rates, 765 | upsample_initial_channel, 766 | upsample_kernel_sizes, 767 | spk_embed_dim, 768 | gin_channels, 769 | sr=None, 770 | **kwargs 771 | ): 772 | super().__init__() 773 | self.spec_channels = spec_channels 774 | self.inter_channels = inter_channels 775 | self.hidden_channels = hidden_channels 776 | self.filter_channels = filter_channels 777 | self.n_heads = n_heads 778 | self.n_layers = n_layers 779 | self.kernel_size = kernel_size 780 | self.p_dropout = p_dropout 781 | self.resblock = resblock 782 | self.resblock_kernel_sizes = resblock_kernel_sizes 783 | self.resblock_dilation_sizes = resblock_dilation_sizes 784 | self.upsample_rates = upsample_rates 785 | self.upsample_initial_channel = upsample_initial_channel 786 | self.upsample_kernel_sizes = upsample_kernel_sizes 787 | self.segment_size = segment_size 788 | self.gin_channels = gin_channels 789 | # self.hop_length = hop_length# 790 | self.spk_embed_dim = spk_embed_dim 791 | self.enc_p = TextEncoder256( 792 | inter_channels, 793 | hidden_channels, 794 | filter_channels, 795 | n_heads, 796 | n_layers, 797 | kernel_size, 798 | p_dropout, 799 | f0=False, 800 | ) 801 | self.dec = Generator( 802 | inter_channels, 803 | resblock, 804 | resblock_kernel_sizes, 805 | resblock_dilation_sizes, 806 | upsample_rates, 807 | upsample_initial_channel, 808 | upsample_kernel_sizes, 809 | gin_channels=gin_channels, 810 | ) 811 | self.enc_q = PosteriorEncoder( 812 | spec_channels, 813 | inter_channels, 814 | hidden_channels, 815 | 5, 816 | 1, 817 | 16, 818 | gin_channels=gin_channels, 819 | ) 820 | self.flow = ResidualCouplingBlock( 821 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels 822 | ) 823 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) 824 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) 825 | 826 | def remove_weight_norm(self): 827 | self.dec.remove_weight_norm() 828 | self.flow.remove_weight_norm() 829 | self.enc_q.remove_weight_norm() 830 | 831 | def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] 832 | g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 833 | m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) 834 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) 835 | z_p = self.flow(z, y_mask, g=g) 836 | z_slice, ids_slice = commons.rand_slice_segments( 837 | z, y_lengths, self.segment_size 838 | ) 839 | o = self.dec(z_slice, g=g) 840 | return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) 841 | 842 | def infer(self, phone, phone_lengths, sid, max_len=None): 843 | g = self.emb_g(sid).unsqueeze(-1) 844 | m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) 845 | z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask 846 | z = self.flow(z_p, x_mask, g=g, reverse=True) 847 | o = self.dec((z * x_mask)[:, :, :max_len], g=g) 848 | return o, x_mask, (z, z_p, m_p, logs_p) 849 | class SynthesizerTrnMs768NSFsid_nono(nn.Module): 850 | def __init__( 851 | self, 852 | spec_channels, 853 | segment_size, 854 | inter_channels, 855 | hidden_channels, 856 | filter_channels, 857 | n_heads, 858 | n_layers, 859 | kernel_size, 860 | p_dropout, 861 | resblock, 862 | resblock_kernel_sizes, 863 | resblock_dilation_sizes, 864 | upsample_rates, 865 | upsample_initial_channel, 866 | upsample_kernel_sizes, 867 | spk_embed_dim, 868 | gin_channels, 869 | sr=None, 870 | **kwargs 871 | ): 872 | super().__init__() 873 | self.spec_channels = spec_channels 874 | self.inter_channels = inter_channels 875 | self.hidden_channels = hidden_channels 876 | self.filter_channels = filter_channels 877 | self.n_heads = n_heads 878 | self.n_layers = n_layers 879 | self.kernel_size = kernel_size 880 | self.p_dropout = p_dropout 881 | self.resblock = resblock 882 | self.resblock_kernel_sizes = resblock_kernel_sizes 883 | self.resblock_dilation_sizes = resblock_dilation_sizes 884 | self.upsample_rates = upsample_rates 885 | self.upsample_initial_channel = upsample_initial_channel 886 | self.upsample_kernel_sizes = upsample_kernel_sizes 887 | self.segment_size = segment_size 888 | self.gin_channels = gin_channels 889 | # self.hop_length = hop_length# 890 | self.spk_embed_dim = spk_embed_dim 891 | self.enc_p = TextEncoder768( 892 | inter_channels, 893 | hidden_channels, 894 | filter_channels, 895 | n_heads, 896 | n_layers, 897 | kernel_size, 898 | p_dropout, 899 | f0=False, 900 | ) 901 | self.dec = Generator( 902 | inter_channels, 903 | resblock, 904 | resblock_kernel_sizes, 905 | resblock_dilation_sizes, 906 | upsample_rates, 907 | upsample_initial_channel, 908 | upsample_kernel_sizes, 909 | gin_channels=gin_channels, 910 | ) 911 | self.enc_q = PosteriorEncoder( 912 | spec_channels, 913 | inter_channels, 914 | hidden_channels, 915 | 5, 916 | 1, 917 | 16, 918 | gin_channels=gin_channels, 919 | ) 920 | self.flow = ResidualCouplingBlock( 921 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels 922 | ) 923 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) 924 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) 925 | 926 | def remove_weight_norm(self): 927 | self.dec.remove_weight_norm() 928 | self.flow.remove_weight_norm() 929 | self.enc_q.remove_weight_norm() 930 | 931 | def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] 932 | g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 933 | m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) 934 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) 935 | z_p = self.flow(z, y_mask, g=g) 936 | z_slice, ids_slice = commons.rand_slice_segments( 937 | z, y_lengths, self.segment_size 938 | ) 939 | o = self.dec(z_slice, g=g) 940 | return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) 941 | 942 | def infer(self, phone, phone_lengths, sid, max_len=None): 943 | g = self.emb_g(sid).unsqueeze(-1) 944 | m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) 945 | z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask 946 | z = self.flow(z_p, x_mask, g=g, reverse=True) 947 | o = self.dec((z * x_mask)[:, :, :max_len], g=g) 948 | return o, x_mask, (z, z_p, m_p, logs_p) 949 | 950 | 951 | class MultiPeriodDiscriminator(torch.nn.Module): 952 | def __init__(self, use_spectral_norm=False): 953 | super(MultiPeriodDiscriminator, self).__init__() 954 | periods = [2, 3, 5, 7, 11, 17] 955 | # periods = [3, 5, 7, 11, 17, 23, 37] 956 | 957 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 958 | discs = discs + [ 959 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 960 | ] 961 | self.discriminators = nn.ModuleList(discs) 962 | 963 | def forward(self, y, y_hat): 964 | y_d_rs = [] # 965 | y_d_gs = [] 966 | fmap_rs = [] 967 | fmap_gs = [] 968 | for i, d in enumerate(self.discriminators): 969 | y_d_r, fmap_r = d(y) 970 | y_d_g, fmap_g = d(y_hat) 971 | # for j in range(len(fmap_r)): 972 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) 973 | y_d_rs.append(y_d_r) 974 | y_d_gs.append(y_d_g) 975 | fmap_rs.append(fmap_r) 976 | fmap_gs.append(fmap_g) 977 | 978 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 979 | 980 | class MultiPeriodDiscriminatorV2(torch.nn.Module): 981 | def __init__(self, use_spectral_norm=False): 982 | super(MultiPeriodDiscriminatorV2, self).__init__() 983 | # periods = [2, 3, 5, 7, 11, 17] 984 | periods = [2,3, 5, 7, 11, 17, 23, 37] 985 | 986 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 987 | discs = discs + [ 988 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods 989 | ] 990 | self.discriminators = nn.ModuleList(discs) 991 | 992 | def forward(self, y, y_hat): 993 | y_d_rs = [] # 994 | y_d_gs = [] 995 | fmap_rs = [] 996 | fmap_gs = [] 997 | for i, d in enumerate(self.discriminators): 998 | y_d_r, fmap_r = d(y) 999 | y_d_g, fmap_g = d(y_hat) 1000 | # for j in range(len(fmap_r)): 1001 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) 1002 | y_d_rs.append(y_d_r) 1003 | y_d_gs.append(y_d_g) 1004 | fmap_rs.append(fmap_r) 1005 | fmap_gs.append(fmap_g) 1006 | 1007 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 1008 | 1009 | 1010 | class DiscriminatorS(torch.nn.Module): 1011 | def __init__(self, use_spectral_norm=False): 1012 | super(DiscriminatorS, self).__init__() 1013 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 1014 | self.convs = nn.ModuleList( 1015 | [ 1016 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 1017 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 1018 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 1019 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 1020 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 1021 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 1022 | ] 1023 | ) 1024 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 1025 | 1026 | def forward(self, x): 1027 | fmap = [] 1028 | 1029 | for l in self.convs: 1030 | x = l(x) 1031 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 1032 | fmap.append(x) 1033 | x = self.conv_post(x) 1034 | fmap.append(x) 1035 | x = torch.flatten(x, 1, -1) 1036 | 1037 | return x, fmap 1038 | 1039 | 1040 | class DiscriminatorP(torch.nn.Module): 1041 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 1042 | super(DiscriminatorP, self).__init__() 1043 | self.period = period 1044 | self.use_spectral_norm = use_spectral_norm 1045 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 1046 | self.convs = nn.ModuleList( 1047 | [ 1048 | norm_f( 1049 | Conv2d( 1050 | 1, 1051 | 32, 1052 | (kernel_size, 1), 1053 | (stride, 1), 1054 | padding=(get_padding(kernel_size, 1), 0), 1055 | ) 1056 | ), 1057 | norm_f( 1058 | Conv2d( 1059 | 32, 1060 | 128, 1061 | (kernel_size, 1), 1062 | (stride, 1), 1063 | padding=(get_padding(kernel_size, 1), 0), 1064 | ) 1065 | ), 1066 | norm_f( 1067 | Conv2d( 1068 | 128, 1069 | 512, 1070 | (kernel_size, 1), 1071 | (stride, 1), 1072 | padding=(get_padding(kernel_size, 1), 0), 1073 | ) 1074 | ), 1075 | norm_f( 1076 | Conv2d( 1077 | 512, 1078 | 1024, 1079 | (kernel_size, 1), 1080 | (stride, 1), 1081 | padding=(get_padding(kernel_size, 1), 0), 1082 | ) 1083 | ), 1084 | norm_f( 1085 | Conv2d( 1086 | 1024, 1087 | 1024, 1088 | (kernel_size, 1), 1089 | 1, 1090 | padding=(get_padding(kernel_size, 1), 0), 1091 | ) 1092 | ), 1093 | ] 1094 | ) 1095 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 1096 | 1097 | def forward(self, x): 1098 | fmap = [] 1099 | 1100 | # 1d to 2d 1101 | b, c, t = x.shape 1102 | if t % self.period != 0: # pad first 1103 | n_pad = self.period - (t % self.period) 1104 | x = F.pad(x, (0, n_pad), "reflect") 1105 | t = t + n_pad 1106 | x = x.view(b, c, t // self.period, self.period) 1107 | 1108 | for l in self.convs: 1109 | x = l(x) 1110 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 1111 | fmap.append(x) 1112 | x = self.conv_post(x) 1113 | fmap.append(x) 1114 | x = torch.flatten(x, 1, -1) 1115 | 1116 | return x, fmap 1117 | -------------------------------------------------------------------------------- /infer_pack/modules.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import scipy 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | 9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 10 | from torch.nn.utils import weight_norm, remove_weight_norm 11 | 12 | from infer_pack import commons 13 | from infer_pack.commons import init_weights, get_padding 14 | from infer_pack.transforms import piecewise_rational_quadratic_transform 15 | 16 | 17 | LRELU_SLOPE = 0.1 18 | 19 | 20 | class LayerNorm(nn.Module): 21 | def __init__(self, channels, eps=1e-5): 22 | super().__init__() 23 | self.channels = channels 24 | self.eps = eps 25 | 26 | self.gamma = nn.Parameter(torch.ones(channels)) 27 | self.beta = nn.Parameter(torch.zeros(channels)) 28 | 29 | def forward(self, x): 30 | x = x.transpose(1, -1) 31 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 32 | return x.transpose(1, -1) 33 | 34 | 35 | class ConvReluNorm(nn.Module): 36 | def __init__( 37 | self, 38 | in_channels, 39 | hidden_channels, 40 | out_channels, 41 | kernel_size, 42 | n_layers, 43 | p_dropout, 44 | ): 45 | super().__init__() 46 | self.in_channels = in_channels 47 | self.hidden_channels = hidden_channels 48 | self.out_channels = out_channels 49 | self.kernel_size = kernel_size 50 | self.n_layers = n_layers 51 | self.p_dropout = p_dropout 52 | assert n_layers > 1, "Number of layers should be larger than 0." 53 | 54 | self.conv_layers = nn.ModuleList() 55 | self.norm_layers = nn.ModuleList() 56 | self.conv_layers.append( 57 | nn.Conv1d( 58 | in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 59 | ) 60 | ) 61 | self.norm_layers.append(LayerNorm(hidden_channels)) 62 | self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) 63 | for _ in range(n_layers - 1): 64 | self.conv_layers.append( 65 | nn.Conv1d( 66 | hidden_channels, 67 | hidden_channels, 68 | kernel_size, 69 | padding=kernel_size // 2, 70 | ) 71 | ) 72 | self.norm_layers.append(LayerNorm(hidden_channels)) 73 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 74 | self.proj.weight.data.zero_() 75 | self.proj.bias.data.zero_() 76 | 77 | def forward(self, x, x_mask): 78 | x_org = x 79 | for i in range(self.n_layers): 80 | x = self.conv_layers[i](x * x_mask) 81 | x = self.norm_layers[i](x) 82 | x = self.relu_drop(x) 83 | x = x_org + self.proj(x) 84 | return x * x_mask 85 | 86 | 87 | class DDSConv(nn.Module): 88 | """ 89 | Dialted and Depth-Separable Convolution 90 | """ 91 | 92 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): 93 | super().__init__() 94 | self.channels = channels 95 | self.kernel_size = kernel_size 96 | self.n_layers = n_layers 97 | self.p_dropout = p_dropout 98 | 99 | self.drop = nn.Dropout(p_dropout) 100 | self.convs_sep = nn.ModuleList() 101 | self.convs_1x1 = nn.ModuleList() 102 | self.norms_1 = nn.ModuleList() 103 | self.norms_2 = nn.ModuleList() 104 | for i in range(n_layers): 105 | dilation = kernel_size**i 106 | padding = (kernel_size * dilation - dilation) // 2 107 | self.convs_sep.append( 108 | nn.Conv1d( 109 | channels, 110 | channels, 111 | kernel_size, 112 | groups=channels, 113 | dilation=dilation, 114 | padding=padding, 115 | ) 116 | ) 117 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 118 | self.norms_1.append(LayerNorm(channels)) 119 | self.norms_2.append(LayerNorm(channels)) 120 | 121 | def forward(self, x, x_mask, g=None): 122 | if g is not None: 123 | x = x + g 124 | for i in range(self.n_layers): 125 | y = self.convs_sep[i](x * x_mask) 126 | y = self.norms_1[i](y) 127 | y = F.gelu(y) 128 | y = self.convs_1x1[i](y) 129 | y = self.norms_2[i](y) 130 | y = F.gelu(y) 131 | y = self.drop(y) 132 | x = x + y 133 | return x * x_mask 134 | 135 | 136 | class WN(torch.nn.Module): 137 | def __init__( 138 | self, 139 | hidden_channels, 140 | kernel_size, 141 | dilation_rate, 142 | n_layers, 143 | gin_channels=0, 144 | p_dropout=0, 145 | ): 146 | super(WN, self).__init__() 147 | assert kernel_size % 2 == 1 148 | self.hidden_channels = hidden_channels 149 | self.kernel_size = (kernel_size,) 150 | self.dilation_rate = dilation_rate 151 | self.n_layers = n_layers 152 | self.gin_channels = gin_channels 153 | self.p_dropout = p_dropout 154 | 155 | self.in_layers = torch.nn.ModuleList() 156 | self.res_skip_layers = torch.nn.ModuleList() 157 | self.drop = nn.Dropout(p_dropout) 158 | 159 | if gin_channels != 0: 160 | cond_layer = torch.nn.Conv1d( 161 | gin_channels, 2 * hidden_channels * n_layers, 1 162 | ) 163 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") 164 | 165 | for i in range(n_layers): 166 | dilation = dilation_rate**i 167 | padding = int((kernel_size * dilation - dilation) / 2) 168 | in_layer = torch.nn.Conv1d( 169 | hidden_channels, 170 | 2 * hidden_channels, 171 | kernel_size, 172 | dilation=dilation, 173 | padding=padding, 174 | ) 175 | in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") 176 | self.in_layers.append(in_layer) 177 | 178 | # last one is not necessary 179 | if i < n_layers - 1: 180 | res_skip_channels = 2 * hidden_channels 181 | else: 182 | res_skip_channels = hidden_channels 183 | 184 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 185 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") 186 | self.res_skip_layers.append(res_skip_layer) 187 | 188 | def forward(self, x, x_mask, g=None, **kwargs): 189 | output = torch.zeros_like(x) 190 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 191 | 192 | if g is not None: 193 | g = self.cond_layer(g) 194 | 195 | for i in range(self.n_layers): 196 | x_in = self.in_layers[i](x) 197 | if g is not None: 198 | cond_offset = i * 2 * self.hidden_channels 199 | g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] 200 | else: 201 | g_l = torch.zeros_like(x_in) 202 | 203 | acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) 204 | acts = self.drop(acts) 205 | 206 | res_skip_acts = self.res_skip_layers[i](acts) 207 | if i < self.n_layers - 1: 208 | res_acts = res_skip_acts[:, : self.hidden_channels, :] 209 | x = (x + res_acts) * x_mask 210 | output = output + res_skip_acts[:, self.hidden_channels :, :] 211 | else: 212 | output = output + res_skip_acts 213 | return output * x_mask 214 | 215 | def remove_weight_norm(self): 216 | if self.gin_channels != 0: 217 | torch.nn.utils.remove_weight_norm(self.cond_layer) 218 | for l in self.in_layers: 219 | torch.nn.utils.remove_weight_norm(l) 220 | for l in self.res_skip_layers: 221 | torch.nn.utils.remove_weight_norm(l) 222 | 223 | 224 | class ResBlock1(torch.nn.Module): 225 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 226 | super(ResBlock1, self).__init__() 227 | self.convs1 = nn.ModuleList( 228 | [ 229 | weight_norm( 230 | Conv1d( 231 | channels, 232 | channels, 233 | kernel_size, 234 | 1, 235 | dilation=dilation[0], 236 | padding=get_padding(kernel_size, dilation[0]), 237 | ) 238 | ), 239 | weight_norm( 240 | Conv1d( 241 | channels, 242 | channels, 243 | kernel_size, 244 | 1, 245 | dilation=dilation[1], 246 | padding=get_padding(kernel_size, dilation[1]), 247 | ) 248 | ), 249 | weight_norm( 250 | Conv1d( 251 | channels, 252 | channels, 253 | kernel_size, 254 | 1, 255 | dilation=dilation[2], 256 | padding=get_padding(kernel_size, dilation[2]), 257 | ) 258 | ), 259 | ] 260 | ) 261 | self.convs1.apply(init_weights) 262 | 263 | self.convs2 = nn.ModuleList( 264 | [ 265 | weight_norm( 266 | Conv1d( 267 | channels, 268 | channels, 269 | kernel_size, 270 | 1, 271 | dilation=1, 272 | padding=get_padding(kernel_size, 1), 273 | ) 274 | ), 275 | weight_norm( 276 | Conv1d( 277 | channels, 278 | channels, 279 | kernel_size, 280 | 1, 281 | dilation=1, 282 | padding=get_padding(kernel_size, 1), 283 | ) 284 | ), 285 | weight_norm( 286 | Conv1d( 287 | channels, 288 | channels, 289 | kernel_size, 290 | 1, 291 | dilation=1, 292 | padding=get_padding(kernel_size, 1), 293 | ) 294 | ), 295 | ] 296 | ) 297 | self.convs2.apply(init_weights) 298 | 299 | def forward(self, x, x_mask=None): 300 | for c1, c2 in zip(self.convs1, self.convs2): 301 | xt = F.leaky_relu(x, LRELU_SLOPE) 302 | if x_mask is not None: 303 | xt = xt * x_mask 304 | xt = c1(xt) 305 | xt = F.leaky_relu(xt, LRELU_SLOPE) 306 | if x_mask is not None: 307 | xt = xt * x_mask 308 | xt = c2(xt) 309 | x = xt + x 310 | if x_mask is not None: 311 | x = x * x_mask 312 | return x 313 | 314 | def remove_weight_norm(self): 315 | for l in self.convs1: 316 | remove_weight_norm(l) 317 | for l in self.convs2: 318 | remove_weight_norm(l) 319 | 320 | 321 | class ResBlock2(torch.nn.Module): 322 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 323 | super(ResBlock2, self).__init__() 324 | self.convs = nn.ModuleList( 325 | [ 326 | weight_norm( 327 | Conv1d( 328 | channels, 329 | channels, 330 | kernel_size, 331 | 1, 332 | dilation=dilation[0], 333 | padding=get_padding(kernel_size, dilation[0]), 334 | ) 335 | ), 336 | weight_norm( 337 | Conv1d( 338 | channels, 339 | channels, 340 | kernel_size, 341 | 1, 342 | dilation=dilation[1], 343 | padding=get_padding(kernel_size, dilation[1]), 344 | ) 345 | ), 346 | ] 347 | ) 348 | self.convs.apply(init_weights) 349 | 350 | def forward(self, x, x_mask=None): 351 | for c in self.convs: 352 | xt = F.leaky_relu(x, LRELU_SLOPE) 353 | if x_mask is not None: 354 | xt = xt * x_mask 355 | xt = c(xt) 356 | x = xt + x 357 | if x_mask is not None: 358 | x = x * x_mask 359 | return x 360 | 361 | def remove_weight_norm(self): 362 | for l in self.convs: 363 | remove_weight_norm(l) 364 | 365 | 366 | class Log(nn.Module): 367 | def forward(self, x, x_mask, reverse=False, **kwargs): 368 | if not reverse: 369 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 370 | logdet = torch.sum(-y, [1, 2]) 371 | return y, logdet 372 | else: 373 | x = torch.exp(x) * x_mask 374 | return x 375 | 376 | 377 | class Flip(nn.Module): 378 | def forward(self, x, *args, reverse=False, **kwargs): 379 | x = torch.flip(x, [1]) 380 | if not reverse: 381 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 382 | return x, logdet 383 | else: 384 | return x 385 | 386 | 387 | class ElementwiseAffine(nn.Module): 388 | def __init__(self, channels): 389 | super().__init__() 390 | self.channels = channels 391 | self.m = nn.Parameter(torch.zeros(channels, 1)) 392 | self.logs = nn.Parameter(torch.zeros(channels, 1)) 393 | 394 | def forward(self, x, x_mask, reverse=False, **kwargs): 395 | if not reverse: 396 | y = self.m + torch.exp(self.logs) * x 397 | y = y * x_mask 398 | logdet = torch.sum(self.logs * x_mask, [1, 2]) 399 | return y, logdet 400 | else: 401 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 402 | return x 403 | 404 | 405 | class ResidualCouplingLayer(nn.Module): 406 | def __init__( 407 | self, 408 | channels, 409 | hidden_channels, 410 | kernel_size, 411 | dilation_rate, 412 | n_layers, 413 | p_dropout=0, 414 | gin_channels=0, 415 | mean_only=False, 416 | ): 417 | assert channels % 2 == 0, "channels should be divisible by 2" 418 | super().__init__() 419 | self.channels = channels 420 | self.hidden_channels = hidden_channels 421 | self.kernel_size = kernel_size 422 | self.dilation_rate = dilation_rate 423 | self.n_layers = n_layers 424 | self.half_channels = channels // 2 425 | self.mean_only = mean_only 426 | 427 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 428 | self.enc = WN( 429 | hidden_channels, 430 | kernel_size, 431 | dilation_rate, 432 | n_layers, 433 | p_dropout=p_dropout, 434 | gin_channels=gin_channels, 435 | ) 436 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 437 | self.post.weight.data.zero_() 438 | self.post.bias.data.zero_() 439 | 440 | def forward(self, x, x_mask, g=None, reverse=False): 441 | x0, x1 = torch.split(x, [self.half_channels] * 2, 1) 442 | h = self.pre(x0) * x_mask 443 | h = self.enc(h, x_mask, g=g) 444 | stats = self.post(h) * x_mask 445 | if not self.mean_only: 446 | m, logs = torch.split(stats, [self.half_channels] * 2, 1) 447 | else: 448 | m = stats 449 | logs = torch.zeros_like(m) 450 | 451 | if not reverse: 452 | x1 = m + x1 * torch.exp(logs) * x_mask 453 | x = torch.cat([x0, x1], 1) 454 | logdet = torch.sum(logs, [1, 2]) 455 | return x, logdet 456 | else: 457 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 458 | x = torch.cat([x0, x1], 1) 459 | return x 460 | 461 | def remove_weight_norm(self): 462 | self.enc.remove_weight_norm() 463 | 464 | 465 | class ConvFlow(nn.Module): 466 | def __init__( 467 | self, 468 | in_channels, 469 | filter_channels, 470 | kernel_size, 471 | n_layers, 472 | num_bins=10, 473 | tail_bound=5.0, 474 | ): 475 | super().__init__() 476 | self.in_channels = in_channels 477 | self.filter_channels = filter_channels 478 | self.kernel_size = kernel_size 479 | self.n_layers = n_layers 480 | self.num_bins = num_bins 481 | self.tail_bound = tail_bound 482 | self.half_channels = in_channels // 2 483 | 484 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 485 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) 486 | self.proj = nn.Conv1d( 487 | filter_channels, self.half_channels * (num_bins * 3 - 1), 1 488 | ) 489 | self.proj.weight.data.zero_() 490 | self.proj.bias.data.zero_() 491 | 492 | def forward(self, x, x_mask, g=None, reverse=False): 493 | x0, x1 = torch.split(x, [self.half_channels] * 2, 1) 494 | h = self.pre(x0) 495 | h = self.convs(h, x_mask, g=g) 496 | h = self.proj(h) * x_mask 497 | 498 | b, c, t = x0.shape 499 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 500 | 501 | unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) 502 | unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( 503 | self.filter_channels 504 | ) 505 | unnormalized_derivatives = h[..., 2 * self.num_bins :] 506 | 507 | x1, logabsdet = piecewise_rational_quadratic_transform( 508 | x1, 509 | unnormalized_widths, 510 | unnormalized_heights, 511 | unnormalized_derivatives, 512 | inverse=reverse, 513 | tails="linear", 514 | tail_bound=self.tail_bound, 515 | ) 516 | 517 | x = torch.cat([x0, x1], 1) * x_mask 518 | logdet = torch.sum(logabsdet * x_mask, [1, 2]) 519 | if not reverse: 520 | return x, logdet 521 | else: 522 | return x 523 | -------------------------------------------------------------------------------- /infer_pack/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform( 13 | inputs, 14 | unnormalized_widths, 15 | unnormalized_heights, 16 | unnormalized_derivatives, 17 | inverse=False, 18 | tails=None, 19 | tail_bound=1.0, 20 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 21 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 22 | min_derivative=DEFAULT_MIN_DERIVATIVE, 23 | ): 24 | if tails is None: 25 | spline_fn = rational_quadratic_spline 26 | spline_kwargs = {} 27 | else: 28 | spline_fn = unconstrained_rational_quadratic_spline 29 | spline_kwargs = {"tails": tails, "tail_bound": tail_bound} 30 | 31 | outputs, logabsdet = spline_fn( 32 | inputs=inputs, 33 | unnormalized_widths=unnormalized_widths, 34 | unnormalized_heights=unnormalized_heights, 35 | unnormalized_derivatives=unnormalized_derivatives, 36 | inverse=inverse, 37 | min_bin_width=min_bin_width, 38 | min_bin_height=min_bin_height, 39 | min_derivative=min_derivative, 40 | **spline_kwargs 41 | ) 42 | return outputs, logabsdet 43 | 44 | 45 | def searchsorted(bin_locations, inputs, eps=1e-6): 46 | bin_locations[..., -1] += eps 47 | return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 48 | 49 | 50 | def unconstrained_rational_quadratic_spline( 51 | inputs, 52 | unnormalized_widths, 53 | unnormalized_heights, 54 | unnormalized_derivatives, 55 | inverse=False, 56 | tails="linear", 57 | tail_bound=1.0, 58 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 59 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 60 | min_derivative=DEFAULT_MIN_DERIVATIVE, 61 | ): 62 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 63 | outside_interval_mask = ~inside_interval_mask 64 | 65 | outputs = torch.zeros_like(inputs) 66 | logabsdet = torch.zeros_like(inputs) 67 | 68 | if tails == "linear": 69 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 70 | constant = np.log(np.exp(1 - min_derivative) - 1) 71 | unnormalized_derivatives[..., 0] = constant 72 | unnormalized_derivatives[..., -1] = constant 73 | 74 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 75 | logabsdet[outside_interval_mask] = 0 76 | else: 77 | raise RuntimeError("{} tails are not implemented.".format(tails)) 78 | 79 | ( 80 | outputs[inside_interval_mask], 81 | logabsdet[inside_interval_mask], 82 | ) = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, 89 | right=tail_bound, 90 | bottom=-tail_bound, 91 | top=tail_bound, 92 | min_bin_width=min_bin_width, 93 | min_bin_height=min_bin_height, 94 | min_derivative=min_derivative, 95 | ) 96 | 97 | return outputs, logabsdet 98 | 99 | 100 | def rational_quadratic_spline( 101 | inputs, 102 | unnormalized_widths, 103 | unnormalized_heights, 104 | unnormalized_derivatives, 105 | inverse=False, 106 | left=0.0, 107 | right=1.0, 108 | bottom=0.0, 109 | top=1.0, 110 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 111 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 112 | min_derivative=DEFAULT_MIN_DERIVATIVE, 113 | ): 114 | if torch.min(inputs) < left or torch.max(inputs) > right: 115 | raise ValueError("Input to a transform is not within its domain") 116 | 117 | num_bins = unnormalized_widths.shape[-1] 118 | 119 | if min_bin_width * num_bins > 1.0: 120 | raise ValueError("Minimal bin width too large for the number of bins") 121 | if min_bin_height * num_bins > 1.0: 122 | raise ValueError("Minimal bin height too large for the number of bins") 123 | 124 | widths = F.softmax(unnormalized_widths, dim=-1) 125 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 126 | cumwidths = torch.cumsum(widths, dim=-1) 127 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) 128 | cumwidths = (right - left) * cumwidths + left 129 | cumwidths[..., 0] = left 130 | cumwidths[..., -1] = right 131 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 132 | 133 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 134 | 135 | heights = F.softmax(unnormalized_heights, dim=-1) 136 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 137 | cumheights = torch.cumsum(heights, dim=-1) 138 | cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) 139 | cumheights = (top - bottom) * cumheights + bottom 140 | cumheights[..., 0] = bottom 141 | cumheights[..., -1] = top 142 | heights = cumheights[..., 1:] - cumheights[..., :-1] 143 | 144 | if inverse: 145 | bin_idx = searchsorted(cumheights, inputs)[..., None] 146 | else: 147 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 148 | 149 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 150 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 151 | 152 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 153 | delta = heights / widths 154 | input_delta = delta.gather(-1, bin_idx)[..., 0] 155 | 156 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 157 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 158 | 159 | input_heights = heights.gather(-1, bin_idx)[..., 0] 160 | 161 | if inverse: 162 | a = (inputs - input_cumheights) * ( 163 | input_derivatives + input_derivatives_plus_one - 2 * input_delta 164 | ) + input_heights * (input_delta - input_derivatives) 165 | b = input_heights * input_derivatives - (inputs - input_cumheights) * ( 166 | input_derivatives + input_derivatives_plus_one - 2 * input_delta 167 | ) 168 | c = -input_delta * (inputs - input_cumheights) 169 | 170 | discriminant = b.pow(2) - 4 * a * c 171 | assert (discriminant >= 0).all() 172 | 173 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 174 | outputs = root * input_bin_widths + input_cumwidths 175 | 176 | theta_one_minus_theta = root * (1 - root) 177 | denominator = input_delta + ( 178 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta) 179 | * theta_one_minus_theta 180 | ) 181 | derivative_numerator = input_delta.pow(2) * ( 182 | input_derivatives_plus_one * root.pow(2) 183 | + 2 * input_delta * theta_one_minus_theta 184 | + input_derivatives * (1 - root).pow(2) 185 | ) 186 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 187 | 188 | return outputs, -logabsdet 189 | else: 190 | theta = (inputs - input_cumwidths) / input_bin_widths 191 | theta_one_minus_theta = theta * (1 - theta) 192 | 193 | numerator = input_heights * ( 194 | input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta 195 | ) 196 | denominator = input_delta + ( 197 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta) 198 | * theta_one_minus_theta 199 | ) 200 | outputs = input_cumheights + numerator / denominator 201 | 202 | derivative_numerator = input_delta.pow(2) * ( 203 | input_derivatives_plus_one * theta.pow(2) 204 | + 2 * input_delta * theta_one_minus_theta 205 | + input_derivatives * (1 - theta).pow(2) 206 | ) 207 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 208 | 209 | return outputs, logabsdet 210 | -------------------------------------------------------------------------------- /my_utils.py: -------------------------------------------------------------------------------- 1 | import ffmpeg 2 | import numpy as np 3 | 4 | 5 | def load_audio(file, sr): 6 | try: 7 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 8 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 9 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 10 | file = ( 11 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 12 | ) # 防止小白拷路径头尾带了空格和"和回车 13 | out, _ = ( 14 | ffmpeg.input(file, threads=0) 15 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) 16 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 17 | ) 18 | except Exception as e: 19 | raise RuntimeError(f"Failed to load audio: {e}") 20 | 21 | return np.frombuffer(out, np.float32).flatten() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numba==0.56.4 2 | numpy==1.23.5 3 | scipy==1.9.3 4 | librosa==0.9.2 5 | llvmlite==0.39.0 6 | fairseq==0.12.2 7 | faiss-cpu==1.7.0; sys_platform == "darwin" 8 | faiss-cpu==1.7.2; sys_platform != "darwin" 9 | gradio 10 | Cython 11 | future>=0.18.3 12 | pydub>=0.25.1 13 | soundfile>=0.12.1 14 | ffmpeg-python>=0.2.0 15 | tensorboardX 16 | functorch>=2.0.0 17 | Jinja2>=3.1.2 18 | json5>=0.9.11 19 | Markdown 20 | matplotlib>=3.7.1 21 | matplotlib-inline>=0.1.6 22 | praat-parselmouth>=0.4.3 23 | Pillow>=9.1.1 24 | pyworld>=0.3.2 25 | resampy>=0.4.2 26 | scikit-learn>=1.2.2 27 | starlette>=0.26.1 28 | tensorboard 29 | tensorboard-data-server 30 | tensorboard-plugin-wit 31 | torchgen>=0.0.1 32 | tqdm>=4.65.0 33 | tornado>=6.2 34 | Werkzeug>=2.2.3 35 | uc-micro-py>=1.0.1 36 | sympy>=1.11.1 37 | tabulate>=0.9.0 38 | PyYAML>=6.0 39 | pyasn1>=0.4.8 40 | pyasn1-modules>=0.2.8 41 | fsspec>=2023.3.0 42 | absl-py>=1.4.0 43 | audioread 44 | uvicorn>=0.21.1 45 | colorama>=0.4.6 46 | customtkinter 47 | torchcrepe -------------------------------------------------------------------------------- /rvcgui.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | from tkinter import filedialog 4 | import soundfile as sf 5 | import tkinter as tk 6 | import customtkinter as ctk 7 | 8 | import os 9 | import sys 10 | import torch 11 | import warnings 12 | import customtkinter as ctk 13 | 14 | now_dir = os.getcwd() 15 | sys.path.append(now_dir) 16 | tmp = os.path.join(now_dir, "TEMP") 17 | os.makedirs(os.path.join(now_dir, "models"), exist_ok=True) 18 | os.makedirs(os.path.join(now_dir, "output"), exist_ok=True) 19 | os.environ["TEMP"] = tmp 20 | warnings.filterwarnings("ignore") 21 | torch.manual_seed(114514) 22 | 23 | from vc_infer_pipeline import VC 24 | from fairseq import checkpoint_utils 25 | from scipy.io import wavfile 26 | from my_utils import load_audio 27 | from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono 28 | from infer_pack.modelsv2 import SynthesizerTrnMs768NSFsid_nono, SynthesizerTrnMs768NSFsid 29 | from multiprocessing import cpu_count 30 | import threading 31 | from time import sleep 32 | from time import sleep 33 | import traceback 34 | import numpy as np 35 | import subprocess 36 | import zipfile 37 | from config import Config 38 | 39 | config = Config() 40 | 41 | 42 | 43 | def extract_model_from_zip(zip_path, output_dir): 44 | # Extract the folder name from the zip file path 45 | folder_name = os.path.splitext(os.path.basename(zip_path))[0] 46 | 47 | # Create a folder with the same name as the zip file inside the output directory 48 | output_folder = os.path.join(output_dir, folder_name) 49 | os.makedirs(output_folder, exist_ok=True) 50 | 51 | with zipfile.ZipFile(zip_path, 'r') as zip_ref: 52 | for member in zip_ref.namelist(): 53 | if (member.endswith('.pth') and not (os.path.basename(member).startswith("G_") or os.path.basename(member).startswith("D_")) and zip_ref.getinfo(member).file_size < 200*(1024**2)) or (member.endswith('.index') and not (os.path.basename(member).startswith("trained"))): 54 | # Extract the file to the output folder 55 | zip_ref.extract(member, output_folder) 56 | 57 | # Move the file to the top level of the output folder 58 | file_path = os.path.join(output_folder, member) 59 | new_path = os.path.join(output_folder, os.path.basename(file_path)) 60 | os.rename(file_path, new_path) 61 | 62 | print(f"Model files extracted to folder: {output_folder}") 63 | 64 | 65 | def play_audio(file_path): 66 | if sys.platform == 'win32': 67 | audio_file = os.path.abspath(file_path) 68 | subprocess.call(['start', '', audio_file], shell=True) 69 | elif sys.platform == 'darwin': 70 | audio_file = 'path/to/audio/file.wav' 71 | subprocess.call(['open', audio_file]) 72 | elif sys.platform == 'linux': 73 | audio_file = 'path/to/audio/file.wav' 74 | subprocess.call(['xdg-open', audio_file]) 75 | 76 | def get_full_path(path): 77 | return os.path.abspath(path) 78 | 79 | hubert_model = None 80 | device = config.device 81 | print(device) 82 | is_half = config.is_half 83 | 84 | def load_hubert(): 85 | global hubert_model 86 | models, _, _ = checkpoint_utils.load_model_ensemble_and_task( 87 | ["hubert_base.pt"], 88 | suffix="", 89 | ) 90 | hubert_model = models[0] 91 | hubert_model = hubert_model.to(config.device) 92 | if is_half: 93 | hubert_model = hubert_model.half() 94 | else: 95 | hubert_model = hubert_model.float() 96 | hubert_model.eval() 97 | 98 | 99 | def vc_single( 100 | sid, 101 | input_audio, 102 | f0_up_key, 103 | f0_file, 104 | f0_method, 105 | file_index, 106 | index_rate, 107 | crepe_hop_length, 108 | output_path=None, 109 | ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0 110 | global tgt_sr, net_g, vc, hubert_model 111 | if input_audio is None: 112 | return "You need to upload an audio", None 113 | f0_up_key = int(f0_up_key) 114 | try: 115 | audio = load_audio(input_audio, 16000) 116 | times = [0, 0, 0] 117 | if hubert_model == None: 118 | load_hubert() 119 | if_f0 = cpt.get("f0", 1) 120 | file_index = ( 121 | file_index.strip(" ") 122 | .strip('"') 123 | .strip("\n") 124 | .strip('"') 125 | .strip(" ") 126 | .replace("trained", "added") 127 | ) # 防止小白写错,自动帮他替换掉 128 | 129 | audio_opt = vc.pipeline( 130 | hubert_model, 131 | net_g, 132 | sid, 133 | audio, 134 | times, 135 | f0_up_key, 136 | f0_method, 137 | file_index, 138 | # file_big_npy, 139 | index_rate, 140 | if_f0, 141 | version, 142 | crepe_hop_length, 143 | None, 144 | ) 145 | print( 146 | "npy: ", times[0], "s, f0: ", times[1], "s, infer: ", times[2], "s", sep="" 147 | ) 148 | 149 | if output_path is not None: 150 | sf.write(output_path, audio_opt, tgt_sr, format='WAV') 151 | 152 | return "Success", (tgt_sr, audio_opt) 153 | except: 154 | info = traceback.format_exc() 155 | print(info) 156 | return info, (None, None) 157 | 158 | 159 | def vc_multi( 160 | sid, 161 | dir_path, 162 | opt_root, 163 | paths, 164 | f0_up_key, 165 | f0_method, 166 | file_index, 167 | index_rate, 168 | ): 169 | try: 170 | dir_path = ( 171 | dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") 172 | ) # 防止小白拷路径头尾带了空格和"和回车 173 | opt_root = opt_root.strip(" ").strip( 174 | '"').strip("\n").strip('"').strip(" ") 175 | os.makedirs(opt_root, exist_ok=True) 176 | try: 177 | if dir_path != "": 178 | paths = [os.path.join(dir_path, name) 179 | for name in os.listdir(dir_path)] 180 | else: 181 | paths = [path.name for path in paths] 182 | except: 183 | traceback.print_exc() 184 | paths = [path.name for path in paths] 185 | infos = [] 186 | for path in paths: 187 | info, opt = vc_single( 188 | sid, 189 | path, 190 | f0_up_key, 191 | None, 192 | f0_method, 193 | file_index, 194 | index_rate, 195 | ) 196 | if info == "Success": 197 | try: 198 | tgt_sr, audio_opt = opt 199 | wavfile.write( 200 | "%s/%s" % (opt_root, os.path.basename(path) 201 | ), tgt_sr, audio_opt 202 | ) 203 | except: 204 | info = traceback.format_exc() 205 | infos.append("%s->%s" % (os.path.basename(path), info)) 206 | yield "\n".join(infos) 207 | yield "\n".join(infos) 208 | except: 209 | yield traceback.format_exc() 210 | 211 | 212 | # 一个选项卡全局只能有一个音色 213 | def get_vc(weight_root, sid): 214 | global n_spk, tgt_sr, net_g, vc, cpt, version 215 | if sid == "" or sid == []: 216 | global hubert_model 217 | if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 218 | print("clean_empty_cache") 219 | del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt 220 | hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None 221 | if torch.cuda.is_available(): 222 | torch.cuda.empty_cache() 223 | ###楼下不这么折腾清理不干净 224 | if_f0 = cpt.get("f0", 1) 225 | version = cpt.get("version", "v1") 226 | if version == "v1": 227 | if if_f0 == 1: 228 | net_g = SynthesizerTrnMs256NSFsid( 229 | *cpt["config"], is_half=config.is_half 230 | ) 231 | else: 232 | net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) 233 | elif version == "v2": 234 | if if_f0 == 1: 235 | net_g = SynthesizerTrnMs768NSFsid( 236 | *cpt["config"], is_half=config.is_half 237 | ) 238 | else: 239 | net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) 240 | del net_g, cpt 241 | if torch.cuda.is_available(): 242 | torch.cuda.empty_cache() 243 | cpt = None 244 | return {"visible": False, "__type__": "update"} 245 | person = (weight_root) 246 | print("loading %s" % person) 247 | cpt = torch.load(person, map_location="cpu") 248 | tgt_sr = cpt["config"][-1] 249 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk 250 | if_f0 = cpt.get("f0", 1) 251 | version = cpt.get("version", "v1") 252 | if version == "v1": 253 | if if_f0 == 1: 254 | net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) 255 | else: 256 | net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) 257 | elif version == "v2": 258 | if if_f0 == 1: 259 | net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) 260 | else: 261 | net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) 262 | del net_g.enc_q 263 | print(net_g.load_state_dict(cpt["weight"], strict=False)) 264 | net_g.eval().to(config.device) 265 | if config.is_half: 266 | net_g = net_g.half() 267 | else: 268 | net_g = net_g.float() 269 | vc = VC(tgt_sr, config) 270 | n_spk = cpt["config"][-3] 271 | return {"visible": True, "maximum": n_spk, "__type__": "update"} 272 | 273 | 274 | def clean(): 275 | return {"value": "", "__type__": "update"} 276 | 277 | 278 | def if_done(done, p): 279 | while 1: 280 | if p.poll() == None: 281 | sleep(0.5) 282 | else: 283 | break 284 | done[0] = True 285 | 286 | 287 | def if_done_multi(done, ps): 288 | while 1: 289 | # poll==None代表进程未结束 290 | # 只要有一个进程未结束都不停 291 | flag = 1 292 | for p in ps: 293 | if p.poll() == None: 294 | flag = 0 295 | sleep(0.5) 296 | break 297 | if flag == 1: 298 | break 299 | done[0] = True 300 | 301 | 302 | # window 303 | 304 | 305 | def outputkey(length=5): 306 | # generate all possible characters 307 | characters = string.ascii_letters + string.digits 308 | return ''.join(random.choices(characters, k=length)) 309 | # choose `length` characters randomly from the list and join them into a string 310 | 311 | def refresh_model_list(): 312 | global model_folders 313 | model_folders = [f for f in os.listdir(models_dir) if os.path.isdir(os.path.join( 314 | models_dir, f)) and any(f.endswith(".pth") for f in os.listdir(os.path.join(models_dir, f)))] 315 | model_list.configure(values=model_folders) 316 | model_list.update() 317 | 318 | def browse_zip(): 319 | global zip_file 320 | zip_file = filedialog.askopenfilename( 321 | initialdir=os.getcwd(), 322 | title="Select file", 323 | filetypes=(("zip files", "*.zip"), ("all files", "*.*")), 324 | ) 325 | extract_model_from_zip(zip_file, models_dir) 326 | refresh_model_list() 327 | 328 | def get_output_path(file_path): 329 | 330 | if not os.path.exists(file_path): 331 | # change the file extension to .wav 332 | 333 | return file_path # File path does not exist, return as is 334 | 335 | # Split file path into directory, base filename, and extension 336 | dir_name, file_name = os.path.split(file_path) 337 | file_name, file_ext = os.path.splitext(file_name) 338 | 339 | # Initialize index to 1 340 | index = 1 341 | 342 | # Increment index until a new file path is found 343 | while True: 344 | new_dir = f"{dir_name}\\{chosenOne}\\" 345 | new_file_name = f"{file_name}_RVC_{index}{file_ext}" 346 | new_file_path = os.path.join(new_dir, new_file_name) 347 | if not os.path.exists(new_file_path): 348 | # change the file extension to .wav 349 | if not os.path.exists(new_dir): 350 | os.makedirs(new_dir) 351 | new_file_path = os.path.splitext(new_file_path)[0] + ".wav" 352 | return new_file_path # Found new file path, return it 353 | index += 1 354 | 355 | def on_button_click(): 356 | output_audio_frame.pack_forget() 357 | result_state.pack_forget() 358 | run_button.configure(state="disabled") 359 | 360 | # Get values from user input widgets 361 | sid = sid_entry.get() 362 | input_audio = input_audio_entry.get() 363 | f0_pitch = round(f0_pitch_entry.get()) 364 | crepe_hop_length = round((crepe_hop_length_entry.get()) * 64) 365 | f0_file = None 366 | f0_method = f0_method_entry.get() 367 | file_index = file_index_entry.get() 368 | # file_big_npy = file_big_npy_entry.get() 369 | index_rate = round(index_rate_entry.get(),2) 370 | global output_file 371 | output_file = get_output_path(input_audio) 372 | print("sid: ", sid, "input_audio: ", input_audio, "f0_pitch: ", f0_pitch, "f0_file: ", f0_file, "f0_method: ", f0_method, 373 | "file_index: ", file_index, "file_big_npy: ", "index_rate: ", index_rate, "output_file: ", output_file) 374 | # Call the vc_single function with the user input values 375 | if model_loaded == True and os.path.isfile(input_audio): 376 | try: 377 | loading_frame.pack(padx=10, pady=10) 378 | loading_progress.start() 379 | 380 | result, audio_opt = vc_single( 381 | 0, input_audio, f0_pitch, None, f0_method, file_index, index_rate,crepe_hop_length, output_file) 382 | # output_label.configure(text=result + "\n saved at" + output_file) 383 | print(os.path.join(output_file)) 384 | if os.path.exists(output_file) and os.path.getsize(output_file) > 0: 385 | print(output_file) 386 | 387 | run_button.configure(state="enabled") 388 | message = result 389 | result_state.configure(text_color="green") 390 | last_output_file.configure(text=output_file) 391 | output_audio_frame.pack(padx=10, pady=10) 392 | else: 393 | message = result 394 | result_state.configure(text_color="red") 395 | 396 | except Exception as e: 397 | print(e) 398 | message = "Voice conversion failed", e 399 | 400 | # Update the output label with the result 401 | # output_label.configure(text=result + "\n saved at" + output_file) 402 | 403 | run_button.configure(state="enabled") 404 | else: 405 | message = "Please select a model and input audio file" 406 | run_button.configure(state="enabled") 407 | result_state.configure(text_color="red") 408 | 409 | loading_progress.stop() 410 | loading_frame.pack_forget() 411 | result_state.pack(padx=10, pady=10, side="top") 412 | result_state.configure(text=message) 413 | 414 | 415 | def browse_file(): 416 | filepath = filedialog.askopenfilename ( 417 | filetypes=[("Audio Files", ["*.mp3","*.wav"])]) 418 | filepath = os.path.normpath(filepath) # Normalize file path 419 | input_audio_entry.delete(0, tk.END) 420 | input_audio_entry.insert(0, filepath) 421 | 422 | 423 | 424 | def start_processing(): 425 | 426 | t = threading.Thread(target=on_button_click) 427 | t.start() 428 | 429 | 430 | # Create tkinter window and widgets 431 | root = ctk.CTk() 432 | ctk.set_appearance_mode("dark") 433 | root.title("RVC GUI") 434 | # Get screen dimensions 435 | screen_width = root.winfo_screenwidth() 436 | screen_height = root.winfo_screenheight() 437 | 438 | # Set GUI dimensions as a percentage of screen size 439 | 440 | gui_height = int(screen_height * 0.85) # 80% of screen height 441 | gui_dimensions = f"800x{gui_height}" 442 | 443 | root.geometry(gui_dimensions) 444 | root.resizable(False, True) 445 | 446 | model_loaded = False 447 | 448 | def selected_model(choice): 449 | global chosenOne 450 | chosenOne = choice 451 | file_index_entry.delete(0, ctk.END) 452 | model_dir = os.path.join(models_dir, choice) 453 | pth_files = [f for f in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, f)) 454 | and f.endswith(".pth") and not (f.startswith("G_") or f.startswith("D_")) 455 | and os.path.getsize(os.path.join(model_dir, f)) < 200*(1024**2)] 456 | 457 | if pth_files: 458 | global pth_file_path 459 | pth_file_path = os.path.join(model_dir, pth_files[0]) 460 | npy_files = [f for f in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, f)) 461 | and f.endswith(".index")] 462 | if npy_files: 463 | npy_files_dir = [os.path.join(model_dir, f) for f in npy_files] 464 | if len(npy_files_dir) == 1: 465 | index_file = npy_files_dir[0] 466 | print(f".pth file directory: {pth_file_path}") 467 | print(f".index file directory: {index_file}") 468 | file_index_entry.insert(0, os.path.normpath(index_file)) 469 | else: 470 | print(f"Incomplete set of .index files found in {model_dir}") 471 | else: 472 | print(f"No .index files found in {model_dir}") 473 | get_vc(pth_file_path, 0) 474 | global model_loaded 475 | model_loaded = True 476 | else: 477 | print(f"No eligible .pth files found in {model_dir}") 478 | 479 | 480 | def index_slider_event(value): 481 | index_rate_label.configure( 482 | text='Feature retrieval rate: %s' % round(value, 2)) 483 | # print(value) 484 | 485 | 486 | def pitch_slider_event(value): 487 | f0_pitch_label.configure(text='Pitch: %s' % round(value)) 488 | # print(value) 489 | 490 | def crepe_hop_length_slider_event(value): 491 | crepe_hop_length_label.configure(text='crepe hop: %s' % round((value) * 64)) 492 | # print(value) 493 | 494 | 495 | # hide crepe hop length slider if crepe is not selected 496 | def crepe_hop_length_slider_visibility(value): 497 | if value == "crepe" or value == "crepe-tiny": 498 | crepe_hop_length_label.grid(row=2, column=0, padx=10, pady=5, ) 499 | crepe_hop_length_entry.grid(row=2, column=1, padx=10, pady=5, ) 500 | else: 501 | crepe_hop_length_label.grid_remove() 502 | crepe_hop_length_entry.grid_remove() 503 | 504 | def update_config(selected): 505 | global device, is_half # declare newconfig as a global variable 506 | if selected == "GPU": 507 | device = "cuda:0" 508 | # is_half = True 509 | else: 510 | if torch.backends.mps.is_available(): 511 | device = "mps" 512 | # is_half = False 513 | else: 514 | device = "cpu" 515 | is_half = False 516 | 517 | config.device = device 518 | config.is_half = is_half 519 | 520 | 521 | if "pth_file_path" in globals(): 522 | load_hubert() 523 | get_vc(pth_file_path, 0) 524 | 525 | 526 | models_dir = "./models" 527 | model_folders = [f for f in os.listdir(models_dir) if os.path.isdir(os.path.join( 528 | models_dir, f)) and any(f.endswith(".pth") for f in os.listdir(os.path.join(models_dir, f)))] 529 | 530 | 531 | master_frame = ctk.CTkFrame(master=root, height=500) 532 | master_frame.pack(padx=5, pady=5) 533 | 534 | 535 | left_frame = ctk.CTkFrame(master=master_frame, ) 536 | left_frame.grid(row=0, column=0, padx=10, pady=10, sticky="nsew") 537 | 538 | right_frame = ctk.CTkFrame(master=master_frame, ) 539 | right_frame.grid(row=0, column=1, pady=10, padx=10, sticky="nsew") 540 | 541 | 542 | inputpath_frame = ctk.CTkFrame(master=left_frame) 543 | inputpath_frame.grid(row=0, column=0, padx=15, pady=10, sticky="nsew") 544 | 545 | 546 | output_audio_frame = ctk.CTkFrame(master=root) 547 | 548 | select_model_frame = ctk.CTkFrame(left_frame) 549 | select_model_frame.grid(row=1, column=0, padx=15, pady=10, sticky="nsew") 550 | 551 | pitch_frame = ctk.CTkFrame(left_frame) 552 | pitch_frame.grid(row=3, column=0, padx=10, pady=5, sticky="nsew") 553 | 554 | 555 | 556 | # Get the list of .pth files in the models directory 557 | 558 | 559 | 560 | sid_label = ctk.CTkLabel(select_model_frame, text="Speaker ID:") 561 | sid_entry = ctk.CTkEntry(select_model_frame) 562 | sid_entry.insert(0, "0") 563 | sid_entry.configure(state="disabled") 564 | 565 | # intiilizing model select widget 566 | select_model = ctk.StringVar(value="Select a model") 567 | model_list = ctk.CTkOptionMenu(select_model_frame, values=model_folders, 568 | command=selected_model, 569 | variable=select_model 570 | ) 571 | 572 | # intiilizing audio file input widget 573 | input_audio_label = ctk.CTkLabel(inputpath_frame, text="Input audio file:") 574 | browse_button = ctk.CTkButton( 575 | inputpath_frame, text="Browse", command=browse_file) 576 | input_audio_entry = ctk.CTkEntry(inputpath_frame) 577 | 578 | # intiilizing pitch widget 579 | f0_pitch_label = ctk.CTkLabel(pitch_frame, text="Pitch: 0") 580 | f0_pitch_entry = ctk.CTkSlider( 581 | pitch_frame, from_=-20, to=20, number_of_steps=100, command=pitch_slider_event, ) 582 | f0_pitch_entry.set(0) 583 | 584 | # intiilizing crepe hop length widget 585 | crepe_hop_length_label = ctk.CTkLabel(pitch_frame, text="crepe hop: 128") 586 | crepe_hop_length_entry = ctk.CTkSlider( 587 | pitch_frame, from_=1, to=8, number_of_steps=7, command=crepe_hop_length_slider_event) 588 | crepe_hop_length_entry.set(2) 589 | 590 | # intiilizing f0 file widget 591 | #f0_file_label = ctk.CTkLabel(right_frame, text="F0 file (Optional/Not Tested)") 592 | #f0_file_entry = ctk.CTkEntry(right_frame, width=250) 593 | 594 | # intiilizing f0 method widget 595 | f0_method_label = ctk.CTkLabel( 596 | pitch_frame, text="F0 method") 597 | f0_method_entry = ctk.CTkSegmentedButton( 598 | pitch_frame, height=40, values=["dio", "pm","harvest", "crepe", "crepe-tiny" ], command=crepe_hop_length_slider_visibility) 599 | f0_method_entry.set("dio") 600 | 601 | # intiilizing index file widget 602 | file_index_label = ctk.CTkLabel(right_frame, text=".index File (Recommended)") 603 | file_index_entry = ctk.CTkEntry(right_frame, width=250) 604 | 605 | # intiilizing big npy file widget 606 | 607 | 608 | 609 | # intiilizing index rate widget 610 | index_rate_entry = ctk.CTkSlider( 611 | right_frame, from_=0, to=1, number_of_steps=20, command=index_slider_event, ) 612 | index_rate_entry.set(0.4) 613 | index_rate_label = ctk.CTkLabel( 614 | right_frame, text="Feature retrieval rate: 0.4" ) 615 | 616 | # intiilizing run button widget 617 | run_button = ctk.CTkButton( 618 | left_frame, fg_color="green", hover_color="darkgreen", text="Convert", command=start_processing) 619 | 620 | # intiilizing output label widget 621 | output_label = ctk.CTkLabel(right_frame, text="") 622 | 623 | # intiilizing Notes label widget 624 | notes_label = ctk.CTkLabel(left_frame, justify="left", text_color="#8A8A8A", text="Tips: \n 1. harvest and crepe are the highest quality, but also the slowest methods. \n 2. dio and pm are the lightest and fastest methods, but also the lowest quality.") 625 | 626 | # intiilizing loading progress bar widget 627 | 628 | loading_frame = ctk.CTkFrame(master=root, width=200) 629 | 630 | laoding_label = ctk.CTkLabel(loading_frame, text="Converting..., If the window is not responding, Please wait.") 631 | laoding_label.pack(padx=10, pady=10) 632 | loading_progress = ctk.CTkProgressBar(master=loading_frame, width=200) 633 | loading_progress.configure(mode="indeterminate") 634 | loading_progress.pack(padx=10, pady=10) 635 | 636 | # intiilizing result state label widget 637 | result_state = ctk.CTkLabel( 638 | root, text="", height=50, width=100, corner_radius=10) 639 | 640 | # intiilizing change device widget 641 | change_device_label = ctk.CTkLabel( right_frame, text="Processing mode") 642 | change_device = ctk.CTkSegmentedButton( 643 | right_frame, command=lambda value: update_config(value)) 644 | change_device.configure( 645 | values=["GPU", "CPU"]) 646 | 647 | if "cpu" in device.lower() or device.lower() == "cpu": 648 | change_device.set("CPU") 649 | change_device.configure(state="disabled") 650 | 651 | else: 652 | change_device.set("GPU") 653 | 654 | # intiilizing last output label & open output button widget 655 | last_output_label = ctk.CTkLabel(output_audio_frame, text="Output path: ") 656 | last_output_file = ctk.CTkLabel(output_audio_frame, text="", text_color="green") 657 | open_output_button = ctk.CTkButton(output_audio_frame, text="Open", command=lambda: play_audio(output_file)) 658 | 659 | # intiilizing import models button widget 660 | import_moodels_button = ctk.CTkButton(right_frame, fg_color="darkred", hover_color="black", corner_radius=20, text="Import model from .zip", command=browse_zip) 661 | 662 | 663 | 664 | # button = ctk.CTkButton(root, text="Open Window", command=open_window) 665 | # button.pack() 666 | 667 | 668 | 669 | # Packing widgets into window 670 | notes_label.grid(row=5, column=0, padx=10, pady=10) 671 | change_device_label.grid(row=1, column=0, columnspan=2, padx=10, pady=5) 672 | change_device.grid(row=2, column=0, columnspan=2, padx=10, pady=5) 673 | last_output_label.grid( pady=10, row=0, column=0) 674 | last_output_file.grid( pady=10, row=0, column=1) 675 | open_output_button.grid(pady=10, row=1, column=0, columnspan=2) 676 | import_moodels_button.grid(padx=10, pady=10, row=0, column=0) 677 | model_list.grid(padx=10, pady=10, row=0, column=2) 678 | sid_label.grid(padx=10, pady=10, row=0, column=0) 679 | sid_entry.grid(padx=0, pady=10, row=0, column= 1) 680 | browse_button.grid(padx=10, pady=10, row=0, column=2) 681 | input_audio_label.grid(padx=10, pady=10, row=0, column=0) 682 | input_audio_entry.grid(padx=10, pady=10, row=0, column=1) 683 | f0_method_label.grid(padx=10, pady=10, row=0, column=0) 684 | f0_method_entry.grid(padx=10, pady=10, row=0, column=1) 685 | #crepe_hop_length_label.grid(padx=10, pady=10, row=1, column=0) 686 | #crepe_hop_length_entry.grid(padx=10, pady=10, row=1, column=1) 687 | f0_pitch_label.grid(padx=10, pady=10, row=3, column=0) 688 | f0_pitch_entry.grid(padx=10, pady=10, row=3, column=1) 689 | #0_file_label.grid(padx=10, pady=10) 690 | #f0_file_entry.grid(padx=10, pady=10) 691 | file_index_label.grid(padx=10, pady=10) 692 | file_index_entry.grid(padx=10, pady=10) 693 | 694 | 695 | index_rate_label.grid(padx=10, pady=10) 696 | index_rate_entry.grid(padx=10, pady=10) 697 | run_button.grid(padx=30, pady=30, row=4, column=0, columnspan=2) 698 | output_label.grid(padx=0, pady=10) 699 | 700 | root.mainloop() 701 | -------------------------------------------------------------------------------- /setup.bat: -------------------------------------------------------------------------------- 1 | python -m pip install -U pip setuptools wheel 2 | pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu118 3 | pip install -r requirements.txt -------------------------------------------------------------------------------- /trainset_preprocess_pipeline_print.py: -------------------------------------------------------------------------------- 1 | import sys, os, multiprocessing 2 | from scipy import signal 3 | 4 | now_dir = os.getcwd() 5 | sys.path.append(now_dir) 6 | 7 | inp_root = sys.argv[1] 8 | sr = int(sys.argv[2]) 9 | n_p = int(sys.argv[3]) 10 | exp_dir = sys.argv[4] 11 | noparallel = sys.argv[5] == "True" 12 | import numpy as np, os, traceback 13 | from slicer2 import Slicer 14 | import librosa, traceback 15 | from scipy.io import wavfile 16 | import multiprocessing 17 | from my_utils import load_audio 18 | 19 | mutex = multiprocessing.Lock() 20 | f = open("%s/preprocess.log" % exp_dir, "a+") 21 | 22 | 23 | def println(strr): 24 | mutex.acquire() 25 | print(strr) 26 | f.write("%s\n" % strr) 27 | f.flush() 28 | mutex.release() 29 | 30 | 31 | class PreProcess: 32 | def __init__(self, sr, exp_dir): 33 | self.slicer = Slicer( 34 | sr=sr, 35 | threshold=-40, 36 | min_length=800, 37 | min_interval=400, 38 | hop_size=15, 39 | max_sil_kept=150, 40 | ) 41 | self.sr = sr 42 | self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) 43 | self.per = 3.0 44 | self.overlap = 0.3 45 | self.tail = self.per + self.overlap 46 | self.max = 0.95 47 | self.alpha = 0.8 48 | self.exp_dir = exp_dir 49 | self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir 50 | self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir 51 | os.makedirs(self.exp_dir, exist_ok=True) 52 | os.makedirs(self.gt_wavs_dir, exist_ok=True) 53 | os.makedirs(self.wavs16k_dir, exist_ok=True) 54 | 55 | def norm_write(self, tmp_audio, idx0, idx1): 56 | tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + ( 57 | 1 - self.alpha 58 | ) * tmp_audio 59 | wavfile.write( 60 | "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), 61 | self.sr, 62 | tmp_audio.astype(np.float32), 63 | ) 64 | tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)#, res_type="soxr_vhq" 65 | wavfile.write( 66 | "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 67 | 16000, 68 | tmp_audio.astype(np.float32), 69 | ) 70 | 71 | def pipeline(self, path, idx0): 72 | try: 73 | audio = load_audio(path, self.sr) 74 | # zero phased digital filter cause pre-ringing noise... 75 | # audio = signal.filtfilt(self.bh, self.ah, audio) 76 | audio = signal.lfilter(self.bh, self.ah, audio) 77 | 78 | idx1 = 0 79 | for audio in self.slicer.slice(audio): 80 | i = 0 81 | while 1: 82 | start = int(self.sr * (self.per - self.overlap) * i) 83 | i += 1 84 | if len(audio[start:]) > self.tail * self.sr: 85 | tmp_audio = audio[start : start + int(self.per * self.sr)] 86 | self.norm_write(tmp_audio, idx0, idx1) 87 | idx1 += 1 88 | else: 89 | tmp_audio = audio[start:] 90 | idx1 += 1 91 | break 92 | self.norm_write(tmp_audio, idx0, idx1) 93 | println("%s->Suc." % path) 94 | except: 95 | println("%s->%s" % (path, traceback.format_exc())) 96 | 97 | def pipeline_mp(self, infos): 98 | for path, idx0 in infos: 99 | self.pipeline(path, idx0) 100 | 101 | def pipeline_mp_inp_dir(self, inp_root, n_p): 102 | try: 103 | infos = [ 104 | ("%s/%s" % (inp_root, name), idx) 105 | for idx, name in enumerate(sorted(list(os.listdir(inp_root)))) 106 | ] 107 | if noparallel: 108 | for i in range(n_p): 109 | self.pipeline_mp(infos[i::n_p]) 110 | else: 111 | ps = [] 112 | for i in range(n_p): 113 | p = multiprocessing.Process( 114 | target=self.pipeline_mp, args=(infos[i::n_p],) 115 | ) 116 | p.start() 117 | ps.append(p) 118 | for p in ps: 119 | p.join() 120 | except: 121 | println("Fail. %s" % traceback.format_exc()) 122 | 123 | 124 | def preprocess_trainset(inp_root, sr, n_p, exp_dir): 125 | pp = PreProcess(sr, exp_dir) 126 | println("start preprocess") 127 | println(sys.argv) 128 | pp.pipeline_mp_inp_dir(inp_root, n_p) 129 | println("end preprocess") 130 | 131 | 132 | if __name__ == "__main__": 133 | preprocess_trainset(inp_root, sr, n_p, exp_dir) 134 | -------------------------------------------------------------------------------- /vc_infer_pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np, parselmouth, torch, pdb 2 | from time import time as ttime 3 | import torch.nn.functional as F 4 | import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe) 5 | import scipy.signal as signal 6 | import pyworld, os, traceback, faiss 7 | from scipy import signal 8 | from torch import Tensor # Fork Feature. Used for pitch prediction for the torchcrepe f0 inference computation 9 | 10 | bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) 11 | 12 | class VC(object): 13 | def __init__(self, tgt_sr, config): 14 | self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( 15 | config.x_pad, 16 | config.x_query, 17 | config.x_center, 18 | config.x_max, 19 | config.is_half, 20 | ) 21 | self.sr = 16000 # hubert输入采样率 22 | self.window = 160 # 每帧点数 23 | self.t_pad = self.sr * self.x_pad # 每条前后pad时间 24 | self.t_pad_tgt = tgt_sr * self.x_pad 25 | self.t_pad2 = self.t_pad * 2 26 | self.t_query = self.sr * self.x_query # 查询切点前后查询时间 27 | self.t_center = self.sr * self.x_center # 查询切点位置 28 | self.t_max = self.sr * self.x_max # 免查询时长阈值 29 | self.device = config.device 30 | 31 | #region f0 Overhaul Region 32 | # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device) 33 | def get_optimal_torch_device(self, index: int = 0) -> torch.device: 34 | # Get cuda device 35 | if torch.cuda.is_available(): 36 | return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast 37 | elif torch.backends.mps.is_available(): 38 | return torch.device("mps") 39 | # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library 40 | # Else wise return the "cpu" as a torch device, 41 | return torch.device("cpu") 42 | 43 | # Get the f0 via parselmouth computation 44 | def get_f0_pm_computation(self, x, time_step, f0_min, f0_max, p_len): 45 | f0 = ( 46 | parselmouth.Sound(x, self.sr) 47 | .to_pitch_ac( 48 | time_step=time_step / 1000, 49 | voicing_threshold=0.6, 50 | pitch_floor=f0_min, 51 | pitch_ceiling=f0_max, 52 | ) 53 | .selected_array["frequency"] 54 | ) 55 | pad_size = (p_len - len(f0) + 1) // 2 56 | if pad_size > 0 or p_len - len(f0) - pad_size > 0: 57 | f0 = np.pad( 58 | f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" 59 | ) 60 | return f0 61 | 62 | # Get the f0 via the pyworld computation. Fork Feature +dio along with harvest 63 | def get_f0_pyworld_computation(self, x, f0_min, f0_max, f0_type): 64 | if f0_type == "harvest": 65 | f0, t = pyworld.harvest( 66 | x.astype(np.double), 67 | fs=self.sr, 68 | f0_ceil=f0_max, 69 | f0_floor=f0_min, 70 | frame_period=10, 71 | ) 72 | elif f0_type == "dio": 73 | f0, t = pyworld.dio( 74 | x.astype(np.double), 75 | fs=self.sr, 76 | f0_ceil=f0_max, 77 | f0_floor=f0_min, 78 | frame_period=10, 79 | ) 80 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) 81 | f0 = signal.medfilt(f0, 3) 82 | return f0 83 | 84 | # Fork Feature: Get the f0 via the crepe algorithm from torchcrepe 85 | def get_f0_crepe_computation( 86 | self, 87 | x, 88 | f0_min, 89 | f0_max, 90 | p_len, 91 | hop_length=128, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time. 92 | model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full 93 | ): 94 | x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float. 95 | x /= np.quantile(np.abs(x), 0.999) 96 | torch_device = self.get_optimal_torch_device() 97 | audio = torch.from_numpy(x).to(torch_device, copy=True) 98 | audio = torch.unsqueeze(audio, dim=0) 99 | if audio.ndim == 2 and audio.shape[0] > 1: 100 | audio = torch.mean(audio, dim=0, keepdim=True).detach() 101 | audio = audio.detach() 102 | print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) 103 | pitch: Tensor = torchcrepe.predict( 104 | audio, 105 | self.sr, 106 | hop_length, 107 | f0_min, 108 | f0_max, 109 | model, 110 | batch_size=hop_length * 2, 111 | device=torch_device, 112 | pad=True 113 | ) 114 | p_len = p_len or x.shape[0] // hop_length 115 | # Resize the pitch for final f0 116 | source = np.array(pitch.squeeze(0).cpu().float().numpy()) 117 | source[source < 0.001] = np.nan 118 | target = np.interp( 119 | np.arange(0, len(source) * p_len, len(source)) / p_len, 120 | np.arange(0, len(source)), 121 | source 122 | ) 123 | f0 = np.nan_to_num(target) 124 | return f0 # Resized f0 125 | 126 | #endregion 127 | 128 | def get_f0(self, x, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0=None): 129 | time_step = self.window / self.sr * 1000 130 | f0_min = 50 131 | f0_max = 1100 132 | f0_mel_min = 1127 * np.log(1 + f0_min / 700) 133 | f0_mel_max = 1127 * np.log(1 + f0_max / 700) 134 | if f0_method == "pm": 135 | f0 = self.get_f0_pm_computation(x, time_step, f0_min, f0_max, p_len) 136 | elif f0_method == "harvest": 137 | f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "harvest") 138 | elif f0_method == "dio": # Fork Feature 139 | f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "dio") 140 | elif f0_method == "crepe": # Fork Feature: Adding a new f0 algorithm called crepe 141 | f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length) 142 | elif f0_method == "crepe-tiny": # For Feature add crepe-tiny model 143 | f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny") 144 | 145 | print("Using the following f0 method: " + f0_method) 146 | f0 *= pow(2, f0_up_key / 12) 147 | # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) 148 | tf0 = self.sr // self.window # 每秒f0点数 149 | if inp_f0 is not None: 150 | delta_t = np.round( 151 | (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 152 | ).astype("int16") 153 | replace_f0 = np.interp( 154 | list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] 155 | ) 156 | shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] 157 | f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ 158 | :shape 159 | ] 160 | # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) 161 | f0bak = f0.copy() 162 | f0_mel = 1127 * np.log(1 + f0 / 700) 163 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( 164 | f0_mel_max - f0_mel_min 165 | ) + 1 166 | f0_mel[f0_mel <= 1] = 1 167 | f0_mel[f0_mel > 255] = 255 168 | f0_coarse = np.rint(f0_mel).astype(np.int) 169 | 170 | return f0_coarse, f0bak # 1-0 171 | 172 | def vc( 173 | self, 174 | model, 175 | net_g, 176 | sid, 177 | audio0, 178 | pitch, 179 | pitchf, 180 | times, 181 | index, 182 | big_npy, 183 | index_rate, 184 | version, 185 | ): # ,file_index,file_big_npy 186 | feats = torch.from_numpy(audio0) 187 | if self.is_half: 188 | feats = feats.half() 189 | else: 190 | feats = feats.float() 191 | if feats.dim() == 2: # double channels 192 | feats = feats.mean(-1) 193 | assert feats.dim() == 1, feats.dim() 194 | feats = feats.view(1, -1) 195 | padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) 196 | 197 | inputs = { 198 | "source": feats.to(self.device), 199 | "padding_mask": padding_mask, 200 | "output_layer": 9 if version == "v1" else 12, 201 | } 202 | t0 = ttime() 203 | with torch.no_grad(): 204 | logits = model.extract_features(**inputs) 205 | feats = model.final_proj(logits[0]) if version == "v1" else logits[0] 206 | 207 | if ( 208 | isinstance(index, type(None)) == False 209 | and isinstance(big_npy, type(None)) == False 210 | and index_rate != 0 211 | ): 212 | npy = feats[0].cpu().numpy() 213 | if self.is_half: 214 | npy = npy.astype("float32") 215 | 216 | # _, I = index.search(npy, 1) 217 | # npy = big_npy[I.squeeze()] 218 | 219 | score, ix = index.search(npy, k=8) 220 | weight = np.square(1 / score) 221 | weight /= weight.sum(axis=1, keepdims=True) 222 | npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) 223 | 224 | if self.is_half: 225 | npy = npy.astype("float16") 226 | feats = ( 227 | torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate 228 | + (1 - index_rate) * feats 229 | ) 230 | 231 | feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) 232 | t1 = ttime() 233 | p_len = audio0.shape[0] // self.window 234 | if feats.shape[1] < p_len: 235 | p_len = feats.shape[1] 236 | if pitch != None and pitchf != None: 237 | pitch = pitch[:, :p_len] 238 | pitchf = pitchf[:, :p_len] 239 | p_len = torch.tensor([p_len], device=self.device).long() 240 | with torch.no_grad(): 241 | if pitch != None and pitchf != None: 242 | audio1 = ( 243 | (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) 244 | .data.cpu() 245 | .float() 246 | .numpy() 247 | ) 248 | else: 249 | audio1 = ( 250 | (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() 251 | ) 252 | del feats, p_len, padding_mask 253 | if torch.cuda.is_available(): 254 | torch.cuda.empty_cache() 255 | t2 = ttime() 256 | times[0] += t1 - t0 257 | times[2] += t2 - t1 258 | return audio1 259 | 260 | def pipeline( 261 | self, 262 | model, 263 | net_g, 264 | sid, 265 | audio, 266 | times, 267 | f0_up_key, 268 | f0_method, 269 | file_index, 270 | # file_big_npy, 271 | index_rate, 272 | if_f0, 273 | version, 274 | crepe_hop_length, 275 | f0_file=None, 276 | ): 277 | if ( 278 | file_index != "" 279 | # and file_big_npy != "" 280 | # and os.path.exists(file_big_npy) == True 281 | and os.path.exists(file_index) == True 282 | and index_rate != 0 283 | ): 284 | try: 285 | index = faiss.read_index(file_index) 286 | # big_npy = np.load(file_big_npy) 287 | big_npy = index.reconstruct_n(0, index.ntotal) 288 | except: 289 | traceback.print_exc() 290 | index = big_npy = None 291 | else: 292 | index = big_npy = None 293 | audio = signal.filtfilt(bh, ah, audio) 294 | audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") 295 | opt_ts = [] 296 | if audio_pad.shape[0] > self.t_max: 297 | audio_sum = np.zeros_like(audio) 298 | for i in range(self.window): 299 | audio_sum += audio_pad[i : i - self.window] 300 | for t in range(self.t_center, audio.shape[0], self.t_center): 301 | opt_ts.append( 302 | t 303 | - self.t_query 304 | + np.where( 305 | np.abs(audio_sum[t - self.t_query : t + self.t_query]) 306 | == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() 307 | )[0][0] 308 | ) 309 | s = 0 310 | audio_opt = [] 311 | t = None 312 | t1 = ttime() 313 | audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") 314 | p_len = audio_pad.shape[0] // self.window 315 | inp_f0 = None 316 | if hasattr(f0_file, "name") == True: 317 | try: 318 | with open(f0_file.name, "r") as f: 319 | lines = f.read().strip("\n").split("\n") 320 | inp_f0 = [] 321 | for line in lines: 322 | inp_f0.append([float(i) for i in line.split(",")]) 323 | inp_f0 = np.array(inp_f0, dtype="float32") 324 | except: 325 | traceback.print_exc() 326 | sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() 327 | pitch, pitchf = None, None 328 | if if_f0 == 1: 329 | pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0) 330 | pitch = pitch[:p_len] 331 | pitchf = pitchf[:p_len] 332 | pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() 333 | pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float32).unsqueeze(0).float() 334 | t2 = ttime() 335 | times[1] += t2 - t1 336 | for t in opt_ts: 337 | t = t // self.window * self.window 338 | if if_f0 == 1: 339 | audio_opt.append( 340 | self.vc( 341 | model, 342 | net_g, 343 | sid, 344 | audio_pad[s : t + self.t_pad2 + self.window], 345 | pitch[:, s // self.window : (t + self.t_pad2) // self.window], 346 | pitchf[:, s // self.window : (t + self.t_pad2) // self.window], 347 | times, 348 | index, 349 | big_npy, 350 | index_rate, 351 | version, 352 | )[self.t_pad_tgt : -self.t_pad_tgt] 353 | ) 354 | else: 355 | audio_opt.append( 356 | self.vc( 357 | model, 358 | net_g, 359 | sid, 360 | audio_pad[s : t + self.t_pad2 + self.window], 361 | None, 362 | None, 363 | times, 364 | index, 365 | big_npy, 366 | index_rate, 367 | version, 368 | )[self.t_pad_tgt : -self.t_pad_tgt] 369 | ) 370 | s = t 371 | if if_f0 == 1: 372 | audio_opt.append( 373 | self.vc( 374 | model, 375 | net_g, 376 | sid, 377 | audio_pad[t:], 378 | pitch[:, t // self.window :] if t is not None else pitch, 379 | pitchf[:, t // self.window :] if t is not None else pitchf, 380 | times, 381 | index, 382 | big_npy, 383 | index_rate, 384 | version, 385 | )[self.t_pad_tgt : -self.t_pad_tgt] 386 | ) 387 | else: 388 | audio_opt.append( 389 | self.vc( 390 | model, 391 | net_g, 392 | sid, 393 | audio_pad[t:], 394 | None, 395 | None, 396 | times, 397 | index, 398 | big_npy, 399 | index_rate, 400 | version, 401 | )[self.t_pad_tgt : -self.t_pad_tgt] 402 | ) 403 | audio_opt = np.concatenate(audio_opt) 404 | del pitch, pitchf, sid 405 | if torch.cuda.is_available(): 406 | torch.cuda.empty_cache() 407 | return audio_opt --------------------------------------------------------------------------------